diff --git a/doc/index.rst b/doc/index.rst index 69f08730cd8b51af15b8dc06bf168f4dfa2e620b..0644b34c41adf9dbfff1575f8e08f91a8f7f7a96 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -68,7 +68,7 @@ Please check :ref:`installation` to get started. ref_creation ref_kernel ref_transform - ref_scoped_functions + ref_call ref_other misc diff --git a/doc/ref_scoped_functions.rst b/doc/ref_call.rst similarity index 59% rename from doc/ref_scoped_functions.rst rename to doc/ref_call.rst index c2deaca6732ad28012302c6623b1cfef57c2d0cb..46edc533ce16cf59b0cf96d4c0a28b3f2196642c 100644 --- a/doc/ref_scoped_functions.rst +++ b/doc/ref_call.rst @@ -1,5 +1,5 @@ -ScopedFunctions -=============== +Calling Loopy Kernels and External Functions +============================================ ``ScopedFunctions`` are pymbolic nodes within expressions in a ``Loo.py`` kernel, whose name has been resolved by the kernel. @@ -21,8 +21,8 @@ is resolved by one of the ``function_scoper`` in a :attr:`LoopKernel.scoped_func as its functionality is superseded by ``lp.register_function_scoper(...)``. -Expressions after a function is scoped. ---------------------------------------- +Expressions after a function is scoped +-------------------------------------- Consider the following expression. @@ -127,12 +127,12 @@ Description Inference Although this step has no significance for a ``ScalarCallable``, it forms a very important part of ``CallableKernel``. In which the -``dim_tags``, ``shape`` and ``mem_scope`` of the arguments of the +``dim_tags``, ``shape`` and ``address_space`` of the arguments of the callable kernel is altered. - The ``dim_tags`` attribute helps to ensure that the memory layout between the caller and the callee kernel is coherent. -- The ``mem_scope`` attribute ensures that, while writing the device +- The ``address_space`` attribute ensures that, while writing the device code we emit the appropriate scope qualifiers for the function declaration arguments. - The ``shape`` attribute helps in: @@ -150,121 +150,16 @@ developments of the ``sin`` pymbolic call expression node. (Type Inference) -> ScopedFunction(Variable('sin_0')) -> (Descriptor Inference) -> ScopedFunction(Variable('sin_1')) -Changes on the target side to accommodate the new function interface. ---------------------------------------------------------------------- +Changes on the target side to accommodate the new function interface +-------------------------------------------------------------------- The earlier "function\_mangler" as a member method of the class ``lp.ASTBuilderBase`` will be replaced by ``function_scopers``. The function scopers would return a list of functions with the signature ``(target, identifier)->lp.InKernelCallable``. -An example of registering Vector callables is shown below. ----------------------------------------------------------- - -.. code:: python +An example: Calling BLAS +------------------------ - import loopy as lp - import numpy as np - from loopy.diagnostic import LoopyError - from loopy.target.c import CTarget - - - # {{{ blas callable - - class BLASCallable(lp.ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel): - for i in range(0, 2): - if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: - # the types provided aren't mature enough to specialize the - # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) - - mat_dtype = arg_id_to_dtype[0].numpy_dtype - vec_dtype = arg_id_to_dtype[1].numpy_dtype - - if mat_dtype != vec_dtype: - raise LoopyError("DGEMV should have same dtype for matrix and " - "vector") - - if vec_dtype == np.float32: - name_in_target = "cblas_sgemv" - elif vec_dtype == np.float64: - name_in_target = "cblas_dgemv" - else: - raise LoopyError("GEMV only supported for float32 and float64 " - "types") - - from loopy.types import NumpyType - return self.copy(name_in_target=name_in_target, - arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype), - -1: NumpyType(vec_dtype)}) - - def emit_call_insn(self, insn, target, expression_to_code_mapper): - assert self.is_ready_for_codegen() - - from loopy.kernel.instruction import CallInstruction - - assert isinstance(insn, CallInstruction) - - parameters = insn.expression.parameters - - parameters = list(parameters) - par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] - - parameters.append(insn.assignees[0]) - par_dtypes.append(self.arg_id_to_dtype[-1]) - - # no type casting in array calls. - from loopy.expression import dtype_to_type_context - from pymbolic.mapper.stringifier import PREC_NONE - from loopy.symbolic import SubArrayRef - from pymbolic import var - - mat_descr = self.arg_id_to_descr[0] - - c_parameters = [ - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr if isinstance(par, SubArrayRef) else - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr - for par, par_dtype in zip( - parameters, par_dtypes)] - c_parameters.insert(0, var('CblasRowMajor')) - c_parameters.insert(1, var('CblasNoTrans')) - c_parameters.insert(2, mat_descr.shape[0]) - c_parameters.insert(3, mat_descr.shape[1]) - c_parameters.insert(4, 1) - c_parameters.insert(6, 1) - c_parameters.insert(8, 1) - c_parameters.insert(10, 1) - return var(self.name_in_target)(*c_parameters), False - - def generate_preambles(self, target): - assert isinstance(target, CTarget) - yield("99_cblas", "#include ") - return - - - def blas_fn_lookup(target, identifier): - if identifier == 'gemv': - return BLASCallable(name='gemv') - return None - - # }}} - - - n = 10 - - knl = lp.make_kernel( - "{[i]: 0<=i<10}", - """ - y[:] = gemv(A[:, :], x[:]) - """, [ - lp.ArrayArg('A', dtype=np.float64, shape=(n, n)), - lp.ArrayArg('x', dtype=np.float64, shape=(n, )), - lp.ArrayArg('y', shape=(n, )), ...], - target=CTarget()) - knl = lp.register_function_lookup(knl, blas_fn_lookup) +.. literalinclude:: ../examples/python/external-call.py diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 07b7836d82596892f1d94e336dfa81e1b5a7a881..c9ce206260c04fc883a0f980df0b18a9a826bbd9 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -363,9 +363,9 @@ C Block Instructions Atomic Operations ^^^^^^^^^^^^^^^^^ -.. autoclass:: memory_ordering +.. autoclass:: MemoryOrdering -.. autoclass:: memory_scope +.. autoclass:: MemoryScope .. autoclass:: VarAtomicity @@ -586,7 +586,7 @@ Do not create :class:`LoopKernel` objects directly. Instead, refer to .. autoclass:: LoopKernel -.. autoclass:: kernel_state +.. autoclass:: KernelState :members: :undoc-members: diff --git a/examples/python/call-external.py b/examples/python/call-external.py new file mode 100644 index 0000000000000000000000000000000000000000..904270472391a3f3c776ba6c254450a9755eed13 --- /dev/null +++ b/examples/python/call-external.py @@ -0,0 +1,105 @@ +import loopy as lp +import numpy as np +from loopy.diagnostic import LoopyError +from loopy.target.c import CTarget + + +# {{{ blas callable + +class BLASCallable(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel): + for i in range(0, 2): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + mat_dtype = arg_id_to_dtype[0].numpy_dtype + vec_dtype = arg_id_to_dtype[1].numpy_dtype + + if mat_dtype != vec_dtype: + raise LoopyError("DGEMV should have same dtype for matrix and " + "vector") + + if vec_dtype == np.float32: + name_in_target = "cblas_sgemv" + elif vec_dtype == np.float64: + name_in_target = "cblas_dgemv" + else: + raise LoopyError("GEMV only supported for float32 and float64 " + "types") + + from loopy.types import NumpyType + return self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype), + -1: NumpyType(vec_dtype)}) + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + + parameters = list(parameters) + par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + + parameters.append(insn.assignees[0]) + par_dtypes.append(self.arg_id_to_dtype[-1]) + + # no type casting in array calls. + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from loopy.symbolic import SubArrayRef + from pymbolic import var + + mat_descr = self.arg_id_to_descr[0] + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr if isinstance(par, SubArrayRef) else + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr + for par, par_dtype in zip( + parameters, par_dtypes)] + c_parameters.insert(0, var('CblasRowMajor')) + c_parameters.insert(1, var('CblasNoTrans')) + c_parameters.insert(2, mat_descr.shape[0]) + c_parameters.insert(3, mat_descr.shape[1]) + c_parameters.insert(4, 1) + c_parameters.insert(6, 1) + c_parameters.insert(8, 1) + c_parameters.insert(10, 1) + return var(self.name_in_target)(*c_parameters), False + + def generate_preambles(self, target): + assert isinstance(target, CTarget) + yield("99_cblas", "#include ") + return + + +def blas_fn_lookup(target, identifier): + if identifier == 'gemv': + return BLASCallable(name='gemv') + return None + +# }}} + + +n = 10 + +knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[:] = gemv(A[:, :], x[:]) + """, [ + lp.ArrayArg('A', dtype=np.float64, shape=(n, n)), + lp.ArrayArg('x', dtype=np.float64, shape=(n, )), + lp.ArrayArg('y', shape=(n, )), ...], + target=CTarget()) + +knl = lp.register_function_lookup(knl, blas_fn_lookup) diff --git a/loopy/__init__.py b/loopy/__init__.py index a5931d03ae94038931dbc1c5d74d72d6ee125143..a552e498e6a118cd054d56660e2320003c272a97 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -37,7 +37,9 @@ from loopy.library.function import ( default_function_mangler, single_arg_function_mangler) from loopy.kernel.instruction import ( - memory_ordering, memory_scope, VarAtomicity, AtomicInit, AtomicUpdate, + MemoryOrdering, memory_ordering, + MemoryScope, memory_scope, + VarAtomicity, AtomicInit, AtomicUpdate, InstructionBase, MultiAssignmentBase, Assignment, ExpressionInstruction, CallInstruction, CInstruction, NoOpInstruction, BarrierInstruction) @@ -45,13 +47,14 @@ from loopy.kernel.data import ( auto, KernelArgument, ValueArg, ArrayArg, GlobalArg, ConstantArg, ImageArg, - temp_var_scope, TemporaryVariable, AddressSpace, + AddressSpace, temp_var_scope, + TemporaryVariable, SubstitutionRule, CallMangleInfo) from loopy.kernel.function_interface import ( ScalarCallable) -from loopy.kernel import LoopKernel, kernel_state +from loopy.kernel import LoopKernel, KernelState, kernel_state from loopy.kernel.tools import ( get_dot_dependency_graph, show_dependency_graph, @@ -118,7 +121,7 @@ from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier -from loopy.transform.register_callable import (register_callable_kernel, +from loopy.transform.callable import (register_callable_kernel, register_function_lookup, inline_callable_kernel) from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call @@ -158,9 +161,13 @@ __all__ = [ "auto", - "LoopKernel", "kernel_state", + "LoopKernel", + "KernelState", "kernel_state", # lower case is deprecated - "memory_ordering", "memory_scope", "VarAtomicity", + "MemoryOrdering", "memory_ordering", # lower case is deprecated + "MemoryScope", "memory_scope", # lower case is deprecated + + "VarAtomicity", "AtomicInit", "AtomicUpdate", "InstructionBase", "MultiAssignmentBase", "Assignment", "ExpressionInstruction", @@ -171,7 +178,8 @@ __all__ = [ "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", - "AddressSpace", "temp_var_scope", "TemporaryVariable", + "AddressSpace", "temp_var_scope", # temp_var_scope is deprecated + "TemporaryVariable", "SubstitutionRule", "CallMangleInfo", diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 8e647b02d9ebb8ae79dc1f06fd2051ba3d17758c..015c82dd1fa5f81665f062f974149c2e93a324a9 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -515,11 +515,11 @@ def auto_test_vs_ref( properties=cl.command_queue_properties.PROFILING_ENABLE) args = None - from loopy.kernel import kernel_state + from loopy.kernel import KernelState from loopy.target.pyopencl import PyOpenCLTarget if test_knl.state not in [ - kernel_state.PREPROCESSED, - kernel_state.SCHEDULED]: + KernelState.PREPROCESSED, + KernelState.SCHEDULED]: if isinstance(test_knl.target, PyOpenCLTarget): test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) diff --git a/loopy/check.py b/loopy/check.py index 4a340e6dd7955201cf56e9660f2f030812787fdd..86d0d48d3d3fccb8e3dc9138275a53cf6d2a8b78 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -124,7 +124,8 @@ def check_functions_are_scoped(kernel): elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): pass else: - raise NotImplementedError("Unknown type of instruction %s." % type(insn)) + raise NotImplementedError( + "Unknown type of instruction %s" % type(insn).__name__) # }}} @@ -185,14 +186,15 @@ def check_loop_priority_inames_known(kernel): def _get_all_unique_iname_tags(kernel): - """ Returns a list of all the unique iname tags in the *kernel*. + """Returns a set of all the iname tags used in *kernel* that + inherit from :class:`loopy.kernel.data.UniqueTag`. """ from loopy.kernel.data import UniqueTag iname_tags = [kernel.iname_to_tag.get(iname) for iname in kernel.all_inames()] - unique_iname_tags = [tag for tag in iname_tags if - isinstance(tag, UniqueTag)] - return unique_iname_tags + return set( + tag for tag in iname_tags if + isinstance(tag, UniqueTag)) def check_multiple_tags_allowed(kernel): @@ -225,13 +227,13 @@ def check_for_double_use_of_hw_axes(kernel): insn_tag_keys.add(key) - # checking usage of iname tags in the callee kernel. + # check usage of iname tags in the callee kernel if isinstance(insn, CallInstruction): in_knl_callable = kernel.scoped_functions[ insn.expression.function.name] if isinstance(in_knl_callable, CallableKernel): - # checking for collision in iname_tag keys in the instruction - # due to the callee kernel. + # check for collision in iname_tag keys in the instruction + # due to the callee kernel common_iname_tags = [tag for tag in _get_all_unique_iname_tags(in_knl_callable.subkernel) if tag.key in insn_tag_keys] @@ -257,25 +259,25 @@ def _is_racing_iname_tag(tv, tag): from loopy.kernel.data import (AddressSpace, LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto) - if tv.scope == AddressSpace.PRIVATE: + if tv.address_space == AddressSpace.PRIVATE: return ( isinstance(tag, ConcurrentTag) and not isinstance(tag, (LocalIndexTagBase, GroupIndexTag))) - elif tv.scope == AddressSpace.LOCAL: + elif tv.address_space == AddressSpace.LOCAL: return ( isinstance(tag, ConcurrentTag) and not isinstance(tag, GroupIndexTag)) - elif tv.scope == AddressSpace.GLOBAL: + elif tv.address_space == AddressSpace.GLOBAL: return isinstance(tag, ConcurrentTag) - elif tv.scope == auto: + elif tv.address_space == auto: raise LoopyError("scope of temp var '%s' has not yet been" "determined" % tv.name) else: - raise ValueError("unexpected value of temp_var.scope for " + raise ValueError("unexpected value of temp_var.address_space for " "temporary variable '%s'" % tv.name) @@ -542,13 +544,13 @@ class IndirectDependencyEdgeFinder(object): return False -def declares_nosync_with(kernel, var_scope, dep_a, dep_b): +def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): from loopy.kernel.data import AddressSpace - if var_scope == AddressSpace.GLOBAL: + if var_address_space == AddressSpace.GLOBAL: search_scopes = ["global", "any"] - elif var_scope == AddressSpace.LOCAL: + elif var_address_space == AddressSpace.LOCAL: search_scopes = ["local", "any"] - elif var_scope == AddressSpace.PRIVATE: + elif var_address_space == AddressSpace.PRIVATE: search_scopes = ["any"] else: raise ValueError("unexpected value of 'AddressSpace'") @@ -597,19 +599,19 @@ def _check_variable_access_ordered_inner(kernel): continue if name in kernel.temporary_variables: - scope = kernel.temporary_variables[name].scope + address_space = kernel.temporary_variables[name].address_space else: arg = kernel.arg_dict[name] if isinstance(arg, ArrayArg): - scope = arg.memory_address_space + address_space = arg.address_space elif isinstance(arg, ValueArg): - scope = AddressSpace.PRIVATE + address_space = AddressSpace.PRIVATE else: # No need to consider ConstantArg and ImageArg (for now) # because those won't be written. - raise ValueError("could not determine scope of '%s'" % name) + raise ValueError("could not determine address_space of '%s'" % name) - # Check even for PRIVATE scope, to ensure intentional program order. + # Check even for PRIVATE address space, to ensure intentional program order. from loopy.symbolic import AccessRangeOverlapChecker overlap_checker = AccessRangeOverlapChecker(kernel) @@ -623,7 +625,7 @@ def _check_variable_access_ordered_inner(kernel): other = kernel.id_to_insn[other_id] has_dependency_relationship = ( - declares_nosync_with(kernel, scope, other, writer) + declares_nosync_with(kernel, address_space, other, writer) or depfind(writer_id, other_id) or @@ -907,7 +909,7 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): "aliases have a definition" % (temporary, subkernel)) continue - if tval.scope in (AddressSpace.PRIVATE, AddressSpace.LOCAL): + if tval.address_space in (AddressSpace.PRIVATE, AddressSpace.LOCAL): from loopy.diagnostic import MissingDefinitionError raise MissingDefinitionError("temporary variable '%s' gets used " "in subkernel '%s' without a definition (maybe you forgot " diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e5938dbc40be13769b333fca78c4aa7c74c30dab..e9d30d01300779215cbde187cc26f47a41838274 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -415,8 +415,8 @@ def generate_code_v2(kernel): :returns: a :class:`CodeGenerationResult` """ - from loopy.kernel import kernel_state - if kernel.state == kernel_state.INITIAL: + from loopy.kernel import KernelState + if kernel.state == KernelState.INITIAL: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) @@ -424,7 +424,7 @@ def generate_code_v2(kernel): from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) - if kernel.state != kernel_state.SCHEDULED: + if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") @@ -510,17 +510,18 @@ def generate_code_v2(kernel): from loopy.codegen.result import generate_host_or_device_program - # {{{ collecting ASTs of auxiliary kernels + # {{{ collect ASTs of auxiliary kernels auxiliary_dev_progs = [] - # scanning through all the call instructions if there is any instance of + # scan through all the call instructions if there is any instance of # CallableKernel, whose code is to be generated. + from loopy.kernel.function_interface import CallableKernel + for insn in kernel.instructions: if isinstance(insn, CallInstruction): in_knl_callable = kernel.scoped_functions[ insn.expression.function.name] - from loopy.kernel.function_interface import CallableKernel if isinstance(in_knl_callable, CallableKernel): auxiliary_dev_prog = generate_code_v2( in_knl_callable.subkernel.copy( @@ -528,20 +529,22 @@ def generate_code_v2(kernel): target=kernel.target) ).device_programs[0].ast auxiliary_dev_progs.append(auxiliary_dev_prog) + elif isinstance(insn, (Assignment, NoOpInstruction, Assignment, BarrierInstruction, CInstruction, _DataObliviousInstruction)): pass + else: - raise NotImplementedError("Unknown type of instruction %s." % ( - str(type(insn)))) + raise NotImplementedError("Unknown type of instruction %s" % ( + type(insn).__name__)) codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) - # Modifying the first device program to add the auxiliary kernels - # as functions. + # Modify the first device program to add the auxiliary kernels + # as functions new_dev_prog = codegen_result.device_programs[0] for auxiliary_dev_prog in auxiliary_dev_progs: new_dev_prog = new_dev_prog.copy( @@ -580,7 +583,7 @@ def generate_code_v2(kernel): for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) - # {{{ collecting preambles from all the in kernel callables. + # {{{ collect preambles from all the in kernel callables. in_knl_callable_collector = InKernelCallablesCollector(kernel) @@ -592,7 +595,9 @@ def generate_code_v2(kernel): elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): pass else: - raise NotImplementedError("Unkown instruction %s" % type(insn)) + raise NotImplementedError( + "Unknown instruction type '%s'" + % type(insn).__name__) # }}} diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 9969f6ad0ba51e3a0def3e11c83eb49a204c14ab..45e2a18c4b93665275cc61f84b221a2a0e504d32 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -72,7 +72,7 @@ def synthesize_idis_for_extra_args(kernel, schedule_index): for arg in sched_item.extra_args: temporary = kernel.temporary_variables[arg] - assert temporary.scope == AddressSpace.GLOBAL + assert temporary.address_space == AddressSpace.GLOBAL idis.extend( temporary.decl_info( kernel.target, diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 70415c333d5a40cb822ba0ad1c721cb7a5633c81..bcbe41874d8613eaabd84ae71dd65317558f0185 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -679,7 +679,7 @@ class F2LoopyTranslator(FTreeWalkerBase): if dims is not None: # default order is set to "F" in kernel creation below kernel_data.append( - lp.ArrayArg( + lp.GlobalArg( arg_name, dtype=sub.get_type(arg_name), shape=sub.get_loopy_shape(arg_name), diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 847eb0d9700398478703b823eaa508ac042cf3e4..1de0b621a46ffec95bd0df48875739e11976dda2 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -82,6 +82,9 @@ def make_slab(space, iname, start, stop, step=1): An instance of :class:`int` or an instance of :class:`islpy._isl.Aff` indicating the upper bound of ``step*iname``. + + :arg step: + An instance of :class:`int`. """ zero = isl.Aff.zero_on_domain(space) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 4141ac4cb78ce049087047fac216afb92fb94a1b..fd1550ccbcf9b26311ce97d92a7d41790cb17a06 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -94,12 +94,16 @@ class _UniqueVarNameGenerator(UniqueNameGenerator): # {{{ loop kernel object -class kernel_state: # noqa +class KernelState: # noqa INITIAL = 0 PREPROCESSED = 1 SCHEDULED = 2 +# FIXME Introduce noisy deprecation goop +kernel_state = KernelState + + class LoopKernel(ImmutableRecordWithoutPickling): """These correspond more or less directly to arguments of :func:`loopy.make_kernel`. @@ -189,7 +193,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: state - A value from :class:`kernel_state`. + A value from :class:`KernelState`. .. attribute:: target @@ -227,7 +231,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): index_dtype=np.int32, options=None, - state=kernel_state.INITIAL, + state=KernelState.INITIAL, is_called_from_host=True, target=None, @@ -302,9 +306,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): raise TypeError("index_dtype must be signed") if state not in [ - kernel_state.INITIAL, - kernel_state.PREPROCESSED, - kernel_state.SCHEDULED, + KernelState.INITIAL, + KernelState.PREPROCESSED, + KernelState.SCHEDULED, ]: raise ValueError("invalid value for 'state'") @@ -320,9 +324,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT if function_scopers is None: - from loopy.library.function import loopy_specific_callable_scopers - # populating the function scopers from the target and the loopy + # populate the function scopers from the target and the loopy # specific callable scopers + + from loopy.library.function import loopy_specific_callable_scopers function_scopers = [loopy_specific_callable_scopers] + ( target.get_device_ast_builder().function_scopers()) @@ -982,7 +987,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): | set( tv.name for tv in six.itervalues(self.temporary_variables) - if tv.scope == AddressSpace.GLOBAL)) + if tv.address_space == AddressSpace.GLOBAL)) # }}} @@ -1217,13 +1222,13 @@ class LoopKernel(ImmutableRecordWithoutPickling): return set( tv.name for tv in six.itervalues(self.temporary_variables) - if tv.scope == AddressSpace.LOCAL) + if tv.address_space == AddressSpace.LOCAL) def local_mem_use(self): from loopy.kernel.data import AddressSpace return sum( tv.nbytes for tv in six.itervalues(self.temporary_variables) - if tv.scope == AddressSpace.LOCAL) + if tv.address_space == AddressSpace.LOCAL) # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index f808c42c286beb51275c3765fbb382387d10e7a0..aa53d8ec8093932734451368314d271d080e6a99 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -35,7 +35,7 @@ from loopy.symbolic import ( from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, - SubstitutionRule) + SubstitutionRule, AddressSpace) from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, CallInstruction) from loopy.diagnostic import LoopyError, warn_with_kernel @@ -1156,14 +1156,18 @@ class ArgumentGuesser: # other writable type of variable is an argument. return ArrayArg(arg_name, - shape=lp.auto, offset=self.default_offset) + shape=lp.auto, + offset=self.default_offset, + address_space=AddressSpace.GLOBAL) irank = self.find_index_rank(arg_name) if irank == 0: # read-only, no indices return ValueArg(arg_name) else: - return ArrayArg(arg_name, shape=lp.auto, offset=self.default_offset) + return ArrayArg( + arg_name, shape=lp.auto, offset=self.default_offset, + address_space=AddressSpace.GLOBAL) def convert_names_to_full_args(self, kernel_args): new_kernel_args = [] @@ -1449,7 +1453,7 @@ def create_temporaries(knl, default_order): new_temp_vars[assignee_name] = lp.TemporaryVariable( name=assignee_name, dtype=temp_var_type, - scope=lp.auto, + address_space=lp.auto, base_indices=lp.auto, shape=lp.auto, order=default_order, @@ -1848,7 +1852,7 @@ class FunctionScoper(RuleAwareIdentityMapper): returns an instance of :class:`loopy.kernel.function_interface.InKernelCallable`. - **Example**: If given an expression of the form ``sin(x) + unknown_function(y) + + **Example:** If given an expression of the form ``sin(x) + unknown_function(y) + log(z)``, then the mapper would return ``ScopedFunction('sin')(x) + unknown_function(y) + ScopedFunction('log')(z)``. @@ -1866,12 +1870,12 @@ class FunctionScoper(RuleAwareIdentityMapper): from loopy.symbolic import ScopedFunction if not isinstance(expr.function, ScopedFunction): - # searching the kernel for the function. + # search the kernel for the function in_knl_callable = self.kernel.find_scoped_function_identifier( expr.function.name) if in_knl_callable: - # Associating the newly created ScopedFunction with the - # resolved in-kernel callable. + # associate the newly created ScopedFunction with the + # resolved in-kernel callable self.scoped_functions[expr.function.name] = in_knl_callable return type(expr)( @@ -1879,20 +1883,22 @@ class FunctionScoper(RuleAwareIdentityMapper): tuple(self.rec(child, expn_state) for child in expr.parameters)) - # This is an unknown function as of yet, hence not modifying it. + # this is an unknown function as of yet, do not modify it return super(FunctionScoper, self).map_call(expr, expn_state) def map_call_with_kwargs(self, expr, expn_state): + # FIXME duplicated logic with map_call + from loopy.symbolic import ScopedFunction if not isinstance(expr.function, ScopedFunction): - # searching the kernel for the function. + # search the kernel for the function. in_knl_callable = self.kernel.find_scoped_function_identifier( expr.function.name) if in_knl_callable: - # Associating the newly created ScopedFunction with the - # resolved in-kernel callable. + # associate the newly created ScopedFunction with the + # resolved in-kernel callable self.scoped_functions[expr.function.name] = in_knl_callable return type(expr)( ScopedFunction(expr.function.name), @@ -1903,7 +1909,7 @@ class FunctionScoper(RuleAwareIdentityMapper): for key, val in six.iteritems(expr.kw_parameters)) ) - # This is an unknown function as of yet, hence not modifying it. + # this is an unknown function as of yet, do not modify it return super(FunctionScoper, self).map_call_with_kwargs(expr, expn_state) @@ -1914,7 +1920,12 @@ class FunctionScoper(RuleAwareIdentityMapper): SegmentedOp) from loopy.library.reduction import ArgExtOp - # Noting down the extra functions arising due to certain reductions. + # note down the extra functions arising due to certain reductions + + # FIXME Discuss this. It cannot stay the way it is, because non-built-in + # reductions cannot add themselves to this list. We may need to change + # the reduction interface. Why don't reductions generate scoped functions + # in the first place? if isinstance(expr.operation, MaxReductionOperation): self.scoped_functions["max"] = ( self.kernel.find_scoped_function_identifier("max")) @@ -2015,16 +2026,16 @@ class SliceToInameReplacer(IdentityMapper): """ Converts slices to instances of :class:`loopy.symbolic.SubArrayRef`. - :attribute var_name_gen: + .. attribute:: var_name_gen Variable name generator, in order to generate unique inames within the kernel domain. - :attribute knl: + .. attribute:: knl An instance of :class:`loopy.LoopKernel` - :attribute iname_domains: + .. attribute:: iname_domains An instance of :class:`dict` to store the slices enountered in the expressions as a mapping from ``iname`` to a tuple of ``(start, stop, @@ -2047,7 +2058,7 @@ class SliceToInameReplacer(IdentityMapper): swept_inames = [] for i, index in enumerate(expr.index_tuple): if isinstance(index, Slice): - unique_var_name = self.var_name_gen(based_on="islice") + unique_var_name = self.var_name_gen(based_on="i") if expr.aggregate.name in self.knl.arg_dict: domain_length = self.knl.arg_dict[expr.aggregate.name].shape[i] elif expr.aggregate.name in self.knl.temporary_variables: @@ -2436,7 +2447,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): knl = create_temporaries(knl, default_order) - # Convert slices to iname domains + # convert slices to iname domains knl = realize_slices_as_sub_array_refs(knl) # ------------------------------------------------------------------------- @@ -2476,7 +2487,6 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) - # Function Lookup knl = scope_functions(knl) from loopy.preprocess import prepare_for_caching diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 83f98ecd19eab030ac902c1ea05b1ace02f1dad8..f75e1a8c4103ee884d459ceb92b4fd6528503f57 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -32,8 +32,8 @@ from loopy.kernel.array import ArrayBase from loopy.diagnostic import LoopyError from loopy.kernel.instruction import ( # noqa InstructionBase, - memory_ordering, - memory_scope, + MemoryOrdering, + MemoryScope, VarAtomicity, AtomicInit, AtomicUpdate, @@ -43,11 +43,12 @@ from loopy.kernel.instruction import ( # noqa CallInstruction, make_assignment, CInstruction) +from warnings import warn class auto(object): # noqa """A generic placeholder object for something that should be automatically - detected. See, for example, the *shape* or *strides* argument of + determined. See, for example, the *shape* or *strides* argument of :class:`GlobalArg`. """ @@ -243,9 +244,8 @@ def parse_tag(tag): # {{{ memory address space -class AddressSpace: - """ - Storage location of a variable. +class AddressSpace(object): + """Storage location of a variable. .. attribute:: PRIVATE .. attribute:: LOCAL @@ -268,7 +268,38 @@ class AddressSpace: elif val == cls.GLOBAL: return "global" else: - raise ValueError("unexpected value of MemoryAddressScope") + raise ValueError("unexpected value of AddressSpace") + + +class _deprecated_temp_var_scope_property(property): # noqa + def __get__(self, cls, owner): + warn("'temp_var_scope' is deprecated. Use 'AddressSpace'.", + DeprecationWarning, stacklevel=2) + + return classmethod(self.fget).__get__(None, owner)() + + +class temp_var_scope(object): # noqa + """Deprecated. Use :class:`AddressSpace` instead. + """ + + @_deprecated_temp_var_scope_property + def PRIVATE(self): + return AddressSpace.PRIVATE + + @_deprecated_temp_var_scope_property + def LOCAL(self): + return AddressSpace.LOCAL + + @_deprecated_temp_var_scope_property + def GLOBAL(self): + return AddressSpace.GLOBAL + + @classmethod + def stringify(cls, val): + warn("'temp_var_scope' is deprecated. Use 'AddressSpace'.", + DeprecationWarning, stacklevel=2) + return AddressSpace.stringify(val) # }}} @@ -297,7 +328,6 @@ class KernelArgument(ImmutableRecord): import loopy as lp if dtype is lp.auto: - from warnings import warn warn("Argument/temporary data type should be None if unspecified, " "not auto. This usage will be disallowed in 2018.", DeprecationWarning, stacklevel=2) @@ -313,26 +343,24 @@ class KernelArgument(ImmutableRecord): class ArrayArg(ArrayBase, KernelArgument): __doc__ = ArrayBase.__doc__ + ( """ - .. attribute:: memory_address_space + .. attribute:: address_space An attribute of :class:`AddressSpace` defining the address - space in which the array resides in the target memory layout. - Defaults to ``AddressSpace.GLOBAL`` + space in which the array resides. .. attribute:: is_output_only - An instance of :class:`bool`. If set to *TRUE*, recorded to be + An instance of :class:`bool`. If set to *True*, recorded to be returned from the kernel. """) allowed_extra_kwargs = [ - "memory_address_space", + "address_space", "is_output_only"] def __init__(self, *args, **kwargs): - # Defaulting the memory_address_space to be GLOBAL. - kwargs["memory_address_space"] = kwargs.pop( - "memory_address_space", AddressSpace.GLOBAL) + if "address_space" not in kwargs: + raise TypeError("'address_space' must be specified") kwargs["is_output_only"] = kwargs.pop("is_output_only", None) super(ArrayArg, self).__init__(*args, **kwargs) @@ -342,16 +370,19 @@ class ArrayArg(ArrayBase, KernelArgument): def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): return ast_builder.get_array_arg_decl(self.name + name_suffix, - self.memory_address_space, shape, dtype, is_written) + self.address_space, shape, dtype, is_written) -class GlobalArg(ArrayBase, KernelArgument): - def __new__(cls, *args, **kwargs): - from warnings import warn - warn("Use of 'GlobalArg' is deprecated, use 'ArrayArg' instead.", - DeprecationWarning, stacklevel=2) +# Making this a function prevents incorrect use in isinstance. +# Note: This is *not* deprecated, as it is super-common and +# incrementally more convenient to use than ArrayArg directly. +def GlobalArg(*args, **kwargs): + address_space = kwargs.pop("address_space", None) + if address_space is not None: + raise TypeError("may not pass 'address_space' to GlobalArg") + kwargs["address_space"] = AddressSpace.GLOBAL - return ArrayArg(*args, **kwargs) + return ArrayArg(*args, **kwargs) class ConstantArg(ArrayBase, KernelArgument): @@ -423,43 +454,12 @@ class InameArg(ValueArg): # {{{ temporary variable -class _deprecated_temp_var_scope_property(property): # noqa - def __get__(self, cls, owner): - from warnings import warn - warn("'temp_var_scope' is deprecated. Use 'AddressSpace'.", - DeprecationWarning, stacklevel=2) - - return classmethod(self.fget).__get__(None, owner)() - -class temp_var_scope: # noqa - """Deprecated. Use :class:`mem_adress_space` instead. - """ - - @_deprecated_temp_var_scope_property - def PRIVATE(self): - return AddressSpace.PRIVATE - - @_deprecated_temp_var_scope_property - def LOCAL(self): - return AddressSpace.LOCAL - - @_deprecated_temp_var_scope_property - def GLOBAL(self): - return AddressSpace.GLOBAL - - @classmethod - def stringify(cls, val): - from warnings import warn - warn("'temp_var_scope' is deprecated. Use 'AddressSpace'.", - DeprecationWarning, stacklevel=2) - return AddressSpace.stringify - class TemporaryVariable(ArrayBase): __doc__ = ArrayBase.__doc__ + """ .. attribute:: storage_shape .. attribute:: base_indices - .. attribute:: scope + .. attribute:: address_space What memory this temporary variable lives in. One of the values in :class:`AddressSpace`, @@ -472,10 +472,6 @@ class TemporaryVariable(ArrayBase): hold the data in this temporary. Note that this storage array must not match any existing variable names. - .. attribute:: scope - - One of :class:`AddressSpace`. - .. attribute:: initializer *None* or a :class:`numpy.ndarray` of data to be used to initialize the @@ -501,14 +497,14 @@ class TemporaryVariable(ArrayBase): allowed_extra_kwargs = [ "storage_shape", "base_indices", - "scope", + "address_space", "base_storage", "initializer", "read_only", "_base_storage_access_may_be_aliasing", ] - def __init__(self, name, dtype=None, shape=(), scope=auto, + def __init__(self, name, dtype=None, shape=(), address_space=None, dim_tags=None, offset=0, dim_names=None, strides=None, order=None, base_indices=None, storage_shape=None, base_storage=None, initializer=None, read_only=False, @@ -519,6 +515,28 @@ class TemporaryVariable(ArrayBase): :arg base_indices: :class:`loopy.auto` or a tuple of base indices """ + scope = kwargs.pop("scope", None) + if scope is not None: + warn("Passing 'scope' is deprecated. Use 'address_space' instead.", + DeprecationWarning, stacklevel=2) + + if address_space is not None: + raise ValueError("only one of 'scope' and 'address_space' " + "may be specified") + else: + address_space = scope + + del scope + + if address_space is None: + address_space = auto + + if address_space is None: + raise LoopyError( + "temporary variable '%s': " + "address_space must not be None" + % name) + if initializer is None: pass elif isinstance(initializer, np.ndarray): @@ -579,7 +597,8 @@ class TemporaryVariable(ArrayBase): dtype=dtype, shape=shape, strides=strides, dim_tags=dim_tags, offset=offset, dim_names=dim_names, order=order, - base_indices=base_indices, scope=scope, + base_indices=base_indices, + address_space=address_space, storage_shape=storage_shape, base_storage=base_storage, initializer=initializer, @@ -589,20 +608,33 @@ class TemporaryVariable(ArrayBase): **kwargs) @property - def is_local(self): - """One of :class:`loopy.AddressSpace`.""" - - if self.scope is auto: - return auto - elif self.scope == AddressSpace.LOCAL: - return True - elif self.scope == AddressSpace.PRIVATE: - return False - elif self.scope == AddressSpace.GLOBAL: - raise LoopyError("TemporaryVariable.is_local called on " - "global temporary variable '%s'" % self.name) - else: - raise LoopyError("unexpected value of TemporaryVariable.scope") + def scope(self): + warn("Use of 'TemporaryVariable.scope' is deprecated, " + "use 'TemporaryVariable.address_space' instead.", + DeprecationWarning, stacklevel=2) + + return self.address_space + + def copy(self, **kwargs): + address_space = kwargs.pop("address_space", None) + scope = kwargs.pop("scope", None) + + if scope is not None: + warn("Passing 'scope' is deprecated. Use 'address_space' instead.", + DeprecationWarning, stacklevel=2) + + if address_space is not None: + raise ValueError("only one of 'scope' and 'address_space' " + "may be specified") + else: + address_space = scope + + del scope + + if address_space is not None: + kwargs["address_space"] = address_space + + return super(TemporaryVariable, self).copy(**kwargs) @property def nbytes(self): @@ -619,7 +651,7 @@ class TemporaryVariable(ArrayBase): shape_override=self.storage_shape) def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): - if self.scope == AddressSpace.GLOBAL: + if self.address_space == AddressSpace.GLOBAL: return ast_builder.get_array_arg_decl(self.name + name_suffix, AddressSpace.GLOBAL, shape, dtype, is_written) else: @@ -627,10 +659,10 @@ class TemporaryVariable(ArrayBase): "non-global temporary") def __str__(self): - if self.scope is auto: + if self.address_space is auto: scope_str = "auto" else: - scope_str = AddressSpace.stringify(self.scope) + scope_str = AddressSpace.stringify(self.address_space) return ( self.stringify(include_typename=False) @@ -642,7 +674,7 @@ class TemporaryVariable(ArrayBase): super(TemporaryVariable, self).__eq__(other) and self.storage_shape == other.storage_shape and self.base_indices == other.base_indices - and self.scope == other.scope + and self.address_space == other.address_space and self.base_storage == other.base_storage and ( (self.initializer is None and other.initializer is None) @@ -661,7 +693,7 @@ class TemporaryVariable(ArrayBase): self.update_persistent_hash_for_shape(key_hash, key_builder, self.storage_shape) key_builder.rec(key_hash, self.base_indices) - key_builder.rec(key_hash, self.scope) + key_builder.rec(key_hash, self.address_space) key_builder.rec(key_hash, self.base_storage) initializer = self.initializer diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 089b6cb3687583ef7f1bc1e6b993b407f2a8b9c4..edb222ec25a583eb0607774730688be01ec45a6b 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -35,13 +35,7 @@ from pymbolic.primitives import Variable from loopy.symbolic import parse_tagged_name from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, - RuleAwareIdentityMapper, SubstitutionRuleExpander, SubstitutionMapper, - CombineMapper) - -from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction) - -from functools import reduce + RuleAwareIdentityMapper, SubstitutionRuleExpander) # {{{ argument descriptors @@ -61,7 +55,7 @@ class ArrayArgDescriptor(ImmutableRecord): Shape of the array. - .. attribute:: mem_scope + .. attribute:: address_space An attribute of :class:`loopy.kernel.data.AddressSpace`. @@ -69,9 +63,10 @@ class ArrayArgDescriptor(ImmutableRecord): A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase` """ - fields = set(['shape', 'mem_scope', 'dim_tags']) - def __init__(self, shape, mem_scope, dim_tags): + fields = set(['shape', 'address_space', 'dim_tags']) + + def __init__(self, shape, address_space, dim_tags): # {{{ sanity checks @@ -79,6 +74,8 @@ class ArrayArgDescriptor(ImmutableRecord): assert isinstance(shape, tuple) assert isinstance(dim_tags, tuple) + + # FIXME at least vector dim tags should be supported assert all(isinstance(dim_tag, FixedStrideArrayDimTag) for dim_tag in dim_tags) @@ -86,7 +83,7 @@ class ArrayArgDescriptor(ImmutableRecord): super(ArrayArgDescriptor, self).__init__( shape=shape, - mem_scope=mem_scope, + address_space=address_space, dim_tags=dim_tags) # }}} @@ -176,7 +173,8 @@ class InKernelCallable(ImmutableRecord): .. note:: - Negative ids in the mapping attributes indicate the result arguments + Negative "id" values ``-i`` in the mapping attributes indicate + return value with (0-based) index *i*. .. automethod:: __init__ .. automethod:: with_types @@ -470,120 +468,6 @@ class ScalarCallable(InKernelCallable): # }}} -# {{{ kernel inliner mapper - -class KernelInliner(SubstitutionMapper): - """Mapper to replace variables (indices, temporaries, arguments) in the - callee kernel with variables in the caller kernel. - - :arg caller: the caller kernel - :arg arg_map: dict of argument name to variables in caller - :arg arg_dict: dict of argument name to arguments in callee - """ - - def __init__(self, subst_func, caller, arg_map, arg_dict): - super(KernelInliner, self).__init__(subst_func) - self.caller = caller - self.arg_map = arg_map - self.arg_dict = arg_dict - - def map_subscript(self, expr): - if expr.aggregate.name in self.arg_map: - - aggregate = self.subst_func(expr.aggregate) - sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller - callee_arg = self.arg_dict[expr.aggregate.name] # Arg in callee - if aggregate.name in self.caller.arg_dict: - caller_arg = self.caller.arg_dict[aggregate.name] # Arg in caller - else: - caller_arg = self.caller.temporary_variables[aggregate.name] - - # Firstly, map inner inames to outer inames. - outer_indices = self.map_tuple(expr.index_tuple) - - # Next, reshape to match dimension of outer arrays. - # We can have e.g. A[3, 2] from outside and B[6] from inside - from numbers import Integral - if not all(isinstance(d, Integral) for d in callee_arg.shape): - raise LoopyError( - "Argument: {0} in callee kernel: {1} does not have " - "constant shape.".format(callee_arg)) - - flatten_index = 0 - for i, idx in enumerate(sar.get_begin_subscript().index_tuple): - flatten_index += idx*caller_arg.dim_tags[i].stride - - flatten_index += sum( - idx * tag.stride - for idx, tag in zip(outer_indices, callee_arg.dim_tags)) - - from loopy.isl_helpers import simplify_via_aff - flatten_index = simplify_via_aff(flatten_index) - - new_indices = [] - for dim_tag in caller_arg.dim_tags: - ind = flatten_index // dim_tag.stride - flatten_index -= (dim_tag.stride * ind) - new_indices.append(ind) - - new_indices = tuple(simplify_via_aff(i) for i in new_indices) - - return aggregate.index(tuple(new_indices)) - else: - return super(KernelInliner, self).map_subscript(expr) - - -class CalleeScopedCallsCollector(CombineMapper): - """ - Collects the scoped functions which are a part of the callee kernel and - must be transferred to the caller kernel before inlining. - - :returns: - An :class:`frozenset` of function names that are not scoped in - the caller kernel. - - .. note:: - :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are - never scoped in the pipeline. - """ - - def __init__(self, callee_scoped_functions): - self.callee_scoped_functions = callee_scoped_functions - - def combine(self, values): - import operator - return reduce(operator.or_, values, frozenset()) - - def map_call(self, expr): - if expr.function.name in self.callee_scoped_functions: - return (frozenset([(expr, - self.callee_scoped_functions[expr.function.name])]) | - self.combine((self.rec(child) for child in expr.parameters))) - else: - return self.combine((self.rec(child) for child in expr.parameters)) - - def map_call_with_kwargs(self, expr): - if expr.function.name in self.callee_scoped_functions: - return (frozenset([(expr, - self.callee_scoped_functions[expr.function.name])]) | - self.combine((self.rec(child) for child in expr.parameters - + tuple(expr.kw_parameters.values())))) - else: - return self.combine((self.rec(child) for child in - expr.parameters+tuple(expr.kw_parameters.values()))) - - def map_constant(self, expr): - return frozenset() - - map_variable = map_constant - map_function_symbol = map_constant - map_tagged_variable = map_constant - map_type_cast = map_constant - - -# }}} - - # {{{ callable kernel class CallableKernel(InKernelCallable): @@ -594,15 +478,16 @@ class CallableKernel(InKernelCallable): in order to initiate association between a function in caller kernel and the callee kernel. - The :meth:`CallableKernel.with_types` should be called in order to match + :meth:`CallableKernel.with_types` should be called in order to match the ``dtypes`` of the arguments that are shared between the caller and the callee kernel. - The :meth:`CallableKernel.with_descrs` should be called in order to match - the ``dim_tags, shape, mem_scopes`` of the arguments shared between the + :meth:`CallableKernel.with_descrs` should be called in order to match + :attr:`ArrayArgDescriptor.dim_tags`, :attr:`ArrayArgDescriptor.shape`, + :attr:`ArrayArgDescriptor.address_space`` of the arguments shared between the caller and the callee kernel. - The :meth:`CallableKernel.with_hw_axes` should be called to set the grid + :meth:`CallableKernel.with_hw_axes` should be called to set the grid sizes for the :attr:`subkernel` of the callable. """ @@ -652,43 +537,43 @@ class CallableKernel(InKernelCallable): pre_specialized_subkernel = self.subkernel.copy( args=new_args) - # inferring the types of the written variables based on the knowledge + # infer the types of the written variables based on the knowledge # of the types of the arguments supplied specialized_kernel = infer_unknown_types(pre_specialized_subkernel, expect_completion=True) new_arg_id_to_dtype = {} for arg in specialized_kernel.args: - # associating the updated_arg_id_to_dtype with keyword as well as - # positional id. + # associate the updated_arg_id_to_dtype with keyword as well as + # positional id new_arg_id_to_dtype[arg.name] = arg.dtype new_arg_id_to_dtype[kw_to_pos[arg.name]] = arg.dtype - # Returning the kernel call with specialized subkernel and the corresponding + # Return the kernel call with specialized subkernel and the corresponding # new arg_id_to_dtype return self.copy(subkernel=specialized_kernel, arg_id_to_dtype=new_arg_id_to_dtype) def with_descrs(self, arg_id_to_descr): - # tuning the subkernel so that we have the the matching shapes and - # dim_tags. + # tune the subkernel so that we have the matching shapes and + # dim_tags new_args = self.subkernel.args[:] kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - for id, descr in arg_id_to_descr.items(): - if isinstance(id, int): - id = pos_to_kw[id] - assert isinstance(id, str) + for arg_id, descr in arg_id_to_descr.items(): + if isinstance(arg_id, int): + arg_id = pos_to_kw[arg_id] + assert isinstance(arg_id, str) if isinstance(descr, ArrayArgDescriptor): - new_arg = self.subkernel.arg_dict[id].copy( + new_arg = self.subkernel.arg_dict[arg_id].copy( shape=descr.shape, dim_tags=descr.dim_tags, - memory_address_space=descr.mem_scope) + address_space=descr.address_space) # replacing the new arg with the arg of the same name - new_args = [new_arg if arg.name == id else arg for arg in + new_args = [new_arg if arg.name == arg_id else arg for arg in new_args] elif isinstance(descr, ValueArgDescriptor): pass @@ -712,7 +597,7 @@ class CallableKernel(InKernelCallable): arg_id_to_descr[pos] = ArrayArgDescriptor( shape=arg.shape, dim_tags=arg.dim_tags, - mem_scope=AddressSpace.GLOBAL) + address_space=AddressSpace.GLOBAL) return self.copy(subkernel=self.subkernel, arg_id_to_descr=arg_id_to_descr) @@ -724,7 +609,6 @@ class CallableKernel(InKernelCallable): GridOverrideForCalleeKernel(lsize, gsize)))) def is_ready_for_codegen(self): - return (self.arg_id_to_dtype is not None and self.arg_id_to_descr is not None and self.name_in_target is not None) @@ -732,7 +616,7 @@ class CallableKernel(InKernelCallable): def generate_preambles(self, target): """ Yields the *target* specific preambles. """ - # TODO: This is not correct, as the code code preamble generated + # FIXME TODO: This is not correct, as the code code preamble generated # during the code generationg of the child kernel, does not guarantee # that this thing would be updated. for preamble in self.subkernel.preambles: @@ -740,194 +624,6 @@ class CallableKernel(InKernelCallable): return - def inline_within_kernel(self, kernel, instruction): - """ - Returns a copy of *kernel* with the *instruction* in the *kernel* - replaced by inlining :attr:`subkernel` within it. - """ - callee_knl = self.subkernel - - import islpy as isl - - callee_label = callee_knl.name[:4] + "_" - - # {{{ duplicate and rename inames - - vng = kernel.get_var_name_generator() - ing = kernel.get_instruction_id_generator() - dim_type = isl.dim_type.set - - iname_map = {} - for iname in callee_knl.all_inames(): - iname_map[iname] = vng(callee_label+iname) - - new_domains = [] - new_iname_to_tags = kernel.iname_to_tags.copy() - - # transferring iname tags info from the callee to the caller kernel - for domain in callee_knl.domains: - new_domain = domain.copy() - for i in range(new_domain.n_dim()): - iname = new_domain.get_dim_name(dim_type, i) - - if iname in callee_knl.iname_to_tags: - new_iname_to_tags[iname_map[iname]] = ( - callee_knl.iname_to_tags[iname]) - new_domain = new_domain.set_dim_name( - dim_type, i, iname_map[iname]) - new_domains.append(new_domain) - - kernel = kernel.copy(domains=kernel.domains + new_domains, - iname_to_tags=new_iname_to_tags) - - # }}} - - # {{{ rename temporaries - - temp_map = {} - new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(callee_knl.temporary_variables): - new_name = vng(callee_label+name) - temp_map[name] = new_name - new_temps[new_name] = temp.copy(name=new_name) - - kernel = kernel.copy(temporary_variables=new_temps) - - # }}} - - # {{{ match kernel arguments - - arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) - - assignees = instruction.assignees # writes - parameters = instruction.expression.parameters # reads - - # add keyword parameters - from pymbolic.primitives import CallWithKwargs - - if isinstance(instruction.expression, CallWithKwargs): - from loopy.kernel.function_interface import get_kw_pos_association - - _, pos_to_kw = get_kw_pos_association(callee_knl) - kw_parameters = instruction.expression.kw_parameters - for i in range(len(parameters), len(parameters) + len(kw_parameters)): - parameters = parameters + (kw_parameters[pos_to_kw[i]],) - - assignee_pos = 0 - parameter_pos = 0 - for i, arg in enumerate(callee_knl.args): - if arg.is_output_only: - arg_map[arg.name] = assignees[assignee_pos] - assignee_pos += 1 - else: - arg_map[arg.name] = parameters[parameter_pos] - parameter_pos += 1 - - # }}} - - # {{{ rewrite instructions - - import pymbolic.primitives as p - from pymbolic.mapper.substitutor import make_subst_func - - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) - for k, v in six.iteritems(arg_map))) - subst_mapper = KernelInliner( - make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) - - insn_id = {} - for insn in callee_knl.instructions: - insn_id[insn.id] = ing(callee_label+insn.id) - - # {{{ root and leave instructions in callee kernel - - dep_map = callee_knl.recursive_insn_dep_map() - # roots depend on nothing - heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) - # leaves have nothing that depends on them - tails = set(dep_map.keys()) - for insn, deps in six.iteritems(dep_map): - tails = tails - deps - - # }}} - - # {{{ use NoOp to mark the start and end of callee kernel - - from loopy.kernel.instruction import NoOpInstruction - - noop_start = NoOpInstruction( - id=ing(callee_label+"_start"), - within_inames=instruction.within_inames, - depends_on=instruction.depends_on - ) - noop_end = NoOpInstruction( - id=instruction.id, - within_inames=instruction.within_inames, - depends_on=frozenset(insn_id[insn] for insn in tails) - ) - # }}} - - inner_insns = [noop_start] - - for insn in callee_knl.instructions: - insn = insn.with_transformed_expressions(subst_mapper) - within_inames = frozenset(map(iname_map.get, insn.within_inames)) - within_inames = within_inames | instruction.within_inames - depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( - instruction.depends_on) - if insn.id in heads: - depends_on = depends_on | set([noop_start.id]) - insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in callee kernel - priority=instruction.priority, - depends_on=depends_on - ) - inner_insns.append(insn) - - inner_insns.append(noop_end) - - new_insns = [] - for insn in kernel.instructions: - if insn == instruction: - new_insns.extend(inner_insns) - else: - new_insns.append(insn) - - kernel = kernel.copy(instructions=new_insns) - - # }}} - - # {{{ transferring the scoped functions from callee to caller - - callee_scoped_calls_collector = CalleeScopedCallsCollector( - callee_knl.scoped_functions) - callee_scoped_calls_dict = {} - - for insn in kernel.instructions: - if isinstance(insn, MultiAssignmentBase): - callee_scoped_calls_dict.update(dict(callee_scoped_calls_collector( - insn.expression))) - elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("Unknown type of instruction %s." % type( - insn)) - - from loopy.kernel.function_interface import ( - register_pymbolic_calls_to_knl_callables) - kernel = register_pymbolic_calls_to_knl_callables(kernel, - callee_scoped_calls_dict) - - # }}} - - return kernel - def emit_call_insn(self, insn, target, expression_to_code_mapper): assert self.is_ready_for_codegen() @@ -951,7 +647,7 @@ class CallableKernel(InKernelCallable): parameters.append(kw_parameters[pos_to_kw[i]]) par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) - # inserting the assigness at the required positions. + # insert the assigness at the required positions assignee_write_count = -1 for i, arg in enumerate(self.subkernel.args): if arg.is_output_only: @@ -960,7 +656,7 @@ class CallableKernel(InKernelCallable): par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) assignee_write_count -= 1 - # no type casting in array calls. + # no type casting in array calls from loopy.expression import dtype_to_type_context from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import SubArrayRef @@ -1015,10 +711,10 @@ class ManglerCallable(ScalarCallable): def with_types(self, arg_id_to_dtype, kernel): if self.arg_id_to_dtype is not None: # specializing an already specialized function. - for id, dtype in arg_id_to_dtype.items(): + for arg_id, dtype in arg_id_to_dtype.items(): # only checking for the ones which have been provided # if does not match, returns an error. - if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + if self.arg_id_to_dtype[arg_id] != arg_id_to_dtype[arg_id]: raise LoopyError("Overwriting a specialized" " function is illegal--maybe start with new instance of" " ManglerCallable?") @@ -1057,12 +753,14 @@ class ManglerCallable(ScalarCallable): # {{{ new pymbolic calls to scoped functions +# FIXME Are these identifiers guaranteed to be available? Is there a var name +# generator somewhere ensuring that that's the case? def next_indexed_variable(function): """ Returns an instance of :class:`str` with the next indexed-name in the sequence for the name of *function*. - :Example: ``Variable('sin_0')`` will return ``'sin_1'``. + **Example:** ``Variable('sin_0')`` will return ``'sin_1'``. :arg function: Either an instance of :class:`pymbolic.primitives.Variable` or :class:`loopy.reduction.ArgExtOp` or @@ -1149,6 +847,9 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): def register_pymbolic_calls_to_knl_callables(kernel, pymbolic_exprs_to_knl_callables): + # FIXME This could use an example. I have no idea what this does. + # Surely I can't associate arbitrary pymbolic expresions (3+a?) + # with callables? """ Returns a copy of :arg:`kernel` which includes an association with the given pymbolic expressions to the instances of :class:`InKernelCallable` for the @@ -1156,7 +857,7 @@ def register_pymbolic_calls_to_knl_callables(kernel, :arg kernel: An instance of :class:`loopy.kernel.LoopKernel`. - :arg pymbolic_exprs_to_knl_callables: A mapping from pymbolic expressions + :arg pymbolic_exprs_to_knl_callables: A mapping from :mod:`pymbolic` expressions to the instances of :class:`loopy.kernel.function_interface.InKernelCallable`. """ @@ -1182,7 +883,7 @@ def register_pymbolic_calls_to_knl_callables(kernel, pymbolic_call_function = pymbolic_call.function.function else: raise NotImplementedError("Unknown type %s for pymbolic call " - "function." % type(pymbolic_call)) + "function" % type(pymbolic_call).__name__) unique_var = next_indexed_variable(pymbolic_call_function) from loopy.library.reduction import ArgExtOp, SegmentedOp @@ -1203,7 +904,7 @@ def register_pymbolic_calls_to_knl_callables(kernel, pymbolic_calls_to_new_names[pymbolic_call] = ( scoped_functions_to_names[in_knl_callable]) - # Using the data populated in pymbolic_calls_to_new_names to change the + # Use the data populated in pymbolic_calls_to_new_names to change the # names of the scoped functions of all the calls in the kernel. rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index fafebf37d02a4d703efb2a05705ca19b61603e62..b09931373b5c6f3b699b8252764481a743818f72 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -538,64 +538,78 @@ def _get_assignee_subscript_deps(expr): # {{{ atomic ops -class memory_ordering: # noqa +class MemoryOrdering: # noqa """Ordering of atomic operations, defined as in C11 and OpenCL. - .. attribute:: relaxed - .. attribute:: acquire - .. attribute:: release - .. attribute:: acq_rel - .. attribute:: seq_cst + .. attribute:: RELAXED + .. attribute:: ACQUIRE + .. attribute:: RELEASE + .. attribute:: ACQ_REL + .. attribute:: SEQ_CST """ - relaxed = 0 - acquire = 1 - release = 2 - acq_rel = 3 - seq_cst = 4 + RELAXED = 0 + ACQUIRE = 1 + RELEASE = 2 + ACQ_REL = 3 + SEQ_CST = 4 + + # FIXME Introduce compat/deprecation goop for now-upper-case enum + # constants @staticmethod def to_string(v): - for i in dir(memory_ordering): + for i in dir(MemoryOrdering): if i.startswith("_"): continue - if getattr(memory_ordering, i) == v: + if getattr(MemoryOrdering, i) == v: return i - raise ValueError("Unknown value of memory_ordering") + raise ValueError("Unknown value of MemoryOrdering") + + +# FIXME Introduce noisy deprecation goop +memory_ordering = MemoryOrdering -class memory_scope: # noqa +class MemoryScope: # noqa """Scope of atomicity, defined as in OpenCL. .. attribute:: auto Scope matches the accessibility of the variable. - .. attribute:: work_item - .. attribute:: work_group - .. attribute:: work_device - .. attribute:: all_svm_devices + .. attribute:: WORK_ITEM + .. attribute:: WORK_GROUP + .. attribute:: WORK_DEVICE + .. attribute:: ALL_SVM_DEVICES """ - work_item = 0 - work_group = 1 - device = 2 - all_svm_devices = 2 + WORK_ITEM = 0 + WORK_GROUP = 1 + DEVICE = 2 + ALL_SVM_DEVICES = 2 + + # FIXME Introduce compat/deprecation goop for now-upper-case enum + # constants auto = -1 @staticmethod def to_string(v): - for i in dir(memory_scope): + for i in dir(MemoryScope): if i.startswith("_"): continue - if getattr(memory_scope, i) == v: + if getattr(MemoryScope, i) == v: return i - raise ValueError("Unknown value of memory_scope") + raise ValueError("Unknown value of MemoryScope") + + +# FIXME Introduce noisy deprecation goop +memory_scope = MemoryScope class VarAtomicity(object): @@ -628,15 +642,15 @@ class OrderedAtomic(VarAtomicity): .. attribute:: ordering - One of the values from :class:`memory_ordering` + One of the values from :class:`MemoryOrdering` .. attribute:: scope - One of the values from :class:`memory_scope` + One of the values from :class:`MemoryScope` """ - ordering = memory_ordering.seq_cst - scope = memory_scope.auto + ordering = MemoryOrdering.SEQ_CST + scope = MemoryScope.auto def update_persistent_hash(self, key_hash, key_builder): """Custom hash computation function for use with @@ -657,8 +671,8 @@ class OrderedAtomic(VarAtomicity): return "%s[%s]%s/%s" % ( self.op_name, self.var_name, - memory_ordering.to_string(self.ordering), - memory_scope.to_string(self.scope)) + MemoryOrdering.to_string(self.ordering), + MemoryScope.to_string(self.scope)) class AtomicInit(OrderedAtomic): @@ -667,11 +681,11 @@ class AtomicInit(OrderedAtomic): .. attribute:: ordering - One of the values from :class:`memory_ordering` + One of the values from :class:`MemoryOrdering` .. attribute:: scope - One of the values from :class:`memory_scope` + One of the values from :class:`MemoryScope` """ op_name = 'init' @@ -681,11 +695,11 @@ class AtomicUpdate(OrderedAtomic): .. attribute:: ordering - One of the values from :class:`memory_ordering` + One of the values from :class:`MemoryOrdering` .. attribute:: scope - One of the values from :class:`memory_scope` + One of the values from :class:`MemoryScope` """ op_name = 'update' @@ -695,11 +709,11 @@ class AtomicLoad(OrderedAtomic): .. attribute:: ordering - One of the values from :class:`memory_ordering` + One of the values from :class:`MemoryOrdering` .. attribute:: scope - One of the values from :class:`memory_scope` + One of the values from :class:`MemoryScope` """ op_name = 'load' diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index fb57133e9be1647570def6e4c1678c5ec7ea3532..ed739c0fdd5861638ea5a28652d8768a536f0ec9 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1725,8 +1725,8 @@ def get_subkernels(kernel): See also :class:`loopy.schedule.CallKernel`. """ - from loopy.kernel import kernel_state - if kernel.state != kernel_state.SCHEDULED: + from loopy.kernel import KernelState + if kernel.state != KernelState.SCHEDULED: raise LoopyError("Kernel must be scheduled") from loopy.schedule import CallKernel @@ -1742,8 +1742,8 @@ def get_subkernel_to_insn_id_map(kernel): consisting of the instruction ids scheduled within the subkernel. The kernel must be scheduled. """ - from loopy.kernel import kernel_state - if kernel.state != kernel_state.SCHEDULED: + from loopy.kernel import KernelState + if kernel.state != KernelState.SCHEDULED: raise LoopyError("Kernel must be scheduled") from loopy.schedule import ( diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c4719ace5575efe2897b46963f8a7edd6bd38df1..777cc1c640249480b798810d4e5b7e91e5d3649b 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -137,7 +137,7 @@ def check_reduction_iname_uniqueness(kernel): # }}} -# {{{ decide temporary scope +# {{{ decide temporary address space def _get_compute_inames_tagged(kernel, insn, tag_base): return set(iname for iname in kernel.insn_inames(insn.id) @@ -154,8 +154,8 @@ def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_names): if kernel.iname_tags_of_type(iname, tag_base)) -def find_temporary_scope(kernel): - logger.debug("%s: find temporary scope" % kernel.name) +def find_temporary_address_space(kernel): + logger.debug("%s: find temporary address space" % kernel.name) new_temp_vars = {} from loopy.kernel.data import (LocalIndexTagBase, GroupIndexTag, @@ -183,7 +183,7 @@ def find_temporary_scope(kernel): # Only fill out for variables that do not yet know if they're # local. (I.e. those generated by implicit temporary generation.) - if temp_var.scope is not lp.auto: + if temp_var.address_space is not lp.auto: new_temp_vars[temp_var.name] = temp_var continue @@ -194,7 +194,7 @@ def find_temporary_scope(kernel): for alias in base_storage_to_aliases.get(temp_var.base_storage, []): my_writers = my_writers | writers.get(alias, frozenset()) - desired_scope_per_insn = [] + desired_aspace_per_insn = [] for insn_id in my_writers: insn = kernel.id_to_insn[insn_id] @@ -220,8 +220,8 @@ def find_temporary_scope(kernel): assert locparallel_assignee_inames <= locparallel_compute_inames assert grpparallel_assignee_inames <= grpparallel_compute_inames - desired_scope = AddressSpace.PRIVATE - for iname_descr, scope_descr, apin, cpin, scope in [ + desired_aspace = AddressSpace.PRIVATE + for iname_descr, aspace_descr, apin, cpin, aspace in [ ("local", "local", locparallel_assignee_inames, locparallel_compute_inames, AddressSpace.LOCAL), ("group", "global", grpparallel_assignee_inames, @@ -231,46 +231,45 @@ def find_temporary_scope(kernel): if (apin != cpin and bool(apin)): warn_with_kernel( kernel, - "write_race_%s(%s)" % (scope_descr, insn_id), + "write_race_%s(%s)" % (aspace_descr, insn_id), "instruction '%s' looks invalid: " "it assigns to indices based on %s IDs, but " "its temporary '%s' cannot be made %s because " "a write race across the iname(s) '%s' would emerge. " "(Do you need to add an extra iname to your prefetch?)" - % (insn_id, iname_descr, temp_var.name, scope_descr, + % (insn_id, iname_descr, temp_var.name, aspace_descr, ", ".join(cpin - apin)), WriteRaceConditionWarning) if (apin == cpin - - # doesn't want to be in this scope if there aren't any - # parallel inames of that kind: + # doesn't want to be in this address space if there + # aren't any parallel inames of that kind and bool(cpin)): - desired_scope = max(desired_scope, scope) + desired_aspace = max(desired_aspace, aspace) - desired_scope_per_insn.append(desired_scope) + desired_aspace_per_insn.append(desired_aspace) - if not desired_scope_per_insn: + if not desired_aspace_per_insn: if temp_var.initializer is None: warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name, "temporary variable '%s' never written, eliminating" % temp_var.name, LoopyAdvisory) else: raise LoopyError("temporary variable '%s': never written, " - "cannot automatically determine scope" + "cannot automatically determine address space" % temp_var.name) continue - overall_scope = max(desired_scope_per_insn) + overall_aspace = max(desired_aspace_per_insn) from pytools import all - if not all(iscope == overall_scope for iscope in desired_scope_per_insn): + if not all(iaspace == overall_aspace for iaspace in desired_aspace_per_insn): raise LoopyError("not all instructions agree on the " - "the desired scope (private/local/global) of the " + "the desired address space (private/local/global) of the " "temporary '%s'" % temp_var.name) - new_temp_vars[temp_var.name] = temp_var.copy(scope=overall_scope) + new_temp_vars[temp_var.name] = temp_var.copy(address_space=overall_aspace) return kernel.copy(temporary_variables=new_temp_vars) @@ -785,7 +784,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): if ( assignee_var_name in kernel.temporary_variables and - (kernel.temporary_variables[assignee_var_name].scope + (kernel.temporary_variables[assignee_var_name].address_space == AddressSpace.PRIVATE)): new_assignees.append(assignee) continue @@ -1026,7 +1025,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=AddressSpace.PRIVATE) + address_space=AddressSpace.PRIVATE) init_insn_depends_on = frozenset() @@ -1161,14 +1160,14 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=AddressSpace.PRIVATE) + address_space=AddressSpace.PRIVATE) acc_var_names = make_temporaries( name_based_on="acc_"+red_iname, nvars=nresults, shape=outer_local_iname_sizes + (size,), dtypes=reduction_dtypes, - scope=AddressSpace.LOCAL) + address_space=AddressSpace.LOCAL) acc_vars = tuple(var(n) for n in acc_var_names) @@ -1354,7 +1353,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, return mapper(expr, temp_kernel, None) - def make_temporaries(name_based_on, nvars, shape, dtypes, scope): + def make_temporaries(name_based_on, nvars, shape, dtypes, address_space): var_names = [ var_name_gen(name_based_on.format(index=i)) for i in range(nvars)] @@ -1366,7 +1365,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, name=name, shape=shape, dtype=dtype, - scope=scope) + address_space=address_space) return var_names @@ -1394,7 +1393,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=AddressSpace.PRIVATE) + address_space=AddressSpace.PRIVATE) from pymbolic import var acc_vars = tuple(var(n) for n in acc_var_names) @@ -1516,14 +1515,14 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=AddressSpace.PRIVATE) + address_space=AddressSpace.PRIVATE) acc_var_names = make_temporaries( name_based_on="acc_"+scan_iname, nvars=nresults, shape=outer_local_iname_sizes + (scan_size,), dtypes=reduction_dtypes, - scope=AddressSpace.LOCAL) + address_space=AddressSpace.LOCAL) acc_vars = tuple(var(n) for n in acc_var_names) read_vars = tuple(var(n) for n in read_var_names) @@ -2134,6 +2133,7 @@ class ArgDescrInferenceMapper(CombineMapper): import operator return reduce(operator.or_, values, frozenset()) + # FIXME logic duplication between map_call and map_call_with_kwargs def map_call(self, expr, **kwargs): from loopy.kernel.function_interface import ValueArgDescriptor from loopy.symbolic import SubArrayRef, ScopedFunction @@ -2363,6 +2363,7 @@ class FunctionsNotReadyForCodegenCollector(CombineMapper): def combine(self, values): return all(values) + # FIXME logic duplication between map_call and map_call_with_kwargs def map_call(self, expr, *args, **kwargs): from loopy.library.reduction import ArgExtOp, SegmentedOp from pymbolic.primitives import Variable @@ -2470,8 +2471,8 @@ def preprocess_kernel(kernel, device=None): warn("passing 'device' to preprocess_kernel() is deprecated", DeprecationWarning, stacklevel=2) - from loopy.kernel import kernel_state - if kernel.state >= kernel_state.PREPROCESSED: + from loopy.kernel import KernelState + if kernel.state >= KernelState.PREPROCESSED: return kernel # {{{ cache retrieval @@ -2536,7 +2537,7 @@ def preprocess_kernel(kernel, device=None): kernel = realize_ilp(kernel) - kernel = find_temporary_scope(kernel) + kernel = find_temporary_address_space(kernel) # inferring the shape and dim_tags of the arguments involved in a function # call. @@ -2561,7 +2562,7 @@ def preprocess_kernel(kernel, device=None): logger.info("%s: preprocess done" % kernel.name) kernel = kernel.copy( - state=kernel_state.PREPROCESSED) + state=KernelState.PREPROCESSED) # {{{ prepare for caching diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 440ac22cb890bd9f1b47f909ee96681c39c33975..652f8b8933ee79935f8bf08e7de2356972922ccc 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1850,8 +1850,8 @@ def generate_loop_schedules(kernel, debug_args={}): def generate_loop_schedules_inner(kernel, debug_args={}): - from loopy.kernel import kernel_state - if kernel.state not in (kernel_state.PREPROCESSED, kernel_state.SCHEDULED): + from loopy.kernel import KernelState + if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") @@ -1862,7 +1862,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): debug = ScheduleDebugger(**debug_args) - preschedule = kernel.schedule if kernel.state == kernel_state.SCHEDULED else () + preschedule = kernel.schedule if kernel.state == KernelState.SCHEDULED else () prescheduled_inames = set( insn.iname @@ -1914,7 +1914,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): unscheduled_insn_ids=set(insn.id for insn in kernel.instructions), scheduled_insn_ids=frozenset(), - within_subkernel=kernel.state != kernel_state.SCHEDULED, + within_subkernel=kernel.state != KernelState.SCHEDULED, may_schedule_global_barriers=True, preschedule=preschedule, @@ -1984,11 +1984,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}): new_kernel = kernel.copy( schedule=gen_sched, - state=kernel_state.SCHEDULED) + state=KernelState.SCHEDULED) from loopy.schedule.device_mapping import \ map_schedule_onto_host_or_device - if kernel.state != kernel_state.SCHEDULED: + if kernel.state != KernelState.SCHEDULED: # Device mapper only gets run once. new_kernel = map_schedule_onto_host_or_device(new_kernel) diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index 5c41f03997e5193333f5be213f2f87d38147b6df..59afb07d2e9b7713dbe86c2c5aef7356decbbcff 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -30,8 +30,8 @@ from loopy.schedule.tools import get_block_boundaries def map_schedule_onto_host_or_device(kernel): # FIXME: Should be idempotent. - from loopy.kernel import kernel_state - assert kernel.state == kernel_state.SCHEDULED + from loopy.kernel import KernelState + assert kernel.state == KernelState.SCHEDULED from functools import partial device_prog_name_gen = partial( diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index d1e3a85e9489be6c276e06792d6bd8d187e2f436..e0129fd98417f26a501138a92de4a67614f1a139 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -91,7 +91,8 @@ def add_extra_args_to_schedule(kernel): more_args = set(tv for tv in used_temporaries if - kernel.temporary_variables[tv].scope == AddressSpace.GLOBAL + kernel.temporary_variables[tv].address_space + == AddressSpace.GLOBAL and kernel.temporary_variables[tv].initializer is None and diff --git a/loopy/statistics.py b/loopy/statistics.py index 521eaeb5a04cc244ce0f0fff511d273952ec8a2a..6c012ca2175a65dd813a1fa70e92a72c306c1080 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -919,7 +919,7 @@ class LocalMemAccessCounter(MemAccessCounter): if name in self.knl.temporary_variables: array = self.knl.temporary_variables[name] if isinstance(array, TemporaryVariable) and ( - array.scope == AddressSpace.LOCAL): + array.address_space == AddressSpace.LOCAL): if index is None: # no subscript sub_map[MemAccess( @@ -1739,8 +1739,8 @@ def gather_access_footprints(kernel, ignore_uncountable=False): from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) - from loopy.kernel import kernel_state - if kernel.state < kernel_state.PREPROCESSED: + from loopy.kernel import KernelState + if kernel.state < KernelState.PREPROCESSED: kernel = preprocess_kernel(kernel) write_footprints = [] @@ -1793,8 +1793,8 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) - from loopy.kernel import kernel_state - if kernel.state < kernel_state.PREPROCESSED: + from loopy.kernel import KernelState + if kernel.state < KernelState.PREPROCESSED: kernel = preprocess_kernel(kernel) result = {} diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 09e6e57477a48f5b39d798b7d445c945bb6e3952..2c235a0d13384006563f12470101a6252520be70 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -836,13 +836,13 @@ class SubArrayRef(p.Expression): name = self.subscript.aggregate.name if name in kernel.temporary_variables: - arg = kernel.temporary_variables[name] - mem_scope = arg.scope assert name not in kernel.arg_dict + arg = kernel.temporary_variables[name] else: assert name in kernel.arg_dict arg = kernel.arg_dict[name] - mem_scope = arg.memory_address_space + + aspace = arg.address_space from loopy.kernel.array import FixedStrideArrayDimTag as DimTag from loopy.isl_helpers import simplify_via_aff @@ -861,7 +861,8 @@ class SubArrayRef(p.Expression): kernel.get_iname_bounds(iname.name).upper_bound_pw_aff)+1 for iname in self.swept_inames) - return ArrayArgDescriptor(mem_scope=mem_scope, + return ArrayArgDescriptor( + address_space=aspace, dim_tags=sub_dim_tags, shape=sub_shape) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 9be9db38c207bbd4c19af8059901ad830ac3262c..eab1e6afc67d3b99aebb431ea8e694db3e3cbb66 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -512,7 +512,7 @@ class CASTBuilder(ASTBuilderBase): six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): - if tv.scope == AddressSpace.GLOBAL and ( + if tv.address_space == AddressSpace.GLOBAL and ( tv.initializer is not None): assert tv.read_only @@ -606,12 +606,12 @@ class CASTBuilder(ASTBuilderBase): if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations - if tv.scope != AddressSpace.GLOBAL and ( + if tv.address_space != AddressSpace.GLOBAL and ( tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( codegen_state, schedule_index, tv, idi), - tv.scope) + tv.address_space) if tv.initializer is not None: assert tv.read_only @@ -627,7 +627,7 @@ class CASTBuilder(ASTBuilderBase): base_storage_sizes.setdefault(tv.base_storage, []).append( tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append( - tv.scope) + tv.address_space) align_size = tv.dtype.itemsize @@ -643,9 +643,9 @@ class CASTBuilder(ASTBuilderBase): cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) - cast_decl = self.wrap_temporary_decl(cast_decl, tv.scope) + cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space) temp_var_decl = self.wrap_temporary_decl( - temp_var_decl, tv.scope) + temp_var_decl, tv.address_space) if tv._base_storage_access_may_be_aliasing: ptrtype = _ConstPointer diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index abe49a2414face070156463e7e11ae027e136ff0..0464270a348b019e93092fb5d1d30a8ceaf5788d 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -82,7 +82,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): def map_variable(self, expr, type_context): tv = self.kernel.temporary_variables.get(expr.name) - if tv is not None and tv.scope == AddressSpace.PRIVATE: + if tv is not None and tv.address_space == AddressSpace.PRIVATE: # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # below in decl generation) @@ -102,7 +102,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): ary = self.find_array(expr) if (isinstance(ary, TemporaryVariable) - and ary.scope == AddressSpace.PRIVATE): + and ary.address_space == AddressSpace.PRIVATE): # generate access code for acccess to private-index temporaries gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() @@ -308,7 +308,7 @@ class ISPCASTBuilder(CASTBuilder): shape = decl_info.shape - if temp_var.scope == AddressSpace.PRIVATE: + if temp_var.address_space == AddressSpace.PRIVATE: # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # above in expr to code mapper) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 85af4ece364a45bc20aa9076a50d1e058d0a32a4..6ee5969b39afe045ecfddbe8d29f9400c1bdbf89 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -647,20 +647,20 @@ class OpenCLCASTBuilder(CASTBuilder): if ( isinstance(lhs_var, ArrayArg) and - lhs_var.memory_address_space == AddressSpace.GLOBAL): + lhs_var.address_space == AddressSpace.GLOBAL): var_kind = "__global" elif ( isinstance(lhs_var, ArrayArg) and - lhs_var.memory_address_space == AddressSpace.LOCAL): + lhs_var.address_space == AddressSpace.LOCAL): var_kind = "__local" elif ( isinstance(lhs_var, TemporaryVariable) - and lhs_var.scope == AddressSpace.LOCAL): + and lhs_var.address_space == AddressSpace.LOCAL): var_kind = "__local" elif ( isinstance(lhs_var, TemporaryVariable) - and lhs_var.scope == AddressSpace.GLOBAL): + and lhs_var.address_space == AddressSpace.GLOBAL): var_kind = "__global" else: raise LoopyError("unexpected kind of variable '%s' in " diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 7355ceb2c19211d2da0185de2492fd7fa3d68abe..27c4f4ab4f92056169fd85cc5c71334d011e240c 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -56,7 +56,7 @@ def adjust_local_temp_var_storage(kernel, device): lmem_size = cl_char.usable_local_mem_size(device) for temp_var in six.itervalues(kernel.temporary_variables): - if temp_var.scope != AddressSpace.LOCAL: + if temp_var.address_space != AddressSpace.LOCAL: new_temp_vars[temp_var.name] = \ temp_var.copy(storage_shape=temp_var.shape) continue @@ -69,7 +69,7 @@ def adjust_local_temp_var_storage(kernel, device): other_loctemp_nbytes = [ tv.nbytes for tv in six.itervalues(kernel.temporary_variables) - if tv.scope == AddressSpace.LOCAL + if tv.address_space == AddressSpace.LOCAL and tv.name != temp_var.name] storage_shape = temp_var.storage_shape @@ -702,7 +702,7 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): global_temporaries = sorted( (tv for tv in six.itervalues(codegen_state.kernel.temporary_variables) - if tv.scope == AddressSpace.GLOBAL), + if tv.address_space == AddressSpace.GLOBAL), key=lambda tv: tv.name) from pymbolic.mapper.stringifier import PREC_NONE diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 0d3db360df3142307924782bb24032399edcd125..f0b9814c43698a64af23f1555a27e910ef89762e 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -46,7 +46,7 @@ def temp_needs_batching_if_not_sequential(tv, batch_varying_args): # do not batch read_only temps if not in # `batch_varying_args` return False - if tv.scope == AddressSpace.PRIVATE: + if tv.address_space == AddressSpace.PRIVATE: # do not batch private temps if not in `batch_varying args` return False return True diff --git a/loopy/transform/register_callable.py b/loopy/transform/callable.py similarity index 50% rename from loopy/transform/register_callable.py rename to loopy/transform/callable.py index 455c2e51ec3b5ff3577dac899e9f4bc54e6c4be3..092cef8876d377083e961853b19568d4d83af5a1 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/callable.py @@ -22,15 +22,19 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six + +import islpy as isl +from pymbolic.primitives import CallWithKwargs + from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, CInstruction, _DataObliviousInstruction) -from loopy.symbolic import IdentityMapper +from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper from loopy.isl_helpers import simplify_via_aff -from pymbolic.primitives import CallWithKwargs from loopy.kernel.function_interface import (get_kw_pos_association, register_pymbolic_calls_to_knl_callables) @@ -144,7 +148,7 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): name=function_name, is_called_from_host=False)) - # disabling global barriers for callee kernel + # FIXME disabling global barriers for callee kernel (for now) from loopy import set_options callee_kernel = set_options(callee_kernel, "disable_global_barriers") @@ -154,12 +158,321 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # }}} +# {{{ callee scoped calls collector (to support inlining) + +class CalleeScopedCallsCollector(CombineMapper): + """ + Collects the scoped functions which are a part of the callee kernel and + must be transferred to the caller kernel before inlining. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the caller kernel. + + .. note:: + :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are + never scoped in the pipeline. + """ + + def __init__(self, callee_scoped_functions): + self.callee_scoped_functions = callee_scoped_functions + + def combine(self, values): + import operator + from functools import reduce + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters))) + else: + return self.combine((self.rec(child) for child in expr.parameters)) + + def map_call_with_kwargs(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values()))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + +# }}} + + +# {{{ kernel inliner mapper + +class KernelInliner(SubstitutionMapper): + """Mapper to replace variables (indices, temporaries, arguments) in the + callee kernel with variables in the caller kernel. + + :arg caller: the caller kernel + :arg arg_map: dict of argument name to variables in caller + :arg arg_dict: dict of argument name to arguments in callee + """ + + def __init__(self, subst_func, caller, arg_map, arg_dict): + super(KernelInliner, self).__init__(subst_func) + self.caller = caller + self.arg_map = arg_map + self.arg_dict = arg_dict + + def map_subscript(self, expr): + if expr.aggregate.name in self.arg_map: + + aggregate = self.subst_func(expr.aggregate) + sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller + callee_arg = self.arg_dict[expr.aggregate.name] # Arg in callee + if aggregate.name in self.caller.arg_dict: + caller_arg = self.caller.arg_dict[aggregate.name] # Arg in caller + else: + caller_arg = self.caller.temporary_variables[aggregate.name] + + # Firstly, map inner inames to outer inames. + outer_indices = self.map_tuple(expr.index_tuple) + + # Next, reshape to match dimension of outer arrays. + # We can have e.g. A[3, 2] from outside and B[6] from inside + from numbers import Integral + if not all(isinstance(d, Integral) for d in callee_arg.shape): + raise LoopyError( + "Argument: {0} in callee kernel: {1} does not have " + "constant shape.".format(callee_arg)) + + flatten_index = 0 + for i, idx in enumerate(sar.get_begin_subscript().index_tuple): + flatten_index += idx*caller_arg.dim_tags[i].stride + + flatten_index += sum( + idx * tag.stride + for idx, tag in zip(outer_indices, callee_arg.dim_tags)) + + from loopy.isl_helpers import simplify_via_aff + flatten_index = simplify_via_aff(flatten_index) + + new_indices = [] + for dim_tag in caller_arg.dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + return aggregate.index(tuple(new_indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + +# }}} + + +# {{{ inlining of a single call instruction + +def _inline_call_instruction(kernel, callee_knl, instruction): + """ + Returns a copy of *kernel* with the *instruction* in the *kernel* + replaced by inlining :attr:`subkernel` within it. + """ + callee_label = callee_knl.name[:4] + "_" + + # {{{ duplicate and rename inames + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + dim_type = isl.dim_type.set + + iname_map = {} + for iname in callee_knl.all_inames(): + iname_map[iname] = vng(callee_label+iname) + + new_domains = [] + new_iname_to_tags = kernel.iname_to_tags.copy() + + # transferring iname tags info from the callee to the caller kernel + for domain in callee_knl.domains: + new_domain = domain.copy() + for i in range(new_domain.n_dim()): + iname = new_domain.get_dim_name(dim_type, i) + + if iname in callee_knl.iname_to_tags: + new_iname_to_tags[iname_map[iname]] = ( + callee_knl.iname_to_tags[iname]) + new_domain = new_domain.set_dim_name( + dim_type, i, iname_map[iname]) + new_domains.append(new_domain) + + kernel = kernel.copy(domains=kernel.domains + new_domains, + iname_to_tags=new_iname_to_tags) + + # }}} + + # {{{ rename temporaries + + temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(callee_knl.temporary_variables): + new_name = vng(callee_label+name) + temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # }}} + + # {{{ match kernel arguments + + arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) + + assignees = instruction.assignees # writes + parameters = instruction.expression.parameters # reads + + # add keyword parameters + from pymbolic.primitives import CallWithKwargs + + if isinstance(instruction.expression, CallWithKwargs): + from loopy.kernel.function_interface import get_kw_pos_association + + _, pos_to_kw = get_kw_pos_association(callee_knl) + kw_parameters = instruction.expression.kw_parameters + for i in range(len(parameters), len(parameters) + len(kw_parameters)): + parameters = parameters + (kw_parameters[pos_to_kw[i]],) + + assignee_pos = 0 + parameter_pos = 0 + for i, arg in enumerate(callee_knl.args): + if arg.is_output_only: + arg_map[arg.name] = assignees[assignee_pos] + assignee_pos += 1 + else: + arg_map[arg.name] = parameters[parameter_pos] + parameter_pos += 1 + + # }}} + + # {{{ rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) + for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner( + make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) + + insn_id = {} + for insn in callee_knl.instructions: + insn_id[insn.id] = ing(callee_label+insn.id) + + # {{{ root and leave instructions in callee kernel + + dep_map = callee_knl.recursive_insn_dep_map() + # roots depend on nothing + heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + # leaves have nothing that depends on them + tails = set(dep_map.keys()) + for insn, deps in six.iteritems(dep_map): + tails = tails - deps + + # }}} + + # {{{ use NoOp to mark the start and end of callee kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing(callee_label+"_start"), + within_inames=instruction.within_inames, + depends_on=instruction.depends_on + ) + noop_end = NoOpInstruction( + id=instruction.id, + within_inames=instruction.within_inames, + depends_on=frozenset(insn_id[insn] for insn in tails) + ) + # }}} + + inner_insns = [noop_start] + + for insn in callee_knl.instructions: + insn = insn.with_transformed_expressions(subst_mapper) + within_inames = frozenset(map(iname_map.get, insn.within_inames)) + within_inames = within_inames | instruction.within_inames + depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( + instruction.depends_on) + if insn.id in heads: + depends_on = depends_on | set([noop_start.id]) + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on + ) + inner_insns.append(insn) + + inner_insns.append(noop_end) + + new_insns = [] + for insn in kernel.instructions: + if insn == instruction: + new_insns.extend(inner_insns) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + + # }}} + + # {{{ transferring the scoped functions from callee to caller + + callee_scoped_calls_collector = CalleeScopedCallsCollector( + callee_knl.scoped_functions) + callee_scoped_calls_dict = {} + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callee_scoped_calls_dict.update(dict(callee_scoped_calls_collector( + insn.expression))) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknown type of instruction %s." % type( + insn)) + + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + kernel = register_pymbolic_calls_to_knl_callables(kernel, + callee_scoped_calls_dict) + + # }}} + + return kernel + +# }}} + + # {{{ inline callable kernel +# FIXME This should take a 'within' parameter to be able to only inline +# *some* calls to a kernel, but not others. def inline_callable_kernel(kernel, function_name): """ - Returns a copy of *kernel* with the callable kernel addresed by - *function_name* inlined. + Returns a copy of *kernel* with the callable kernel addressed by + (scoped) name *function_name* inlined. """ from loopy.preprocess import infer_arg_descr kernel = infer_arg_descr(kernel) @@ -167,25 +480,33 @@ def inline_callable_kernel(kernel, function_name): old_insns = kernel.instructions for insn in old_insns: if isinstance(insn, CallInstruction): + # FIXME This seems to use identifiers across namespaces. Why not + # check whether the function is a scoped function first? if insn.expression.function.name in kernel.scoped_functions: in_knl_callable = kernel.scoped_functions[ insn.expression.function.name] from loopy.kernel.function_interface import CallableKernel if isinstance(in_knl_callable, CallableKernel) and ( in_knl_callable.subkernel.name == function_name): - kernel = in_knl_callable.inline_within_kernel(kernel, insn) + kernel = _inline_call_instruction( + kernel, in_knl_callable.subkernel, insn) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): pass else: - raise NotImplementedError("Unknown instruction %s." % type(insn)) + raise NotImplementedError( + "Unknown instruction type %s" + % type(insn).__name__) return kernel # }}} -# {{{ matching caller to callee args if dimenstions dont match +# {{{ tools to match caller to callee args by (guessed) automatic reshaping + +# (This is undocumented and not recommended, but it is currently needed +# to support Firedrake.) class DimChanger(IdentityMapper): """ diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 19414424d29229a8e7a2c9a0580335b39f6c07f5..5b1ee6ccafa4c3b76609f197cc31691de562aaa5 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -147,7 +147,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, rule_name=None, temporary_name=None, - temporary_scope=None, temporary_is_local=None, + temporary_address_space=None, temporary_scope=None, footprint_subscripts=None, fetch_bounding_box=False, fetch_outer_inames=None): @@ -184,9 +184,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, :arg rule_name: base name of the generated temporary variable. :arg temporary_name: The name of the temporary to be used. - :arg temporary_scope: The :class:`AddressSpace` to use for the + :arg temporary_address_space: The :class:`AddressSpace` to use for the temporary. - :arg temporary_is_local: Deprecated, use *temporary_scope* instead. :arg footprint_subscripts: A list of tuples indicating the index (i.e. subscript) tuples used to generate the footprint. @@ -335,7 +334,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, - temporary_scope=temporary_scope, temporary_is_local=temporary_is_local, + temporary_address_space=temporary_address_space, + temporary_scope=temporary_scope, precompute_outer_inames=fetch_outer_inames) # {{{ remove inames that were temporarily added by slice sweeps diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index f1a01541328fc73ceae3ee0589c681808331bbf1..d0edcfd7812685938fca6c12bf4c35fe47031c2e 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -336,7 +336,7 @@ class DifferentiationContext(object): if var_name in self.kernel.arg_dict: self.new_args.append( - lp.ArrayArg( + lp.GlobalArg( new_var_name, arg.dtype, shape=shape, diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 8f8593c2c03542f75e721f9037577507dd70eef6..49e30a7516cbbf00a07aace34831eb857a877432 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -130,8 +130,8 @@ def _merge_values(item_name, val_a, val_b): # {{{ two-kernel fusion def _fuse_two_kernels(knla, knlb): - from loopy.kernel import kernel_state - if knla.state != kernel_state.INITIAL or knlb.state != kernel_state.INITIAL: + from loopy.kernel import KernelState + if knla.state != KernelState.INITIAL or knlb.state != KernelState.INITIAL: raise LoopyError("can only fuse kernels in INITIAL state") # {{{ fuse domains diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 663c60b2a84b20a6c48d838860642c44e0f1f58d..87136d017d891fa3b36057c24e07be5a9e5f0510 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -33,8 +33,6 @@ __doc__ = """ """ -# {{{ main entrypoint - def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, args_to_unpack=None): """ @@ -141,12 +139,12 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, from loopy.symbolic import SubstitutionMapper # dict to store the new assignees and parameters, the mapping pattern - # from id to parameters is identical to InKernelCallable.arg_id_to_dtype + # from arg_id to parameters is identical to InKernelCallable.arg_id_to_dtype id_to_parameters = tuple(enumerate(parameters)) + tuple( (-i-1, assignee) for i, assignee in enumerate(insn.assignees)) new_id_to_parameters = {} - for id, p in id_to_parameters: + for arg_id, p in id_to_parameters: if isinstance(p, SubArrayRef) and (p.subscript.aggregate.name in args_to_pack): new_pack_inames = ilp_inames_map.copy() # packing-specific inames @@ -185,8 +183,8 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, pack_tmp = TemporaryVariable( name=pack_name, dtype=arg_in_caller.dtype, - dim_tags=in_knl_callable.arg_id_to_descr[id].dim_tags, - shape=in_knl_callable.arg_id_to_descr[id].shape, + dim_tags=in_knl_callable.arg_id_to_descr[arg_id].dim_tags, + shape=in_knl_callable.arg_id_to_descr[arg_id].shape, scope=temp_var_scope.PRIVATE, ) @@ -207,7 +205,7 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, zip(arg_in_caller.dim_tags, p.subscript.index_tuple))) new_indices = [] - for dim_tag in in_knl_callable.arg_id_to_descr[id].dim_tags: + for dim_tag in in_knl_callable.arg_id_to_descr[arg_id].dim_tags: ind = flatten_index // dim_tag.stride flatten_index -= (dim_tag.stride * ind) new_indices.append(ind) @@ -249,7 +247,7 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, updated_swept_inames = [] for i, _ in enumerate( - in_knl_callable.arg_id_to_descr[id].shape): + in_knl_callable.arg_id_to_descr[arg_id].shape): updated_swept_inames.append(var(vng("i_packsweep_"+arg))) ctx = kernel.isl_context @@ -257,17 +255,18 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, set=[iname.name for iname in updated_swept_inames]) iname_set = isl.BasicSet.universe(space) for iname, axis_length in zip(updated_swept_inames, - in_knl_callable.arg_id_to_descr[id].shape): + in_knl_callable.arg_id_to_descr[arg_id].shape): iname_set = iname_set & make_slab(space, iname.name, 0, axis_length) new_domains = new_domains + [iname_set] # }}} - new_id_to_parameters[id] = SubArrayRef(tuple(updated_swept_inames), - (var(pack_name).index(tuple(updated_swept_inames)))) + new_id_to_parameters[arg_id] = SubArrayRef( + tuple(updated_swept_inames), + (var(pack_name).index(tuple(updated_swept_inames)))) else: - new_id_to_parameters[id] = p + new_id_to_parameters[arg_id] = p if packing_insns: subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) @@ -315,7 +314,4 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, return kernel -# }}} - - # vim: foldmethod=marker diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index acc21b09db709c78d3c0b07c91279bcf9affe412..52d568975216699f53b6a038d0ce775b89dbc4b0 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -268,8 +268,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, dtype=None, fetch_bounding_box=False, - temporary_scope=None, temporary_is_local=None, - compute_insn_id=None): + temporary_address_space=None, + compute_insn_id=None, + **kwargs): """Precompute the expression described in the substitution rule determined by *subst_use* and store it in a temporary array. A precomputation needs two things to operate, a list of *sweep_inames* (order irrelevant) and an @@ -355,27 +356,30 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, eliminated. """ - # {{{ unify temporary_scope / temporary_is_local + # {{{ unify temporary_address_space / temporary_scope + + temporary_scope = kwargs.pop("temporary_scope", None) from loopy.kernel.data import AddressSpace - if temporary_is_local is not None: + if temporary_scope is not None: from warnings import warn - warn("temporary_is_local is deprecated. Use temporary_scope instead", + warn("temporary_scope is deprecated. Use temporary_address_space instead", DeprecationWarning, stacklevel=2) - if temporary_scope is not None: - raise LoopyError("may not specify both temporary_is_local and " + if temporary_address_space is not None: + raise LoopyError("may not specify both temporary_address_space and " "temporary_scope") - if temporary_is_local: - temporary_scope = AddressSpace.LOCAL - else: - temporary_scope = AddressSpace.PRIVATE + temporary_address_space = temporary_scope - del temporary_is_local + del temporary_scope # }}} + if kwargs: + raise TypeError("unrecognized keyword arguments: %s" + % ", ".join(kwargs.keys())) + # {{{ check, standardize arguments if isinstance(sweep_inames, str): @@ -847,7 +851,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, compute_dep_id = compute_insn_id added_compute_insns = [compute_insn] - if temporary_scope == AddressSpace.GLOBAL: + if temporary_address_space == AddressSpace.GLOBAL: barrier_insn_id = kernel.make_unique_instruction_id( based_on=c_subst_name+"_barrier") from loopy.kernel.instruction import BarrierInstruction @@ -959,8 +963,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, import loopy as lp - if temporary_scope is None: - temporary_scope = lp.auto + if temporary_address_space is None: + temporary_address_space = lp.auto new_temp_shape = tuple(abm.non1_storage_shape) @@ -971,7 +975,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, dtype=dtype, base_indices=(0,)*len(new_temp_shape), shape=tuple(abm.non1_storage_shape), - scope=temporary_scope, + address_space=temporary_address_space, dim_names=tuple(non1_storage_axis_names)) else: @@ -1009,20 +1013,20 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, temp_var = temp_var.copy(shape=new_temp_shape) - if temporary_scope == temp_var.scope: + if temporary_address_space == temp_var.address_space: pass - elif temporary_scope is lp.auto: - temporary_scope = temp_var.scope - elif temp_var.scope is lp.auto: + elif temporary_address_space is lp.auto: + temporary_address_space = temp_var.address_space + elif temp_var.address_space is lp.auto: pass else: raise LoopyError("Existing and new temporary '%s' do not " "have matching scopes (existing: %s, new: %s)" % (temporary_name, - AddressSpace.stringify(temp_var.scope), - AddressSpace.stringify(temporary_scope))) + AddressSpace.stringify(temp_var.address_space), + AddressSpace.stringify(temporary_address_space))) - temp_var = temp_var.copy(scope=temporary_scope) + temp_var = temp_var.copy(address_space=temporary_address_space) # }}} diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 0283b84f970a74d3e78d1f09d6f428c7daf5b7ee..cca62bc522bb110ec4aeb190b538e5b6e8583abf 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -441,7 +441,7 @@ class TemporarySaver(object): group_sizes, local_sizes = ( self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) - if temporary.scope == lp.AddressSpace.LOCAL: + if temporary.address_space == lp.AddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. del local_tags[:] local_sizes = () @@ -454,7 +454,7 @@ class TemporarySaver(object): def auto_promote_temporary(self, temporary_name): temporary = self.kernel.temporary_variables[temporary_name] - if temporary.scope == AddressSpace.GLOBAL: + if temporary.address_space == AddressSpace.GLOBAL: # Nothing to be done for global temporaries (I hope) return None @@ -673,7 +673,7 @@ class TemporarySaver(object): domain = domain.set_dim_name( isl.dim_type.set, orig_dim + dim_idx, new_iname) - if orig_temporary.is_local: + if orig_temporary.address_space == AddressSpace.LOCAL: # If the temporary has local scope, then loads / stores can # be done in parallel. from loopy.kernel.data import AutoFitLocalIndexTag diff --git a/test/test_callables.py b/test/test_callables.py new file mode 100644 index 0000000000000000000000000000000000000000..3b27b2d5b66d1da63277dcbf6af7bda9b3a3f9d7 --- /dev/null +++ b/test/test_callables.py @@ -0,0 +1,415 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import numpy as np +import pyopencl as cl +import pyopencl.clrandom # noqa: F401 +import loopy as lp +import pytest +import sys + + +from pyopencl.tools import ( # noqa: F401 + pytest_generate_tests_for_pyopencl + as pytest_generate_tests) + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 + + +def test_register_function_lookup(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + from testlib import register_log2_lookup + + x = np.random.rand(10) + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = log2(x[i]) + """) + knl = lp.register_function_lookup(knl, register_log2_lookup) + + evt, (out, ) = knl(queue, x=x) + + assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2 ** 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + grandchild_knl = lp.make_kernel( + "{[i, j]:0<= i, j< 16}", + """ + c[i, j] = 2*a[i, j] + 3*b[i, j] + """) + + child_knl = lp.make_kernel( + "{[i, j]:0<=i, j < 16}", + """ + [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) + """) + + parent_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), '...'], + ) + + child_knl = lp.register_callable_kernel( + child_knl, 'linear_combo1', grandchild_knl) + knl = lp.register_callable_kernel( + parent_knl, 'linear_combo2', child_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo2') + knl = lp.inline_callable_kernel(knl, 'linear_combo1') + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out)/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_slices_with_negative_step(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2 ** 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + child_knl = lp.make_kernel( + "{[i, j]:0<=i, j < 16}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """) + + parent_knl = lp.make_kernel( + "{[i, k, m]: 0<=i, k, m<16}", + """ + z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], + y[i, :, k, :, m]) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='z', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), '...'], + ) + + knl = lp.register_callable_kernel( + parent_knl, 'linear_combo', child_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_call_with_kwargs(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 2 ** 2 + + a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) + b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) + c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_kernel( + "{[i, j]:0<=i, j < %d}" % n, + """ + h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] + <>f1[i, j] = 2*f[i, j] + p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] + """, + [ + lp.GlobalArg('f'), + lp.GlobalArg('e'), + lp.GlobalArg('h'), + lp.GlobalArg('g'), + '...']) + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, + """ + <> d[i, j, k, l, m] = 2*b[i, j, k, l, m] + [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m] = linear_combo( + f=[j, l]: a[i, j, k, l, m], + g=[j, l]: d[i, j, k, l, m], + e=[j, l]: c[i, j, k, l, m]) + """) + + knl = lp.register_callable_kernel( + caller_knl, 'linear_combo', callee_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) + + a = a_dev.get() + b = b_dev.get() + c = c_dev.get() + + h = out1.get() # h = 2c + 3a + 8b + p = out2.get() # p = 7c + 8a + 4b + h_exact = 3*a + 8*b + 2*c + p_exact = 8*a + 4*b + 7*c + + assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7 + assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_hw_axes(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 2 ** 4 + + x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_kernel( + "{[i, j]:0<=i, j < 16}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """) + + callee_knl = lp.split_iname(callee_knl, "i", 4, inner_tag="l.0", outer_tag="g.0") + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """ + ) + caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") + + knl = lp.register_callable_kernel( + caller_knl, 'linear_combo', callee_knl) + + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out, ) = knl(queue, x=x_dev, y=y_dev) + + x_host = x_dev.get() + y_host = y_dev.get() + + assert np.linalg.norm(2*x_host+3*y_host-out.get())/np.linalg.norm( + 2*x_host+3*y_host) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_shape_translation_through_sub_array_ref(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) + x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) + x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) + + callee1 = lp.make_kernel( + "{[i]: 0<=i<6}", + """ + a[i] = 2*abs(b[i]) + """) + + callee2 = lp.make_kernel( + "{[i, j]: 0<=i<3 and 0 <= j < 2}", + """ + a[i, j] = 3*b[i, j] + """) + + callee3 = lp.make_kernel( + "{[i]: 0<=i<6}", + """ + a[i] = 5*b[i] + """) + + knl = lp.make_kernel( + "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", + """ + [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) + [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) + [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) + """) + + knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1) + knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2) + knl = lp.register_callable_kernel(knl, 'callee_fn3', callee3) + + if inline: + knl = lp.inline_callable_kernel(knl, 'callee_fn1') + knl = lp.inline_callable_kernel(knl, 'callee_fn2') + knl = lp.inline_callable_kernel(knl, 'callee_fn3') + + knl = lp.set_options(knl, "write_cl") + knl = lp.set_options(knl, "return_dict") + evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) + + y1 = out_dict['y1'].get() + y2 = out_dict['y2'].get() + y3 = out_dict['y3'].get() + + assert (np.linalg.norm(y1-2*x1.get())) < 1e-15 + assert (np.linalg.norm(y2-3*x2.get())) < 1e-15 + assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15 + + +def test_multi_arg_array_call(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + import pymbolic.primitives as p + n = 10 + acc_i = p.Variable("acc_i") + i = p.Variable("i") + index = p.Variable("index") + a_i = p.Subscript(p.Variable("a"), p.Variable("i")) + argmin_kernel = lp.make_kernel( + "{[i]: 0 <= i < n}", + [ + lp.Assignment(id="init2", assignee=index, + expression=0), + lp.Assignment(id="init1", assignee=acc_i, + expression="214748367"), + lp.Assignment(id="insn", assignee=index, + expression=p.If(p.Expression.eq(acc_i, a_i), i, index), + depends_on="update"), + lp.Assignment(id="update", assignee=acc_i, + expression=p.Variable("min")(acc_i, a_i), + depends_on="init1,init2")]) + + argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) + + knl = lp.make_kernel( + "{[i]:0<=i 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_loopy.py b/test/test_loopy.py index c069916e5233e91cf8ed0f042b6c7747ec69bee2..accf9c1dff5a1f660871dd63d6af3337aced6490 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -69,7 +69,7 @@ def test_globals_decl_once_with_multi_subprogram(ctx_factory): """, [lp.TemporaryVariable( 'cnst', shape=('n'), initializer=cnst, - scope=lp.temp_var_scope.GLOBAL, + scope=lp.AddressSpace.GLOBAL, read_only=True), '...']) knl = lp.fix_parameters(knl, n=16) knl = lp.add_barrier(knl, "id:first", "id:second") @@ -1070,7 +1070,7 @@ def test_atomic(ctx_factory, dtype): def test_atomic_load(ctx_factory, dtype): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - from loopy.kernel.data import temp_var_scope as scopes + from loopy.kernel.data import AddressSpace n = 10 vec_width = 4 @@ -1108,7 +1108,7 @@ def test_atomic_load(ctx_factory, dtype): lp.GlobalArg("a", dtype, shape=lp.auto), lp.GlobalArg("b", dtype, shape=lp.auto), lp.TemporaryVariable('temp', dtype, for_atomic=True, - scope=scopes.LOCAL), + scope=AddressSpace.LOCAL), "..." ], silenced_warnings=["write_race(init)", "write_race(temp_sum)"]) @@ -1895,8 +1895,8 @@ def test_global_barrier(ctx_factory): print(knl) knl = lp.preprocess_kernel(knl) - assert knl.temporary_variables["z"].scope == lp.temp_var_scope.GLOBAL - assert knl.temporary_variables["v"].scope == lp.temp_var_scope.GLOBAL + assert knl.temporary_variables["z"].address_space == lp.AddressSpace.GLOBAL + assert knl.temporary_variables["v"].address_space == lp.AddressSpace.GLOBAL print(knl) @@ -2023,7 +2023,7 @@ def test_temp_initializer(ctx_factory, src_order, tmp_order): lp.TemporaryVariable("tmp", initializer=a, shape=lp.auto, - scope=lp.temp_var_scope.PRIVATE, + scope=lp.AddressSpace.PRIVATE, read_only=True, order=tmp_order), "..." @@ -2048,7 +2048,7 @@ def test_const_temp_with_initializer_not_saved(): lp.TemporaryVariable("tmp", initializer=np.arange(10), shape=lp.auto, - scope=lp.temp_var_scope.PRIVATE, + scope=lp.AddressSpace.PRIVATE, read_only=True), "..." ], @@ -2264,7 +2264,6 @@ def test_integer_reduction(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - from loopy.kernel.data import temp_var_scope as scopes from loopy.types import to_loopy_type n = 200 @@ -2272,7 +2271,7 @@ def test_integer_reduction(ctx_factory): var_int = np.random.randint(1000, size=n).astype(vtype) var_lp = lp.TemporaryVariable('var', initializer=var_int, read_only=True, - scope=scopes.PRIVATE, + scope=lp.AddressSpace.PRIVATE, dtype=to_loopy_type(vtype), shape=lp.auto) @@ -2453,8 +2452,6 @@ def test_barrier_insertion_near_bottom_of_loop(): def test_barrier_in_overridden_get_grid_size_expanded_kernel(): - from loopy.kernel.data import temp_var_scope as scopes - # make simple barrier'd kernel knl = lp.make_kernel('{[i]: 0 <= i < 10}', """ @@ -2465,7 +2462,7 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): end """, [lp.TemporaryVariable("a", np.float32, shape=(10,), order='C', - scope=scopes.LOCAL), + scope=lp.AddressSpace.LOCAL), lp.GlobalArg("b", np.float32, shape=(11,), order='C')], seq_dependencies=True) @@ -2690,7 +2687,6 @@ def test_wildcard_dep_matching(): def test_preamble_with_separate_temporaries(ctx_factory): - from loopy.kernel.data import temp_var_scope as scopes # create a function mangler # and finally create a test @@ -2717,7 +2713,8 @@ def test_preamble_with_separate_temporaries(ctx_factory): """, [lp.GlobalArg('out', shape=('n',)), lp.TemporaryVariable( - 'offsets', shape=(offsets.size,), initializer=offsets, scope=scopes.GLOBAL, + 'offsets', shape=(offsets.size,), initializer=offsets, + scope=lp.AddressSpace.GLOBAL, read_only=True), lp.GlobalArg('data', shape=(data.size,), dtype=np.float64)], ) @@ -2851,7 +2848,7 @@ def test_no_barriers_for_nonoverlapping_access(second_index, expect_barrier): """ % second_index, [ lp.TemporaryVariable("a", lp.auto, shape=(256,), - scope=lp.temp_var_scope.LOCAL), + scope=lp.AddressSpace.LOCAL), ]) knl = lp.tag_inames(knl, "i:l.0") diff --git a/test/test_transform.py b/test/test_transform.py index 6e441976aac739bec2774cbf69e15a855d4f86a4..ed184fb50c099d5fb2a6a0941d2f2c22c3b757bc 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -182,370 +182,6 @@ def test_add_barrier(ctx_factory): assert (np.linalg.norm(out-2*a.T) < 1e-16) -def test_register_function_lookup(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - from testlib import register_log2_lookup - - x = np.random.rand(10) - ctx = cl.create_some_context() - queue = cl.CommandQueue(ctx) - - knl = lp.make_kernel( - "{[i]: 0<=i<10}", - """ - y[i] = log2(x[i]) - """) - knl = lp.register_function_lookup(knl, register_log2_lookup) - - evt, (out, ) = knl(queue, x=x) - - assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_register_knl(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - n = 2 ** 4 - - x = np.random.rand(n, n, n, n, n) - y = np.random.rand(n, n, n, n, n) - - grandchild_knl = lp.make_kernel( - "{[i, j]:0<= i, j< 16}", - """ - c[i, j] = 2*a[i, j] + 3*b[i, j] - """) - - child_knl = lp.make_kernel( - "{[i, j]:0<=i, j < 16}", - """ - [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) - """) - - parent_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", - """ - [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], - [j, l]: y[i, j, k, l, m]) - """, - kernel_data=[ - lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], - ) - - child_knl = lp.register_callable_kernel( - child_knl, 'linear_combo1', grandchild_knl) - knl = lp.register_callable_kernel( - parent_knl, 'linear_combo2', child_knl) - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo2') - knl = lp.inline_callable_kernel(knl, 'linear_combo1') - - evt, (out, ) = knl(queue, x=x, y=y) - - assert (np.linalg.norm(2*x+3*y-out)/( - np.linalg.norm(2*x+3*y))) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_slices_with_negative_step(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - n = 2 ** 4 - - x = np.random.rand(n, n, n, n, n) - y = np.random.rand(n, n, n, n, n) - - child_knl = lp.make_kernel( - "{[i, j]:0<=i, j < 16}", - """ - g[i, j] = 2*e[i, j] + 3*f[i, j] - """) - - parent_knl = lp.make_kernel( - "{[i, k, m]: 0<=i, k, m<16}", - """ - z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], - y[i, :, k, :, m]) - """, - kernel_data=[ - lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='z', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], - ) - - knl = lp.register_callable_kernel( - parent_knl, 'linear_combo', child_knl) - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') - - evt, (out, ) = knl(queue, x=x, y=y) - - assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/( - np.linalg.norm(2*x+3*y))) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_register_knl_with_call_with_kwargs(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 2 ** 2 - - a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) - b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) - c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - - callee_knl = lp.make_kernel( - "{[i, j]:0<=i, j < %d}" % n, - """ - h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] - <>f1[i, j] = 2*f[i, j] - p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] - """, - [lp.ArrayArg('f'), lp.ArrayArg('e'), lp.ArrayArg('h'), - lp.ArrayArg('g'), '...']) - - caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, - """ - <> d[i, j, k, l, m] = 2*b[i, j, k, l, m] - [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m] = linear_combo( - f=[j, l]: a[i, j, k, l, m], - g=[j, l]: d[i, j, k, l, m], - e=[j, l]: c[i, j, k, l, m]) - """) - - knl = lp.register_callable_kernel( - caller_knl, 'linear_combo', callee_knl) - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') - - evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) - - a = a_dev.get() - b = b_dev.get() - c = c_dev.get() - - h = out1.get() # h = 2c + 3a + 8b - p = out2.get() # p = 7c + 8a + 4b - h_exact = 3*a + 8*b + 2*c - p_exact = 8*a + 4*b + 7*c - - assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7 - assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_register_knl_with_hw_axes(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 2 ** 4 - - x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - - callee_knl = lp.make_kernel( - "{[i, j]:0<=i, j < 16}", - """ - g[i, j] = 2*e[i, j] + 3*f[i, j] - """) - - callee_knl = lp.split_iname(callee_knl, "i", 4, inner_tag="l.0", outer_tag="g.0") - - caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", - """ - [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], - [j, l]: y[i, j, k, l, m]) - """ - ) - caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") - - knl = lp.register_callable_kernel( - caller_knl, 'linear_combo', callee_knl) - - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') - - evt, (out, ) = knl(queue, x=x_dev, y=y_dev) - - x_host = x_dev.get() - y_host = y_dev.get() - - assert np.linalg.norm(2*x_host+3*y_host-out.get())/np.linalg.norm( - 2*x_host+3*y_host) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_shape_translation_through_sub_array_ref(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) - x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) - x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) - - callee1 = lp.make_kernel( - "{[i]: 0<=i<6}", - """ - a[i] = 2*abs(b[i]) - """) - - callee2 = lp.make_kernel( - "{[i, j]: 0<=i<3 and 0 <= j < 2}", - """ - a[i, j] = 3*b[i, j] - """) - - callee3 = lp.make_kernel( - "{[i]: 0<=i<6}", - """ - a[i] = 5*b[i] - """) - - knl = lp.make_kernel( - "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", - """ - [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) - [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) - [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) - """) - - knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1) - knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2) - knl = lp.register_callable_kernel(knl, 'callee_fn3', callee3) - - if inline: - knl = lp.inline_callable_kernel(knl, 'callee_fn1') - knl = lp.inline_callable_kernel(knl, 'callee_fn2') - knl = lp.inline_callable_kernel(knl, 'callee_fn3') - - knl = lp.set_options(knl, "write_cl") - knl = lp.set_options(knl, "return_dict") - evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) - - y1 = out_dict['y1'].get() - y2 = out_dict['y2'].get() - y3 = out_dict['y3'].get() - - assert (np.linalg.norm(y1-2*x1.get())) < 1e-15 - assert (np.linalg.norm(y2-3*x2.get())) < 1e-15 - assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15 - - -def test_multi_arg_array_call(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - import pymbolic.primitives as p - n = 10 - acc_i = p.Variable("acc_i") - i = p.Variable("i") - index = p.Variable("index") - a_i = p.Subscript(p.Variable("a"), p.Variable("i")) - argmin_kernel = lp.make_kernel( - "{[i]: 0 <= i < n}", - [ - lp.Assignment(id="init2", assignee=index, - expression=0), - lp.Assignment(id="init1", assignee=acc_i, - expression="214748367"), - lp.Assignment(id="insn", assignee=index, - expression=p.If(p.Expression.eq(acc_i, a_i), i, index), - depends_on="update"), - lp.Assignment(id="update", assignee=acc_i, - expression=p.Variable("min")(acc_i, a_i), - depends_on="init1,init2")]) - - argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) - - knl = lp.make_kernel( - "{[i]:0<=i