diff --git a/README.rst b/README.rst index 0e551fbede0460a2e7c76167b54d672afdf81286..94b485a64c18b5e9e727a561f70127269d04a16c 100644 --- a/README.rst +++ b/README.rst @@ -25,7 +25,7 @@ It can capture the following types of optimizations: * Vector and multi-core parallelism in the OpenCL/CUDA model * Data layout transformations (structure of arrays to array of structures) -* Loopy Unrolling +* Loop unrolling * Loop tiling with efficient handling of boundary cases * Prefetching/copy optimizations * Instruction level parallelism diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 69f89548618e86b408a31af240bee84678c859c1..7196dad863474d9b6ea9df9d9d0ae90b3e14986d 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -111,9 +111,9 @@ always see loopy's view of a kernel by printing it. KERNEL: loopy_kernel --------------------------------------------------------------------------- ARGUMENTS: - a: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1) - n: ValueArg, type: <runtime> - out: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1) + a: GlobalArg, type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1) + n: ValueArg, type: <auto/runtime> + out: GlobalArg, type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1) --------------------------------------------------------------------------- DOMAINS: [n] -> { [i] : 0 <= i < n } @@ -154,7 +154,7 @@ following: See :ref:`specifying-arguments`. * Loopy has not determined the type of ``a`` and ``out``. The data type is - given as ``<runtime>``, which means that these types will be determined + given as ``<auto/runtime>``, which means that these types will be determined by the data passed in when the kernel is invoked. Loopy generates (and caches!) a copy of the kernel for each combination of types passed in. diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 88a5717642af6d9ebc1bd7770936ae44e8cbf44b..038ef23ac08ce3bbc71a1fd1fce40181c6f8d9bb 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1081,7 +1081,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): warn_with_kernel(self, "iname-order", "get_visual_iname_order_embedding() could not determine a " - "consistent iname nesting order") + "consistent iname nesting order. This is a possible indication " + "that the kernel may not schedule successfully, but for now " + "it only impacts printing of the kernel.") embedding = dict((iname, iname) for iname in self.all_inames()) return embedding diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 5d4240b9ab3e1ce2ad356a93b5e21b3bbf4d499e..b672f0227b1b8ba931b844b80a24b75c9625286d 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -549,15 +549,55 @@ class ArrayBase(ImmutableRecord): .. attribute :: name .. attribute :: dtype + the :class:`loopy.loopytype` of the array. + if this is *none*, :mod:`loopy` will try to continue without + knowing the type of this array, where the idea is that precise + knowledge of the type will become available at invocation time. + :class:`loopy.compiledkernel` (and thereby + :meth:`loopy.loopkernel.__call__`) automatically add this type + information based on invocation arguments. + + note that some transformations, such as :func:`loopy.add_padding` + cannot be performed without knowledge of the exact *dtype*. .. attribute :: shape + May be one of the following: + + * *None*. In this case, no shape is intended to be specified, + only the strides will be used to access the array. Bounds checking + will not be performed. + + * :class:`loopy.auto`. The shape will be determined by finding the + access footprint. + + * a tuple like like :attr:`numpy.ndarray.shape`. + + Each entry of the tuple is also allowed to be a :mod:`pymbolic` + expression involving kernel parameters, or a (potentially-comma + separated) or a string that can be parsed to such an expression. + + Any element of the shape tuple not used to compute strides + may be *None*. + .. attribute:: dim_tags See :ref:`data-dim-tags`. .. attribute:: offset + Offset from the beginning of the buffer to the point from + which the strides are counted. May be one of + + * 0 or None + * a string (that is interpreted as an argument name). + * a pymbolic expression + * :class:`loopy.auto`, in which case an offset argument + is added automatically, immediately following this argument. + :class:`loopy.CompiledKernel` is even smarter in its treatment of + this case and will compile custom versions of the kernel based on + whether the passed arrays have offsets or not. + .. attribute:: dim_names A tuple of strings providing names for the array axes, or *None*. @@ -568,6 +608,21 @@ class ArrayBase(ImmutableRecord): to generate more informative names than could be achieved by axis numbers. + .. attribute:: alignment + + Memory alignment of the array in bytes. For temporary arrays, + this ensures they are allocated with this alignment. For arguments, + this entails a promise that the incoming array obeys this alignment + restriction. + + Defaults to *None*. + + If an integer N is given, the array would be declared + with ``__attribute__((aligned(N)))`` in code generation for + :class:`loopy.CTarget`. + + .. versionadded:: 2018.1 + .. automethod:: __init__ .. automethod:: __eq__ .. automethod:: num_user_axes @@ -584,46 +639,18 @@ class ArrayBase(ImmutableRecord): def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0, dim_names=None, strides=None, order=None, for_atomic=False, - target=None, + target=None, alignment=None, **kwargs): """ All of the following (except *name*) are optional. Specify either strides or shape. - :arg name: May contain multiple names separated by - commas, in which case multiple arguments, - each with identical properties, are created - for each name. - :arg dtype: the :class:`numpy.dtype` of the array. - If this is *None*, :mod:`loopy` will try to continue without - knowing the type of this array, where the idea is that precise - knowledge of the type will become available at invocation time. - :class:`loopy.CompiledKernel` (and thereby - :meth:`loopy.LoopKernel.__call__`) automatically add this type - information based on invocation arguments. - - Note that some transformations, such as :func:`loopy.add_padding` - cannot be performed without knowledge of the exact *dtype*. + :arg name: When passed to :class:`loopy.make_kernel`, this may contain + multiple names separated by commas, in which case multiple arguments, + each with identical properties, are created for each name. - :arg shape: May be one of the following: - - * *None*. In this case, no shape is intended to be specified, - only the strides will be used to access the array. Bounds checking - will not be performed. - - * :class:`loopy.auto`. The shape will be determined by finding the - access footprint. - - * a tuple like like :attr:`numpy.ndarray.shape`. - - Each entry of the tuple is also allowed to be a :mod:`pymbolic` - expression involving kernel parameters, or a (potentially-comma - separated) or a string that can be parsed to such an expression. - - Any element of the shape tuple not used to compute strides - may be *None*. - - * A string which can be parsed into the previous form. + :arg shape: May be any of the things specified under :attr:`shape`, + or a string which can be parsed into the previous form. :arg dim_tags: A comma-separated list of tags as understood by :func:`parse_array_dim_tag`. @@ -649,17 +676,9 @@ class ArrayBase(ImmutableRecord): :arg for_atomic: Whether the array is declared for atomic access, and, if necessary, using atomic-capable data types. - :arg offset: Offset from the beginning of the buffer to the point from - which the strides are counted. May be one of + :arg offset: (See :attr:`offset`) + :arg alignment: memory alignment in bytes - * 0 or None - * a string (that is interpreted as an argument name). - * a pymbolic expression - * :class:`loopy.auto`, in which case an offset argument - is added automatically, immediately following this argument. - :class:`loopy.CompiledKernel` is even smarter in its treatment of - this case and will compile custom versions of the kernel based on - whether the passed arrays have offsets or not. """ for kwarg_name in kwargs: @@ -672,6 +691,14 @@ class ArrayBase(ImmutableRecord): dtype = to_loopy_type(dtype, allow_auto=True, allow_none=True, for_atomic=for_atomic, target=target) + if dtype is lp.auto: + from warnings import warn + warn("Argument/temporary data type should be None if unspecified, " + "not auto. This usage will be disallowed in 2018.", + DeprecationWarning, stacklevel=2) + + dtype = None + strides_known = strides is not None and strides is not lp.auto shape_known = shape is not None and shape is not lp.auto @@ -805,6 +832,7 @@ class ArrayBase(ImmutableRecord): offset=offset, dim_names=dim_names, order=order, + alignment=alignment, **kwargs) def __eq__(self, other): @@ -832,10 +860,10 @@ class ArrayBase(ImmutableRecord): if include_typename: info_entries.append(type(self).__name__) - if self.dtype is lp.auto: - type_str = "<auto>" - elif self.dtype is None: - type_str = "<runtime>" + assert self.dtype is not lp.auto + + if self.dtype is None: + type_str = "<auto/runtime>" else: type_str = str(self.dtype) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index f7667ca639e649a8f25b6e5d8975710742aef9a6..4a08c28bd8091425293892384e01d20447413cd5 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1004,7 +1004,7 @@ def _find_existentially_quantified_inames(dom_str): def parse_domains(domains, defines): - if isinstance(domains, str): + if isinstance(domains, (isl.BasicSet, str)): domains = [domains] result = [] @@ -1106,6 +1106,9 @@ class ArgumentGuesser: self.all_written_names = set() from loopy.symbolic import get_dependencies for insn in instructions: + for pred in insn.predicates: + self.all_names.update(get_dependencies(self.submap(pred))) + if isinstance(insn, MultiAssignmentBase): for assignee_var_name in insn.assignee_var_names(): self.all_written_names.add(assignee_var_name) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index a4e6036cbac0235590d7cc66a201c47ac87d6030..c90e8a64b6f47a87e87c5e64d2ef930232d34894 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -219,9 +219,20 @@ class KernelArgument(ImmutableRecord): dtype = kwargs.pop("dtype", None) from loopy.types import to_loopy_type - kwargs["dtype"] = to_loopy_type( + dtype = to_loopy_type( dtype, allow_auto=True, allow_none=True, target=target) + import loopy as lp + if dtype is lp.auto: + from warnings import warn + warn("Argument/temporary data type should be None if unspecified, " + "not auto. This usage will be disallowed in 2018.", + DeprecationWarning, stacklevel=2) + + dtype = None + + kwargs["dtype"] = dtype + ImmutableRecord.__init__(self, **kwargs) @@ -268,10 +279,10 @@ class ValueArg(KernelArgument): def __str__(self): import loopy as lp - if self.dtype is lp.auto: - type_str = "<auto>" - elif self.dtype is None: - type_str = "<runtime>" + assert self.dtype is not lp.auto + + if self.dtype is None: + type_str = "<auto/runtime>" else: type_str = str(self.dtype) @@ -449,7 +460,7 @@ class TemporaryVariable(ArrayBase): % name) ArrayBase.__init__(self, name=intern(name), - dtype=dtype, shape=shape, + dtype=dtype, shape=shape, strides=strides, dim_tags=dim_tags, offset=offset, dim_names=dim_names, order=order, base_indices=base_indices, scope=scope, diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index a65e7fb4ceefd28a909dcb6cee24ea437f15a60e..fbc4238c21e966cb61d1c074ce6924fd9af26084 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -107,7 +107,7 @@ def get_arguments_with_incomplete_dtype(knl): if arg.dtype is None] -def add_and_infer_dtypes(knl, dtype_dict): +def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False): processed_dtype_dict = {} for k, v in six.iteritems(dtype_dict): @@ -119,7 +119,7 @@ def add_and_infer_dtypes(knl, dtype_dict): knl = add_dtypes(knl, processed_dtype_dict) from loopy.type_inference import infer_unknown_types - return infer_unknown_types(knl, expect_completion=True) + return infer_unknown_types(knl, expect_completion=expect_completion) def _add_and_infer_dtypes_overdetermined(knl, dtype_dict): diff --git a/loopy/match.py b/loopy/match.py index ab0038af8dc5e9189a382bb76115998f57aef74e..3c047e463939cd67a4878d202a754c0cab48058d 100644 --- a/loopy/match.py +++ b/loopy/match.py @@ -134,6 +134,12 @@ class All(MatchExpressionBase): def __call__(self, kernel, matchable): return True + def __str__(self): + return "all" + + def __repr__(self): + return "%s()" % (type(self).__name__) + def update_persistent_hash(self, key_hash, key_builder): key_builder.rec(key_hash, "all_match_expr") @@ -144,18 +150,21 @@ class All(MatchExpressionBase): return hash(type(self)) -class And(MatchExpressionBase): +class MultiChildMatchExpressionBase(MatchExpressionBase): def __init__(self, children): self.children = children - def __call__(self, kernel, matchable): - return all(ch(kernel, matchable) for ch in self.children) - def __str__(self): - return "(%s)" % (" and ".join(str(ch) for ch in self.children)) + joiner = " %s " % type(self).__name__.lower() + return "(%s)" % (joiner.join(str(ch) for ch in self.children)) + + def __repr__(self): + return "%s(%s)" % ( + type(self).__name__, + ", ".join(repr(ch) for ch in self.children)) def update_persistent_hash(self, key_hash, key_builder): - key_builder.rec(key_hash, "and_match_expr") + key_builder.rec(key_hash, type(self).__name__) key_builder.rec(key_hash, self.children) def __eq__(self, other): @@ -166,26 +175,14 @@ class And(MatchExpressionBase): return hash((type(self), self.children)) -class Or(MatchExpressionBase): - def __init__(self, children): - self.children = children - +class And(MultiChildMatchExpressionBase): def __call__(self, kernel, matchable): - return any(ch(kernel, matchable) for ch in self.children) - - def __str__(self): - return "(%s)" % (" or ".join(str(ch) for ch in self.children)) - - def update_persistent_hash(self, key_hash, key_builder): - key_builder.rec(key_hash, "or_match_expr") - key_builder.rec(key_hash, self.children) + return all(ch(kernel, matchable) for ch in self.children) - def __eq__(self, other): - return (type(self) == type(other) - and self.children == other.children) - def __hash__(self): - return hash((type(self), self.children)) +class Or(MultiChildMatchExpressionBase): + def __call__(self, kernel, matchable): + return any(ch(kernel, matchable) for ch in self.children) class Not(MatchExpressionBase): @@ -198,6 +195,9 @@ class Not(MatchExpressionBase): def __str__(self): return "(not %s)" % str(self.child) + def __repr__(self): + return "%s(%r)" % (type(self).__name__, self.child) + def update_persistent_hash(self, key_hash, key_builder): key_builder.rec(key_hash, "not_match_expr") key_builder.rec(key_hash, self.child) @@ -222,6 +222,9 @@ class GlobMatchExpressionBase(MatchExpressionBase): descr = type(self).__name__ return descr.lower() + ":" + self.glob + def __repr__(self): + return "%s(%r)" % (type(self).__name__, self. glob) + def update_persistent_hash(self, key_hash, key_builder): key_builder.rec(key_hash, type(self).__name__) key_builder.rec(key_hash, self.glob) @@ -273,7 +276,7 @@ def parse_match(expr): """Syntax examples:: * ``id:yoink and writes:a_temp`` - * ``id:yoink and (not writes:a_temp or tagged:input)`` + * ``id:yoink and (not writes:a_temp or tag:input)`` """ if not expr: return All() diff --git a/loopy/options.py b/loopy/options.py index 25bb7014ce07a30c49f7f78d5a6325eaba36291d..13d0b752dfcfa0f0da233880f27f09a963ab4c81 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -112,6 +112,15 @@ class Options(ImmutableRecord): Do not check for or accept :mod:`numpy` arrays as arguments. + Defaults to *False*. + + .. attribute:: cl_exec_manage_array_events + + Within the PyOpenCL executor, respect and udpate + :attr:`pyopencl.array.Array.event`. + + Defaults to *True*. + .. attribute:: return_dict Have kernels return a :class:`dict` instead of a tuple as @@ -196,6 +205,7 @@ class Options(ImmutableRecord): skip_arg_checks=kwargs.get("skip_arg_checks", False), no_numpy=kwargs.get("no_numpy", False), + cl_exec_manage_array_events=kwargs.get("no_numpy", True), return_dict=kwargs.get("return_dict", False), write_wrapper=kwargs.get("write_wrapper", False), write_code=kwargs.get("write_code", False), diff --git a/loopy/preprocess.py b/loopy/preprocess.py index f2b5e7a87022e01bd51368cc3ef3cc60d507d958..ad119e94e74b294e16cdc15c5ab1f723cf7f254b 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -797,11 +797,10 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): newly_added_assignments_ids.add(new_assignment_id) - import loopy as lp new_temporaries[new_assignee_name] = ( TemporaryVariable( name=new_assignee_name, - dtype=lp.auto, + dtype=None, scope=temp_var_scope.PRIVATE)) from pymbolic import var @@ -987,7 +986,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, new_temporary_variables[name] = TemporaryVariable( name=name, shape=(), - dtype=lp.auto, + dtype=None, scope=temp_var_scope.PRIVATE) from pymbolic import var diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 9536fc711a2266a5fae10e83d3d8de8974fc66c5..177daa02948b9c07ef1d9856dc04019e69e24897 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -194,7 +194,6 @@ def generate_array_literal(codegen_state, array, value): ecm = codegen_state.expression_to_code_mapper - from pymbolic.mapper.stringifier import PREC_NONE from loopy.expression import dtype_to_type_context from loopy.symbolic import ArrayLiteral @@ -203,7 +202,7 @@ def generate_array_literal(codegen_state, array, value): codegen_state.ast_builder.get_c_expression_to_code_mapper(), ArrayLiteral( tuple( - ecm(d_i, PREC_NONE, type_context, array.dtype).expr + ecm.map_constant(d_i, type_context) for d_i in data))) # }}} @@ -710,13 +709,18 @@ class CASTBuilder(ASTBuilderBase): ecm(p.flattened_product(decl_info.shape), prec=PREC_NONE, type_context="i")) + if temp_var.alignment: + from cgen import AlignedAttribute + temp_var_decl = AlignedAttribute(temp_var.alignment, temp_var_decl) + return temp_var_decl def wrap_temporary_decl(self, decl, scope): return decl def wrap_global_constant(self, decl): - return decl + from cgen import Static + return Static(decl) def get_value_arg_decl(self, name, shape, dtype, is_written): assert shape == () diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 5efc58bb7cd8692594018a5f7a9bcf75278a3b9b..d8b76d32afa64d308648420904f4f4bf8e2e2316 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -105,12 +105,23 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): kernel_arg.dtype.numpy_dtype), order=order)) + expected_strides = tuple( + var("_lpy_expected_strides_%s" % i) + for i in range(num_axes)) + + gen("%s = %s.strides" % (strify(expected_strides), arg.name)) + #check strides if not skip_arg_checks: - gen("assert %(strides)s == %(name)s.strides, " + strides_check_expr = self.get_strides_check_expr( + (strify(s) for s in sym_shape), + (strify(s) for s in sym_strides), + (strify(s) for s in expected_strides)) + gen("assert %(strides_check)s, " "'Strides of loopy created array %(name)s, " "do not match expected.'" % - dict(name=arg.name, + dict(strides_check=strides_check_expr, + name=arg.name, strides=strify(sym_strides))) for i in range(num_axes): gen("del _lpy_shape_%d" % i) @@ -133,11 +144,13 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ generate invocation - def generate_invocation(self, gen, kernel_name, args): + def generate_invocation(self, gen, kernel_name, args, + kernel, implemented_data_info): gen("for knl in _lpy_c_kernels:") with Indentation(gen): gen('knl({args})'.format( args=", ".join(args))) + # }}} # {{{ diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 0304ec6f09eb2b014bb01a7b30889e24910e0dd9..3a3ea0a70fe9a9229aa3499ad0bdbfeb87f751ed 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -351,6 +351,13 @@ class ExecutionWrapperGeneratorBase(object): def get_arg_pass(self, arg): raise NotImplementedError() + def get_strides_check_expr(self, shape, strides, sym_strides): + # Returns an expression suitable for use for checking the strides of an + # argument. Arguments should be sequences of strings. + return " and ".join( + "(%s == 1 or %s == %s)" % elem + for elem in zip(shape, strides, sym_strides)) + # {{{ arg setup def generate_arg_setup( @@ -516,13 +523,34 @@ class ExecutionWrapperGeneratorBase(object): itemsize = kernel_arg.dtype.numpy_dtype.itemsize sym_strides = tuple( itemsize*s_i for s_i in arg.unvec_strides) - gen("if %s.strides != %s:" - % (arg.name, strify(sym_strides))) + + ndim = len(arg.unvec_shape) + shape = ["_lpy_shape_%d" % i for i in range(ndim)] + strides = ["_lpy_stride_%d" % i for i in range(ndim)] + + gen("(%s,) = %s.shape" % (", ".join(shape), arg.name)) + gen("(%s,) = %s.strides" % (", ".join(strides), arg.name)) + + gen("if not %s:" + % self.get_strides_check_expr( + shape, strides, + (strify(s) for s in sym_strides))) with Indentation(gen): + gen("_lpy_got = tuple(stride " + "for (dim, stride) in zip(%s.shape, %s.strides) " + "if dim > 1)" + % (arg.name, arg.name)) + gen("_lpy_expected = tuple(stride " + "for (dim, stride) in zip(%s.shape, %s) " + "if dim > 1)" + % (arg.name, strify_tuple(sym_strides))) + gen("raise TypeError(\"strides mismatch on " - "argument '%s' (got: %%s, expected: %%s)\" " - "%% (%s.strides, %s))" - % (arg.name, arg.name, strify(sym_strides))) + "argument '%s' " + "(after removing unit length dims, " + "got: %%s, expected: %%s)\" " + "%% (_lpy_got, _lpy_expected))" + % arg.name) if not arg.allows_offset: gen("if hasattr(%s, 'offset') and %s.offset:" % ( @@ -571,7 +599,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ generate invocation - def generate_invocation(self, gen, kernel_name, args): + def generate_invocation(self, gen, kernel_name, args, + kernel, implemented_data_info): raise NotImplementedError() # }}} @@ -632,7 +661,8 @@ class ExecutionWrapperGeneratorBase(object): args = self.generate_arg_setup( gen, kernel, implemented_data_info, options) - self.generate_invocation(gen, codegen_result.host_program.name, args) + self.generate_invocation(gen, codegen_result.host_program.name, args, + kernel, implemented_data_info) self.generate_output_handler(gen, options, kernel, implemented_data_info) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index f24b115fd5a35af94e4a6d437550bccf86b5bee0..744c03d8ed091bc0f05e4fc41aa14e88ec89276a 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -61,6 +61,11 @@ def adjust_local_temp_var_storage(kernel, device): temp_var.copy(storage_shape=temp_var.shape) continue + if not temp_var.shape: + # scalar, no need to mess with storage shape + new_temp_vars[temp_var.name] = temp_var + continue + other_loctemp_nbytes = [ tv.nbytes for tv in six.itervalues(kernel.temporary_variables) @@ -441,7 +446,9 @@ def generate_value_arg_setup(kernel, devices, implemented_data_info): warn("{knl_name}: device not supplied to PyOpenCLTarget--" "workarounds for broken OpenCL implementations " "(such as those relating to complex numbers) " - "may not be enabled when needed" + "may not be enabled when needed. To avoid this, " + "pass target=lp.PyOpenCLTarget(dev) when creating " + "the kernel." .format(knl_name=kernel.name)) if any(count_bug_per_dev): diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index cc0b48a6ac17e23f318c5489d45fca6710bb3392..bef3152d03c193c14b11ce6f9ba3f20fdfcff6ad 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -151,7 +151,24 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ generate invocation - def generate_invocation(self, gen, kernel_name, args): + def generate_invocation(self, gen, kernel_name, args, + kernel, implemented_data_info): + if kernel.options.cl_exec_manage_array_events: + gen(""" + if wait_for is None: + wait_for = [] + """) + + gen("") + from loopy.kernel.data import GlobalArg + for arg in implemented_data_info: + if issubclass(arg.arg_class, GlobalArg): + gen( + "wait_for.extend({arg_name}.events)" + .format(arg_name=arg.name)) + + gen("") + gen("_lpy_evt = {kernel_name}({args})" .format( kernel_name=kernel_name, @@ -160,6 +177,14 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): + args + ["wait_for=wait_for"]))) + if kernel.options.cl_exec_manage_array_events: + gen("") + from loopy.kernel.data import GlobalArg + for arg in implemented_data_info: + if (issubclass(arg.arg_class, GlobalArg) + and arg.base_name in kernel.get_written_variables()): + gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name)) + # }}} # {{{ diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index e7a86300f9d040cba1688e5bb0f3dcbbd926f783..7e6b03581e39d03bc06d2f6d37f65a1d4ac6a386 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -38,6 +38,20 @@ __doc__ = """ # {{{ to_batched +def temp_needs_batching_if_not_sequential(tv, batch_varying_args): + from loopy.kernel.data import temp_var_scope + if tv.name in batch_varying_args: + return True + if tv.initializer is not None and tv.read_only: + # do not batch read_only temps if not in + # `batch_varying_args` + return False + if tv.scope == temp_var_scope.PRIVATE: + # do not batch private temps if not in `batch_varying args` + return False + return True + + class _BatchVariableChanger(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, kernel, batch_varying_args, batch_iname_expr, sequential): @@ -50,14 +64,17 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): def needs_batch_subscript(self, name): tv = self.kernel.temporary_variables.get(name) - return ( - (not self.sequential - and (tv is not None - and not ( - tv.initializer is not None - and tv.read_only))) - or - name in self.batch_varying_args) + + if name in self.batch_varying_args: + return True + if not self.sequential: + if tv is None: + return False + if not temp_needs_batching_if_not_sequential(tv, + self.batch_varying_args): + return False + + return True def map_subscript(self, expr, expn_state): if not self.needs_batch_subscript(expr.aggregate.name): @@ -89,6 +106,10 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", sequential=False): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. + .. note:: + For temporaries in a kernel that are private or read only + globals and if `sequential=True`, loopy does not does not batch these + variables unless explicitly mentioned in `batch_varying_args`. :arg nbatches: the number of batches. May be a constant non-negative integer or a string, which will be added as an integer argument. @@ -144,13 +165,13 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", new_temps = {} for temp in six.itervalues(knl.temporary_variables): - if temp.initializer is not None and temp.read_only: - new_temps[temp.name] = temp - else: + if temp_needs_batching_if_not_sequential(temp, batch_varying_args): new_temps[temp.name] = temp.copy( shape=(nbatches_expr,) + temp.shape, dim_tags=("c",) * (len(temp.shape) + 1), dim_names=_add_unique_dim_name("ibatch", temp.dim_names)) + else: + new_temps[temp.name] = temp knl = knl.copy(temporary_variables=new_temps) else: diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index cd8ec409cce1a3f210554a05daf4bd358781fb20..2347cef3c04d2a44cef91782700e097a20e19712 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -854,23 +854,23 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, # {{{ iname duplication for schedulability -def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])): - # Remove common inames of the current insn_deps, as they are not relevant +def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset([])): + # Remove common inames of the current insn_iname_sets, as they are not relevant # for splitting. - common = frozenset([]).union(*insn_deps).intersection(*insn_deps) + common = frozenset([]).union(*insn_iname_sets).intersection(*insn_iname_sets) # If common inames were found, we reduce the problem and go into recursion if common: # Remove the common inames from the instruction dependencies - insn_deps = ( - frozenset(dep - common for dep in insn_deps) + insn_iname_sets = ( + frozenset(iname_set - common for iname_set in insn_iname_sets) - frozenset([frozenset([])])) # Join the common inames with those previously found common = common.union(old_common_inames) # Go into recursion - for option in _get_iname_duplication_options(insn_deps, common): + for option in _get_iname_duplication_options(insn_iname_sets, common): yield option # Do not yield anything beyond here! return @@ -880,7 +880,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])): def join_sets_if_not_disjoint(sets): for s1 in sets: for s2 in sets: - if s1 != s2 and s1.intersection(s2): + if s1 != s2 and s1 & s2: return ( (sets - frozenset([s1, s2])) | frozenset([s1 | s2]) @@ -888,7 +888,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])): return sets, True - partitioning = insn_deps + partitioning = insn_iname_sets stop = False while not stop: partitioning, stop = join_sets_if_not_disjoint(partitioning) @@ -897,7 +897,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])): # subproblems if len(partitioning) > 1: for part in partitioning: - working_set = frozenset(s for s in insn_deps if s.issubset(part)) + working_set = frozenset(s for s in insn_iname_sets if s <= part) for option in _get_iname_duplication_options(working_set, old_common_inames): yield option @@ -908,7 +908,9 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])): # There are splitting options for all inames for iname in inames: iname_insns = frozenset( - insn for insn in insn_deps if frozenset([iname]).issubset(insn)) + insn + for insn in insn_iname_sets + if frozenset([iname]) <= insn) import itertools as it # For a given iname, the set of instructions containing this iname @@ -919,7 +921,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])): for l in range(1, len(iname_insns))): yield ( iname, - tuple(insn.union(old_common_inames) for insn in insns_to_dup)) + tuple(insn | old_common_inames for insn in insns_to_dup)) # If partitioning was empty, we have recursed successfully and yield nothing @@ -951,12 +953,12 @@ def get_iname_duplication_options(knl, use_boostable_into=False): * duplicating j in instruction i2 * duplicating i in instruction i2 and i3 - Use :func:`has_schedulable_iname_nesting` to decide, whether an iname needs to be + Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be duplicated in a given kernel. """ # First we extract the minimal necessary information from the kernel if use_boostable_into: - insn_deps = ( + insn_iname_sets = ( frozenset(insn.within_inames.union( insn.boostable_into if insn.boostable_into is not None else frozenset([])) @@ -964,20 +966,20 @@ def get_iname_duplication_options(knl, use_boostable_into=False): - frozenset([frozenset([])])) else: - insn_deps = ( + insn_iname_sets = ( frozenset(insn.within_inames for insn in knl.instructions) - frozenset([frozenset([])])) # Get the duplication options as a tuple of iname and a set - for iname, insns in _get_iname_duplication_options(insn_deps): + for iname, insns in _get_iname_duplication_options(insn_iname_sets): # Check whether this iname has a parallel tag and discard it if so from loopy.kernel.data import ConcurrentTag if (iname in knl.iname_to_tag and isinstance(knl.iname_to_tag[iname], ConcurrentTag)): continue - # If we find a duplication option and fo not use boostable_into + # If we find a duplication option and to not use boostable_into # information, we restart this generator with use_boostable_into=True if not use_boostable_into and not knl.options.ignore_boostable_into: for option in get_iname_duplication_options(knl, True): diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 6ffc1dff5220ab48c6c87ec29fec6e44d57ba133..fcf8f965b68fd258b0c0f1eae94ec84a39a5b7ee 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -312,15 +312,8 @@ class TypeInferenceMapper(CombineMapper): from loopy.kernel.data import TemporaryVariable, KernelArgument import loopy as lp - if isinstance(obj, TemporaryVariable): - result = [obj.dtype] - if result[0] is lp.auto: - self.symbols_with_unknown_types.add(expr.name) - return [] - else: - return result - - elif isinstance(obj, KernelArgument): + if isinstance(obj, (KernelArgument, TemporaryVariable)): + assert obj.dtype is not lp.auto result = [obj.dtype] if result[0] is None: self.symbols_with_unknown_types.add(expr.name) @@ -515,10 +508,12 @@ def infer_unknown_types(kernel, expect_completion=False): import loopy as lp for tv in six.itervalues(kernel.temporary_variables): - if tv.dtype is lp.auto: + assert tv.dtype is not lp.auto + if tv.dtype is None: names_for_type_inference.append(tv.name) for arg in kernel.args: + assert arg.dtype is not lp.auto if arg.dtype is None: names_for_type_inference.append(arg.name) @@ -588,6 +583,9 @@ def infer_unknown_types(kernel, expect_completion=False): failed = not result if not failed: new_dtype, = result + if new_dtype.target is None: + new_dtype = new_dtype.with_target(kernel.target) + debug(" success: %s", new_dtype) if new_dtype != item.dtype: debug(" changed from: %s", item.dtype) diff --git a/loopy/types.py b/loopy/types.py index f095d1d58f9eaebb7dcc9c8d41afa73951f2ba84..8f0f310c305b3d5b24bd6e771b501bb6d9c69224 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -177,13 +177,20 @@ class AtomicNumpyType(NumpyType, AtomicType): # }}} -def to_loopy_type(dtype, allow_none=False, allow_auto=False, for_atomic=False, +def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False, target=None): from loopy.kernel.data import auto - if allow_none and dtype is None: - return dtype - elif allow_auto and dtype is auto: - return dtype + if dtype is None: + if allow_none: + return None + else: + raise LoopyError("dtype may not be none") + + elif dtype is auto: + if allow_auto: + return dtype + else: + raise LoopyError("dtype may not be auto") numpy_dtype = None diff --git a/loopy/version.py b/loopy/version.py index c415c29a15063b6fd92335fdbaa37ba75ff4019f..7141a678297ded5e0d6e2f16f065f035a034d540 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -21,7 +21,7 @@ THE SOFTWARE. """ -VERSION = (2017, 2) +VERSION = (2017, 2, 1) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v72-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v76-islpy%s" % _islpy_version diff --git a/requirements.txt b/requirements.txt index 1a23022821116aea068b76eab72f9a5596694eea..a3e88cfea99e7413211c35d11464932f98e23758 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ git+https://github.com/inducer/pymbolic.git git+https://github.com/inducer/genpy.git git+https://github.com/inducer/codepy.git -hg+https://bitbucket.org/inducer/f2py +git+https://github.com/inducer/f2py # Optional, needed for using the C preprocessor on Fortran ply>=3.6 diff --git a/test/test_loopy.py b/test/test_loopy.py index 8bb8f37f08b909444f384387e0007c51c8eee587..e36a4c2c3cb3f7e70a5b039ea631bbce20923be8 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2462,16 +2462,9 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): vecsize = 16 knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0') - # artifically expand via overridden_get_grid_sizes_for_insn_ids - class GridOverride(object): - def __init__(self, clean, vecsize=vecsize): - self.clean = clean - self.vecsize = vecsize - - def __call__(self, insn_ids, ignore_auto=True): - gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto) - return gsize, (self.vecsize,) + from testlib import GridOverride + # artifically expand via overridden_get_grid_sizes_for_insn_ids knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( knl.copy(), vecsize)) # make sure we can generate the code @@ -2741,6 +2734,36 @@ def test_preamble_with_separate_temporaries(ctx_factory): queue, data=data.flatten('C'))[1][0], data[offsets[:-1] + 1]) +def test_arg_inference_for_predicates(): + knl = lp.make_kernel("{[i]: 0 <= i < 10}", + """ + if incr[i] + a = a + 1 + end + """) + + assert "incr" in knl.arg_dict + assert knl.arg_dict["incr"].shape == (10,) + + +def test_relaxed_stride_checks(ctx_factory): + # Check that loopy is compatible with numpy's relaxed stride rules. + ctx = ctx_factory() + + knl = lp.make_kernel("{[i,j]: 0 <= i <= n and 0 <= j <= m}", + """ + a[i] = sum(j, A[i,j] * b[j]) + """) + + with cl.CommandQueue(ctx) as queue: + mat = np.zeros((1, 10), order="F") + b = np.zeros(10) + + evt, (a,) = knl(queue, A=mat, b=b) + + assert a == 0 + + def test_add_prefetch_works_in_lhs_index(): knl = lp.make_kernel( "{ [n,k,l,k1,l1,k2,l2]: " diff --git a/test/test_transform.py b/test/test_transform.py index e50605b46672f8e9c1817431f1577742b1f6fb4c..0e10db362f36b7fc258059c2ec7ed1a344b97212 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -96,13 +96,65 @@ def test_to_batched(ctx_factory): knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', ''' out[i] = sum(j, a[i,j]*x[j])''') + knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32, + x=np.float32, + a=np.float32)) bknl = lp.to_batched(knl, "nbatches", "out,x") + ref_knl = lp.make_kernel( + ''' { [i,j,k]: 0<=i,j<n and 0<=k<nbatches} ''', + '''out[k, i] = sum(j, a[i,j]*x[k, j])''') + ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32, + x=np.float32, + a=np.float32)) + + a = np.random.randn(5, 5).astype(np.float32) + x = np.random.randn(7, 5).astype(np.float32) + + # Running both the kernels + evt, (out1, ) = bknl(queue, a=a, x=x, n=5, nbatches=7) + evt, (out2, ) = ref_knl(queue, a=a, x=x, n=5, nbatches=7) + + # checking that the outputs are same + assert np.linalg.norm(out1-out2) < 1e-15 + + +def test_to_batched_temp(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + ''' { [i,j]: 0<=i,j<n } ''', + ''' cnst = 2.0 + out[i] = sum(j, cnst*a[i,j]*x[j])''', + [lp.TemporaryVariable( + "cnst", + dtype=np.float32, + shape=(), + scope=lp.temp_var_scope.PRIVATE), '...']) + knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32, + x=np.float32, + a=np.float32)) + ref_knl = lp.make_kernel( + ''' { [i,j]: 0<=i,j<n } ''', + '''out[i] = sum(j, 2.0*a[i,j]*x[j])''') + ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32, + x=np.float32, + a=np.float32)) + + bknl = lp.to_batched(knl, "nbatches", "out,x") + bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x") + + # checking that cnst is not being bathced + assert bknl.temporary_variables['cnst'].shape == () + a = np.random.randn(5, 5) x = np.random.randn(7, 5) - bknl(queue, a=a, x=x) + # Checking that the program compiles and the logic is correct + lp.auto_test_vs_ref( + bref_knl, ctx, bknl, + parameters=dict(a=a, x=x, n=5, nbatches=7)) def test_add_barrier(ctx_factory): diff --git a/test/testlib.py b/test/testlib.py index 3fae05a38ad0f0c414f42a182e36ed26c5b50da5..73de4199d31736230026eb7f2eb7939a93806369 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -1,6 +1,20 @@ import loopy as lp +# {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel + +class GridOverride(object): + def __init__(self, clean, vecsize): + self.clean = clean + self.vecsize = vecsize + + def __call__(self, insn_ids, ignore_auto=True): + gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto) + return gsize, (self.vecsize,) + +# }}} + + # {{{ test_preamble_with_separate_temporaries class SeparateTemporariesPreambleTestHelper: @@ -99,3 +113,5 @@ class SeparateTemporariesPreambleTestHelper: yield (desc, '\n'.join([str(decl), code])) # }}} + +# vim: foldmethod=marker