Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tasmith4/loopy
  • ben_sepanski/loopy
  • arghdos/loopy
  • inducer/loopy
  • wence-/loopy
  • isuruf/loopy
  • fikl2/loopy
  • xywei/loopy
  • kaushikcfd/loopy
  • zweiner2/loopy
10 results
Show changes
Commits on Source (60)
Showing
with 312 additions and 149 deletions
......@@ -25,7 +25,7 @@ It can capture the following types of optimizations:
* Vector and multi-core parallelism in the OpenCL/CUDA model
* Data layout transformations (structure of arrays to array of structures)
* Loopy Unrolling
* Loop unrolling
* Loop tiling with efficient handling of boundary cases
* Prefetching/copy optimizations
* Instruction level parallelism
......
......@@ -111,9 +111,9 @@ always see loopy's view of a kernel by printing it.
KERNEL: loopy_kernel
---------------------------------------------------------------------------
ARGUMENTS:
a: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
n: ValueArg, type: <runtime>
out: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
a: GlobalArg, type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1)
n: ValueArg, type: <auto/runtime>
out: GlobalArg, type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1)
---------------------------------------------------------------------------
DOMAINS:
[n] -> { [i] : 0 <= i < n }
......@@ -154,7 +154,7 @@ following:
See :ref:`specifying-arguments`.
* Loopy has not determined the type of ``a`` and ``out``. The data type is
given as ``<runtime>``, which means that these types will be determined
given as ``<auto/runtime>``, which means that these types will be determined
by the data passed in when the kernel is invoked. Loopy generates (and
caches!) a copy of the kernel for each combination of types passed in.
......
......@@ -1081,7 +1081,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
warn_with_kernel(self,
"iname-order",
"get_visual_iname_order_embedding() could not determine a "
"consistent iname nesting order")
"consistent iname nesting order. This is a possible indication "
"that the kernel may not schedule successfully, but for now "
"it only impacts printing of the kernel.")
embedding = dict((iname, iname) for iname in self.all_inames())
return embedding
......
......@@ -549,15 +549,55 @@ class ArrayBase(ImmutableRecord):
.. attribute :: name
.. attribute :: dtype
the :class:`loopy.loopytype` of the array.
if this is *none*, :mod:`loopy` will try to continue without
knowing the type of this array, where the idea is that precise
knowledge of the type will become available at invocation time.
:class:`loopy.compiledkernel` (and thereby
:meth:`loopy.loopkernel.__call__`) automatically add this type
information based on invocation arguments.
note that some transformations, such as :func:`loopy.add_padding`
cannot be performed without knowledge of the exact *dtype*.
.. attribute :: shape
May be one of the following:
* *None*. In this case, no shape is intended to be specified,
only the strides will be used to access the array. Bounds checking
will not be performed.
* :class:`loopy.auto`. The shape will be determined by finding the
access footprint.
* a tuple like like :attr:`numpy.ndarray.shape`.
Each entry of the tuple is also allowed to be a :mod:`pymbolic`
expression involving kernel parameters, or a (potentially-comma
separated) or a string that can be parsed to such an expression.
Any element of the shape tuple not used to compute strides
may be *None*.
.. attribute:: dim_tags
See :ref:`data-dim-tags`.
.. attribute:: offset
Offset from the beginning of the buffer to the point from
which the strides are counted. May be one of
* 0 or None
* a string (that is interpreted as an argument name).
* a pymbolic expression
* :class:`loopy.auto`, in which case an offset argument
is added automatically, immediately following this argument.
:class:`loopy.CompiledKernel` is even smarter in its treatment of
this case and will compile custom versions of the kernel based on
whether the passed arrays have offsets or not.
.. attribute:: dim_names
A tuple of strings providing names for the array axes, or *None*.
......@@ -568,6 +608,21 @@ class ArrayBase(ImmutableRecord):
to generate more informative names than could be achieved by
axis numbers.
.. attribute:: alignment
Memory alignment of the array in bytes. For temporary arrays,
this ensures they are allocated with this alignment. For arguments,
this entails a promise that the incoming array obeys this alignment
restriction.
Defaults to *None*.
If an integer N is given, the array would be declared
with ``__attribute__((aligned(N)))`` in code generation for
:class:`loopy.CTarget`.
.. versionadded:: 2018.1
.. automethod:: __init__
.. automethod:: __eq__
.. automethod:: num_user_axes
......@@ -584,46 +639,18 @@ class ArrayBase(ImmutableRecord):
def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0,
dim_names=None, strides=None, order=None, for_atomic=False,
target=None,
target=None, alignment=None,
**kwargs):
"""
All of the following (except *name*) are optional.
Specify either strides or shape.
:arg name: May contain multiple names separated by
commas, in which case multiple arguments,
each with identical properties, are created
for each name.
:arg dtype: the :class:`numpy.dtype` of the array.
If this is *None*, :mod:`loopy` will try to continue without
knowing the type of this array, where the idea is that precise
knowledge of the type will become available at invocation time.
:class:`loopy.CompiledKernel` (and thereby
:meth:`loopy.LoopKernel.__call__`) automatically add this type
information based on invocation arguments.
Note that some transformations, such as :func:`loopy.add_padding`
cannot be performed without knowledge of the exact *dtype*.
:arg name: When passed to :class:`loopy.make_kernel`, this may contain
multiple names separated by commas, in which case multiple arguments,
each with identical properties, are created for each name.
:arg shape: May be one of the following:
* *None*. In this case, no shape is intended to be specified,
only the strides will be used to access the array. Bounds checking
will not be performed.
* :class:`loopy.auto`. The shape will be determined by finding the
access footprint.
* a tuple like like :attr:`numpy.ndarray.shape`.
Each entry of the tuple is also allowed to be a :mod:`pymbolic`
expression involving kernel parameters, or a (potentially-comma
separated) or a string that can be parsed to such an expression.
Any element of the shape tuple not used to compute strides
may be *None*.
* A string which can be parsed into the previous form.
:arg shape: May be any of the things specified under :attr:`shape`,
or a string which can be parsed into the previous form.
:arg dim_tags: A comma-separated list of tags as understood by
:func:`parse_array_dim_tag`.
......@@ -649,17 +676,9 @@ class ArrayBase(ImmutableRecord):
:arg for_atomic:
Whether the array is declared for atomic access, and, if necessary,
using atomic-capable data types.
:arg offset: Offset from the beginning of the buffer to the point from
which the strides are counted. May be one of
:arg offset: (See :attr:`offset`)
:arg alignment: memory alignment in bytes
* 0 or None
* a string (that is interpreted as an argument name).
* a pymbolic expression
* :class:`loopy.auto`, in which case an offset argument
is added automatically, immediately following this argument.
:class:`loopy.CompiledKernel` is even smarter in its treatment of
this case and will compile custom versions of the kernel based on
whether the passed arrays have offsets or not.
"""
for kwarg_name in kwargs:
......@@ -672,6 +691,14 @@ class ArrayBase(ImmutableRecord):
dtype = to_loopy_type(dtype, allow_auto=True, allow_none=True,
for_atomic=for_atomic, target=target)
if dtype is lp.auto:
from warnings import warn
warn("Argument/temporary data type should be None if unspecified, "
"not auto. This usage will be disallowed in 2018.",
DeprecationWarning, stacklevel=2)
dtype = None
strides_known = strides is not None and strides is not lp.auto
shape_known = shape is not None and shape is not lp.auto
......@@ -805,6 +832,7 @@ class ArrayBase(ImmutableRecord):
offset=offset,
dim_names=dim_names,
order=order,
alignment=alignment,
**kwargs)
def __eq__(self, other):
......@@ -832,10 +860,10 @@ class ArrayBase(ImmutableRecord):
if include_typename:
info_entries.append(type(self).__name__)
if self.dtype is lp.auto:
type_str = "<auto>"
elif self.dtype is None:
type_str = "<runtime>"
assert self.dtype is not lp.auto
if self.dtype is None:
type_str = "<auto/runtime>"
else:
type_str = str(self.dtype)
......
......@@ -1004,7 +1004,7 @@ def _find_existentially_quantified_inames(dom_str):
def parse_domains(domains, defines):
if isinstance(domains, str):
if isinstance(domains, (isl.BasicSet, str)):
domains = [domains]
result = []
......@@ -1106,6 +1106,9 @@ class ArgumentGuesser:
self.all_written_names = set()
from loopy.symbolic import get_dependencies
for insn in instructions:
for pred in insn.predicates:
self.all_names.update(get_dependencies(self.submap(pred)))
if isinstance(insn, MultiAssignmentBase):
for assignee_var_name in insn.assignee_var_names():
self.all_written_names.add(assignee_var_name)
......
......@@ -219,9 +219,20 @@ class KernelArgument(ImmutableRecord):
dtype = kwargs.pop("dtype", None)
from loopy.types import to_loopy_type
kwargs["dtype"] = to_loopy_type(
dtype = to_loopy_type(
dtype, allow_auto=True, allow_none=True, target=target)
import loopy as lp
if dtype is lp.auto:
from warnings import warn
warn("Argument/temporary data type should be None if unspecified, "
"not auto. This usage will be disallowed in 2018.",
DeprecationWarning, stacklevel=2)
dtype = None
kwargs["dtype"] = dtype
ImmutableRecord.__init__(self, **kwargs)
......@@ -268,10 +279,10 @@ class ValueArg(KernelArgument):
def __str__(self):
import loopy as lp
if self.dtype is lp.auto:
type_str = "<auto>"
elif self.dtype is None:
type_str = "<runtime>"
assert self.dtype is not lp.auto
if self.dtype is None:
type_str = "<auto/runtime>"
else:
type_str = str(self.dtype)
......@@ -449,7 +460,7 @@ class TemporaryVariable(ArrayBase):
% name)
ArrayBase.__init__(self, name=intern(name),
dtype=dtype, shape=shape,
dtype=dtype, shape=shape, strides=strides,
dim_tags=dim_tags, offset=offset, dim_names=dim_names,
order=order,
base_indices=base_indices, scope=scope,
......
......@@ -107,7 +107,7 @@ def get_arguments_with_incomplete_dtype(knl):
if arg.dtype is None]
def add_and_infer_dtypes(knl, dtype_dict):
def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False):
processed_dtype_dict = {}
for k, v in six.iteritems(dtype_dict):
......@@ -119,7 +119,7 @@ def add_and_infer_dtypes(knl, dtype_dict):
knl = add_dtypes(knl, processed_dtype_dict)
from loopy.type_inference import infer_unknown_types
return infer_unknown_types(knl, expect_completion=True)
return infer_unknown_types(knl, expect_completion=expect_completion)
def _add_and_infer_dtypes_overdetermined(knl, dtype_dict):
......
......@@ -134,6 +134,12 @@ class All(MatchExpressionBase):
def __call__(self, kernel, matchable):
return True
def __str__(self):
return "all"
def __repr__(self):
return "%s()" % (type(self).__name__)
def update_persistent_hash(self, key_hash, key_builder):
key_builder.rec(key_hash, "all_match_expr")
......@@ -144,18 +150,21 @@ class All(MatchExpressionBase):
return hash(type(self))
class And(MatchExpressionBase):
class MultiChildMatchExpressionBase(MatchExpressionBase):
def __init__(self, children):
self.children = children
def __call__(self, kernel, matchable):
return all(ch(kernel, matchable) for ch in self.children)
def __str__(self):
return "(%s)" % (" and ".join(str(ch) for ch in self.children))
joiner = " %s " % type(self).__name__.lower()
return "(%s)" % (joiner.join(str(ch) for ch in self.children))
def __repr__(self):
return "%s(%s)" % (
type(self).__name__,
", ".join(repr(ch) for ch in self.children))
def update_persistent_hash(self, key_hash, key_builder):
key_builder.rec(key_hash, "and_match_expr")
key_builder.rec(key_hash, type(self).__name__)
key_builder.rec(key_hash, self.children)
def __eq__(self, other):
......@@ -166,26 +175,14 @@ class And(MatchExpressionBase):
return hash((type(self), self.children))
class Or(MatchExpressionBase):
def __init__(self, children):
self.children = children
class And(MultiChildMatchExpressionBase):
def __call__(self, kernel, matchable):
return any(ch(kernel, matchable) for ch in self.children)
def __str__(self):
return "(%s)" % (" or ".join(str(ch) for ch in self.children))
def update_persistent_hash(self, key_hash, key_builder):
key_builder.rec(key_hash, "or_match_expr")
key_builder.rec(key_hash, self.children)
return all(ch(kernel, matchable) for ch in self.children)
def __eq__(self, other):
return (type(self) == type(other)
and self.children == other.children)
def __hash__(self):
return hash((type(self), self.children))
class Or(MultiChildMatchExpressionBase):
def __call__(self, kernel, matchable):
return any(ch(kernel, matchable) for ch in self.children)
class Not(MatchExpressionBase):
......@@ -198,6 +195,9 @@ class Not(MatchExpressionBase):
def __str__(self):
return "(not %s)" % str(self.child)
def __repr__(self):
return "%s(%r)" % (type(self).__name__, self.child)
def update_persistent_hash(self, key_hash, key_builder):
key_builder.rec(key_hash, "not_match_expr")
key_builder.rec(key_hash, self.child)
......@@ -222,6 +222,9 @@ class GlobMatchExpressionBase(MatchExpressionBase):
descr = type(self).__name__
return descr.lower() + ":" + self.glob
def __repr__(self):
return "%s(%r)" % (type(self).__name__, self. glob)
def update_persistent_hash(self, key_hash, key_builder):
key_builder.rec(key_hash, type(self).__name__)
key_builder.rec(key_hash, self.glob)
......@@ -273,7 +276,7 @@ def parse_match(expr):
"""Syntax examples::
* ``id:yoink and writes:a_temp``
* ``id:yoink and (not writes:a_temp or tagged:input)``
* ``id:yoink and (not writes:a_temp or tag:input)``
"""
if not expr:
return All()
......
......@@ -112,6 +112,15 @@ class Options(ImmutableRecord):
Do not check for or accept :mod:`numpy` arrays as
arguments.
Defaults to *False*.
.. attribute:: cl_exec_manage_array_events
Within the PyOpenCL executor, respect and udpate
:attr:`pyopencl.array.Array.event`.
Defaults to *True*.
.. attribute:: return_dict
Have kernels return a :class:`dict` instead of a tuple as
......@@ -196,6 +205,7 @@ class Options(ImmutableRecord):
skip_arg_checks=kwargs.get("skip_arg_checks", False),
no_numpy=kwargs.get("no_numpy", False),
cl_exec_manage_array_events=kwargs.get("no_numpy", True),
return_dict=kwargs.get("return_dict", False),
write_wrapper=kwargs.get("write_wrapper", False),
write_code=kwargs.get("write_code", False),
......
......@@ -797,11 +797,10 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
newly_added_assignments_ids.add(new_assignment_id)
import loopy as lp
new_temporaries[new_assignee_name] = (
TemporaryVariable(
name=new_assignee_name,
dtype=lp.auto,
dtype=None,
scope=temp_var_scope.PRIVATE))
from pymbolic import var
......@@ -987,7 +986,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
new_temporary_variables[name] = TemporaryVariable(
name=name,
shape=(),
dtype=lp.auto,
dtype=None,
scope=temp_var_scope.PRIVATE)
from pymbolic import var
......
......@@ -194,7 +194,6 @@ def generate_array_literal(codegen_state, array, value):
ecm = codegen_state.expression_to_code_mapper
from pymbolic.mapper.stringifier import PREC_NONE
from loopy.expression import dtype_to_type_context
from loopy.symbolic import ArrayLiteral
......@@ -203,7 +202,7 @@ def generate_array_literal(codegen_state, array, value):
codegen_state.ast_builder.get_c_expression_to_code_mapper(),
ArrayLiteral(
tuple(
ecm(d_i, PREC_NONE, type_context, array.dtype).expr
ecm.map_constant(d_i, type_context)
for d_i in data)))
# }}}
......@@ -710,13 +709,18 @@ class CASTBuilder(ASTBuilderBase):
ecm(p.flattened_product(decl_info.shape),
prec=PREC_NONE, type_context="i"))
if temp_var.alignment:
from cgen import AlignedAttribute
temp_var_decl = AlignedAttribute(temp_var.alignment, temp_var_decl)
return temp_var_decl
def wrap_temporary_decl(self, decl, scope):
return decl
def wrap_global_constant(self, decl):
return decl
from cgen import Static
return Static(decl)
def get_value_arg_decl(self, name, shape, dtype, is_written):
assert shape == ()
......
......@@ -105,12 +105,23 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
kernel_arg.dtype.numpy_dtype),
order=order))
expected_strides = tuple(
var("_lpy_expected_strides_%s" % i)
for i in range(num_axes))
gen("%s = %s.strides" % (strify(expected_strides), arg.name))
#check strides
if not skip_arg_checks:
gen("assert %(strides)s == %(name)s.strides, "
strides_check_expr = self.get_strides_check_expr(
(strify(s) for s in sym_shape),
(strify(s) for s in sym_strides),
(strify(s) for s in expected_strides))
gen("assert %(strides_check)s, "
"'Strides of loopy created array %(name)s, "
"do not match expected.'" %
dict(name=arg.name,
dict(strides_check=strides_check_expr,
name=arg.name,
strides=strify(sym_strides)))
for i in range(num_axes):
gen("del _lpy_shape_%d" % i)
......@@ -133,11 +144,13 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
# {{{ generate invocation
def generate_invocation(self, gen, kernel_name, args):
def generate_invocation(self, gen, kernel_name, args,
kernel, implemented_data_info):
gen("for knl in _lpy_c_kernels:")
with Indentation(gen):
gen('knl({args})'.format(
args=", ".join(args)))
# }}}
# {{{
......
......@@ -351,6 +351,13 @@ class ExecutionWrapperGeneratorBase(object):
def get_arg_pass(self, arg):
raise NotImplementedError()
def get_strides_check_expr(self, shape, strides, sym_strides):
# Returns an expression suitable for use for checking the strides of an
# argument. Arguments should be sequences of strings.
return " and ".join(
"(%s == 1 or %s == %s)" % elem
for elem in zip(shape, strides, sym_strides))
# {{{ arg setup
def generate_arg_setup(
......@@ -516,13 +523,34 @@ class ExecutionWrapperGeneratorBase(object):
itemsize = kernel_arg.dtype.numpy_dtype.itemsize
sym_strides = tuple(
itemsize*s_i for s_i in arg.unvec_strides)
gen("if %s.strides != %s:"
% (arg.name, strify(sym_strides)))
ndim = len(arg.unvec_shape)
shape = ["_lpy_shape_%d" % i for i in range(ndim)]
strides = ["_lpy_stride_%d" % i for i in range(ndim)]
gen("(%s,) = %s.shape" % (", ".join(shape), arg.name))
gen("(%s,) = %s.strides" % (", ".join(strides), arg.name))
gen("if not %s:"
% self.get_strides_check_expr(
shape, strides,
(strify(s) for s in sym_strides)))
with Indentation(gen):
gen("_lpy_got = tuple(stride "
"for (dim, stride) in zip(%s.shape, %s.strides) "
"if dim > 1)"
% (arg.name, arg.name))
gen("_lpy_expected = tuple(stride "
"for (dim, stride) in zip(%s.shape, %s) "
"if dim > 1)"
% (arg.name, strify_tuple(sym_strides)))
gen("raise TypeError(\"strides mismatch on "
"argument '%s' (got: %%s, expected: %%s)\" "
"%% (%s.strides, %s))"
% (arg.name, arg.name, strify(sym_strides)))
"argument '%s' "
"(after removing unit length dims, "
"got: %%s, expected: %%s)\" "
"%% (_lpy_got, _lpy_expected))"
% arg.name)
if not arg.allows_offset:
gen("if hasattr(%s, 'offset') and %s.offset:" % (
......@@ -571,7 +599,8 @@ class ExecutionWrapperGeneratorBase(object):
# {{{ generate invocation
def generate_invocation(self, gen, kernel_name, args):
def generate_invocation(self, gen, kernel_name, args,
kernel, implemented_data_info):
raise NotImplementedError()
# }}}
......@@ -632,7 +661,8 @@ class ExecutionWrapperGeneratorBase(object):
args = self.generate_arg_setup(
gen, kernel, implemented_data_info, options)
self.generate_invocation(gen, codegen_result.host_program.name, args)
self.generate_invocation(gen, codegen_result.host_program.name, args,
kernel, implemented_data_info)
self.generate_output_handler(gen, options, kernel, implemented_data_info)
......
......@@ -61,6 +61,11 @@ def adjust_local_temp_var_storage(kernel, device):
temp_var.copy(storage_shape=temp_var.shape)
continue
if not temp_var.shape:
# scalar, no need to mess with storage shape
new_temp_vars[temp_var.name] = temp_var
continue
other_loctemp_nbytes = [
tv.nbytes
for tv in six.itervalues(kernel.temporary_variables)
......@@ -441,7 +446,9 @@ def generate_value_arg_setup(kernel, devices, implemented_data_info):
warn("{knl_name}: device not supplied to PyOpenCLTarget--"
"workarounds for broken OpenCL implementations "
"(such as those relating to complex numbers) "
"may not be enabled when needed"
"may not be enabled when needed. To avoid this, "
"pass target=lp.PyOpenCLTarget(dev) when creating "
"the kernel."
.format(knl_name=kernel.name))
if any(count_bug_per_dev):
......
......@@ -151,7 +151,24 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
# {{{ generate invocation
def generate_invocation(self, gen, kernel_name, args):
def generate_invocation(self, gen, kernel_name, args,
kernel, implemented_data_info):
if kernel.options.cl_exec_manage_array_events:
gen("""
if wait_for is None:
wait_for = []
""")
gen("")
from loopy.kernel.data import GlobalArg
for arg in implemented_data_info:
if issubclass(arg.arg_class, GlobalArg):
gen(
"wait_for.extend({arg_name}.events)"
.format(arg_name=arg.name))
gen("")
gen("_lpy_evt = {kernel_name}({args})"
.format(
kernel_name=kernel_name,
......@@ -160,6 +177,14 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
+ args
+ ["wait_for=wait_for"])))
if kernel.options.cl_exec_manage_array_events:
gen("")
from loopy.kernel.data import GlobalArg
for arg in implemented_data_info:
if (issubclass(arg.arg_class, GlobalArg)
and arg.base_name in kernel.get_written_variables()):
gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name))
# }}}
# {{{
......
......@@ -38,6 +38,20 @@ __doc__ = """
# {{{ to_batched
def temp_needs_batching_if_not_sequential(tv, batch_varying_args):
from loopy.kernel.data import temp_var_scope
if tv.name in batch_varying_args:
return True
if tv.initializer is not None and tv.read_only:
# do not batch read_only temps if not in
# `batch_varying_args`
return False
if tv.scope == temp_var_scope.PRIVATE:
# do not batch private temps if not in `batch_varying args`
return False
return True
class _BatchVariableChanger(RuleAwareIdentityMapper):
def __init__(self, rule_mapping_context, kernel, batch_varying_args,
batch_iname_expr, sequential):
......@@ -50,14 +64,17 @@ class _BatchVariableChanger(RuleAwareIdentityMapper):
def needs_batch_subscript(self, name):
tv = self.kernel.temporary_variables.get(name)
return (
(not self.sequential
and (tv is not None
and not (
tv.initializer is not None
and tv.read_only)))
or
name in self.batch_varying_args)
if name in self.batch_varying_args:
return True
if not self.sequential:
if tv is None:
return False
if not temp_needs_batching_if_not_sequential(tv,
self.batch_varying_args):
return False
return True
def map_subscript(self, expr, expn_state):
if not self.needs_batch_subscript(expr.aggregate.name):
......@@ -89,6 +106,10 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
sequential=False):
"""Takes in a kernel that carries out an operation and returns a kernel
that carries out a batch of these operations.
.. note::
For temporaries in a kernel that are private or read only
globals and if `sequential=True`, loopy does not does not batch these
variables unless explicitly mentioned in `batch_varying_args`.
:arg nbatches: the number of batches. May be a constant non-negative
integer or a string, which will be added as an integer argument.
......@@ -144,13 +165,13 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
new_temps = {}
for temp in six.itervalues(knl.temporary_variables):
if temp.initializer is not None and temp.read_only:
new_temps[temp.name] = temp
else:
if temp_needs_batching_if_not_sequential(temp, batch_varying_args):
new_temps[temp.name] = temp.copy(
shape=(nbatches_expr,) + temp.shape,
dim_tags=("c",) * (len(temp.shape) + 1),
dim_names=_add_unique_dim_name("ibatch", temp.dim_names))
else:
new_temps[temp.name] = temp
knl = knl.copy(temporary_variables=new_temps)
else:
......
......@@ -854,23 +854,23 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None,
# {{{ iname duplication for schedulability
def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
# Remove common inames of the current insn_deps, as they are not relevant
def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset([])):
# Remove common inames of the current insn_iname_sets, as they are not relevant
# for splitting.
common = frozenset([]).union(*insn_deps).intersection(*insn_deps)
common = frozenset([]).union(*insn_iname_sets).intersection(*insn_iname_sets)
# If common inames were found, we reduce the problem and go into recursion
if common:
# Remove the common inames from the instruction dependencies
insn_deps = (
frozenset(dep - common for dep in insn_deps)
insn_iname_sets = (
frozenset(iname_set - common for iname_set in insn_iname_sets)
-
frozenset([frozenset([])]))
# Join the common inames with those previously found
common = common.union(old_common_inames)
# Go into recursion
for option in _get_iname_duplication_options(insn_deps, common):
for option in _get_iname_duplication_options(insn_iname_sets, common):
yield option
# Do not yield anything beyond here!
return
......@@ -880,7 +880,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
def join_sets_if_not_disjoint(sets):
for s1 in sets:
for s2 in sets:
if s1 != s2 and s1.intersection(s2):
if s1 != s2 and s1 & s2:
return (
(sets - frozenset([s1, s2]))
| frozenset([s1 | s2])
......@@ -888,7 +888,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
return sets, True
partitioning = insn_deps
partitioning = insn_iname_sets
stop = False
while not stop:
partitioning, stop = join_sets_if_not_disjoint(partitioning)
......@@ -897,7 +897,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
# subproblems
if len(partitioning) > 1:
for part in partitioning:
working_set = frozenset(s for s in insn_deps if s.issubset(part))
working_set = frozenset(s for s in insn_iname_sets if s <= part)
for option in _get_iname_duplication_options(working_set,
old_common_inames):
yield option
......@@ -908,7 +908,9 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
# There are splitting options for all inames
for iname in inames:
iname_insns = frozenset(
insn for insn in insn_deps if frozenset([iname]).issubset(insn))
insn
for insn in insn_iname_sets
if frozenset([iname]) <= insn)
import itertools as it
# For a given iname, the set of instructions containing this iname
......@@ -919,7 +921,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
for l in range(1, len(iname_insns))):
yield (
iname,
tuple(insn.union(old_common_inames) for insn in insns_to_dup))
tuple(insn | old_common_inames for insn in insns_to_dup))
# If partitioning was empty, we have recursed successfully and yield nothing
......@@ -951,12 +953,12 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
* duplicating j in instruction i2
* duplicating i in instruction i2 and i3
Use :func:`has_schedulable_iname_nesting` to decide, whether an iname needs to be
Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be
duplicated in a given kernel.
"""
# First we extract the minimal necessary information from the kernel
if use_boostable_into:
insn_deps = (
insn_iname_sets = (
frozenset(insn.within_inames.union(
insn.boostable_into if insn.boostable_into is not None
else frozenset([]))
......@@ -964,20 +966,20 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
-
frozenset([frozenset([])]))
else:
insn_deps = (
insn_iname_sets = (
frozenset(insn.within_inames for insn in knl.instructions)
-
frozenset([frozenset([])]))
# Get the duplication options as a tuple of iname and a set
for iname, insns in _get_iname_duplication_options(insn_deps):
for iname, insns in _get_iname_duplication_options(insn_iname_sets):
# Check whether this iname has a parallel tag and discard it if so
from loopy.kernel.data import ConcurrentTag
if (iname in knl.iname_to_tag
and isinstance(knl.iname_to_tag[iname], ConcurrentTag)):
continue
# If we find a duplication option and fo not use boostable_into
# If we find a duplication option and to not use boostable_into
# information, we restart this generator with use_boostable_into=True
if not use_boostable_into and not knl.options.ignore_boostable_into:
for option in get_iname_duplication_options(knl, True):
......
......@@ -312,15 +312,8 @@ class TypeInferenceMapper(CombineMapper):
from loopy.kernel.data import TemporaryVariable, KernelArgument
import loopy as lp
if isinstance(obj, TemporaryVariable):
result = [obj.dtype]
if result[0] is lp.auto:
self.symbols_with_unknown_types.add(expr.name)
return []
else:
return result
elif isinstance(obj, KernelArgument):
if isinstance(obj, (KernelArgument, TemporaryVariable)):
assert obj.dtype is not lp.auto
result = [obj.dtype]
if result[0] is None:
self.symbols_with_unknown_types.add(expr.name)
......@@ -515,10 +508,12 @@ def infer_unknown_types(kernel, expect_completion=False):
import loopy as lp
for tv in six.itervalues(kernel.temporary_variables):
if tv.dtype is lp.auto:
assert tv.dtype is not lp.auto
if tv.dtype is None:
names_for_type_inference.append(tv.name)
for arg in kernel.args:
assert arg.dtype is not lp.auto
if arg.dtype is None:
names_for_type_inference.append(arg.name)
......@@ -588,6 +583,9 @@ def infer_unknown_types(kernel, expect_completion=False):
failed = not result
if not failed:
new_dtype, = result
if new_dtype.target is None:
new_dtype = new_dtype.with_target(kernel.target)
debug(" success: %s", new_dtype)
if new_dtype != item.dtype:
debug(" changed from: %s", item.dtype)
......
......@@ -177,13 +177,20 @@ class AtomicNumpyType(NumpyType, AtomicType):
# }}}
def to_loopy_type(dtype, allow_none=False, allow_auto=False, for_atomic=False,
def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False,
target=None):
from loopy.kernel.data import auto
if allow_none and dtype is None:
return dtype
elif allow_auto and dtype is auto:
return dtype
if dtype is None:
if allow_none:
return None
else:
raise LoopyError("dtype may not be none")
elif dtype is auto:
if allow_auto:
return dtype
else:
raise LoopyError("dtype may not be auto")
numpy_dtype = None
......
......@@ -21,7 +21,7 @@ THE SOFTWARE.
"""
VERSION = (2017, 2)
VERSION = (2017, 2, 1)
VERSION_STATUS = ""
VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
......@@ -32,4 +32,4 @@ except ImportError:
else:
_islpy_version = islpy.version.VERSION_TEXT
DATA_MODEL_VERSION = "v72-islpy%s" % _islpy_version
DATA_MODEL_VERSION = "v76-islpy%s" % _islpy_version