diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index d206faad5bd84e3a1c7e7c061673f3d5d1144c84..789c00d33b7bb41816e6901e24046d4b0eefb27d 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -1,8 +1,6 @@ """Loop nest build top-level control/hoisting.""" -from __future__ import division -from __future__ import absolute_import -import six +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -26,12 +24,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - +import six from loopy.codegen.result import merge_codegen_results, wrap_in_if import islpy as isl from loopy.schedule import ( EnterLoop, LeaveLoop, RunInstruction, Barrier, CallKernel, gather_schedule_block, generate_sub_sched_items) +from loopy.diagnostic import LoopyError def get_admissible_conditional_inames_for(codegen_state, sched_index): @@ -150,15 +149,36 @@ def generate_code_for_sched_index(codegen_state, sched_index): return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): + # {{{ emit barrier code + + from loopy.codegen.result import CodeGenerationResult + if codegen_state.is_generating_device_code: - return codegen_state.ast_builder.emit_barrier( + barrier_ast = codegen_state.ast_builder.emit_barrier( sched_item.kind, sched_item.comment) - from loopy.codegen.result import CodeGenerationResult - return CodeGenerationResult( - host_program=None, - device_programs=[], - implemented_domains={}, - implemented_data_info=codegen_state.implemented_data_info) + if sched_item.originating_insn_id: + return CodeGenerationResult.new( + codegen_state, + sched_item.originating_insn_id, + barrier_ast, + codegen_state.implemented_domain) + else: + return barrier_ast + else: + # host code + if sched_item.kind in ["global", "local"]: + # host code is assumed globally and locally synchronous + return CodeGenerationResult( + host_program=None, + device_programs=[], + implemented_domains={}, + implemented_data_info=codegen_state.implemented_data_info) + + else: + raise LoopyError("do not know how to emit code for barrier kind '%s'" + "in host code" % sched_item.kind) + + # }}} elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] diff --git a/loopy/execution.py b/loopy/execution.py index dac5b2ff80767ae00c126aba31c2851cfe3769ef..07e28f06d33e5884ac57c9505593c9ee916c3171 100644 --- a/loopy/execution.py +++ b/loopy/execution.py @@ -187,7 +187,12 @@ class KernelExecutorBase(object): def get_typed_and_scheduled_kernel(self, arg_to_dtype_set): from loopy import CACHING_ENABLED - cache_key = (type(self).__name__, self.kernel, arg_to_dtype_set) + from loopy.preprocess import prepare_for_caching + # prepare_for_caching() gets run by preprocess, but the kernel at this + # stage is not guaranteed to be preprocessed. + cacheable_kernel = prepare_for_caching(self.kernel) + cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set) + if CACHING_ENABLED: try: return typed_and_scheduled_cache[cache_key] diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 50272e5fece9322db3f63104698e21f68c4f21db..e801d09dcf10750ce09af647e0b14f4641fa1fb2 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -212,6 +212,7 @@ class F2LoopyTranslator(FTreeWalkerBase): self.instruction_tags = [] self.conditions = [] + self.conditions_data = [] self.filename = filename @@ -312,7 +313,16 @@ class F2LoopyTranslator(FTreeWalkerBase): def dtype_from_stmt(self, stmt): length, kind = stmt.selector - assert not kind + + if kind and not length: + length = kind + elif length and not kind: + pass + elif not length and not kind: + pass + else: + raise RuntimeError("both length and kind specified") + return np.dtype(self.TYPE_MAP[(type(stmt).__name__.lower(), length)]) def map_type_decl(self, node): @@ -442,7 +452,7 @@ class F2LoopyTranslator(FTreeWalkerBase): # node.expr # node.content[0] - def map_IfThen(self, node): + def realize_conditional(self, node, context_cond=None): scope = self.scope_stack[-1] cond_name = intern("loopy_cond%d" % self.condition_id_counter) @@ -457,22 +467,53 @@ class F2LoopyTranslator(FTreeWalkerBase): self.add_expression_instruction( cond_var, self.parse_expr(node, node.expr)) - self.conditions.append(cond_name) + cond_expr = cond_var + if context_cond is not None: + from pymbolic.primitives import LogicalAnd + cond_expr = LogicalAnd((cond_var, context_cond)) + self.conditions_data.append((context_cond, cond_var)) + else: + self.conditions_data.append((None, cond_var)) + + self.conditions.append(cond_expr) + + def map_IfThen(self, node): self.block_nest.append("if") + self.realize_conditional(node, None) + for c in node.content: self.rec(c) + def construct_else_condition(self): + context_cond, prev_cond = self.conditions_data.pop() + if prev_cond is None: + raise RuntimeError("else if may not follow else") + + self.conditions.pop() + + from pymbolic.primitives import LogicalNot, LogicalAnd + else_expr = LogicalNot(prev_cond) + if context_cond is not None: + else_expr = LogicalAnd((else_expr, context_cond)) + + return else_expr + def map_Else(self, node): - cond_name = self.conditions.pop() - self.conditions.append("!" + cond_name) + else_cond = self.construct_else_condition() + self.conditions.append(else_cond) + self.conditions_data.append((else_cond, None)) + + def map_ElseIf(self, node): + self.realize_conditional(node, self.construct_else_condition()) def map_EndIfThen(self, node): if not self.block_nest: - raise TranslationError("no if block started at end do") + raise TranslationError("no if block started at end if") if self.block_nest.pop() != "if": raise TranslationError("mismatched end if") + self.conditions_data.pop() self.conditions.pop() def map_Do(self, node): diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index a02fc58d97f370d45f36a465c38fa3caf3da9d41..531cc822e1bc76573ef6e0812970d16bd6df0b17 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -862,6 +862,16 @@ class ArrayBase(ImmutableRecord): def __repr__(self): return "<%s>" % self.__str__() + def update_persistent_hash_for_shape(self, key_hash, key_builder, shape): + if isinstance(shape, tuple): + for shape_i in shape: + if shape_i is None: + key_builder.rec(key_hash, shape_i) + else: + key_builder.update_for_pymbolic_expression(key_hash, shape_i) + else: + key_builder.rec(key_hash, shape) + def update_persistent_hash(self, key_hash, key_builder): """Custom hash computation function for use with :class:`pytools.persistent_dict.PersistentDict`. @@ -869,14 +879,7 @@ class ArrayBase(ImmutableRecord): key_builder.rec(key_hash, self.name) key_builder.rec(key_hash, self.dtype) - if isinstance(self.shape, tuple): - for shape_i in self.shape: - if shape_i is None: - key_builder.rec(key_hash, shape_i) - else: - key_builder.update_for_pymbolic_expression(key_hash, shape_i) - else: - key_builder.rec(key_hash, self.shape) + self.update_persistent_hash_for_shape(key_hash, key_builder, self.shape) key_builder.rec(key_hash, self.dim_tags) key_builder.rec(key_hash, self.offset) key_builder.rec(key_hash, self.dim_names) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 14b18150f5b84218f39ba23662eb6106ffb596a0..89cb5f26a4940656cca1ab09841311148e113275 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -366,55 +366,55 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): # {{{ parse one instruction WITH_OPTIONS_RE = re.compile( - "^" - "\s*with\s*" - "\{(?P<options>.+)\}" - "\s*$") + r"^" + r"\s*with\s*" + r"\{(?P<options>.+)\}" + r"\s*$") FOR_RE = re.compile( - "^" - "\s*(for)\s+" - "(?P<inames>[ ,\w]*)" - "\s*$") + r"^" + r"\s*(for)\s+" + r"(?P<inames>[ ,\w]*)" + r"\s*$") IF_RE = re.compile( - "^" - "\s*if\s+" - "(?P<predicate>.+)" - "\s*$") + r"^" + r"\s*if\s+" + r"(?P<predicate>.+)" + r"\s*$") ELIF_RE = re.compile( - "^" - "\s*elif\s+" - "(?P<predicate>.+)" - "\s*$") + r"^" + r"\s*elif\s+" + r"(?P<predicate>.+)" + r"\s*$") -ELSE_RE = re.compile("^\s*else\s*$") +ELSE_RE = re.compile(r"^\s*else\s*$") INSN_RE = re.compile( - "^" - "\s*" - "(?P<lhs>[^{]+?)" - "\s*(?<!\:)=\s*" - "(?P<rhs>.+?)" - "\s*?" - "(?:\{(?P<options>.+)\}\s*)?$") + r"^" + r"\s*" + r"(?P<lhs>[^{]+?)" + r"\s*(?<!\:)=\s*" + r"(?P<rhs>.+?)" + r"\s*?" + r"(?:\{(?P<options>.+)\}\s*)?$") EMPTY_LHS_INSN_RE = re.compile( - "^" - "\s*" - "(?P<rhs>.+?)" - "\s*?" - "(?:\{(?P<options>.+)\}\s*)?$") + r"^" + r"\s*" + r"(?P<rhs>.+?)" + r"\s*?" + r"(?:\{(?P<options>.+)\}\s*)?$") SPECIAL_INSN_RE = re.compile( - "^" - "\s*" - "\.\.\." - "\s*" - "(?P<kind>[a-z]+?)" - "\s*?" - "(?:\{(?P<options>.+)\}\s*)?$") + r"^" + r"\s*" + r"\.\.\." + r"\s*" + r"(?P<kind>[a-z]+?)" + r"\s*?" + r"(?:\{(?P<options>.+)\}\s*)?$") SUBST_RE = re.compile( r"^\s*(?P<lhs>.+?)\s*:=\s*(?P<rhs>.+)\s*$") @@ -582,6 +582,9 @@ def parse_special_insn(groups, insn_options): if special_insn_kind == "gbarrier": cls = BarrierInstruction kwargs["kind"] = "global" + elif special_insn_kind == "lbarrier": + cls = BarrierInstruction + kwargs["kind"] = "local" elif special_insn_kind == "nop": cls = NoOpInstruction else: diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 001dd06326edcad14d8ecd39e29229dd45de8ef2..94b31df12dae516d3539438b7e4ed66ed765e697 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -497,7 +497,8 @@ class TemporaryVariable(ArrayBase): """ super(TemporaryVariable, self).update_persistent_hash(key_hash, key_builder) - key_builder.rec(key_hash, self.storage_shape) + self.update_persistent_hash_for_shape(key_hash, key_builder, + self.storage_shape) key_builder.rec(key_hash, self.base_indices) initializer = self.initializer @@ -510,7 +511,7 @@ class TemporaryVariable(ArrayBase): # }}} -# {{{ subsitution rule +# {{{ substitution rule class SubstitutionRule(ImmutableRecord): """ diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 08268ca9f27623a6d17a195d3c04acb55e5ec68a..d5c388af60a39987c09092fc93325f067a8f4cf7 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1312,11 +1312,12 @@ class BarrierInstruction(_DataObliviousInstruction): .. attribute:: kind - A string, currently only ``"global"``. + A string, ``"global"`` or ``"local"``. The textual syntax in a :mod:`loopy` kernel is:: ... gbarrier + ... lbarrier """ fields = _DataObliviousInstruction.fields | set(["kind"]) @@ -1328,7 +1329,6 @@ class BarrierInstruction(_DataObliviousInstruction): priority=None, boostable=None, boostable_into=None, predicates=None, tags=None, kind="global"): - assert kind == "global" if predicates: raise LoopyError("conditional barriers are not supported") diff --git a/loopy/library/function.py b/loopy/library/function.py index efa590371bb632cbc9776078ea6b5c64f626d46a..9d557ac9fe5c4c040608dc181b96daa812405a65 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -26,7 +26,7 @@ THE SOFTWARE. def default_function_mangler(kernel, name, arg_dtypes): from loopy.library.reduction import reduction_function_mangler - manglers = [reduction_function_mangler] + manglers = [reduction_function_mangler, tuple_function_mangler] for mangler in manglers: result = mangler(kernel, name, arg_dtypes) if result is not None: @@ -45,4 +45,15 @@ def single_arg_function_mangler(kernel, name, arg_dtypes): return None +def tuple_function_mangler(kernel, name, arg_dtypes): + if name == "make_tuple": + from loopy.kernel.data import CallMangleInfo + return CallMangleInfo( + target_name="loopy_make_tuple", + result_dtypes=arg_dtypes, + arg_dtypes=arg_dtypes) + + return None + + # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index f9648bde7dc4d685ca9daf63ecf15b69496c8651..0e5a093b76b8d09d331edead7c69fcc2e3134601 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -123,6 +123,7 @@ class ScalarReductionOperation(ReductionOperation): class SumReductionOperation(ScalarReductionOperation): def neutral_element(self, dtype): + # FIXME: Document that we always use an int here. return 0 def __call__(self, dtype, operand1, operand2): @@ -131,6 +132,7 @@ class SumReductionOperation(ScalarReductionOperation): class ProductReductionOperation(ScalarReductionOperation): def neutral_element(self, dtype): + # FIXME: Document that we always use an int here. return 1 def __call__(self, dtype, operand1, operand2): @@ -189,8 +191,30 @@ class MinReductionOperation(ScalarReductionOperation): return var("min")(operand1, operand2) +# {{{ base class for symbolic reduction ops + +class ReductionOpFunction(FunctionIdentifier): + init_arg_names = ("reduction_op",) + + def __init__(self, reduction_op): + self.reduction_op = reduction_op + + def __getinitargs__(self): + return (self.reduction_op,) + + @property + def name(self): + return self.__class__.__name__ + +# }}} + + # {{{ segmented reduction +class SegmentedOp(ReductionOpFunction): + pass + + class _SegmentedScalarReductionOperation(ReductionOperation): def __init__(self, **kwargs): self.inner_reduction = self.base_reduction_class(**kwargs) @@ -205,7 +229,9 @@ class _SegmentedScalarReductionOperation(ReductionOperation): segment_flag_dtype.numpy_dtype.type.__name__) def neutral_element(self, scalar_dtype, segment_flag_dtype): - return SegmentedFunction(self, (scalar_dtype, segment_flag_dtype), "init")() + scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) + return var("make_tuple")(scalar_neutral_element, + segment_flag_dtype.numpy_dtype.type(0)) def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): return (self.inner_reduction.result_dtypes(kernel, scalar_dtype) @@ -221,7 +247,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return type(self) == type(other) def __call__(self, dtypes, operand1, operand2): - return SegmentedFunction(self, dtypes, "update")(*(operand1 + operand2)) + return SegmentedOp(self)(*(operand1 + operand2)) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): @@ -236,45 +262,14 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): which = "product" -class SegmentedFunction(FunctionIdentifier): - init_arg_names = ("reduction_op", "dtypes", "name") - - def __init__(self, reduction_op, dtypes, name): - """ - :arg dtypes: A :class:`tuple` of `(scalar_dtype, segment_flag_dtype)` - """ - self.reduction_op = reduction_op - self.dtypes = dtypes - self.name = name - - @property - def scalar_dtype(self): - return self.dtypes[0] - - @property - def segment_flag_dtype(self): - return self.dtypes[1] - - def __getinitargs__(self): - return (self.reduction_op, self.dtypes, self.name) - - -def get_segmented_function_preamble(kernel, func_id): +def get_segmented_function_preamble(kernel, func_id, arg_dtypes): op = func_id.reduction_op - prefix = op.prefix(func_id.scalar_dtype, func_id.segment_flag_dtype) - - from pymbolic.mapper.c_code import CCodeMapper - - c_code_mapper = CCodeMapper() + scalar_dtype = arg_dtypes[0] + segment_flag_dtype = arg_dtypes[1] + prefix = op.prefix(scalar_dtype, segment_flag_dtype) return (prefix, """ - inline %(scalar_t)s %(prefix)s_init(%(segment_flag_t)s *segment_flag_out) - { - *segment_flag_out = 0; - return %(neutral)s; - } - - inline %(scalar_t)s %(prefix)s_update( + inline %(scalar_t)s %(prefix)s_op( %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, %(segment_flag_t)s *segment_flag_out) @@ -283,32 +278,36 @@ def get_segmented_function_preamble(kernel, func_id): return segment_flag2 ? op2 : %(combined)s; } """ % dict( - scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype), + scalar_t=kernel.target.dtype_to_typename(scalar_dtype), prefix=prefix, - segment_flag_t=kernel.target.dtype_to_typename( - func_id.segment_flag_dtype), - neutral=c_code_mapper( - op.inner_reduction.neutral_element(func_id.scalar_dtype)), + segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype), combined=op.op % ("op1", "op2"), )) - # }}} # {{{ argmin/argmax +class ArgExtOp(ReductionOpFunction): + pass + + class _ArgExtremumReductionOperation(ReductionOperation): def prefix(self, scalar_dtype, index_dtype): return "loopy_arg%s_%s_%s" % (self.which, - index_dtype.numpy_dtype.type.__name__, - scalar_dtype.numpy_dtype.type.__name__) + scalar_dtype.numpy_dtype.type.__name__, + index_dtype.numpy_dtype.type.__name__) def result_dtypes(self, kernel, scalar_dtype, index_dtype): return (scalar_dtype, index_dtype) def neutral_element(self, scalar_dtype, index_dtype): - return ArgExtFunction(self, (scalar_dtype, index_dtype), "init")() + scalar_neutral_func = ( + get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) + scalar_neutral_element = scalar_neutral_func(scalar_dtype) + return var("make_tuple")(scalar_neutral_element, + index_dtype.numpy_dtype.type(-1)) def __str__(self): return self.which @@ -324,7 +323,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): return 2 def __call__(self, dtypes, operand1, operand2): - return ArgExtFunction(self, dtypes, "update")(*(operand1 + operand2)) + return ArgExtOp(self)(*(operand1 + operand2)) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -339,44 +338,15 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation): neutral_sign = +1 -class ArgExtFunction(FunctionIdentifier): - init_arg_names = ("reduction_op", "dtypes", "name") - - def __init__(self, reduction_op, dtypes, name): - self.reduction_op = reduction_op - self.dtypes = dtypes - self.name = name - - @property - def scalar_dtype(self): - return self.dtypes[0] - - @property - def index_dtype(self): - return self.dtypes[1] - - def __getinitargs__(self): - return (self.reduction_op, self.dtypes, self.name) - - -def get_argext_preamble(kernel, func_id): +def get_argext_preamble(kernel, func_id, arg_dtypes): op = func_id.reduction_op - prefix = op.prefix(func_id.scalar_dtype, func_id.index_dtype) - - from pymbolic.mapper.c_code import CCodeMapper + scalar_dtype = arg_dtypes[0] + index_dtype = arg_dtypes[1] - c_code_mapper = CCodeMapper() - - neutral = get_ge_neutral if op.neutral_sign < 0 else get_le_neutral + prefix = op.prefix(scalar_dtype, index_dtype) return (prefix, """ - inline %(scalar_t)s %(prefix)s_init(%(index_t)s *index_out) - { - *index_out = INT_MIN; - return %(neutral)s; - } - - inline %(scalar_t)s %(prefix)s_update( + inline %(scalar_t)s %(prefix)s_op( %(scalar_t)s op1, %(index_t)s index1, %(scalar_t)s op2, %(index_t)s index2, %(index_t)s *index_out) @@ -393,10 +363,9 @@ def get_argext_preamble(kernel, func_id): } } """ % dict( - scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype), + scalar_t=kernel.target.dtype_to_typename(scalar_dtype), prefix=prefix, - index_t=kernel.target.dtype_to_typename(func_id.index_dtype), - neutral=c_code_mapper(neutral(func_id.scalar_dtype)), + index_t=kernel.target.dtype_to_typename(index_dtype), comp=op.update_comparison, )) @@ -454,76 +423,48 @@ def parse_reduction_op(name): def reduction_function_mangler(kernel, func_id, arg_dtypes): - if isinstance(func_id, ArgExtFunction) and func_id.name == "init": + if isinstance(func_id, ArgExtOp): from loopy.target.opencl import CTarget if not isinstance(kernel.target, CTarget): raise LoopyError("%s: only C-like targets supported for now" % func_id) op = func_id.reduction_op + scalar_dtype = arg_dtypes[0] + index_dtype = arg_dtypes[1] from loopy.kernel.data import CallMangleInfo return CallMangleInfo( - target_name="%s_init" % op.prefix( - func_id.scalar_dtype, func_id.index_dtype), + target_name="%s_op" % op.prefix( + scalar_dtype, index_dtype), result_dtypes=op.result_dtypes( - kernel, func_id.scalar_dtype, func_id.index_dtype), - arg_dtypes=(), - ) - - elif isinstance(func_id, ArgExtFunction) and func_id.name == "update": - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_update" % op.prefix( - func_id.scalar_dtype, func_id.index_dtype), - result_dtypes=op.result_dtypes( - kernel, func_id.scalar_dtype, func_id.index_dtype), + kernel, scalar_dtype, index_dtype), arg_dtypes=( - func_id.scalar_dtype, - kernel.index_dtype, - func_id.scalar_dtype, - kernel.index_dtype), - ) - - elif isinstance(func_id, SegmentedFunction) and func_id.name == "init": - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_init" % op.prefix( - func_id.scalar_dtype, func_id.segment_flag_dtype), - result_dtypes=op.result_dtypes( - kernel, func_id.scalar_dtype, func_id.segment_flag_dtype), - arg_dtypes=(), + scalar_dtype, + index_dtype, + scalar_dtype, + index_dtype), ) - elif isinstance(func_id, SegmentedFunction) and func_id.name == "update": + elif isinstance(func_id, SegmentedOp): from loopy.target.opencl import CTarget if not isinstance(kernel.target, CTarget): raise LoopyError("%s: only C-like targets supported for now" % func_id) op = func_id.reduction_op + scalar_dtype = arg_dtypes[0] + segment_flag_dtype = arg_dtypes[1] from loopy.kernel.data import CallMangleInfo return CallMangleInfo( - target_name="%s_update" % op.prefix( - func_id.scalar_dtype, func_id.segment_flag_dtype), + target_name="%s_op" % op.prefix( + scalar_dtype, segment_flag_dtype), result_dtypes=op.result_dtypes( - kernel, func_id.scalar_dtype, func_id.segment_flag_dtype), + kernel, scalar_dtype, segment_flag_dtype), arg_dtypes=( - func_id.scalar_dtype, - func_id.segment_flag_dtype, - func_id.scalar_dtype, - func_id.segment_flag_dtype), + scalar_dtype, + segment_flag_dtype, + scalar_dtype, + segment_flag_dtype), ) return None @@ -533,16 +474,18 @@ def reduction_preamble_generator(preamble_info): from loopy.target.opencl import OpenCLTarget for func in preamble_info.seen_functions: - if isinstance(func.name, ArgExtFunction): + if isinstance(func.name, ArgExtOp): if not isinstance(preamble_info.kernel.target, OpenCLTarget): raise LoopyError("only OpenCL supported for now") - yield get_argext_preamble(preamble_info.kernel, func.name) + yield get_argext_preamble(preamble_info.kernel, func.name, + func.arg_dtypes) - elif isinstance(func.name, SegmentedFunction): + elif isinstance(func.name, SegmentedOp): if not isinstance(preamble_info.kernel.target, OpenCLTarget): raise LoopyError("only OpenCL supported for now") - yield get_segmented_function_preamble(preamble_info.kernel, func.name) + yield get_segmented_function_preamble(preamble_info.kernel, func.name, + func.arg_dtypes) # vim: fdm=marker diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 57cf74b808ae1a7107e76a18a3876785ab8baabd..4281e50bd006a3cddf5a3cae0ffffe3d78abcfac 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -363,9 +363,12 @@ def gen_dependencies_except(kernel, insn_id, except_insn_ids): def get_priority_tiers(wanted, priorities): # Get highest priority tier candidates: These are the first inames # of all the given priority constraints - candidates = set(next(iter(p for p in prio if p in wanted)) - for prio in priorities - ) + candidates = set() + for prio in priorities: + for p in prio: + if p in wanted: + candidates.add(p) + break # Now shrink this set by removing those inames that are prohibited # by other constraints @@ -383,19 +386,19 @@ def get_priority_tiers(wanted, priorities): candidates = candidates - set(bad_candidates) if candidates: - # We found a valid priority tier! + # We found a valid priority tier yield candidates else: - # If we did not, we stop the generator! + # If we did not, stop the generator return - # Now reduce the input data for recursion! + # Now reduce the input data for recursion priorities = frozenset([tuple(i for i in prio if i not in candidates) for prio in priorities ]) - frozenset([()]) wanted = wanted - candidates - # Yield recursively! + # Yield recursively for tier in get_priority_tiers(wanted, priorities): yield tier @@ -596,7 +599,8 @@ class SchedulerState(ImmutableRecord): .. attribute:: preschedule A sequence of schedule items that must be inserted into the - schedule, maintaining the same ordering + schedule, maintaining the same relative ordering. Newly scheduled + items may interleave this sequence. .. attribute:: prescheduled_insn_ids @@ -728,13 +732,15 @@ def generate_loop_schedules_internal( # }}} - # {{{ see if there are pending local barriers in the preschedule + # {{{ see if there are pending barriers in the preschedule - # Local barriers do not have associated instructions, so they need to - # be handled separately from instructions. + # Barriers that do not have an originating instruction are handled here. + # (These are automatically inserted by insert_barriers().) Barriers with + # originating instructions are handled as part of normal instruction + # scheduling below. if ( isinstance(next_preschedule_item, Barrier) - and next_preschedule_item.kind == "local"): + and next_preschedule_item.originating_insn_id is None): for result in generate_loop_schedules_internal( sched_state.copy( schedule=sched_state.schedule + (next_preschedule_item,), @@ -810,10 +816,7 @@ def generate_loop_schedules_internal( if insn_id in sched_state.prescheduled_insn_ids: if isinstance(next_preschedule_item, RunInstruction): next_preschedule_insn_id = next_preschedule_item.insn_id - elif ( - isinstance(next_preschedule_item, Barrier) - and next_preschedule_item.kind == "global"): - assert hasattr(next_preschedule_item, "originating_insn_id") + elif isinstance(next_preschedule_item, Barrier): assert next_preschedule_item.originating_insn_id is not None next_preschedule_insn_id = next_preschedule_item.originating_insn_id else: @@ -1073,28 +1076,6 @@ def generate_loop_schedules_internal( % iname) continue - if ( - not sched_state.within_subkernel - and iname not in sched_state.prescheduled_inames): - # Avoid messing up some orderings such as picking: - # - # EnterLoop(temporary.reload) - # CallKernel - # ... - # - # instead of - # - # CallKernel - # EnterLoop(temporary.reload) - # ... - # - # This serves a heuristic to catch some bad decisions early, the - # scheduler will not allow the first variant regardless. - if debug_mode: - print("scheduling '%s' prohibited because we are outside " - "a subkernel" % iname) - continue - currently_accessible_inames = ( active_inames_set | sched_state.parallel_inames) if ( @@ -1624,7 +1605,10 @@ def append_barrier_or_raise_error(schedule, dep, verify_only): comment = "for %s (%s)" % ( dep.variable, dep.dep_descr.format( tgt=dep.target.id, src=dep.source.id)) - schedule.append(Barrier(comment=comment, kind=dep.var_kind)) + schedule.append(Barrier( + comment=comment, + kind=dep.var_kind, + originating_insn_id=None)) def insert_barriers(kernel, schedule, kind, verify_only, level=0): @@ -1771,15 +1755,10 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0): # {{{ main scheduling entrypoint def generate_loop_schedules(kernel, debug_args={}): - import sys - rec_limit = sys.getrecursionlimit() - new_limit = max(rec_limit, len(kernel.instructions) * 2) - sys.setrecursionlimit(new_limit) - try: + from pytools import MinRecursionLimit + with MinRecursionLimit(len(kernel.instructions) * 2): for sched in generate_loop_schedules_inner(kernel, debug_args=debug_args): yield sched - finally: - sys.setrecursionlimit(rec_limit) def generate_loop_schedules_inner(kernel, debug_args={}): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 6f82eadd70443facf729711cd922bd8b754a2065..3b4fed215a35fa13a52e3f3901955dab2621dff0 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -656,6 +656,29 @@ class CASTBuilder(ASTBuilderBase): lhs_expr, rhs_expr, lhs_dtype): raise NotImplementedError("atomic updates in %s" % type(self).__name__) + def emit_tuple_assignment(self, codegen_state, insn): + ecm = codegen_state.expression_to_code_mapper + + from cgen import Assign, block_if_necessary + assignments = [] + + for i, (assignee, parameter) in enumerate( + zip(insn.assignees, insn.expression.parameters)): + lhs_code = ecm(assignee, prec=PREC_NONE, type_context=None) + assignee_var_name = insn.assignee_var_names()[i] + lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name) + lhs_dtype = lhs_var.dtype + + from loopy.expression import dtype_to_type_context + rhs_type_context = dtype_to_type_context( + codegen_state.kernel.target, lhs_dtype) + rhs_code = ecm(parameter, prec=PREC_NONE, + type_context=rhs_type_context, needed_dtype=lhs_dtype) + + assignments.append(Assign(lhs_code, rhs_code)) + + return block_if_necessary(assignments) + def emit_multiple_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper @@ -682,6 +705,10 @@ class CASTBuilder(ASTBuilderBase): assert mangle_result.arg_dtypes is not None + if mangle_result.target_name == "loopy_make_tuple": + # This shorcut avoids actually having to emit a 'make_tuple' function. + return self.emit_tuple_assignment(codegen_state, insn) + from loopy.expression import dtype_to_type_context c_parameters = [ ecm(par, PREC_NONE, diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 01e56405e30285705be7cb8eb6d75479c8658ef5..a5f7562c41c3ec8eca673904550e078d2a992241 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -390,10 +390,11 @@ class OpenCLCASTBuilder(CASTBuilder): def preamble_generators(self): from loopy.library.reduction import reduction_preamble_generator + return ( super(OpenCLCASTBuilder, self).preamble_generators() + [ opencl_preamble_generator, - reduction_preamble_generator + reduction_preamble_generator, ]) # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 78d817ce73724d90a6cc6f380b24290971f6c1e7..409cbbc5ebd5feb13b04eeba1671f639663bfcf1 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -232,7 +232,8 @@ class TypeInferenceMapper(CombineMapper): # Codegen for complex types depends on exactly correct types. # Refuse temptation to guess. raise TypeInferenceFailure("Complex constant '%s' needs to " - "be sized for type inference " % expr) + "be sized (i.e. as numpy.complex64/128) for type inference " + % expr) else: raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr) diff --git a/test/test_loopy.py b/test/test_loopy.py index 2ac1026c0660573d97cd2f65c0502ec69b63803d..d7b1f37c18f71527bcb31f856d2f6d09fbc9df9a 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1046,6 +1046,24 @@ def test_within_inames_and_reduction(): print(k.stringify(with_dependencies=True)) +def test_literal_local_barrier(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + """ + for i + ... lbarrier + end + """, seq_dependencies=True) + + knl = lp.fix_parameters(knl, n=128) + + ref_knl = knl + + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) + + def test_kernel_splitting(ctx_factory): ctx = ctx_factory() @@ -1317,6 +1335,28 @@ def test_save_of_local_array(ctx_factory, debug=False): save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) +def test_save_of_local_array_with_explicit_local_barrier(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{ [i,j]: 0<=i,j<8 }", + """ + for i, j + <>t[2*j] = j + ... lbarrier + t[2*j+1] = t[2*j] + ... gbarrier + out[i] = t[2*i] + end + """, seq_dependencies=True) + + knl = lp.set_temporary_scope(knl, "t", "local") + knl = lp.tag_inames(knl, dict(i="g.0", j="l.0")) + + save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) + + def test_save_local_multidim_array(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -2087,6 +2127,47 @@ def test_integer_reduction(ctx_factory): assert function(out) +def test_complicated_argmin_reduction(ctx_factory): + cl_ctx = ctx_factory() + knl = lp.make_kernel( + "{[ictr,itgt,idim]: " + "0<=itgt<ntargets " + "and 0<=ictr<ncenters " + "and 0<=idim<ambient_dim}", + + """ + for itgt + for ictr + <> dist_sq = sum(idim, + (tgt[idim,itgt] - center[idim,ictr])**2) + <> in_disk = dist_sq < (radius[ictr]*1.05)**2 + <> matches = ( + (in_disk + and qbx_forced_limit == 0) + or (in_disk + and qbx_forced_limit != 0 + and qbx_forced_limit * center_side[ictr] > 0) + ) + + <> post_dist_sq = if(matches, dist_sq, HUGE) + end + <> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq) + + tgt_to_qbx_center[itgt] = if(min_dist_sq < HUGE, min_ictr, -1) + end + """) + + knl = lp.fix_parameters(knl, ambient_dim=2) + knl = lp.add_and_infer_dtypes(knl, { + "tgt,center,radius,HUGE": np.float32, + "center_side,qbx_forced_limit": np.int32, + }) + + lp.auto_test_vs_ref(knl, cl_ctx, knl, parameters={ + "HUGE": 1e20, "ncenters": 200, "ntargets": 300, + "qbx_forced_limit": 1}) + + def test_nosync_option_parsing(): knl = lp.make_kernel( "{[i]: 0 <= i < 10}", @@ -2335,6 +2416,21 @@ def test_kernel_var_name_generator(): assert vng("b") != "b" +def test_execution_backend_can_cache_dtypes(ctx_factory): + # When the kernel is invoked, the execution backend uses it as a cache key + # for the type inference and scheduling cache. This tests to make sure that + # dtypes in the kernel can be cached, even though they may not have a + # target. + + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel("{[i]: 0 <= i < 10}", "<>tmp[i] = i") + knl = lp.add_dtypes(knl, dict(tmp=int)) + + knl(queue) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_target.py b/test/test_target.py index b656383e7bbe008892f45159faadd2d195d67a3b..ad0cb7439bfdd6200e020c0becadcd73072ceef4 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -176,6 +176,22 @@ def test_random123(ctx_factory, tp): assert (0 <= out).all() +def test_tuple(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{ [i]: 0 = i }", + """ + a, b = make_tuple(1, 2.) + """) + + evt, (a, b) = knl(queue) + + assert a.get() == 1 + assert b.get() == 2. + + def test_clamp(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx)