diff --git a/loopy/library/function.py b/loopy/library/function.py index efa590371bb632cbc9776078ea6b5c64f626d46a..9d557ac9fe5c4c040608dc181b96daa812405a65 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -26,7 +26,7 @@ THE SOFTWARE. def default_function_mangler(kernel, name, arg_dtypes): from loopy.library.reduction import reduction_function_mangler - manglers = [reduction_function_mangler] + manglers = [reduction_function_mangler, tuple_function_mangler] for mangler in manglers: result = mangler(kernel, name, arg_dtypes) if result is not None: @@ -45,4 +45,15 @@ def single_arg_function_mangler(kernel, name, arg_dtypes): return None +def tuple_function_mangler(kernel, name, arg_dtypes): + if name == "make_tuple": + from loopy.kernel.data import CallMangleInfo + return CallMangleInfo( + target_name="loopy_make_tuple", + result_dtypes=arg_dtypes, + arg_dtypes=arg_dtypes) + + return None + + # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index f9648bde7dc4d685ca9daf63ecf15b69496c8651..0e5a093b76b8d09d331edead7c69fcc2e3134601 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -123,6 +123,7 @@ class ScalarReductionOperation(ReductionOperation): class SumReductionOperation(ScalarReductionOperation): def neutral_element(self, dtype): + # FIXME: Document that we always use an int here. return 0 def __call__(self, dtype, operand1, operand2): @@ -131,6 +132,7 @@ class SumReductionOperation(ScalarReductionOperation): class ProductReductionOperation(ScalarReductionOperation): def neutral_element(self, dtype): + # FIXME: Document that we always use an int here. return 1 def __call__(self, dtype, operand1, operand2): @@ -189,8 +191,30 @@ class MinReductionOperation(ScalarReductionOperation): return var("min")(operand1, operand2) +# {{{ base class for symbolic reduction ops + +class ReductionOpFunction(FunctionIdentifier): + init_arg_names = ("reduction_op",) + + def __init__(self, reduction_op): + self.reduction_op = reduction_op + + def __getinitargs__(self): + return (self.reduction_op,) + + @property + def name(self): + return self.__class__.__name__ + +# }}} + + # {{{ segmented reduction +class SegmentedOp(ReductionOpFunction): + pass + + class _SegmentedScalarReductionOperation(ReductionOperation): def __init__(self, **kwargs): self.inner_reduction = self.base_reduction_class(**kwargs) @@ -205,7 +229,9 @@ class _SegmentedScalarReductionOperation(ReductionOperation): segment_flag_dtype.numpy_dtype.type.__name__) def neutral_element(self, scalar_dtype, segment_flag_dtype): - return SegmentedFunction(self, (scalar_dtype, segment_flag_dtype), "init")() + scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) + return var("make_tuple")(scalar_neutral_element, + segment_flag_dtype.numpy_dtype.type(0)) def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): return (self.inner_reduction.result_dtypes(kernel, scalar_dtype) @@ -221,7 +247,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return type(self) == type(other) def __call__(self, dtypes, operand1, operand2): - return SegmentedFunction(self, dtypes, "update")(*(operand1 + operand2)) + return SegmentedOp(self)(*(operand1 + operand2)) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): @@ -236,45 +262,14 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): which = "product" -class SegmentedFunction(FunctionIdentifier): - init_arg_names = ("reduction_op", "dtypes", "name") - - def __init__(self, reduction_op, dtypes, name): - """ - :arg dtypes: A :class:`tuple` of `(scalar_dtype, segment_flag_dtype)` - """ - self.reduction_op = reduction_op - self.dtypes = dtypes - self.name = name - - @property - def scalar_dtype(self): - return self.dtypes[0] - - @property - def segment_flag_dtype(self): - return self.dtypes[1] - - def __getinitargs__(self): - return (self.reduction_op, self.dtypes, self.name) - - -def get_segmented_function_preamble(kernel, func_id): +def get_segmented_function_preamble(kernel, func_id, arg_dtypes): op = func_id.reduction_op - prefix = op.prefix(func_id.scalar_dtype, func_id.segment_flag_dtype) - - from pymbolic.mapper.c_code import CCodeMapper - - c_code_mapper = CCodeMapper() + scalar_dtype = arg_dtypes[0] + segment_flag_dtype = arg_dtypes[1] + prefix = op.prefix(scalar_dtype, segment_flag_dtype) return (prefix, """ - inline %(scalar_t)s %(prefix)s_init(%(segment_flag_t)s *segment_flag_out) - { - *segment_flag_out = 0; - return %(neutral)s; - } - - inline %(scalar_t)s %(prefix)s_update( + inline %(scalar_t)s %(prefix)s_op( %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, %(segment_flag_t)s *segment_flag_out) @@ -283,32 +278,36 @@ def get_segmented_function_preamble(kernel, func_id): return segment_flag2 ? op2 : %(combined)s; } """ % dict( - scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype), + scalar_t=kernel.target.dtype_to_typename(scalar_dtype), prefix=prefix, - segment_flag_t=kernel.target.dtype_to_typename( - func_id.segment_flag_dtype), - neutral=c_code_mapper( - op.inner_reduction.neutral_element(func_id.scalar_dtype)), + segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype), combined=op.op % ("op1", "op2"), )) - # }}} # {{{ argmin/argmax +class ArgExtOp(ReductionOpFunction): + pass + + class _ArgExtremumReductionOperation(ReductionOperation): def prefix(self, scalar_dtype, index_dtype): return "loopy_arg%s_%s_%s" % (self.which, - index_dtype.numpy_dtype.type.__name__, - scalar_dtype.numpy_dtype.type.__name__) + scalar_dtype.numpy_dtype.type.__name__, + index_dtype.numpy_dtype.type.__name__) def result_dtypes(self, kernel, scalar_dtype, index_dtype): return (scalar_dtype, index_dtype) def neutral_element(self, scalar_dtype, index_dtype): - return ArgExtFunction(self, (scalar_dtype, index_dtype), "init")() + scalar_neutral_func = ( + get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) + scalar_neutral_element = scalar_neutral_func(scalar_dtype) + return var("make_tuple")(scalar_neutral_element, + index_dtype.numpy_dtype.type(-1)) def __str__(self): return self.which @@ -324,7 +323,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): return 2 def __call__(self, dtypes, operand1, operand2): - return ArgExtFunction(self, dtypes, "update")(*(operand1 + operand2)) + return ArgExtOp(self)(*(operand1 + operand2)) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -339,44 +338,15 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation): neutral_sign = +1 -class ArgExtFunction(FunctionIdentifier): - init_arg_names = ("reduction_op", "dtypes", "name") - - def __init__(self, reduction_op, dtypes, name): - self.reduction_op = reduction_op - self.dtypes = dtypes - self.name = name - - @property - def scalar_dtype(self): - return self.dtypes[0] - - @property - def index_dtype(self): - return self.dtypes[1] - - def __getinitargs__(self): - return (self.reduction_op, self.dtypes, self.name) - - -def get_argext_preamble(kernel, func_id): +def get_argext_preamble(kernel, func_id, arg_dtypes): op = func_id.reduction_op - prefix = op.prefix(func_id.scalar_dtype, func_id.index_dtype) - - from pymbolic.mapper.c_code import CCodeMapper + scalar_dtype = arg_dtypes[0] + index_dtype = arg_dtypes[1] - c_code_mapper = CCodeMapper() - - neutral = get_ge_neutral if op.neutral_sign < 0 else get_le_neutral + prefix = op.prefix(scalar_dtype, index_dtype) return (prefix, """ - inline %(scalar_t)s %(prefix)s_init(%(index_t)s *index_out) - { - *index_out = INT_MIN; - return %(neutral)s; - } - - inline %(scalar_t)s %(prefix)s_update( + inline %(scalar_t)s %(prefix)s_op( %(scalar_t)s op1, %(index_t)s index1, %(scalar_t)s op2, %(index_t)s index2, %(index_t)s *index_out) @@ -393,10 +363,9 @@ def get_argext_preamble(kernel, func_id): } } """ % dict( - scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype), + scalar_t=kernel.target.dtype_to_typename(scalar_dtype), prefix=prefix, - index_t=kernel.target.dtype_to_typename(func_id.index_dtype), - neutral=c_code_mapper(neutral(func_id.scalar_dtype)), + index_t=kernel.target.dtype_to_typename(index_dtype), comp=op.update_comparison, )) @@ -454,76 +423,48 @@ def parse_reduction_op(name): def reduction_function_mangler(kernel, func_id, arg_dtypes): - if isinstance(func_id, ArgExtFunction) and func_id.name == "init": + if isinstance(func_id, ArgExtOp): from loopy.target.opencl import CTarget if not isinstance(kernel.target, CTarget): raise LoopyError("%s: only C-like targets supported for now" % func_id) op = func_id.reduction_op + scalar_dtype = arg_dtypes[0] + index_dtype = arg_dtypes[1] from loopy.kernel.data import CallMangleInfo return CallMangleInfo( - target_name="%s_init" % op.prefix( - func_id.scalar_dtype, func_id.index_dtype), + target_name="%s_op" % op.prefix( + scalar_dtype, index_dtype), result_dtypes=op.result_dtypes( - kernel, func_id.scalar_dtype, func_id.index_dtype), - arg_dtypes=(), - ) - - elif isinstance(func_id, ArgExtFunction) and func_id.name == "update": - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_update" % op.prefix( - func_id.scalar_dtype, func_id.index_dtype), - result_dtypes=op.result_dtypes( - kernel, func_id.scalar_dtype, func_id.index_dtype), + kernel, scalar_dtype, index_dtype), arg_dtypes=( - func_id.scalar_dtype, - kernel.index_dtype, - func_id.scalar_dtype, - kernel.index_dtype), - ) - - elif isinstance(func_id, SegmentedFunction) and func_id.name == "init": - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_init" % op.prefix( - func_id.scalar_dtype, func_id.segment_flag_dtype), - result_dtypes=op.result_dtypes( - kernel, func_id.scalar_dtype, func_id.segment_flag_dtype), - arg_dtypes=(), + scalar_dtype, + index_dtype, + scalar_dtype, + index_dtype), ) - elif isinstance(func_id, SegmentedFunction) and func_id.name == "update": + elif isinstance(func_id, SegmentedOp): from loopy.target.opencl import CTarget if not isinstance(kernel.target, CTarget): raise LoopyError("%s: only C-like targets supported for now" % func_id) op = func_id.reduction_op + scalar_dtype = arg_dtypes[0] + segment_flag_dtype = arg_dtypes[1] from loopy.kernel.data import CallMangleInfo return CallMangleInfo( - target_name="%s_update" % op.prefix( - func_id.scalar_dtype, func_id.segment_flag_dtype), + target_name="%s_op" % op.prefix( + scalar_dtype, segment_flag_dtype), result_dtypes=op.result_dtypes( - kernel, func_id.scalar_dtype, func_id.segment_flag_dtype), + kernel, scalar_dtype, segment_flag_dtype), arg_dtypes=( - func_id.scalar_dtype, - func_id.segment_flag_dtype, - func_id.scalar_dtype, - func_id.segment_flag_dtype), + scalar_dtype, + segment_flag_dtype, + scalar_dtype, + segment_flag_dtype), ) return None @@ -533,16 +474,18 @@ def reduction_preamble_generator(preamble_info): from loopy.target.opencl import OpenCLTarget for func in preamble_info.seen_functions: - if isinstance(func.name, ArgExtFunction): + if isinstance(func.name, ArgExtOp): if not isinstance(preamble_info.kernel.target, OpenCLTarget): raise LoopyError("only OpenCL supported for now") - yield get_argext_preamble(preamble_info.kernel, func.name) + yield get_argext_preamble(preamble_info.kernel, func.name, + func.arg_dtypes) - elif isinstance(func.name, SegmentedFunction): + elif isinstance(func.name, SegmentedOp): if not isinstance(preamble_info.kernel.target, OpenCLTarget): raise LoopyError("only OpenCL supported for now") - yield get_segmented_function_preamble(preamble_info.kernel, func.name) + yield get_segmented_function_preamble(preamble_info.kernel, func.name, + func.arg_dtypes) # vim: fdm=marker diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index e4835a3638b43619f9b93f45958382218b701642..e9457233f854fb47ec169cf69d988ab7aba4d6b4 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -648,6 +648,29 @@ class CASTBuilder(ASTBuilderBase): lhs_expr, rhs_expr, lhs_dtype): raise NotImplementedError("atomic updates in %s" % type(self).__name__) + def emit_tuple_assignment(self, codegen_state, insn): + ecm = codegen_state.expression_to_code_mapper + + from cgen import Assign, block_if_necessary + assignments = [] + + for i, (assignee, parameter) in enumerate( + zip(insn.assignees, insn.expression.parameters)): + lhs_code = ecm(assignee, prec=PREC_NONE, type_context=None) + assignee_var_name = insn.assignee_var_names()[i] + lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name) + lhs_dtype = lhs_var.dtype + + from loopy.expression import dtype_to_type_context + rhs_type_context = dtype_to_type_context( + codegen_state.kernel.target, lhs_dtype) + rhs_code = ecm(parameter, prec=PREC_NONE, + type_context=rhs_type_context, needed_dtype=lhs_dtype) + + assignments.append(Assign(lhs_code, rhs_code)) + + return block_if_necessary(assignments) + def emit_multiple_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper @@ -674,6 +697,9 @@ class CASTBuilder(ASTBuilderBase): assert mangle_result.arg_dtypes is not None + if mangle_result.target_name == "loopy_make_tuple": + return self.emit_tuple_assignment(codegen_state, insn) + from loopy.expression import dtype_to_type_context c_parameters = [ ecm(par, PREC_NONE, diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 01e56405e30285705be7cb8eb6d75479c8658ef5..a5f7562c41c3ec8eca673904550e078d2a992241 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -390,10 +390,11 @@ class OpenCLCASTBuilder(CASTBuilder): def preamble_generators(self): from loopy.library.reduction import reduction_preamble_generator + return ( super(OpenCLCASTBuilder, self).preamble_generators() + [ opencl_preamble_generator, - reduction_preamble_generator + reduction_preamble_generator, ]) # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 61f992c5d566128a5cd73bc526c6222df3500997..78633abbd41408ae700aa8516e8a9c6f70f018a9 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2087,6 +2087,47 @@ def test_integer_reduction(ctx_factory): assert function(out) +def test_complicated_argmin_reduction(ctx_factory): + cl_ctx = ctx_factory() + knl = lp.make_kernel( + "{[ictr,itgt,idim]: " + "0<=itgt dist_sq = sum(idim, + (tgt[idim,itgt] - center[idim,ictr])**2) + <> in_disk = dist_sq < (radius[ictr]*1.05)**2 + <> matches = ( + (in_disk + and qbx_forced_limit == 0) + or (in_disk + and qbx_forced_limit != 0 + and qbx_forced_limit * center_side[ictr] > 0) + ) + + <> post_dist_sq = if(matches, dist_sq, HUGE) + end + <> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq) + + tgt_to_qbx_center[itgt] = if(min_dist_sq < HUGE, min_ictr, -1) + end + """) + + knl = lp.fix_parameters(knl, ambient_dim=2) + knl = lp.add_and_infer_dtypes(knl, { + "tgt,center,radius,HUGE": np.float32, + "center_side,qbx_forced_limit": np.int32, + }) + + lp.auto_test_vs_ref(knl, cl_ctx, knl, parameters={ + "HUGE": 1e20, "ncenters": 200, "ntargets": 300, + "qbx_forced_limit": 1}) + + def test_nosync_option_parsing(): knl = lp.make_kernel( "{[i]: 0 <= i < 10}", diff --git a/test/test_target.py b/test/test_target.py index b656383e7bbe008892f45159faadd2d195d67a3b..ad0cb7439bfdd6200e020c0becadcd73072ceef4 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -176,6 +176,22 @@ def test_random123(ctx_factory, tp): assert (0 <= out).all() +def test_tuple(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{ [i]: 0 = i }", + """ + a, b = make_tuple(1, 2.) + """) + + evt, (a, b) = knl(queue) + + assert a.get() == 1 + assert b.get() == 2. + + def test_clamp(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx)