From a8aa6521358255d3e5ede0bfb5968552e66503f0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 18 Sep 2019 23:25:40 -0500 Subject: [PATCH] Merge 'kernel_callables_v3' into 'kernel_callables_v3-edit1' --- doc/tutorial.rst | 4 +- .../fortran/ipython-integration-demo.ipynb | 17 +- examples/fortran/matmul.floopy | 4 +- examples/fortran/sparse.floopy | 4 +- examples/fortran/tagging.floopy | 4 +- examples/fortran/volumeKernel.floopy | 4 +- loopy/__init__.py | 14 +- loopy/check.py | 8 +- loopy/frontend/fortran/__init__.py | 53 ++++- loopy/ipython_ext.py | 2 +- loopy/kernel/creation.py | 94 ++++---- loopy/kernel/instruction.py | 4 +- loopy/library/reduction.py | 193 ++++++++++++---- loopy/preprocess.py | 216 ++++++++++-------- loopy/program.py | 64 +++--- loopy/symbolic.py | 12 +- loopy/target/opencl.py | 16 +- loopy/transform/callable.py | 32 ++- loopy/transform/fusion.py | 5 + loopy/type_inference.py | 2 +- test/test_callables.py | 71 +++--- test/test_fortran.py | 8 +- test/test_numa_diff.py | 20 +- 23 files changed, 520 insertions(+), 331 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index befa5e30b..e6ef54b66 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1157,7 +1157,7 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted: >>> cgr = lp.generate_code_v2(knl) Traceback (most recent call last): ... - loopy.diagnostic.MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) + loopy.diagnostic.MissingBarrierError: rotate_v1: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) The syntax for a inserting a global barrier instruction is ``... gbarrier``. :mod:`loopy` also supports manually inserting local @@ -1554,7 +1554,7 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add, subgroup) : ... + Op(np:dtype('float32'), add, subgroup, loopy_kernel) : ... Each line of output will look roughly like:: diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb index 7a5c8257b..1b0a9df8d 100644 --- a/examples/fortran/ipython-integration-demo.ipynb +++ b/examples/fortran/ipython-integration-demo.ipynb @@ -62,9 +62,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "split_amount = 128" @@ -91,7 +89,7 @@ "\n", "!$loopy begin\n", "!\n", - "! tr_fill, = lp.parse_fortran(SOURCE)\n", + "! tr_fill = lp.parse_fortran(SOURCE)\n", "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n", "! outer_tag=\"g.0\", inner_tag=\"l.0\")\n", "! RESULT = [tr_fill]\n", @@ -107,15 +105,6 @@ "source": [ "print(tr_fill)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -134,7 +123,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy index 4b3552204..a8377bedd 100644 --- a/examples/fortran/matmul.floopy +++ b/examples/fortran/matmul.floopy @@ -13,7 +13,7 @@ subroutine dgemm(m,n,l,alpha,a,b,c) end subroutine !$loopy begin -! dgemm, = lp.parse_fortran(SOURCE, FILENAME) +! dgemm = lp.parse_fortran(SOURCE, FILENAME) ! dgemm = lp.split_iname(dgemm, "i", 16, ! outer_tag="g.0", inner_tag="l.1") ! dgemm = lp.split_iname(dgemm, "j", 8, @@ -24,5 +24,5 @@ end subroutine ! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2") ! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", default_tag="l.auto") ! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", default_tag="l.auto") -! RESULT = [dgemm] +! RESULT = dgemm !$loopy end diff --git a/examples/fortran/sparse.floopy b/examples/fortran/sparse.floopy index 18542e6b0..2b156bdd7 100644 --- a/examples/fortran/sparse.floopy +++ b/examples/fortran/sparse.floopy @@ -23,11 +23,11 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y) end !$loopy begin -! sparse, = lp.parse_fortran(SOURCE, FILENAME) +! sparse = lp.parse_fortran(SOURCE, FILENAME) ! sparse = lp.split_iname(sparse, "i", 128) ! sparse = lp.tag_inames(sparse, {"i_outer": "g.0"}) ! sparse = lp.tag_inames(sparse, {"i_inner": "l.0"}) ! sparse = lp.split_iname(sparse, "j", 4) ! sparse = lp.tag_inames(sparse, {"j_inner": "unr"}) -! RESULT = [sparse] +! RESULT = sparse !$loopy end diff --git a/examples/fortran/tagging.floopy b/examples/fortran/tagging.floopy index 87aacba68..c7ebb7566 100644 --- a/examples/fortran/tagging.floopy +++ b/examples/fortran/tagging.floopy @@ -23,13 +23,13 @@ end ! "factor 4.0", ! "real_type real*8", ! ]) -! fill, = lp.parse_fortran(SOURCE, FILENAME) +! fill = lp.parse_fortran(SOURCE, FILENAME) ! fill = lp.add_barrier(fill, "tag:init", "tag:mult", "gb1") ! fill = lp.split_iname(fill, "i", 128, ! outer_tag="g.0", inner_tag="l.0") ! fill = lp.split_iname(fill, "i_1", 128, ! outer_tag="g.0", inner_tag="l.0") -! RESULT = [fill] +! RESULT = fill ! !$loopy end diff --git a/examples/fortran/volumeKernel.floopy b/examples/fortran/volumeKernel.floopy index c5784b634..211c38049 100644 --- a/examples/fortran/volumeKernel.floopy +++ b/examples/fortran/volumeKernel.floopy @@ -67,7 +67,7 @@ end subroutine volumeKernel !$loopy begin ! -! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME) +! volumeKernel = lp.parse_fortran(SOURCE, FILENAME) ! volumeKernel = lp.split_iname(volumeKernel, ! "e", 32, outer_tag="g.1", inner_tag="g.0") ! volumeKernel = lp.fix_parameters(volumeKernel, @@ -76,6 +76,6 @@ end subroutine volumeKernel ! i="l.0", j="l.1", k="l.2", ! i_1="l.0", j_1="l.1", k_1="l.2" ! )) -! RESULT = [volumeKernel] +! RESULT = volumeKernel ! !$loopy end diff --git a/loopy/__init__.py b/loopy/__init__.py index 1439cb1ff..058bc93ef 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -130,10 +130,10 @@ from loopy.type_inference import infer_unknown_types from loopy.preprocess import (preprocess_kernel, realize_reduction, preprocess_program) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.statistics import (ToCountMap, CountGranularity, - Op, MemAccess, get_op_map, get_mem_access_map, - get_synchronization_map, - gather_access_footprints, gather_access_footprint_bytes) +from loopy.statistics import (ToCountMap, ToCountPolynomialMap, + CountGranularity, stringify_stats_mapping, Op, MemAccess, get_op_map, + get_mem_access_map, get_synchronization_map, + gather_access_footprints, gather_access_footprint_bytes, Sync) from loopy.codegen import ( PreambleInfo, generate_code, generate_code_v2, generate_body) @@ -269,9 +269,11 @@ __all__ = [ "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", - "ToCountMap", "CountGranularity", "Op", - "MemAccess", "get_op_map", "get_mem_access_map", "get_synchronization_map", + "ToCountMap", "ToCountPolynomialMap", "CountGranularity", + "stringify_stats_mapping", "Op", "MemAccess", "get_op_map", + "get_mem_access_map", "get_synchronization_map", "gather_access_footprints", "gather_access_footprint_bytes", + "Sync", "CompiledKernel", diff --git a/loopy/check.py b/loopy/check.py index d1ee125df..83e4fd0af 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -33,8 +33,6 @@ from loopy.type_inference import TypeInferenceMapper from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, CInstruction, _DataObliviousInstruction) -from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction) from functools import reduce import logging @@ -145,9 +143,9 @@ class SubscriptIndicesIsIntChecker(TypeInferenceMapper): return self.rec(expr.aggregate) -def check_for_integer_subscript_indices(kernel): +def check_for_integer_subscript_indices(kernel, callables_table): from pymbolic.primitives import Subscript - idx_int_checker = SubscriptIndicesIsIntChecker(kernel) + idx_int_checker = SubscriptIndicesIsIntChecker(kernel, callables_table) for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): idx_int_checker(insn.expression, return_tuple=isinstance(insn, @@ -763,7 +761,7 @@ def pre_schedule_checks(kernel, callables_table): try: logger.debug("%s: pre-schedule check: start" % kernel.name) - check_for_integer_subscript_indices(kernel) + check_for_integer_subscript_indices(kernel, callables_table) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) check_for_double_use_of_hw_axes(kernel, callables_table) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 3516ca29a..74c1ebf54 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -241,10 +241,54 @@ def parse_transformed_fortran(source, free_form=True, strict=True, return proc_dict["RESULT"] +def _add_assignees_to_calls(knl, all_kernels): + new_insns = [] + subroutine_dict = dict((kernel.name, kernel) for kernel in all_kernels) + from loopy.kernel.instruction import (Assignment, CallInstruction, + CInstruction, _DataObliviousInstruction, + modify_assignee_for_array_call) + from pymbolic.primitives import Call, Variable + + for insn in knl.instructions: + if isinstance(insn, CallInstruction): + if isinstance(insn.expression, Call) and ( + insn.expression.function.name in subroutine_dict): + assignees = [] + new_params = [] + subroutine = subroutine_dict[insn.expression.function.name] + for par, arg in zip(insn.expression.parameters, subroutine.args): + if arg.name in subroutine.get_written_variables(): + par = modify_assignee_for_array_call(par) + assignees.append(par) + if arg.name in subroutine.get_read_variables(): + new_params.append(par) + if arg.name not in (subroutine.get_written_variables() | + subroutine.get_read_variables()): + new_params.append(par) + + new_insns.append( + insn.copy( + assignees=tuple(assignees), + expression=Variable( + insn.expression.function.name)(*new_params))) + else: + new_insns.append(insn) + pass + elif isinstance(insn, (Assignment, CInstruction, + _DataObliviousInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError(type(insn).__name__) + + return knl.copy(instructions=new_insns) + + def parse_fortran(source, filename="", free_form=None, strict=None, - seq_dependencies=None, auto_dependencies=None, target=None): + seq_dependencies=None, auto_dependencies=None, target=None, + return_list_of_knls=False): """ - :returns: a :class:`loopy.Program` + :returns: an instance of :class:`list` of :class:`loopy.LoopKernel`s if + *return_list_of_knls* is True else a :class:`loopy.Program`. """ parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename) @@ -286,6 +330,11 @@ def parse_fortran(source, filename="", free_form=None, strict=None, kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies) + if return_list_of_knls: + return kernels + + kernels = [_add_assignees_to_calls(knl, kernels) for knl in kernels] + from loopy.kernel.tools import identify_root_kernel from loopy.program import make_program from loopy.transform.callable import register_callable_kernel diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py index ec1b10f1f..e44b183ed 100644 --- a/loopy/ipython_ext.py +++ b/loopy/ipython_ext.py @@ -9,7 +9,7 @@ import loopy as lp class LoopyMagics(Magics): @cell_magic def fortran_kernel(self, line, cell): - result = lp.parse_fortran(cell) + result = lp.parse_fortran(cell, return_list_of_knls=True) for knl in result: self.shell.user_ns[knl.name] = knl diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 1f896bb97..f36a90575 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -37,6 +37,7 @@ from loopy.kernel.data import ( SubstitutionRule, AddressSpace, ValueArg) from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, CallInstruction) +from loopy.program import iterate_over_kernels_if_given_program from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -1753,6 +1754,7 @@ def add_inferred_inames(knl): # {{{ apply single-writer heuristic +@iterate_over_kernels_if_given_program def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): logger.debug("%s: default deps" % kernel.name) @@ -2175,56 +2177,55 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # {{{ handle kernel language version - if not is_callee_kernel: - from loopy.version import LANGUAGE_VERSION_SYMBOLS + from loopy.version import LANGUAGE_VERSION_SYMBOLS - version_to_symbol = dict( - (getattr(loopy.version, lvs), lvs) - for lvs in LANGUAGE_VERSION_SYMBOLS) + version_to_symbol = dict( + (getattr(loopy.version, lvs), lvs) + for lvs in LANGUAGE_VERSION_SYMBOLS) - lang_version = kwargs.pop("lang_version", None) - if lang_version is None: - # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION + lang_version = kwargs.pop("lang_version", None) + if lang_version is None: + # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION - # This *is* gross. But it seems like the right thing interface-wise. - import inspect - caller_globals = inspect.currentframe().f_back.f_globals + # This *is* gross. But it seems like the right thing interface-wise. + import inspect + caller_globals = inspect.currentframe().f_back.f_globals - for ver_sym in LANGUAGE_VERSION_SYMBOLS: - try: - lang_version = caller_globals[ver_sym] - break - except KeyError: - pass + for ver_sym in LANGUAGE_VERSION_SYMBOLS: + try: + lang_version = caller_globals[ver_sym] + break + except KeyError: + pass - # }}} + # }}} - if lang_version is None: - from warnings import warn - from loopy.diagnostic import LoopyWarning - from loopy.version import ( - MOST_RECENT_LANGUAGE_VERSION, - FALLBACK_LANGUAGE_VERSION) - warn("'lang_version' was not passed to make_kernel(). " - "To avoid this warning, pass " - "lang_version={ver} in this invocation. " - "(Or say 'from loopy.version import " - "{sym_ver}' in " - "the global scope of the calling frame.)" - .format( - ver=MOST_RECENT_LANGUAGE_VERSION, - sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] - ), - LoopyWarning, stacklevel=2) - - lang_version = FALLBACK_LANGUAGE_VERSION - - if lang_version not in version_to_symbol: - raise LoopyError("Language version '%s' is not known." % (lang_version,)) - if lang_version >= (2018, 1): - options = options.copy(enforce_variable_access_ordered=True) - if lang_version >= (2018, 2): - options = options.copy(ignore_boostable_into=True) + if lang_version is None: + from warnings import warn + from loopy.diagnostic import LoopyWarning + from loopy.version import ( + MOST_RECENT_LANGUAGE_VERSION, + FALLBACK_LANGUAGE_VERSION) + warn("'lang_version' was not passed to make_kernel(). " + "To avoid this warning, pass " + "lang_version={ver} in this invocation. " + "(Or say 'from loopy.version import " + "{sym_ver}' in " + "the global scope of the calling frame.)" + .format( + ver=MOST_RECENT_LANGUAGE_VERSION, + sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] + ), + LoopyWarning, stacklevel=2) + + lang_version = FALLBACK_LANGUAGE_VERSION + + if lang_version not in version_to_symbol: + raise LoopyError("Language version '%s' is not known." % (lang_version,)) + if lang_version >= (2018, 1): + options = options.copy(enforce_variable_access_ordered=True) + if lang_version >= (2018, 2): + options = options.copy(ignore_boostable_into=True) # }}} @@ -2382,11 +2383,6 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): def make_function(*args, **kwargs): - lang_version = kwargs.pop('lang_version', None) - if lang_version: - raise LoopyError("lang_version should be set for program, not " - "functions.") - kwargs['is_callee_kernel'] = True return make_kernel(*args, **kwargs) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 9d85f5e84..1ba0dc7ec 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1208,7 +1208,7 @@ def is_array_call(assignees, expression): return False -def modify_assignee_assignee_for_array_call(assignee): +def modify_assignee_for_array_call(assignee): """ Converts the assignee subscript or variable as a SubArrayRef. """ @@ -1258,7 +1258,7 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): # assignee as an instance of SubArrayRef. If not given as a # SubArrayRef return CallInstruction( - assignees=tuple(modify_assignee_assignee_for_array_call( + assignees=tuple(modify_assignee_for_array_call( assignee) for assignee in assignees), expression=expression, temp_var_types=temp_var_types, diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 6c6a0dd9b..504493f4d 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -51,7 +51,7 @@ class ReductionOperation(object): def arg_count(self): raise NotImplementedError - def neutral_element(self, *dtypes): + def neutral_element(self, dtypes, callables_table, target): raise NotImplementedError def __hash__(self): @@ -84,9 +84,6 @@ class ReductionOperation(object): raise LoopyError("unable to parse reduction type: '%s'" % op_type) - def get_scalar_callables(self): - return frozenset() - class ScalarReductionOperation(ReductionOperation): def __init__(self, forced_result_type=None): @@ -128,29 +125,43 @@ class ScalarReductionOperation(ReductionOperation): class SumReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): + def neutral_element(self, dtype, callables_table, target): # FIXME: Document that we always use an int here. - return 0 + from loopy import auto + if dtype not in [None, auto] and dtype.numpy_dtype.kind == 'f': + return 0.0, callables_table - def __call__(self, dtype, operand1, operand2): - return operand1 + operand2 + return 0, callables_table + + def __call__(self, dtype, operand1, operand2, callables_table, target): + return operand1 + operand2, callables_table class ProductReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): + def neutral_element(self, dtype, callables_table, target): # FIXME: Document that we always use an int here. - return 1 + from loopy import auto + if dtype not in [None, auto] and dtype.numpy_dtype.kind == 'f': + return 1.0, callables_table - def __call__(self, dtype, operand1, operand2): - return operand1 * operand2 + return 1, callables_table + + def __call__(self, dtype, operand1, operand2, callables_table, target): + return operand1 * operand2, callables_table def get_le_neutral(dtype): """Return a number y that satisfies (x <= y) for all y.""" if dtype.numpy_dtype.kind == "f": - # OpenCL 1.1, section 6.11.2 - return var("INFINITY") + # OpenCL 1.2, section 6.12.2 + if dtype.numpy_dtype.itemsize == 4: + #float + return var("INFINITY") + elif dtype.numpy_dtype.itemsize == 8: + #double + return var("HUGE_VAL") + elif dtype.numpy_dtype.kind == "i": # OpenCL 1.1, section 6.11.3 if dtype.numpy_dtype.itemsize == 4: @@ -167,8 +178,13 @@ def get_ge_neutral(dtype): """Return a number y that satisfies (x >= y) for all y.""" if dtype.numpy_dtype.kind == "f": - # OpenCL 1.1, section 6.11.2 - return -var("INFINITY") + # OpenCL 1.2, section 6.12.2 + if dtype.numpy_dtype.itemsize == 4: + #float + return -var("INFINITY") + elif dtype.numpy_dtype.itemsize == 8: + #double + return -var("HUGE_VAL") elif dtype.numpy_dtype.kind == "i": # OpenCL 1.1, section 6.11.3 if dtype.numpy_dtype.itemsize == 4: @@ -182,25 +198,53 @@ def get_ge_neutral(dtype): class MaxReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): - return get_ge_neutral(dtype) + def neutral_element(self, dtype, callables_table, target): + return get_ge_neutral(dtype), callables_table - def __call__(self, dtype, operand1, operand2): - return ResolvedFunction("max")(operand1, operand2) + def __call__(self, dtype, operand1, operand2, callables_table, target): + dtype, = dtype + + # getting the callable 'max' from target + from loopy.program import (find_in_knl_callable_from_identifier, + _default_func_id_to_kernel_callable_mappers) + max_scalar_callable = find_in_knl_callable_from_identifier( + _default_func_id_to_kernel_callable_mappers(target), + target, "max") + + # type specialize the callable + max_scalar_callable, callables_table = max_scalar_callable.with_types( + {0: dtype, 1: dtype}, None, callables_table) - def get_scalar_callables(self): - return frozenset(["max"]) + # populate callables_table + callables_table, func_id = callables_table.with_added_callable( + 'max', max_scalar_callable) + + return ResolvedFunction(func_id)(operand1, operand2), callables_table class MinReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): - return get_le_neutral(dtype) + def neutral_element(self, dtype, callables_table, target): + return get_le_neutral(dtype), callables_table - def __call__(self, dtype, operand1, operand2): - return ResolvedFunction("min")(operand1, operand2) + def __call__(self, dtype, operand1, operand2, callables_table, target): + dtype, = dtype + + # getting the callable 'max' from target + from loopy.program import (find_in_knl_callable_from_identifier, + _default_func_id_to_kernel_callable_mappers) + min_scalar_callable = find_in_knl_callable_from_identifier( + _default_func_id_to_kernel_callable_mappers(target), + target, "min") + + # type specialize the callable + min_scalar_callable, callables_table = min_scalar_callable.with_types( + {0: dtype, 1: dtype}, None, callables_table) - def get_scalar_callables(self): - return frozenset(["min"]) + # populate callables_table + callables_table, func_id = callables_table.with_added_callable( + 'min', min_scalar_callable) + + return ResolvedFunction(func_id)(operand1, operand2), callables_table # {{{ base class for symbolic reduction ops @@ -259,10 +303,26 @@ class _SegmentedScalarReductionOperation(ReductionOperation): scalar_dtype.numpy_dtype.type.__name__, segment_flag_dtype.numpy_dtype.type.__name__) - def neutral_element(self, scalar_dtype, segment_flag_dtype): - scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) - return ResolvedFunction("make_tuple")(scalar_neutral_element, - segment_flag_dtype.numpy_dtype.type(0)) + def neutral_element(self, scalar_dtype, segment_flag_dtype, + callables_table, target): + scalar_neutral_element, calables_table = ( + self.inner_reduction.neutral_element( + scalar_dtype, callables_table, target)) + + from loopy.program import (find_in_knl_callable_from_identifier, + _default_func_id_to_kernel_callable_mappers) + make_tuple_scalar_callable = find_in_knl_callable_from_identifier( + _default_func_id_to_kernel_callable_mappers(target), + target, "make_tuple") + make_tuple_scalar_callable, _ = ( + make_tuple_scalar_callable.with_types( + dict(enumerate([scalar_dtype, segment_flag_dtype])), None, + None)) + callables_table, func_id = callables_table.with_added_callable( + "make_tuple", make_tuple_scalar_callable) + + return ResolvedFunction(func_id)(scalar_neutral_element, + segment_flag_dtype.numpy_dtype.type(0)), callables_table def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): return (self.inner_reduction.result_dtypes(kernel, scalar_dtype) @@ -277,11 +337,27 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def __eq__(self, other): return type(self) == type(other) - def __call__(self, dtypes, operand1, operand2): - return ResolvedFunction(SegmentedOp(self))(*(operand1 + operand2)) + def __call__(self, dtypes, operand1, operand2, callables_table, target): + # getting the callable 'max' from target + + from loopy.program import (find_in_knl_callable_from_identifier, + _default_func_id_to_kernel_callable_mappers) + segmented_scalar_callable = find_in_knl_callable_from_identifier( + _default_func_id_to_kernel_callable_mappers(target), + target, SegmentedOp(self)) + + # type specialize the callable + segmented_scalar_callable, callables_table = ( + segmented_scalar_callable.with_types( + {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]}, + None, callables_table)) - def get_scalar_callables(self): - return frozenset(["make_tuple", SegmentedOp(self)]) + # populate callables_table + callables_table, func_id = callables_table.with_added_callable( + SegmentedOp(self), segmented_scalar_callable) + + return (ResolvedFunction(func_id)(*(operand1 + operand2)), + callables_table) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): @@ -335,12 +411,27 @@ class _ArgExtremumReductionOperation(ReductionOperation): def result_dtypes(self, kernel, scalar_dtype, index_dtype): return (scalar_dtype, index_dtype) - def neutral_element(self, scalar_dtype, index_dtype): + def neutral_element(self, scalar_dtype, index_dtype, callables_table, + target): scalar_neutral_func = ( get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) scalar_neutral_element = scalar_neutral_func(scalar_dtype) - return ResolvedFunction("make_tuple")(scalar_neutral_element, - index_dtype.numpy_dtype.type(-1)) + + from loopy.program import (find_in_knl_callable_from_identifier, + _default_func_id_to_kernel_callable_mappers) + + make_tuple_scalar_callable = find_in_knl_callable_from_identifier( + _default_func_id_to_kernel_callable_mappers(target), + target, "make_tuple") + make_tuple_scalar_callable, _ = ( + make_tuple_scalar_callable.with_types( + dict(enumerate([scalar_dtype, index_dtype])), None, + None)) + callables_table, func_id = callables_table.with_added_callable( + "make_tuple", make_tuple_scalar_callable) + + return ResolvedFunction(func_id)(scalar_neutral_element, + index_dtype.numpy_dtype.type(-1)), callables_table def __str__(self): return self.which @@ -355,11 +446,27 @@ class _ArgExtremumReductionOperation(ReductionOperation): def arg_count(self): return 2 - def __call__(self, dtypes, operand1, operand2): - return ResolvedFunction(ArgExtOp(self))(*(operand1 + operand2)) + def __call__(self, dtypes, operand1, operand2, callables_table, target): + # getting the callable 'max' from target + + from loopy.program import (find_in_knl_callable_from_identifier, + _default_func_id_to_kernel_callable_mappers) + arg_ext_scalar_callable = find_in_knl_callable_from_identifier( + _default_func_id_to_kernel_callable_mappers(target), + target, ArgExtOp(self)) + + # type specialize the callable + arg_ext_scalar_callable, callables_table = ( + arg_ext_scalar_callable.with_types( + {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]}, + None, callables_table)) + + # populate callables_table + callables_table, func_id = callables_table.with_added_callable( + ArgExtOp(self), arg_ext_scalar_callable) - def get_scalar_callables(self): - return frozenset([self.which, "make_tuple", ArgExtOp(self)]) + return (ResolvedFunction(func_id)(*(operand1 + operand2)), + callables_table) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index de620ef9a..c6b69da83 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -38,8 +38,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types -from loopy.symbolic import RuleAwareIdentityMapper - +from loopy.symbolic import RuleAwareIdentityMapper, ReductionCallbackMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) from loopy.program import Program, iterate_over_kernels_if_given_program @@ -899,6 +898,18 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} +class RealizeReductionCallbackMapper(ReductionCallbackMapper): + def __init__(self, callback, callables_table): + super(RealizeReductionCallbackMapper, self).__init__( + callback) + self.callables_table = callables_table + + def map_reduction(self, expr, **kwargs): + result, self.callables_table = self.callback(expr, self.rec, + **kwargs) + return result + + def realize_reduction_for_single_kernel(kernel, callables_table, insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, force_scan=False, force_outer_iname_for_scan=None): @@ -1046,13 +1057,16 @@ def realize_reduction_for_single_kernel(kernel, callables_table, init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) + expression, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, target=kernel.target) + init_insn = make_assignment( id=init_id, assignees=acc_vars, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes), + expression=expression, predicates=insn.predicates,) generated_insns.append(init_insn) @@ -1087,13 +1101,17 @@ def realize_reduction_for_single_kernel(kernel, callables_table, else: reduction_expr = expr.expr + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + reduction_expr, + callables_table, + kernel.target) + reduction_insn = make_assignment( id=update_id, assignees=acc_vars, - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, acc_vars), - reduction_expr), + expression=expression, depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on, within_inames=update_insn_iname_deps, within_inames_is_final=insn.within_inames_is_final, @@ -1105,9 +1123,9 @@ def realize_reduction_for_single_kernel(kernel, callables_table, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0] + return acc_vars[0], callables_table else: - return acc_vars + return acc_vars, callables_table # }}} @@ -1190,7 +1208,8 @@ def realize_reduction_for_single_kernel(kernel, callables_table, base_iname_deps = outer_insn_inames - frozenset(expr.inames) - neutral = expr.operation.neutral_element(*arg_dtypes) + neutral, callables_table = expr.operation.neutral_element(*arg_dtypes, + callables_table=callables_table, target=kernel.target) init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) init_insn = make_assignment( id=init_id, @@ -1243,17 +1262,20 @@ def realize_reduction_for_single_kernel(kernel, callables_table, reduction_expr = expr.expr transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname)) + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar( + neutral_var_names, + tuple(var(nvn) for nvn in neutral_var_names)), + reduction_expr, + callables_table, + kernel.target) transfer_insn = make_assignment( id=transfer_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(red_iname),)] for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar( - neutral_var_names, - tuple(var(nvn) for nvn in neutral_var_names)), - reduction_expr), + expression=expression, within_inames=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), @@ -1282,22 +1304,26 @@ def realize_reduction_for_single_kernel(kernel, callables_table, new_iname_tags[stage_exec_iname] = kernel.iname_tags(red_iname) stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage)) + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)), + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + ( + var(stage_exec_iname) + new_size,)] + for acc_var in acc_vars)), + callables_table, + kernel.target) + stage_insn = make_assignment( id=stage_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars)), - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + ( - var(stage_exec_iname) + new_size,)] - for acc_var in acc_vars))), + expression=expression, within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, @@ -1318,9 +1344,10 @@ def realize_reduction_for_single_kernel(kernel, callables_table, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0][outer_local_iname_vars + (0,)] + return acc_vars[0][outer_local_iname_vars + (0,)], callables_table else: - return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] + return [acc_var[outer_local_iname_vars + (0,)] for acc_var in + acc_vars], callables_table # }}} # {{{ utils (stateful) @@ -1414,6 +1441,9 @@ def realize_reduction_for_single_kernel(kernel, callables_table, if global_barrier is not None: init_insn_depends_on |= frozenset([global_barrier]) + expression, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, target=kernel.target) + init_insn = make_assignment( id=init_id, assignees=acc_vars, @@ -1421,7 +1451,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, (sweep_iname,) + expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes), + expression=expression, predicates=insn.predicates, ) @@ -1440,13 +1470,17 @@ def realize_reduction_for_single_kernel(kernel, callables_table, if insn.within_inames_is_final: update_insn_iname_deps = insn.within_inames | set([track_iname]) + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + _strip_if_scalar(acc_vars, updated_inner_exprs), + callables_table, + kernel.target) + scan_insn = make_assignment( id=update_id, assignees=acc_vars, - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, acc_vars), - _strip_if_scalar(acc_vars, updated_inner_exprs)), + expression=expression, depends_on=frozenset(update_insn_depends_on), within_inames=update_insn_iname_deps, no_sync_with=insn.no_sync_with, @@ -1460,9 +1494,9 @@ def realize_reduction_for_single_kernel(kernel, callables_table, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0] + return acc_vars[0], callables_table else: - return acc_vars + return acc_vars, callables_table # }}} @@ -1536,7 +1570,8 @@ def realize_reduction_for_single_kernel(kernel, callables_table, base_iname_deps = (outer_insn_inames - frozenset(expr.inames) - frozenset([sweep_iname])) - neutral = expr.operation.neutral_element(*arg_dtypes) + neutral, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, target=kernel.target) init_insn_depends_on = insn.depends_on @@ -1635,19 +1670,23 @@ def realize_reduction_for_single_kernel(kernel, callables_table, write_stage_id = insn_id_gen( "scan_%s_write_stage_%d" % (scan_iname, istage)) + + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, read_vars), + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)), + callables_table, + kernel.target) + write_stage_insn = make_assignment( id=write_stage_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, read_vars), - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars)) - ), + expression=expression, within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, @@ -1668,10 +1707,11 @@ def realize_reduction_for_single_kernel(kernel, callables_table, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0][outer_local_iname_vars + (output_idx,)] + return (acc_vars[0][outer_local_iname_vars + (output_idx,)], + callables_table) else: return [acc_var[outer_local_iname_vars + (output_idx,)] - for acc_var in acc_vars] + for acc_var in acc_vars], callables_table # }}} @@ -1765,7 +1805,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, # to reduce over. It's rather similar to an array with () shape in # numpy.) - return expr.expr + return expr.expr, callables_table # }}} @@ -1833,8 +1873,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, # }}} - from loopy.symbolic import ReductionCallbackMapper - cb_mapper = ReductionCallbackMapper(map_reduction) + cb_mapper = RealizeReductionCallbackMapper(map_reduction, callables_table) insn_queue = kernel.instructions[:] insn_id_replacements = {} @@ -1862,13 +1901,14 @@ def realize_reduction_for_single_kernel(kernel, callables_table, # Run reduction expansion. from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: + # FIXME[KK]: With the new mapper emitting callables_table + # something should be done. new_expressions = cb_mapper(insn.expression, callables_table=callables_table, nresults=nresults) else: - new_expressions = ( - cb_mapper(insn.expression, - callables_table=callables_table),) + new_expressions = cb_mapper(insn.expression, + callables_table=callables_table), if generated_insns: # An expansion happened, so insert the generated stuff plus @@ -1955,32 +1995,28 @@ def realize_reduction_for_single_kernel(kernel, callables_table, _hackily_ensure_multi_assignment_return_values_are_scoped_private( kernel)) - return kernel + return kernel, cb_mapper.callables_table def realize_reduction(program, *args, **kwargs): assert isinstance(program, Program) - new_resolved_functions = {} - for func_id, in_knl_callable in program.callables_table.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = realize_reduction_for_single_kernel( - in_knl_callable.subkernel, program.callables_table, - *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable + callables_table = program.callables_table.copy() + kernels_to_scan = [in_knl_callable.subkernel for in_knl_callable in + program.callables_table.values() if isinstance(in_knl_callable, + CallableKernel)] + + for knl in kernels_to_scan: + new_knl, callables_table = realize_reduction_for_single_kernel( + knl, callables_table, *args, **kwargs) + in_knl_callable = callables_table[knl.name].copy( + subkernel=new_knl) + resolved_functions = callables_table.resolved_functions.copy() + resolved_functions[knl.name] = in_knl_callable + callables_table = callables_table.copy( + resolved_functions=resolved_functions) - new_callables_table = program.callables_table.copy( - resolved_functions=new_resolved_functions) - return program.copy(callables_table=new_callables_table) + return program.copy(callables_table=callables_table) # }}} @@ -2338,9 +2374,6 @@ def preprocess_single_kernel(kernel, callables_table, device=None): # }}} - from loopy.transform.subst import expand_subst - kernel = expand_subst(kernel) - # Ordering restriction: # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. @@ -2348,20 +2381,6 @@ def preprocess_single_kernel(kernel, callables_table, device=None): check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) - from loopy.kernel.creation import apply_single_writer_depencency_heuristic - kernel = apply_single_writer_depencency_heuristic(kernel) - - # Ordering restrictions: - # - # - realize_reduction must happen after type inference because it needs - # to be able to determine the types of the reduced expressions. - # - # - realize_reduction must happen after default dependencies are added - # because it manipulates the depends_on field, which could prevent - # defaults from being applied. - kernel = realize_reduction_for_single_kernel(kernel, - callables_table, unknown_types_ok=False) - # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators # need to be duplicated by this. @@ -2451,6 +2470,23 @@ def preprocess_program(program, device=None): program = infer_unknown_types(program, expect_completion=False) + from loopy.transform.subst import expand_subst + program = expand_subst(program) + + from loopy.kernel.creation import apply_single_writer_depencency_heuristic + program = apply_single_writer_depencency_heuristic(program) + + # Ordering restrictions: + # + # - realize_reduction must happen after type inference because it needs + # to be able to determine the types of the reduced expressions. + # + # - realize_reduction must happen after default dependencies are added + # because it manipulates the depends_on field, which could prevent + # defaults from being applied. + + program = realize_reduction(program, unknown_types_ok=False) + # {{{ preprocess callable kernels # Callable editing restrictions: diff --git a/loopy/program.py b/loopy/program.py index 1fb691531..191a13fa1 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -56,6 +56,25 @@ __doc__ = """ """ +def find_in_knl_callable_from_identifier( + function_id_to_in_knl_callable_mappers, target, identifier): + """ + Returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` if the + :arg:`identifier` is known to any kernel function scoper, otherwise returns + *None*. + """ + for func_id_to_in_knl_callable_mapper in ( + function_id_to_in_knl_callable_mappers): + # fixme: do we really need to given target for the function + in_knl_callable = func_id_to_in_knl_callable_mapper( + target, identifier) + if in_knl_callable is not None: + return in_knl_callable + + return None + + class ResolvedFunctionMarker(RuleAwareIdentityMapper): """ Mapper to convert the ``function`` attribute of a @@ -82,23 +101,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): self.function_id_to_in_knl_callable_mappers = ( function_id_to_in_knl_callable_mappers) - def find_in_knl_callable_from_identifier(self, identifier): - """ - Returns an instance of - :class:`loopy.kernel.function_interface.InKernelCallable` if the - :arg:`identifier` is known to any kernel function scoper, otherwise returns - *None*. - """ - for func_id_to_in_knl_callable_mapper in ( - self.function_id_to_in_knl_callable_mappers): - # fixme: do we really need to given target for the function - in_knl_callable = func_id_to_in_knl_callable_mapper( - self.kernel.target, identifier) - if in_knl_callable is not None: - return in_knl_callable - - return None - def map_call(self, expr, expn_state): from loopy.symbolic import parse_tagged_name @@ -117,7 +119,9 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): if not isinstance(expr.function, ResolvedFunction): # search the kernel for the function. - in_knl_callable = self.find_in_knl_callable_from_identifier( + in_knl_callable = find_in_knl_callable_from_identifier( + self.function_id_to_in_knl_callable_mappers, + self.kernel.target, expr.function.name) if in_knl_callable: @@ -140,16 +144,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): return super(ResolvedFunctionMarker, self).map_call_with_kwargs(expr, expn_state) - def map_reduction(self, expr, expn_state): - for func_id in ( - expr.operation.get_scalar_callables()): - in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) - assert in_knl_callable is not None - self.callables_table, _ = ( - self.callables_table.with_added_callable(func_id, - in_knl_callable)) - return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) - def _default_func_id_to_kernel_callable_mappers(target): """ @@ -525,8 +519,7 @@ class CallablesCountingMapper(CombineMapper): map_call_with_kwargs = map_call def map_reduction(self, expr): - return Counter(expr.operation.get_scalar_callables()) + ( - super(CallablesCountingMapper, self).map_reduction(expr)) + return super(CallablesCountingMapper, self).map_reduction(expr) def map_constant(self, expr): return Counter() @@ -774,13 +767,18 @@ class CallablesTable(ImmutableRecord): # {{{ non-edit mode if not self.is_being_edited: - if function.name in self.resolved_functions and ( - self.resolved_functions[function.name] == in_kernel_callable): + if isinstance(function, ReductionOpFunction): + function_name = function + else: + function_name = function.name + + if function_name in self.resolved_functions and ( + self.resolved_functions[function_name] == in_kernel_callable): # if not being edited, check that the given function is # equal to the old version of the callable. return self, function else: - print('Old: ', self.resolved_functions[function.name]) + print('Old: ', self.resolved_functions[function_name]) print('New: ', in_kernel_callable) raise LoopyError("Use 'with_enter_edit_callables_mode' first.") diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 6f3c6f2be..870f9fc2c 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -719,7 +719,7 @@ class RuleArgument(LoopyExpressionBase): mapper_method = intern("map_rule_argument") -class ResolvedFunction(p.Expression): +class ResolvedFunction(LoopyExpressionBase): """ A function invocation whose definition is known in a :mod:`loopy` kernel. Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression @@ -758,8 +758,8 @@ class ResolvedFunction(p.Expression): def __getinitargs__(self): return (self.function, ) - def stringifier(self): - return StringifyMapper + def make_stringifier(self, originating_stringifier=None): + return StringifyMapper() mapper_method = intern("map_resolved_function") @@ -807,7 +807,7 @@ class SweptInameStrideCollector(CoefficientCollectorBase): return super(SweptInameStrideCollector, self).map_algebraic_leaf(expr) -class SubArrayRef(p.Expression): +class SubArrayRef(LoopyExpressionBase): """ An algebraic expression to map an affine memory layout pattern (known as sub-arary) as consecutive elements of the sweeping axes which are defined @@ -871,8 +871,8 @@ class SubArrayRef(p.Expression): and other.subscript == self.subscript and other.swept_inames == self.swept_inames) - def stringifier(self): - return StringifyMapper + def make_stringifier(self, originating_stringifier=None): + return StringifyMapper() mapper_method = intern("map_sub_array_ref") diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 10161378b..82478a268 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -25,6 +25,7 @@ THE SOFTWARE. """ import numpy as np +import six from loopy.target.c import CTarget, CASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper @@ -183,14 +184,17 @@ class OpenCLCallable(ScalarCallable): return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), callables_table) - dtype = np.find_common_type( + common_dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() if (id >= 0 and dtype is not None)]) - if dtype.kind in ['u', 'i', 'f']: - if dtype.kind == 'f': + if common_dtype.kind in ['u', 'i', 'f']: + if common_dtype.kind == 'f': name = 'f'+name - dtype = NumpyType(dtype) + + target = [dtype.target for dtype in six.itervalues(arg_id_to_dtype) + if (id >= 0 and dtype is not None)][0] + dtype = NumpyType(common_dtype, target) return ( self.copy(name_in_target=name, arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), @@ -198,7 +202,7 @@ class OpenCLCallable(ScalarCallable): else: # Unsupported type. raise LoopyError("%s function not supported for the types %s" % - (name, dtype)) + (name, common_dtype)) if name == "dot": for id in arg_id_to_dtype: @@ -319,6 +323,8 @@ def opencl_symbol_mangler(kernel, name): return NumpyType(np.dtype(np.int32)), name elif name.startswith("LONG_"): return NumpyType(np.dtype(np.int64)), name + elif name == "HUGE_VAL": + return NumpyType(np.dtype(np.float64)), name else: return None diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 479843697..7534818d7 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -50,7 +50,7 @@ __doc__ = """ # {{{ register function lookup -def _resolved_callables_from_function_lookup(program, +def _resolve_callables_from_function_lookup(program, func_id_to_in_kernel_callable_mapper): """ Returns a copy of *program* with the expression nodes marked "Resolved" @@ -124,7 +124,7 @@ def register_function_id_to_in_knl_callable_mapper(program, new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + ( [func_id_to_in_knl_callable_mapper]) - program = _resolved_callables_from_function_lookup(program, + program = _resolve_callables_from_function_lookup(program, func_id_to_in_knl_callable_mapper) new_program = program.copy( @@ -173,11 +173,17 @@ def register_callable_kernel(program, callee_kernel): # the number of assigness in the callee kernel intructions. expected_num_assignees = len([arg for arg in callee_kernel.args if arg.name in callee_kernel.get_written_variables()]) - expected_num_parameters = len([arg for arg in callee_kernel.args if + expected_max_num_parameters = len([arg for arg in callee_kernel.args if arg.name in callee_kernel.get_read_variables()]) + len( [arg for arg in callee_kernel.args if arg.name not in (callee_kernel.get_read_variables() | callee_kernel.get_written_variables())]) + expected_min_num_parameters = len([arg for arg in callee_kernel.args if + arg.name in callee_kernel.get_read_variables() and arg.name not in + callee_kernel.get_written_variables()]) + len( + [arg for arg in callee_kernel.args if arg.name not in + (callee_kernel.get_read_variables() | + callee_kernel.get_written_variables())]) for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): caller_kernel = in_knl_callable.subkernel @@ -195,11 +201,21 @@ def register_callable_kernel(program, callee_kernel): "match." % ( callee_kernel.name, insn.id)) if len(insn.expression.parameters+tuple( - kw_parameters.values())) != expected_num_parameters: - raise LoopyError("The number of expected arguments " - "for the callee kernel %s and the number of " - "parameters in instruction %s do not match." - % (callee_kernel.name, insn.id)) + kw_parameters.values())) > expected_max_num_parameters: + raise LoopyError("The number of" + " parameters in instruction '%s' exceed" + " the max. number of arguments possible" + " for the callee kernel '%s' => arg matching" + " not possible." + % (insn.id, callee_kernel.name)) + if len(insn.expression.parameters+tuple( + kw_parameters.values())) < expected_min_num_parameters: + raise LoopyError("The number of" + " parameters in instruction '%s' is less than" + " the min. number of arguments possible" + " for the callee kernel '%s' => arg matching" + " not possible." + % (insn.id, callee_kernel.name)) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 9b83f242b..45e9c0a06 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -419,6 +419,11 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): *data_flow* was added in version 2016.2 """ + from loopy.program import make_program + + programs = [make_program(knl) if isinstance(knl, LoopKernel) else knl for + knl in programs] + # all the resolved functions in programs must be registered in # main_callables_table main_prog_callables_info = ( diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 281dcb43d..2101fd2fc 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -998,7 +998,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table, # functions if _instruction_missed_during_inference(insn): type_inf_mapper(insn.expression, - return_tuple=len(insn.assignees) > 1, + return_tuple=len(insn.assignees) != 1, return_dtype_set=True) elif isinstance(insn, (_DataObliviousInstruction, lp.CInstruction)): diff --git a/test/test_callables.py b/test/test_callables.py index f2f3acbd6..731593ea3 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -63,38 +63,35 @@ def test_register_function_lookup(ctx_factory): def test_register_knl(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 4 + n = 4 x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) grandchild_knl = lp.make_function( - "{[i, j]:0<= i, j< 16}", + "{[i, j]:0<= i, j< 4}", """ c[i, j] = 2*a[i, j] + 3*b[i, j] """, name='linear_combo1') child_knl = lp.make_function( - "{[i, j]:0<=i, j < 16}", + "{[i, j]:0<=i, j < 4}", """ [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) """, name='linear_combo2') parent_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", """ [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """, kernel_data=[ lp.GlobalArg( - name='x', + name='x, y', dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], + shape=(n, n, n, n, n)), + '...'] ) knl = lp.register_callable_kernel( @@ -115,36 +112,29 @@ def test_register_knl(ctx_factory, inline): def test_slices_with_negative_step(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 4 + n = 4 x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) child_knl = lp.make_function( - "{[i, j]:0<=i, j < 16}", + "{[i, j]:0<=i, j < 4}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") parent_knl = lp.make_kernel( - "{[i, k, m]: 0<=i, k, m<16}", + "{[i, k, m]: 0<=i, k, m<4}", """ - z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], + z[i, 3:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], y[i, :, k, :, m]) """, kernel_data=[ lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='z', + name='x, y, z', dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], + shape=(n, n, n, n, n)), + '...'] ) knl = lp.register_callable_kernel( @@ -163,7 +153,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 2 + n = 4 a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) @@ -215,27 +205,27 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 5 + n = 4 x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) callee_knl = lp.make_function( - "{[i, j]:0<=i, j < 32}", + "{[i, j]:0<=i, j < 4}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name='linear_combo') - callee_knl = lp.split_iname(callee_knl, "i", 2, inner_tag="l.0", outer_tag="g.0") + callee_knl = lp.split_iname(callee_knl, "i", 1, inner_tag="l.0", outer_tag="g.0") caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<32}", + "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", """ [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """ ) - caller_knl = lp.split_iname(caller_knl, "i", 8, inner_tag="l.1", outer_tag="g.1") + caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") knl = lp.register_callable_kernel( caller_knl, callee_knl) @@ -252,8 +242,8 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): x_host = x_dev.get() y_host = y_dev.get() - assert gsize == (16, 4) - assert lsize == (2, 8) + assert gsize == (4, 1) + assert lsize == (1, 4) assert np.linalg.norm(2*x_host+3*y_host-out['z'].get())/np.linalg.norm( 2*x_host+3*y_host) < 1e-15 @@ -484,13 +474,13 @@ def test_empty_sub_array_refs(ctx_factory, inline): def test_array_inputs_to_callee_kernels(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 4 + n = 2 ** 3 x = np.random.rand(n, n) y = np.random.rand(n, n) child_knl = lp.make_function( - "{[i, j]:0<=i, j < 16}", + "{[i, j]:0<=i, j < 8}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") @@ -502,17 +492,10 @@ def test_array_inputs_to_callee_kernels(ctx_factory, inline): """, kernel_data=[ lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16)), - lp.GlobalArg( - name='z', + name='x, y, z', dtype=np.float64, - shape=(16, 16)), '...'], + shape=(n, n)), + '...'] ) knl = lp.register_callable_kernel( diff --git a/test/test_fortran.py b/test/test_fortran.py index 437199810..1ab28409b 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -533,9 +533,11 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! - ! prg = lp.parse_fortran(SOURCE) - ! fill = prg["fill"] - ! twice = prg["twice"] + ! # FIXME: correct this after the "Module" is done. + ! # prg = lp.parse_fortran(SOURCE) + ! # fill = prg["fill"] + ! # twice = prg["twice"] + ! fill, twice = lp.parse_fortran(SOURCE, return_list_of_knls=True) ! knl = lp.fuse_kernels((fill, twice)) ! print(knl) ! RESULT = knl diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 1ba44e77e..55a2d2e11 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -60,7 +60,8 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ - knl for knl in lp.parse_fortran(source, filename, seq_dependencies=False) + knl for knl in lp.parse_fortran(source, filename, + seq_dependencies=False, return_list_of_knls=True) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") @@ -229,6 +230,15 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa hsv = tap_hsv + hsv = lp.set_options(hsv, + ignore_boostable_into=True, + cl_build_options=[ + "-cl-denorms-are-zero", + "-cl-fast-relaxed-math", + "-cl-finite-math-only", + "-cl-mad-enable", + "-cl-no-signed-zeros"]) + if 1: print("OPS") op_map = lp.get_op_map(hsv, subgroup_size=32) @@ -238,14 +248,6 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes() print(lp.stringify_stats_mapping(gmem_map)) - hsv = lp.set_options(hsv, cl_build_options=[ - "-cl-denorms-are-zero", - "-cl-fast-relaxed-math", - "-cl-finite-math-only", - "-cl-mad-enable", - "-cl-no-signed-zeros", - ]) - # FIXME: renaming's a bit tricky in this program model. # add a simple transformation for it # hsv = hsv.copy(name="horizontalStrongVolumeKernel") -- GitLab