diff --git a/doc/tutorial.rst b/doc/tutorial.rst index befa5e30b3456e0439be0163ed5fd075c0a6f100..e6ef54b6679678f7cc592aeb519ab576b33eb2e4 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1157,7 +1157,7 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted: >>> cgr = lp.generate_code_v2(knl) Traceback (most recent call last): ... - loopy.diagnostic.MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) + loopy.diagnostic.MissingBarrierError: rotate_v1: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) The syntax for a inserting a global barrier instruction is ``... gbarrier``. :mod:`loopy` also supports manually inserting local @@ -1554,7 +1554,7 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add, subgroup) : ... + Op(np:dtype('float32'), add, subgroup, loopy_kernel) : ... Each line of output will look roughly like:: diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb index 7a5c8257bf80fdfcc3d3b978a7dca2d401c48271..1b0a9df8d18da1947171eadf744cb3db2ea312da 100644 --- a/examples/fortran/ipython-integration-demo.ipynb +++ b/examples/fortran/ipython-integration-demo.ipynb @@ -62,9 +62,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "split_amount = 128" @@ -91,7 +89,7 @@ "\n", "!$loopy begin\n", "!\n", - "! tr_fill, = lp.parse_fortran(SOURCE)\n", + "! tr_fill = lp.parse_fortran(SOURCE)\n", "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n", "! outer_tag=\"g.0\", inner_tag=\"l.0\")\n", "! RESULT = [tr_fill]\n", @@ -107,15 +105,6 @@ "source": [ "print(tr_fill)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -134,7 +123,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy index 4b35522043bfc32b71c0a063c3efc3b4403a26f2..a8377beddb912a2d6b1d9255694336313089a0f9 100644 --- a/examples/fortran/matmul.floopy +++ b/examples/fortran/matmul.floopy @@ -13,7 +13,7 @@ subroutine dgemm(m,n,l,alpha,a,b,c) end subroutine !$loopy begin -! dgemm, = lp.parse_fortran(SOURCE, FILENAME) +! dgemm = lp.parse_fortran(SOURCE, FILENAME) ! dgemm = lp.split_iname(dgemm, "i", 16, ! outer_tag="g.0", inner_tag="l.1") ! dgemm = lp.split_iname(dgemm, "j", 8, @@ -24,5 +24,5 @@ end subroutine ! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2") ! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", default_tag="l.auto") ! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", default_tag="l.auto") -! RESULT = [dgemm] +! RESULT = dgemm !$loopy end diff --git a/examples/fortran/sparse.floopy b/examples/fortran/sparse.floopy index 18542e6b0403a7ab475b3e357f18489847367c3d..2b156bdd709e8f4258492d258adb888ad16fbccd 100644 --- a/examples/fortran/sparse.floopy +++ b/examples/fortran/sparse.floopy @@ -23,11 +23,11 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y) end !$loopy begin -! sparse, = lp.parse_fortran(SOURCE, FILENAME) +! sparse = lp.parse_fortran(SOURCE, FILENAME) ! sparse = lp.split_iname(sparse, "i", 128) ! sparse = lp.tag_inames(sparse, {"i_outer": "g.0"}) ! sparse = lp.tag_inames(sparse, {"i_inner": "l.0"}) ! sparse = lp.split_iname(sparse, "j", 4) ! sparse = lp.tag_inames(sparse, {"j_inner": "unr"}) -! RESULT = [sparse] +! RESULT = sparse !$loopy end diff --git a/examples/fortran/tagging.floopy b/examples/fortran/tagging.floopy index 87aacba68ae2fc6f3b7052325fcd2378e9880e47..c7ebb75667142a8bb470b32f1d92177e135db9b2 100644 --- a/examples/fortran/tagging.floopy +++ b/examples/fortran/tagging.floopy @@ -23,13 +23,13 @@ end ! "factor 4.0", ! "real_type real*8", ! ]) -! fill, = lp.parse_fortran(SOURCE, FILENAME) +! fill = lp.parse_fortran(SOURCE, FILENAME) ! fill = lp.add_barrier(fill, "tag:init", "tag:mult", "gb1") ! fill = lp.split_iname(fill, "i", 128, ! outer_tag="g.0", inner_tag="l.0") ! fill = lp.split_iname(fill, "i_1", 128, ! outer_tag="g.0", inner_tag="l.0") -! RESULT = [fill] +! RESULT = fill ! !$loopy end diff --git a/examples/fortran/volumeKernel.floopy b/examples/fortran/volumeKernel.floopy index c5784b63492063bfd2a9604c42dbf65b2ecb86bf..211c38049076cbe065ce847f948d724c293a032c 100644 --- a/examples/fortran/volumeKernel.floopy +++ b/examples/fortran/volumeKernel.floopy @@ -67,7 +67,7 @@ end subroutine volumeKernel !$loopy begin ! -! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME) +! volumeKernel = lp.parse_fortran(SOURCE, FILENAME) ! volumeKernel = lp.split_iname(volumeKernel, ! "e", 32, outer_tag="g.1", inner_tag="g.0") ! volumeKernel = lp.fix_parameters(volumeKernel, @@ -76,6 +76,6 @@ end subroutine volumeKernel ! i="l.0", j="l.1", k="l.2", ! i_1="l.0", j_1="l.1", k_1="l.2" ! )) -! RESULT = [volumeKernel] +! RESULT = volumeKernel ! !$loopy end diff --git a/loopy/__init__.py b/loopy/__init__.py index 1439cb1ff9e30bbefa7f34ce7a83f0b010d288c4..058bc93ef69916bfcc609c500b6bb0c0b45d647c 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -130,10 +130,10 @@ from loopy.type_inference import infer_unknown_types from loopy.preprocess import (preprocess_kernel, realize_reduction, preprocess_program) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.statistics import (ToCountMap, CountGranularity, - Op, MemAccess, get_op_map, get_mem_access_map, - get_synchronization_map, - gather_access_footprints, gather_access_footprint_bytes) +from loopy.statistics import (ToCountMap, ToCountPolynomialMap, + CountGranularity, stringify_stats_mapping, Op, MemAccess, get_op_map, + get_mem_access_map, get_synchronization_map, + gather_access_footprints, gather_access_footprint_bytes, Sync) from loopy.codegen import ( PreambleInfo, generate_code, generate_code_v2, generate_body) @@ -269,9 +269,11 @@ __all__ = [ "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", - "ToCountMap", "CountGranularity", "Op", - "MemAccess", "get_op_map", "get_mem_access_map", "get_synchronization_map", + "ToCountMap", "ToCountPolynomialMap", "CountGranularity", + "stringify_stats_mapping", "Op", "MemAccess", "get_op_map", + "get_mem_access_map", "get_synchronization_map", "gather_access_footprints", "gather_access_footprint_bytes", + "Sync", "CompiledKernel", diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 3516ca29a880af18936c558c2a6a457af2e3236c..74c1ebf54a93c11640572ade5dbfc926e1b1e3f8 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -241,10 +241,54 @@ def parse_transformed_fortran(source, free_form=True, strict=True, return proc_dict["RESULT"] +def _add_assignees_to_calls(knl, all_kernels): + new_insns = [] + subroutine_dict = dict((kernel.name, kernel) for kernel in all_kernels) + from loopy.kernel.instruction import (Assignment, CallInstruction, + CInstruction, _DataObliviousInstruction, + modify_assignee_for_array_call) + from pymbolic.primitives import Call, Variable + + for insn in knl.instructions: + if isinstance(insn, CallInstruction): + if isinstance(insn.expression, Call) and ( + insn.expression.function.name in subroutine_dict): + assignees = [] + new_params = [] + subroutine = subroutine_dict[insn.expression.function.name] + for par, arg in zip(insn.expression.parameters, subroutine.args): + if arg.name in subroutine.get_written_variables(): + par = modify_assignee_for_array_call(par) + assignees.append(par) + if arg.name in subroutine.get_read_variables(): + new_params.append(par) + if arg.name not in (subroutine.get_written_variables() | + subroutine.get_read_variables()): + new_params.append(par) + + new_insns.append( + insn.copy( + assignees=tuple(assignees), + expression=Variable( + insn.expression.function.name)(*new_params))) + else: + new_insns.append(insn) + pass + elif isinstance(insn, (Assignment, CInstruction, + _DataObliviousInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError(type(insn).__name__) + + return knl.copy(instructions=new_insns) + + def parse_fortran(source, filename="", free_form=None, strict=None, - seq_dependencies=None, auto_dependencies=None, target=None): + seq_dependencies=None, auto_dependencies=None, target=None, + return_list_of_knls=False): """ - :returns: a :class:`loopy.Program` + :returns: an instance of :class:`list` of :class:`loopy.LoopKernel`s if + *return_list_of_knls* is True else a :class:`loopy.Program`. """ parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename) @@ -286,6 +330,11 @@ def parse_fortran(source, filename="", free_form=None, strict=None, kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies) + if return_list_of_knls: + return kernels + + kernels = [_add_assignees_to_calls(knl, kernels) for knl in kernels] + from loopy.kernel.tools import identify_root_kernel from loopy.program import make_program from loopy.transform.callable import register_callable_kernel diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py index ec1b10f1f512e18079f44b94b298e876776cae35..e44b183ed3a08726d66019f8900fc273d432d613 100644 --- a/loopy/ipython_ext.py +++ b/loopy/ipython_ext.py @@ -9,7 +9,7 @@ import loopy as lp class LoopyMagics(Magics): @cell_magic def fortran_kernel(self, line, cell): - result = lp.parse_fortran(cell) + result = lp.parse_fortran(cell, return_list_of_knls=True) for knl in result: self.shell.user_ns[knl.name] = knl diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 1f896bb973c2ebc17294ffd5031998c6af962ca6..f36a9057580f58f092016f5fb4012638c47ca7e1 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -37,6 +37,7 @@ from loopy.kernel.data import ( SubstitutionRule, AddressSpace, ValueArg) from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, CallInstruction) +from loopy.program import iterate_over_kernels_if_given_program from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -1753,6 +1754,7 @@ def add_inferred_inames(knl): # {{{ apply single-writer heuristic +@iterate_over_kernels_if_given_program def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): logger.debug("%s: default deps" % kernel.name) @@ -2175,56 +2177,55 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # {{{ handle kernel language version - if not is_callee_kernel: - from loopy.version import LANGUAGE_VERSION_SYMBOLS + from loopy.version import LANGUAGE_VERSION_SYMBOLS - version_to_symbol = dict( - (getattr(loopy.version, lvs), lvs) - for lvs in LANGUAGE_VERSION_SYMBOLS) + version_to_symbol = dict( + (getattr(loopy.version, lvs), lvs) + for lvs in LANGUAGE_VERSION_SYMBOLS) - lang_version = kwargs.pop("lang_version", None) - if lang_version is None: - # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION + lang_version = kwargs.pop("lang_version", None) + if lang_version is None: + # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION - # This *is* gross. But it seems like the right thing interface-wise. - import inspect - caller_globals = inspect.currentframe().f_back.f_globals + # This *is* gross. But it seems like the right thing interface-wise. + import inspect + caller_globals = inspect.currentframe().f_back.f_globals - for ver_sym in LANGUAGE_VERSION_SYMBOLS: - try: - lang_version = caller_globals[ver_sym] - break - except KeyError: - pass + for ver_sym in LANGUAGE_VERSION_SYMBOLS: + try: + lang_version = caller_globals[ver_sym] + break + except KeyError: + pass - # }}} + # }}} - if lang_version is None: - from warnings import warn - from loopy.diagnostic import LoopyWarning - from loopy.version import ( - MOST_RECENT_LANGUAGE_VERSION, - FALLBACK_LANGUAGE_VERSION) - warn("'lang_version' was not passed to make_kernel(). " - "To avoid this warning, pass " - "lang_version={ver} in this invocation. " - "(Or say 'from loopy.version import " - "{sym_ver}' in " - "the global scope of the calling frame.)" - .format( - ver=MOST_RECENT_LANGUAGE_VERSION, - sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] - ), - LoopyWarning, stacklevel=2) - - lang_version = FALLBACK_LANGUAGE_VERSION - - if lang_version not in version_to_symbol: - raise LoopyError("Language version '%s' is not known." % (lang_version,)) - if lang_version >= (2018, 1): - options = options.copy(enforce_variable_access_ordered=True) - if lang_version >= (2018, 2): - options = options.copy(ignore_boostable_into=True) + if lang_version is None: + from warnings import warn + from loopy.diagnostic import LoopyWarning + from loopy.version import ( + MOST_RECENT_LANGUAGE_VERSION, + FALLBACK_LANGUAGE_VERSION) + warn("'lang_version' was not passed to make_kernel(). " + "To avoid this warning, pass " + "lang_version={ver} in this invocation. " + "(Or say 'from loopy.version import " + "{sym_ver}' in " + "the global scope of the calling frame.)" + .format( + ver=MOST_RECENT_LANGUAGE_VERSION, + sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] + ), + LoopyWarning, stacklevel=2) + + lang_version = FALLBACK_LANGUAGE_VERSION + + if lang_version not in version_to_symbol: + raise LoopyError("Language version '%s' is not known." % (lang_version,)) + if lang_version >= (2018, 1): + options = options.copy(enforce_variable_access_ordered=True) + if lang_version >= (2018, 2): + options = options.copy(ignore_boostable_into=True) # }}} @@ -2382,11 +2383,6 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): def make_function(*args, **kwargs): - lang_version = kwargs.pop('lang_version', None) - if lang_version: - raise LoopyError("lang_version should be set for program, not " - "functions.") - kwargs['is_callee_kernel'] = True return make_kernel(*args, **kwargs) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 9d85f5e8404c8b5ea407e78a6c64df81facce65f..1ba0dc7ec54fdec8ac32f41ef897f54062f4719c 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1208,7 +1208,7 @@ def is_array_call(assignees, expression): return False -def modify_assignee_assignee_for_array_call(assignee): +def modify_assignee_for_array_call(assignee): """ Converts the assignee subscript or variable as a SubArrayRef. """ @@ -1258,7 +1258,7 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): # assignee as an instance of SubArrayRef. If not given as a # SubArrayRef return CallInstruction( - assignees=tuple(modify_assignee_assignee_for_array_call( + assignees=tuple(modify_assignee_for_array_call( assignee) for assignee in assignees), expression=expression, temp_var_types=temp_var_types, diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 6f3c6f2be59d3b11862a8235ae0ca23c605bd120..870f9fc2cda731ed2893f634b4e4baa609450db8 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -719,7 +719,7 @@ class RuleArgument(LoopyExpressionBase): mapper_method = intern("map_rule_argument") -class ResolvedFunction(p.Expression): +class ResolvedFunction(LoopyExpressionBase): """ A function invocation whose definition is known in a :mod:`loopy` kernel. Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression @@ -758,8 +758,8 @@ class ResolvedFunction(p.Expression): def __getinitargs__(self): return (self.function, ) - def stringifier(self): - return StringifyMapper + def make_stringifier(self, originating_stringifier=None): + return StringifyMapper() mapper_method = intern("map_resolved_function") @@ -807,7 +807,7 @@ class SweptInameStrideCollector(CoefficientCollectorBase): return super(SweptInameStrideCollector, self).map_algebraic_leaf(expr) -class SubArrayRef(p.Expression): +class SubArrayRef(LoopyExpressionBase): """ An algebraic expression to map an affine memory layout pattern (known as sub-arary) as consecutive elements of the sweeping axes which are defined @@ -871,8 +871,8 @@ class SubArrayRef(p.Expression): and other.subscript == self.subscript and other.swept_inames == self.swept_inames) - def stringifier(self): - return StringifyMapper + def make_stringifier(self, originating_stringifier=None): + return StringifyMapper() mapper_method = intern("map_sub_array_ref") diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 4798436973557b9f730f7b3382480b2a2e63a095..7534818d72a4bedd5ab7976a54f0b71207f59bcf 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -50,7 +50,7 @@ __doc__ = """ # {{{ register function lookup -def _resolved_callables_from_function_lookup(program, +def _resolve_callables_from_function_lookup(program, func_id_to_in_kernel_callable_mapper): """ Returns a copy of *program* with the expression nodes marked "Resolved" @@ -124,7 +124,7 @@ def register_function_id_to_in_knl_callable_mapper(program, new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + ( [func_id_to_in_knl_callable_mapper]) - program = _resolved_callables_from_function_lookup(program, + program = _resolve_callables_from_function_lookup(program, func_id_to_in_knl_callable_mapper) new_program = program.copy( @@ -173,11 +173,17 @@ def register_callable_kernel(program, callee_kernel): # the number of assigness in the callee kernel intructions. expected_num_assignees = len([arg for arg in callee_kernel.args if arg.name in callee_kernel.get_written_variables()]) - expected_num_parameters = len([arg for arg in callee_kernel.args if + expected_max_num_parameters = len([arg for arg in callee_kernel.args if arg.name in callee_kernel.get_read_variables()]) + len( [arg for arg in callee_kernel.args if arg.name not in (callee_kernel.get_read_variables() | callee_kernel.get_written_variables())]) + expected_min_num_parameters = len([arg for arg in callee_kernel.args if + arg.name in callee_kernel.get_read_variables() and arg.name not in + callee_kernel.get_written_variables()]) + len( + [arg for arg in callee_kernel.args if arg.name not in + (callee_kernel.get_read_variables() | + callee_kernel.get_written_variables())]) for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): caller_kernel = in_knl_callable.subkernel @@ -195,11 +201,21 @@ def register_callable_kernel(program, callee_kernel): "match." % ( callee_kernel.name, insn.id)) if len(insn.expression.parameters+tuple( - kw_parameters.values())) != expected_num_parameters: - raise LoopyError("The number of expected arguments " - "for the callee kernel %s and the number of " - "parameters in instruction %s do not match." - % (callee_kernel.name, insn.id)) + kw_parameters.values())) > expected_max_num_parameters: + raise LoopyError("The number of" + " parameters in instruction '%s' exceed" + " the max. number of arguments possible" + " for the callee kernel '%s' => arg matching" + " not possible." + % (insn.id, callee_kernel.name)) + if len(insn.expression.parameters+tuple( + kw_parameters.values())) < expected_min_num_parameters: + raise LoopyError("The number of" + " parameters in instruction '%s' is less than" + " the min. number of arguments possible" + " for the callee kernel '%s' => arg matching" + " not possible." + % (insn.id, callee_kernel.name)) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 9b83f242bde7923a3932a00b42f442954cf9a7db..45e9c0a06df4739fe5833768c0595b7a075421fa 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -419,6 +419,11 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): *data_flow* was added in version 2016.2 """ + from loopy.program import make_program + + programs = [make_program(knl) if isinstance(knl, LoopKernel) else knl for + knl in programs] + # all the resolved functions in programs must be registered in # main_callables_table main_prog_callables_info = ( diff --git a/test/test_callables.py b/test/test_callables.py index f2f3acbd67dc792e6f5c1aa7cd2581896b3ca024..731593ea3e2cd48b0f03c2dbb3899c881bf78338 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -63,38 +63,35 @@ def test_register_function_lookup(ctx_factory): def test_register_knl(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 4 + n = 4 x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) grandchild_knl = lp.make_function( - "{[i, j]:0<= i, j< 16}", + "{[i, j]:0<= i, j< 4}", """ c[i, j] = 2*a[i, j] + 3*b[i, j] """, name='linear_combo1') child_knl = lp.make_function( - "{[i, j]:0<=i, j < 16}", + "{[i, j]:0<=i, j < 4}", """ [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) """, name='linear_combo2') parent_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", """ [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """, kernel_data=[ lp.GlobalArg( - name='x', + name='x, y', dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], + shape=(n, n, n, n, n)), + '...'] ) knl = lp.register_callable_kernel( @@ -115,36 +112,29 @@ def test_register_knl(ctx_factory, inline): def test_slices_with_negative_step(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 4 + n = 4 x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) child_knl = lp.make_function( - "{[i, j]:0<=i, j < 16}", + "{[i, j]:0<=i, j < 4}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") parent_knl = lp.make_kernel( - "{[i, k, m]: 0<=i, k, m<16}", + "{[i, k, m]: 0<=i, k, m<4}", """ - z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], + z[i, 3:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], y[i, :, k, :, m]) """, kernel_data=[ lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='z', + name='x, y, z', dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], + shape=(n, n, n, n, n)), + '...'] ) knl = lp.register_callable_kernel( @@ -163,7 +153,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 2 + n = 4 a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) @@ -215,27 +205,27 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 5 + n = 4 x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) callee_knl = lp.make_function( - "{[i, j]:0<=i, j < 32}", + "{[i, j]:0<=i, j < 4}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name='linear_combo') - callee_knl = lp.split_iname(callee_knl, "i", 2, inner_tag="l.0", outer_tag="g.0") + callee_knl = lp.split_iname(callee_knl, "i", 1, inner_tag="l.0", outer_tag="g.0") caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<32}", + "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", """ [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """ ) - caller_knl = lp.split_iname(caller_knl, "i", 8, inner_tag="l.1", outer_tag="g.1") + caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") knl = lp.register_callable_kernel( caller_knl, callee_knl) @@ -252,8 +242,8 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): x_host = x_dev.get() y_host = y_dev.get() - assert gsize == (16, 4) - assert lsize == (2, 8) + assert gsize == (4, 1) + assert lsize == (1, 4) assert np.linalg.norm(2*x_host+3*y_host-out['z'].get())/np.linalg.norm( 2*x_host+3*y_host) < 1e-15 @@ -484,13 +474,13 @@ def test_empty_sub_array_refs(ctx_factory, inline): def test_array_inputs_to_callee_kernels(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 4 + n = 2 ** 3 x = np.random.rand(n, n) y = np.random.rand(n, n) child_knl = lp.make_function( - "{[i, j]:0<=i, j < 16}", + "{[i, j]:0<=i, j < 8}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") @@ -502,17 +492,10 @@ def test_array_inputs_to_callee_kernels(ctx_factory, inline): """, kernel_data=[ lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16)), - lp.GlobalArg( - name='z', + name='x, y, z', dtype=np.float64, - shape=(16, 16)), '...'], + shape=(n, n)), + '...'] ) knl = lp.register_callable_kernel( diff --git a/test/test_fortran.py b/test/test_fortran.py index 437199810bc970a096f6a71e0446cb3ba5a5ab5d..1ab28409bc770bcd327b8fa3800fa80a46c657c7 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -533,9 +533,11 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! - ! prg = lp.parse_fortran(SOURCE) - ! fill = prg["fill"] - ! twice = prg["twice"] + ! # FIXME: correct this after the "Module" is done. + ! # prg = lp.parse_fortran(SOURCE) + ! # fill = prg["fill"] + ! # twice = prg["twice"] + ! fill, twice = lp.parse_fortran(SOURCE, return_list_of_knls=True) ! knl = lp.fuse_kernels((fill, twice)) ! print(knl) ! RESULT = knl diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 1ba44e77e13a88ecbc05f4eecc6b9c7e397eb656..55a2d2e1122691168b4ac21ac260044edd764c04 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -60,7 +60,8 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ - knl for knl in lp.parse_fortran(source, filename, seq_dependencies=False) + knl for knl in lp.parse_fortran(source, filename, + seq_dependencies=False, return_list_of_knls=True) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") @@ -229,6 +230,15 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa hsv = tap_hsv + hsv = lp.set_options(hsv, + ignore_boostable_into=True, + cl_build_options=[ + "-cl-denorms-are-zero", + "-cl-fast-relaxed-math", + "-cl-finite-math-only", + "-cl-mad-enable", + "-cl-no-signed-zeros"]) + if 1: print("OPS") op_map = lp.get_op_map(hsv, subgroup_size=32) @@ -238,14 +248,6 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes() print(lp.stringify_stats_mapping(gmem_map)) - hsv = lp.set_options(hsv, cl_build_options=[ - "-cl-denorms-are-zero", - "-cl-fast-relaxed-math", - "-cl-finite-math-only", - "-cl-mad-enable", - "-cl-no-signed-zeros", - ]) - # FIXME: renaming's a bit tricky in this program model. # add a simple transformation for it # hsv = hsv.copy(name="horizontalStrongVolumeKernel")