diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e71ea2c6d053188c0e2211fdf7868c6a75cc9af0..fb90b51291951d952bf30c24c3fa7c08030a53d5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -10,6 +10,18 @@ Python 2.7 AMD CPU: - amd-cl-cpu except: - tags +Python 2.6 AMD CPU: + script: + - export PY_EXE=python2.6 + - export PYOPENCL_TEST=amd:pu + - export EXTRA_INSTALL="numpy mako" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python2.6 + - amd-cl-cpu + except: + - tags Python 3.4 AMD CPU: script: - export PY_EXE=python3.4 @@ -35,6 +47,19 @@ Python 2.7 POCL: - pocl except: - tags +Python 2.7 with legacy PyOpenCL: + script: + - export PY_EXE=python2.7 + - export PYOPENCL_TEST=amd:pu + - export EXTRA_INSTALL="numpy mako" + - export REQUIREMENTS_TXT="requirements-old-pyopencl.txt" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python2.7 + - pocl + except: + - tags # PyPy AMD CPU: # script: # - export PY_EXE=pypy diff --git a/bin/loopy b/bin/loopy index cb223c31a1eef3244bd21310722f4b84a539d77c..c8cbea1c615c396d3903df24472822cc918d3461 100644 --- a/bin/loopy +++ b/bin/loopy @@ -50,9 +50,10 @@ def main(): parser = ArgumentParser(description="Stand-alone loopy frontend") - parser.add_argument("infile") - parser.add_argument("outfile") - parser.add_argument("--lang", default="loopy") + parser.add_argument("infile", metavar="INPUT_FILE") + parser.add_argument("outfile", default="-", metavar="OUTPUT_FILE", + help="Defaults to stdout ('-').", nargs='?') + parser.add_argument("--lang", metavar="LANGUAGE", help="loopy|fortran") parser.add_argument("--target") parser.add_argument("--name") parser.add_argument("--transform") @@ -65,13 +66,30 @@ def main(): from warnings import warn warn("--target option is deprecated and ignored") + lang = None if args.infile == "-": infile_content = sys.stdin.read() else: + from os.path import splitext + _, ext = splitext(args.infile) + + lang = { + ".loopy": "loopy", + ".floopy": "fortran", + ".f90": "fortran", + ".fpp": "fortran", + }.get(ext) with open(args.infile, "r") as infile_fd: infile_content = infile_fd.read() - if args.lang == "loopy": + if args.lang is not None: + lang = args.lang + + if lang is None: + raise RuntimeError("unable to deduce input language " + "(wrong input file extension? --lang flag?)") + + if lang == "loopy": # {{{ path wrangling from os.path import dirname, abspath @@ -115,7 +133,7 @@ def main(): kernels = [kernel] - elif args.lang in ["fortran", "floopy", "fpp"]: + elif lang in ["fortran", "floopy", "fpp"]: pre_transform_code = None if args.transform: with open(args.transform, "r") as xform_fd: @@ -168,10 +186,15 @@ def main(): code, impl_arg_info = generate_code(kernel) codes.append(code) - if args.outfile == "-": + if args.outfile: + outfile, = args.outfile + else: + outfile = "-" + + if outfile == "-": sys.stdout.write("\n\n".join(codes)) else: - with open(args.outfile, "w") as outfile_fd: + with open(outfile, "w") as outfile_fd: outfile_fd.write("\n\n".join(codes)) diff --git a/build-helpers/loopy.spec b/build-helpers/loopy.spec index 7650624932eb4b6aa95e888410e8a0aa1a6d518a..3fcff944671291c652621afaa9cd82080605d73c 100644 --- a/build-helpers/loopy.spec +++ b/build-helpers/loopy.spec @@ -1,7 +1,13 @@ # -*- mode: python -*- +from os.path import basename, dirname, join +from glob import glob + single_file = True +# This makes the executable spew debug info. +debug = False + from os.path import expanduser a = Analysis(['bin/loopy'], @@ -11,6 +17,19 @@ a = Analysis(['bin/loopy'], runtime_hooks=None, excludes=["hedge", "meshpy", "pyopencl", "PIL"] ) + +import ply.lex +import ply.yacc + + +a.datas += [ + (join("py-src", "ply", "lex", basename(fn)), fn, "DATA") + for fn in glob(join(dirname(ply.lex.__file__), "*.py")) + ] + [ + (join("py-src", "ply", "yacc", basename(fn)), fn, "DATA") + for fn in glob(join(dirname(ply.yacc.__file__), "*.py")) + ] + pyz = PYZ(a.pure) if single_file: @@ -20,7 +39,7 @@ if single_file: a.zipfiles, a.datas, name='loopy', - debug=False, + debug=debug, strip=None, upx=True, console=True) @@ -29,7 +48,7 @@ else: a.scripts, exclude_binaries=True, name='loopy', - debug=False, + debug=debug, strip=None, upx=True, console=True) diff --git a/build-helpers/make-linux-build-docker.sh b/build-helpers/make-linux-build-docker.sh index 2deb2935e429a7d4281d3c09e884fb5df92125fc..90684a267e98f1976a31fd3036b824d2c2cdc1d8 100755 --- a/build-helpers/make-linux-build-docker.sh +++ b/build-helpers/make-linux-build-docker.sh @@ -1,5 +1,7 @@ #! /bin/bash +# should be run in this directory (build-helpers) + set -e set -x diff --git a/doc/reference.rst b/doc/reference.rst index 6a42ed944249aeeeff07677ceb9fc46fd0c45cb4..e79f17554119c5efe9351f46b8c501c2c27d2387 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -408,7 +408,7 @@ Dealing with Substitution Rules .. autofunction:: extract_subst -.. autofunction:: temporary_to_subst +.. autofunction:: assignment_to_subst .. autofunction:: expand_subst @@ -421,6 +421,8 @@ Caching, Precomputation and Prefetching .. autofunction:: buffer_array +.. autofunction:: alias_temporaries + Influencing data access ^^^^^^^^^^^^^^^^^^^^^^^ @@ -470,6 +472,11 @@ Arguments .. autofunction:: add_and_infer_dtypes +Batching +^^^^^^^^ + +.. autofunction:: to_batched + Finishing up ^^^^^^^^^^^^ @@ -528,7 +535,7 @@ Obtaining Kernel Statistics .. autofunction:: get_op_poly -.. autofunction:: get_DRAM_access_poly +.. autofunction:: get_gmem_access_poly .. autofunction:: get_barrier_poly diff --git a/doc/tutorial.rst b/doc/tutorial.rst index ef6867cd8d5729cb490a3fba8ad55e3ccc1ce9ea..c1ce360c88b962ef438809500413d747a3cd4b43 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1256,14 +1256,14 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: Counting array accesses ~~~~~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_DRAM_access_poly` provides information on the number and type of +:func:`loopy.get_gmem_access_poly` provides information on the number and type of array loads and stores being performed in a kernel. To demonstrate this, we'll continue using the kernel from the previous example: .. doctest:: - >>> from loopy.statistics import get_DRAM_access_poly - >>> load_store_map = get_DRAM_access_poly(knl) + >>> from loopy.statistics import get_gmem_access_poly + >>> load_store_map = get_gmem_access_poly(knl) >>> print(load_store_map) (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 } @@ -1271,7 +1271,7 @@ continue using the kernel from the previous example: (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } <BLANKLINE> -:func:`loopy.get_DRAM_access_poly` returns a mapping of **{(** +:func:`loopy.get_gmem_access_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. @@ -1313,7 +1313,7 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: ~~~~~~~~~~~ Since we have not tagged any of the inames or parallelized the kernel across threads -(which would have produced iname tags), :func:`loopy.get_DRAM_access_poly` considers +(which would have produced iname tags), :func:`loopy.get_gmem_access_poly` considers the array accesses *uniform*. Now we'll parallelize the kernel and count the array accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated this time, so we'll print the mapping manually to make it more legible: @@ -1321,7 +1321,7 @@ this time, so we'll print the mapping manually to make it more legible: .. doctest:: >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0") - >>> load_store_map = get_DRAM_access_poly(knl_consec) + >>> load_store_map = get_gmem_access_poly(knl_consec) >>> for key in sorted(load_store_map.dict.keys(), key=lambda k: str(k)): ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) (dtype('float32'), 'consecutive', 'load') : @@ -1368,7 +1368,7 @@ our parallelization of the kernel: .. doctest:: >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1") - >>> load_store_map = get_DRAM_access_poly(knl_nonconsec) + >>> load_store_map = get_gmem_access_poly(knl_nonconsec) >>> for key in sorted(load_store_map.dict.keys(), key=lambda k: str(k)): ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) (dtype('float32'), 'nonconsecutive', 'load') : diff --git a/examples/fortran/run-floopy.sh b/examples/fortran/run-floopy.sh deleted file mode 100755 index fcea2c8b5ed58eed8738ad263df62cdf687b3d0f..0000000000000000000000000000000000000000 --- a/examples/fortran/run-floopy.sh +++ /dev/null @@ -1,6 +0,0 @@ -#! /bin/sh - -NAME="$1" -shift - -python $(which loopy) --lang=fpp "$NAME" - "$@" diff --git a/examples/fortran/run-loopy.sh b/examples/fortran/run-loopy.sh deleted file mode 100755 index f22f78424bc654e352fff2806120701d096d7068..0000000000000000000000000000000000000000 --- a/examples/fortran/run-loopy.sh +++ /dev/null @@ -1,3 +0,0 @@ -#! /bin/sh - -python $(which loopy) --lang=loopy "$NAME" - "$@" diff --git a/loopy/__init__.py b/loopy/__init__.py index a161e54783b5246cf6a9d7c587cc0ffc7da19425..8956856d4735a9554ed8c34741790ef1286d9e54 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -54,7 +54,7 @@ from loopy.kernel.tools import ( add_and_infer_dtypes) from loopy.kernel.creation import make_kernel, UniqueName from loopy.library.reduction import register_reduction_parser -from loopy.subst import extract_subst, expand_subst, temporary_to_subst +from loopy.subst import extract_subst, expand_subst, assignment_to_subst from loopy.precompute import precompute from loopy.buffer import buffer_array from loopy.fusion import fuse_kernels @@ -63,7 +63,8 @@ from loopy.padding import (split_arg_axis, find_padding_multiple, from loopy.preprocess import (preprocess_kernel, realize_reduction, infer_unknown_types) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.statistics import get_op_poly, get_DRAM_access_poly, get_barrier_poly +from loopy.statistics import (get_op_poly, get_gmem_access_poly, + get_DRAM_access_poly, get_barrier_poly) from loopy.codegen import generate_code, generate_body from loopy.compiled import CompiledKernel from loopy.options import Options @@ -89,7 +90,7 @@ __all__ = [ "register_reduction_parser", - "extract_subst", "expand_subst", "temporary_to_subst", + "extract_subst", "expand_subst", "assignment_to_subst", "precompute", "buffer_array", "fuse_kernels", "split_arg_axis", "find_padding_multiple", "add_padding", @@ -103,7 +104,8 @@ __all__ = [ "generate_loop_schedules", "get_one_scheduled_kernel", "generate_code", "generate_body", - "get_op_poly", "get_DRAM_access_poly", "get_barrier_poly", + "get_op_poly", "get_gmem_access_poly", "get_DRAM_access_poly", + "get_barrier_poly", "CompiledKernel", @@ -660,23 +662,93 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, # }}} -def rename_iname(knl, old_iname, new_iname, within): +# {{{ rename_inames + +def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): """ :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. + :arg existing_ok: execute even if *new_iname* already exists """ var_name_gen = knl.get_var_name_generator() - if var_name_gen.is_name_conflicting(new_iname): + does_exist = var_name_gen.is_name_conflicting(new_iname) + + if does_exist and not existing_ok: raise ValueError("iname '%s' conflicts with an existing identifier" "--cannot rename" % new_iname) - knl = duplicate_inames(knl, [old_iname], within=within, new_inames=[new_iname]) + if does_exist: + # {{{ check that the domains match up + + dom = knl.get_inames_domain(frozenset((old_iname, new_iname))) + + var_dict = dom.get_var_dict() + _, old_idx = var_dict[old_iname] + _, new_idx = var_dict[new_iname] + + par_idx = dom.dim(dim_type.param) + dom_old = dom.move_dims( + dim_type.param, par_idx, dim_type.set, old_idx, 1) + dom_old = dom_old.move_dims( + dim_type.set, dom_old.dim(dim_type.set), dim_type.param, par_idx, 1) + dom_old = dom_old.project_out( + dim_type.set, new_idx if new_idx < old_idx else new_idx - 1, 1) + + par_idx = dom.dim(dim_type.param) + dom_new = dom.move_dims( + dim_type.param, par_idx, dim_type.set, new_idx, 1) + dom_new = dom_new.move_dims( + dim_type.set, dom_new.dim(dim_type.set), dim_type.param, par_idx, 1) + dom_new = dom_new.project_out( + dim_type.set, old_idx if old_idx < new_idx else old_idx - 1, 1) + + if not (dom_old <= dom_new and dom_new <= dom_old): + raise LoopyError( + "inames {old} and {new} do not iterate over the same domain" + .format(old=old_iname, new=new_iname)) + + # }}} + + from pymbolic import var + subst_dict = {old_iname: var(new_iname)} + + from loopy.context_matching import parse_stack_match + within = parse_stack_match(within) + + from pymbolic.mapper.substitutor import make_subst_func + rule_mapping_context = SubstitutionRuleMappingContext( + knl.substitutions, var_name_gen) + ijoin = RuleAwareSubstitutionMapper(rule_mapping_context, + make_subst_func(subst_dict), within) + + knl = rule_mapping_context.finish_kernel( + ijoin.map_kernel(knl)) + + new_instructions = [] + for insn in knl.instructions: + if (old_iname in insn.forced_iname_deps + and within(knl, insn, ())): + insn = insn.copy( + forced_iname_deps=( + (insn.forced_iname_deps - frozenset([old_iname])) + | frozenset([new_iname]))) + + new_instructions.append(insn) + + knl = knl.copy(instructions=new_instructions) + + else: + knl = duplicate_inames( + knl, [old_iname], within=within, new_inames=[new_iname]) + knl = remove_unused_inames(knl, [old_iname]) return knl +# }}} + # {{{ link inames @@ -1845,4 +1917,176 @@ def tag_instructions(kernel, new_tag, within=None): # }}} + +# {{{ alias_temporaries + +def alias_temporaries(knl, names, base_name_prefix=None): + """Sets all temporaries given by *names* to be backed by a single piece of + storage. Also introduces ordering structures ("groups") to prevent the + usage of each temporary to interfere with another. + + :arg base_name_prefix: an identifier to be used for the common storage + area + """ + gng = knl.get_group_name_generator() + group_names = [gng("tmpgrp_"+name) for name in names] + + if base_name_prefix is None: + base_name_prefix = "temp_storage" + + vng = knl.get_var_name_generator() + base_name = vng(base_name_prefix) + + names_set = set(names) + + new_insns = [] + for insn in knl.instructions: + temp_deps = insn.dependency_names() & names_set + + if not temp_deps: + new_insns.append(insn) + continue + + if len(temp_deps) > 1: + raise LoopyError("Instruction {insn} refers to multiple of the " + "temporaries being aliased, namely '{temps}'. Cannot alias." + .format( + insn=insn.id, + temps=", ".join(temp_deps))) + + temp_name, = temp_deps + temp_idx = names.index(temp_name) + group_name = group_names[temp_idx] + other_group_names = ( + frozenset(group_names[:temp_idx]) + | frozenset(group_names[temp_idx+1:])) + + new_insns.append( + insn.copy( + groups=insn.groups | frozenset([group_name]), + conflicts_with_groups=( + insn.conflicts_with_groups | other_group_names))) + + new_temporary_variables = {} + for tv in six.itervalues(knl.temporary_variables): + if tv.name in names_set: + if tv.base_storage is not None: + raise LoopyError("temporary variable '{tv}' already has " + "a defined storage array -- cannot alias" + .format(tv=tv.name)) + + new_temporary_variables[tv.name] = \ + tv.copy(base_storage=base_name) + else: + new_temporary_variables[tv.name] = tv + + return knl.copy( + instructions=new_insns, + temporary_variables=new_temporary_variables) + +# }}} + + +# {{{ to_batched + +class _BatchVariableChanger(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, kernel, batch_varying_args, + batch_iname_expr): + super(_BatchVariableChanger, self).__init__(rule_mapping_context) + + self.kernel = kernel + self.batch_varying_args = batch_varying_args + self.batch_iname_expr = batch_iname_expr + + def needs_batch_subscript(self, name): + return ( + name in self.kernel.temporary_variables + or + name in self.batch_varying_args) + + def map_subscript(self, expr, expn_state): + if not self.needs_batch_subscript(expr.aggregate.name): + return super(_BatchVariableChanger, self).map_subscript(expr, expn_state) + + idx = expr.index + if not isinstance(idx, tuple): + idx = (idx,) + + return type(expr)(expr.aggregate, (self.batch_iname_expr,) + idx) + + def map_variable(self, expr, expn_state): + if not self.needs_batch_subscript(expr.name): + return super(_BatchVariableChanger, self).map_variable(expr, expn_state) + + return expr.aggregate[self.batch_iname_expr] + + +def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch"): + """Takes in a kernel that carries out an operation and returns a kernel + that carries out a batch of these operations. + + :arg nbatches: the number of batches. May be a constant non-negative + integer or a string, which will be added as an integer argument. + :arg batch_varying_args: a list of argument names that depend vary per-batch. + Each such variable will have a batch index added. + """ + + from pymbolic import var + + vng = knl.get_var_name_generator() + batch_iname = vng(batch_iname_prefix) + batch_iname_expr = var(batch_iname) + + new_args = [] + + batch_dom_str = "{[%(iname)s]: 0 <= %(iname)s < %(nbatches)s}" % { + "iname": batch_iname, + "nbatches": nbatches, + } + + if not isinstance(nbatches, int): + batch_dom_str = "[%s] -> " % nbatches + batch_dom_str + new_args.append(ValueArg(nbatches, dtype=knl.index_dtype)) + + nbatches_expr = var(nbatches) + else: + nbatches_expr = nbatches + + batch_domain = isl.BasicSet(batch_dom_str) + new_domains = [batch_domain] + knl.domains + + for arg in knl.args: + if arg.name in batch_varying_args: + if isinstance(arg, ValueArg): + arg = GlobalArg(arg.name, arg.dtype, shape=(nbatches_expr,), + dim_tags="c") + else: + arg = arg.copy( + shape=(nbatches_expr,) + arg.shape, + dim_tags=("c",) * (len(arg.shape) + 1)) + + new_args.append(arg) + + new_temps = {} + + for temp in six.itervalues(knl.temporary_variables): + new_temps[temp.name] = temp.copy( + shape=(nbatches_expr,) + temp.shape, + dim_tags=("c",) * (len(arg.shape) + 1)) + + knl = knl.copy( + domains=new_domains, + args=new_args, + temporary_variables=new_temps) + + rule_mapping_context = SubstitutionRuleMappingContext( + knl.substitutions, vng) + bvc = _BatchVariableChanger(rule_mapping_context, + knl, batch_varying_args, batch_iname_expr) + return rule_mapping_context.finish_kernel( + bvc.map_kernel(knl)) + + +# }}} + # vim: foldmethod=marker diff --git a/loopy/auto_test.py b/loopy/auto_test.py index c264428279795d2d30ab488031758d7bb7468226..e5f88417df6ca555d6475c21257534f8e995e812 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -353,14 +353,16 @@ def auto_test_vs_ref( ref_knl, ctx, test_knl, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, - fills_entire_output=None, do_check=True, check_result=None - ): + fills_entire_output=None, do_check=True, check_result=None, + max_test_kernel_count=1, + quiet=False): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, message) indicating correctness/acceptability of the result + :arg max_test_kernel_count: Stop testing after this many *test_knl* """ import pyopencl as cl @@ -416,7 +418,7 @@ def auto_test_vs_ref( break ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) - if print_ref_code: + if not quiet and print_ref_code: print(75*"-") print("Reference Code:") print(75*"-") @@ -469,7 +471,7 @@ def auto_test_vs_ref( logger.info("%s (ref): run done" % ref_knl.name) ref_evt.wait() - ref_elapsed = 1e-9*(ref_evt.profile.END-ref_evt.profile.SUBMIT) + ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START) break @@ -488,28 +490,25 @@ def auto_test_vs_ref( properties=cl.command_queue_properties.PROFILING_ENABLE) args = None - from loopy.kernel import LoopKernel - if not isinstance(test_knl, LoopKernel): - warn("Passing an iterable of kernels to auto_test_vs_ref " - "is deprecated--just pass the kernel instead. " - "Scheduling will be performed in auto_test_vs_ref.", - DeprecationWarning, stacklevel=2) - - test_kernels = test_knl + from loopy.kernel import kernel_state + if test_knl.state not in [ + kernel_state.PREPROCESSED, + kernel_state.SCHEDULED]: + test_knl = lp.preprocess_kernel(test_knl) + + if not test_knl.schedule: + test_kernels = lp.generate_loop_schedules(test_knl) else: - from loopy.kernel import kernel_state - if test_knl.state not in [ - kernel_state.PREPROCESSED, - kernel_state.SCHEDULED]: - test_knl = lp.preprocess_kernel(test_knl) - - if not test_knl.schedule: - test_kernels = lp.generate_loop_schedules(test_knl) - else: - test_kernels = [test_knl] + test_kernels = [test_knl] + + test_kernel_count = 0 from loopy.preprocess import infer_unknown_types for i, kernel in enumerate(test_kernels): + test_kernel_count += 1 + if test_kernel_count > max_test_kernel_count: + break + kernel = infer_unknown_types(kernel, expect_completion=True) compiled = CompiledKernel(ctx, kernel) @@ -521,16 +520,17 @@ def auto_test_vs_ref( queue, ref_arg_data, parameters) args["out_host"] = False - print(75*"-") - print("Kernel #%d:" % i) - print(75*"-") - if print_code: - print(compiled.get_highlighted_code()) + if not quiet: print(75*"-") - if dump_binary: - print(type(compiled.cl_program)) - print(compiled.cl_program.binaries[0]) + print("Kernel #%d:" % i) print(75*"-") + if print_code: + print(compiled.get_highlighted_code()) + print(75*"-") + if dump_binary: + print(type(compiled.cl_program)) + print(compiled.cl_program.binaries[0]) + print(75*"-") logger.info("%s: run warmup" % (knl.name)) @@ -596,16 +596,15 @@ def auto_test_vs_ref( evt_start.wait() evt_end.wait() - elapsed = (1e-9*events[-1].profile.END - - 1e-9*events[0].profile.SUBMIT) \ + elapsed_event = (1e-9*events[-1].profile.END + - 1e-9*events[0].profile.START) \ / timing_rounds try: - elapsed_evt_2 = "%g" % \ - ((1e-9*evt_end.profile.START + elapsed_event_marker = ((1e-9*evt_end.profile.START - 1e-9*evt_start.profile.START) / timing_rounds) except cl.RuntimeError: - elapsed_evt_2 = "<unavailable>" + elapsed_event_marker = None elapsed_wall = (stop_time-start_time)/timing_rounds @@ -620,28 +619,36 @@ def auto_test_vs_ref( for cnt, lbl in zip(op_count, op_label): rates += " %g %s/s" % (cnt/elapsed_wall, lbl) - print("elapsed: %g s event, %s s marker-event %g s wall " - "(%d rounds)%s" % ( - elapsed, elapsed_evt_2, elapsed_wall, timing_rounds, rates)) + if not quiet: + def format_float_or_none(v): + if v is None: + return "<unavailable>" + else: + return "%g" % v + + print("elapsed: %s s event, %s s marker-event %s s wall " + "(%d rounds)%s" % ( + format_float_or_none(elapsed_event), + format_float_or_none(elapsed_event_marker), + format_float_or_none(elapsed_wall), timing_rounds, rates)) if do_check: ref_rates = "" for cnt, lbl in zip(op_count, op_label): - ref_rates += " %g %s/s" % (cnt/ref_elapsed, lbl) - print("ref: elapsed: %g s event, %g s wall%s" % ( - ref_elapsed, ref_elapsed_wall, ref_rates)) + ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) + if not quiet: + print("ref: elapsed: %g s event, %g s wall%s" % ( + ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} result_dict = {} - result_dict["elapsed"] = elapsed - result_dict["elapsed_evt_2"] = elapsed_evt_2 + result_dict["elapsed_event"] = elapsed_event + result_dict["elapsed_event_marker"] = elapsed_event_marker result_dict["elapsed_wall"] = elapsed_wall result_dict["timing_rounds"] = timing_rounds - result_dict["rates"] = rates - result_dict["ref_elapsed"] = elapsed - result_dict["ref_elapsed_wall"] = elapsed_wall - result_dict["ref_rates"] = ref_rates + result_dict["ref_elapsed_event"] = ref_elapsed_event + result_dict["ref_elapsed_wall"] = ref_elapsed_wall return result_dict diff --git a/loopy/buffer.py b/loopy/buffer.py index fdc3774b29f64ba5ae8c465076f48b805836d40b..1e6a137b551645a25145ddaaeb8eea40eea554af 100644 --- a/loopy/buffer.py +++ b/loopy/buffer.py @@ -29,9 +29,15 @@ from loopy.symbolic import (get_dependencies, RuleAwareIdentityMapper, SubstitutionRuleMappingContext, SubstitutionMapper) from pymbolic.mapper.substitutor import make_subst_func +from pytools.persistent_dict import PersistentDict +from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper +from loopy.version import DATA_MODEL_VERSION from pymbolic import var +import logging +logger = logging.getLogger(__name__) + # {{{ replace array access @@ -117,6 +123,11 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper): # }}} +buffer_array_cache = PersistentDict("loopy-buffer-array-cachee"+DATA_MODEL_VERSION, + key_builder=LoopyKeyBuilder()) + + +# Adding an argument? also add something to the cache_key below. def buffer_array(kernel, var_name, buffer_inames, init_expression=None, store_expression=None, within=None, default_tag="l.auto", temporary_is_local=None, fetch_bounding_box=False): @@ -173,6 +184,25 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, # }}} + # {{{ caching + + from loopy import CACHING_ENABLED + + cache_key = (kernel, var_name, tuple(buffer_inames), + PymbolicExpressionHashWrapper(init_expression), + PymbolicExpressionHashWrapper(store_expression), within, + default_tag, temporary_is_local, fetch_bounding_box) + + if CACHING_ENABLED: + try: + result = buffer_array_cache[cache_key] + logger.info("%s: buffer_array cache hit" % kernel.name) + return result + except KeyError: + pass + + # }}} + var_name_gen = kernel.get_var_name_generator() within_inames = set() @@ -413,6 +443,10 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, from loopy import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) + if CACHING_ENABLED: + from loopy.preprocess import prepare_for_caching + buffer_array_cache[cache_key] = prepare_for_caching(kernel) + return kernel # vim: foldmethod=marker diff --git a/loopy/compiled.py b/loopy/compiled.py index d8d127c0bc89f439569b8a016e485054c05f2bc2..6d4396b5a11ef99bcf7e8e0b03b5d9d7d8fb6d88 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -35,6 +35,9 @@ from pytools.py_codegen import ( Indentation, PythonFunctionGenerator) from loopy.diagnostic import LoopyError +import logging +logger = logging.getLogger(__name__) + # {{{ object array argument packing @@ -716,11 +719,15 @@ def generate_invoker(kernel, cl_kernel, impl_arg_info, options): if not lsize_expr: lsize_expr = (1,) + def strify_tuple(t): + return "(%s,)" % ( + ", ".join("int(%s)" % strify(t_i) for t_i in t)) + gen("_lpy_evt = _lpy_cl.enqueue_nd_range_kernel(queue, cl_kernel, " "%(gsize)s, %(lsize)s, wait_for=wait_for, g_times_l=True)" % dict( - gsize=strify(gsize_expr), - lsize=strify(lsize_expr))) + gsize=strify_tuple(gsize_expr), + lsize=strify_tuple(lsize_expr))) gen("") # }}} @@ -858,10 +865,13 @@ class CompiledKernel: code = invoke_editor(code, "code.cl") import pyopencl as cl + + logger.info("%s: opencl compilation start" % self.kernel.name) cl_program = cl.Program(self.context, code) cl_kernel = getattr( cl_program.build(options=kernel.options.cl_build_options), kernel.name) + logger.info("%s: opencl compilation done" % self.kernel.name) return _CLKernelInfo( kernel=kernel, diff --git a/loopy/context_matching.py b/loopy/context_matching.py index 61203ece2c38ae7beb385bd8b4758c3ce5eeeea8..a88e207002220a1be840114d71948869f566863d 100644 --- a/loopy/context_matching.py +++ b/loopy/context_matching.py @@ -94,11 +94,21 @@ class MatchExpressionBase(object): def __call__(self, kernel, matchable): raise NotImplementedError + def __ne__(self, other): + return not self.__eq__(other) + + class AllMatchExpression(MatchExpressionBase): def __call__(self, kernel, matchable): return True + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, "all_match_expr") + + def __eq__(self, other): + return (type(self) == type(other)) + class AndMatchExpression(MatchExpressionBase): def __init__(self, children): @@ -110,6 +120,14 @@ class AndMatchExpression(MatchExpressionBase): def __str__(self): return "(%s)" % (" and ".join(str(ch) for ch in self.children)) + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, "and_match_expr") + key_builder.rec(key_hash, self.children) + + def __eq__(self, other): + return (type(self) == type(other) + and self.children == other.children) + class OrMatchExpression(MatchExpressionBase): def __init__(self, children): @@ -121,6 +139,14 @@ class OrMatchExpression(MatchExpressionBase): def __str__(self): return "(%s)" % (" or ".join(str(ch) for ch in self.children)) + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, "or_match_expr") + key_builder.rec(key_hash, self.children) + + def __eq__(self, other): + return (type(self) == type(other) + and self.children == other.children) + class NotMatchExpression(MatchExpressionBase): def __init__(self, child): @@ -132,6 +158,14 @@ class NotMatchExpression(MatchExpressionBase): def __str__(self): return "(not %s)" % str(self.child) + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, "not_match_expr") + key_builder.rec(key_hash, self.child) + + def __eq__(self, other): + return (type(self) == type(other) + and self.child == other.child) + class GlobMatchExpressionBase(MatchExpressionBase): def __init__(self, glob): @@ -146,6 +180,14 @@ class GlobMatchExpressionBase(MatchExpressionBase): descr = descr[:descr.find("Match")] return descr.lower() + ":" + self.glob + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, type(self).__name__) + key_builder.rec(key_hash, self.glob) + + def __eq__(self, other): + return (type(self) == type(other) + and self.glob == other.glob) + class IdMatchExpression(GlobMatchExpressionBase): def __call__(self, kernel, matchable): @@ -284,18 +326,31 @@ def parse_match(expr_str): # {{{ stack match objects class StackMatchComponent(object): - pass + def __ne__(self, other): + return not self.__eq__(other) class StackAllMatchComponent(StackMatchComponent): def __call__(self, kernel, stack): return True + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, "all_match") + + def __eq__(self, other): + return (type(self) == type(other)) + class StackBottomMatchComponent(StackMatchComponent): def __call__(self, kernel, stack): return not stack + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, "bottom_match") + + def __eq__(self, other): + return (type(self) == type(other)) + class StackItemMatchComponent(StackMatchComponent): def __init__(self, match_expr, inner_match): @@ -312,6 +367,16 @@ class StackItemMatchComponent(StackMatchComponent): return self.inner_match(kernel, stack[1:]) + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, "item_match") + key_builder.rec(key_hash, self.match_expr) + key_builder.rec(key_hash, self.inner_match) + + def __eq__(self, other): + return (type(self) == type(other) + and self.match_expr == other.match_expr + and self.inner_match == other.inner_match) + class StackWildcardMatchComponent(StackMatchComponent): def __init__(self, inner_match): @@ -348,6 +413,18 @@ class StackMatch(object): def __init__(self, root_component): self.root_component = root_component + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, self.root_component) + + def __eq__(self, other): + return ( + type(self) == type(other) + and + self.root_component == other.root_component) + + def __ne__(self, other): + return not self.__eq__(other) + def __call__(self, kernel, insn, rule_stack): """ :arg rule_stack: a tuple of (name, tags) rule invocation, outermost first diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 97005cab607f1c1db5341ca3e29bd80ec7e761c1..8293035a9e9ae951c069136b8705673a7f4f7d93 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -25,6 +25,7 @@ THE SOFTWARE. import re import six +from six.moves import intern import loopy as lp import numpy as np @@ -221,7 +222,7 @@ class F2LoopyTranslator(FTreeWalkerBase): def add_expression_instruction(self, lhs, rhs): scope = self.scope_stack[-1] - new_id = "insn%d" % self.insn_id_counter + new_id = intern("insn%d" % self.insn_id_counter) self.insn_id_counter += 1 if self.auto_dependencies and scope.previous_instruction_id: @@ -447,7 +448,7 @@ class F2LoopyTranslator(FTreeWalkerBase): def map_IfThen(self, node): scope = self.scope_stack[-1] - cond_name = "loopy_cond%d" % self.condition_id_counter + cond_name = intern("loopy_cond%d" % self.condition_id_counter) self.condition_id_counter += 1 assert cond_name not in scope.type_map @@ -543,6 +544,8 @@ class F2LoopyTranslator(FTreeWalkerBase): loop_var_suffix += 1 loopy_loop_var = loop_var + "_%d" % loop_var_suffix + loopy_loop_var = intern(loopy_loop_var) + # }}} space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT, diff --git a/loopy/fusion.py b/loopy/fusion.py index c14d936afb4ff063bad9e9ff7e1189daadf15a5c..8845951ea293d5a0e66d457a4bcb8680db57623c 100644 --- a/loopy/fusion.py +++ b/loopy/fusion.py @@ -143,8 +143,12 @@ def _fuse_two_kernels(knla, knlb): else: if b_arg != knla.arg_dict[b_arg.name]: raise LoopyError( - "argument '%s' has inconsistent definition between " - "the two kernels being merged" % b_arg.name) + "argument '{arg_name}' has inconsistent definition between " + "the two kernels being merged ({arg_a} <-> {arg_b})" + .format( + arg_name=b_arg.name, + arg_a=str(knla.arg_dict[b_arg.name]), + arg_b=str(b_arg))) # }}} @@ -214,9 +218,9 @@ def _fuse_two_kernels(knla, knlb): assump_a, assump_b = isl.align_two(assump_a, assump_b) shared_param_names = list( - set(dom_a.get_var_dict(dim_type.set)) + set(assump_a.get_var_dict(dim_type.set)) & - set(dom_b.get_var_dict(dim_type.set))) + set(assump_b.get_var_dict(dim_type.set))) assump_a_s = assump_a.project_out_except(shared_param_names, [dim_type.param]) assump_b_s = assump_a.project_out_except(shared_param_names, [dim_type.param]) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 16695e0ad26d3ec9d4f4b855aea15585117ff227..4e31db993517fc4426766a6e881fb7f56853915c 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -25,7 +25,7 @@ THE SOFTWARE. """ import six -from six.moves import range, zip +from six.moves import range, zip, intern import numpy as np from pytools import RecordWithoutPickling, Record, memoize_method @@ -335,6 +335,9 @@ class LoopKernel(RecordWithoutPickling): def all_variable_names(self): return ( set(six.iterkeys(self.temporary_variables)) + | set(tv.base_storage + for tv in six.itervalues(self.temporary_variables) + if tv.base_storage is not None) | set(six.iterkeys(self.substitutions)) | set(arg.name for arg in self.args) | set(self.all_inames())) @@ -351,7 +354,18 @@ class LoopKernel(RecordWithoutPickling): for id_str in generate_unique_names(based_on): if id_str not in used_ids: - return id_str + return intern(id_str) + + def all_group_names(self): + result = set() + for insn in self.instructions: + result.update(insn.groups) + result.update(insn.conflicts_with_groups) + + return frozenset(result) + + def get_group_name_generator(self): + return _UniqueVarNameGenerator(set(self.all_group_names())) def get_var_descriptor(self, name): try: @@ -577,7 +591,8 @@ class LoopKernel(RecordWithoutPickling): def all_inames(self): result = set() for dom in self.domains: - result.update(dom.get_var_names(dim_type.set)) + result.update( + intern(n) for n in dom.get_var_names(dim_type.set)) return frozenset(result) @memoize_method @@ -588,7 +603,8 @@ class LoopKernel(RecordWithoutPickling): for dom in self.domains: result.update(set(dom.get_var_names(dim_type.param)) - all_inames) - return frozenset(result) + from loopy.tools import intern_frozenset_of_ids + return intern_frozenset_of_ids(result) def outer_params(self, domains=None): if domains is None: @@ -600,7 +616,8 @@ class LoopKernel(RecordWithoutPickling): all_inames.update(dom.get_var_names(dim_type.set)) all_params.update(dom.get_var_names(dim_type.param)) - return all_params-all_inames + from loopy.tools import intern_frozenset_of_ids + return intern_frozenset_of_ids(all_params-all_inames) @memoize_method def all_insn_inames(self): @@ -747,6 +764,15 @@ class LoopKernel(RecordWithoutPickling): for insn in self.instructions for var_name, _ in insn.assignees_and_indices()) + @memoize_method + def get_temporary_to_base_storage_map(self): + result = {} + for tv in six.itervalues(self.temporary_variables): + if tv.base_storage: + result[tv.name] = tv.base_storage + + return result + # }}} # {{{ argument wrangling diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 2923c945c08c6a5fc2d19831efcdf235ea949a72..9a3c9c0cfbc79e9aba934b7a7c051665b19c19c5 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -766,11 +766,9 @@ class ArrayBase(Record): info_entries.append("type: %s" % type_str) if self.shape is None: - pass + info_entries.append("shape: unknown") elif self.shape is lp.auto: info_entries.append("shape: auto") - elif self.shape == (): - pass else: info_entries.append("shape: (%s)" % ", ".join(str(i) for i in self.shape)) @@ -874,11 +872,15 @@ class ArrayBase(Record): return 1 - def decl_info(self, target, is_written, index_dtype): + def decl_info(self, target, is_written, index_dtype, shape_override=None): """Return a list of :class:`loopy.codegen.ImplementedDataInfo` instances corresponding to the array. """ + array_shape = self.shape + if shape_override is not None: + array_shape = shape_override + from loopy.codegen import ImplementedDataInfo from loopy.kernel.data import ValueArg @@ -978,10 +980,10 @@ class ArrayBase(Record): dim_tag = self.dim_tags[user_axis] if isinstance(dim_tag, FixedStrideArrayDimTag): - if self.shape is None: + if array_shape is None: new_shape_axis = None else: - new_shape_axis = self.shape[user_axis] + new_shape_axis = array_shape[user_axis] import loopy as lp if dim_tag.stride is lp.auto: @@ -1004,7 +1006,7 @@ class ArrayBase(Record): yield res elif isinstance(dim_tag, SeparateArrayArrayDimTag): - shape_i = self.shape[user_axis] + shape_i = array_shape[user_axis] if not is_integer(shape_i): raise LoopyError("shape of '%s' has non-constant " "integer axis %d (0-based)" % ( @@ -1018,7 +1020,7 @@ class ArrayBase(Record): yield res elif isinstance(dim_tag, VectorArrayDimTag): - shape_i = self.shape[user_axis] + shape_i = array_shape[user_axis] if not is_integer(shape_i): raise LoopyError("shape of '%s' has non-constant " "integer axis %d (0-based)" % ( diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index e891f06261c27d1dff446e7ec567dbe88c0647da..4683ca905c32cdcade15e32c9745ec353d886375 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -26,6 +26,7 @@ THE SOFTWARE. import numpy as np +from loopy.tools import intern_frozenset_of_ids from loopy.symbolic import IdentityMapper, WalkMapper from loopy.kernel.data import ( InstructionBase, ExpressionInstruction, SubstitutionRule) @@ -33,7 +34,7 @@ import islpy as isl from islpy import dim_type import six -from six.moves import range, zip +from six.moves import range, zip, intern import re import sys @@ -216,7 +217,7 @@ def parse_insn(insn): opt_value = option[equal_idx+1:].strip() if opt_key == "id": - insn_id = opt_value + insn_id = intern(opt_value) elif opt_key == "id_prefix": insn_id = UniqueName(opt_value) elif opt_key == "priority": @@ -235,17 +236,18 @@ def parse_insn(insn): insn_deps_is_final = True opt_value = (opt_value[1:]).strip() - insn_deps = frozenset(dep.strip() for dep in opt_value.split(":") + insn_deps = frozenset( + intern(dep.strip()) for dep in opt_value.split(":") if dep.strip()) elif opt_key == "groups": insn_groups = frozenset( - grp.strip() for grp in opt_value.split(":") + intern(grp.strip()) for grp in opt_value.split(":") if grp.strip()) elif opt_key == "conflicts": conflicts_with_groups = frozenset( - grp.strip() for grp in opt_value.split(":") + intern(grp.strip()) for grp in opt_value.split(":") if grp.strip()) elif opt_key == "inames": @@ -255,10 +257,10 @@ def parse_insn(insn): else: forced_iname_deps_is_final = True - forced_iname_deps = frozenset(opt_value.split(":")) + forced_iname_deps = intern_frozenset_of_ids(opt_value.split(":")) elif opt_key == "if": - predicates = frozenset(opt_value.split(":")) + predicates = intern_frozenset_of_ids(opt_value.split(":")) elif opt_key == "tags": tags = tuple( @@ -284,7 +286,10 @@ def parse_insn(insn): "be variable or subscript" % lhs) return ExpressionInstruction( - id=insn_id, + id=( + intern(insn_id) + if isinstance(insn_id, str) + else insn_id), insn_deps=insn_deps, insn_deps_is_final=insn_deps_is_final, groups=insn_groups, @@ -326,7 +331,17 @@ def parse_insn(insn): def parse_if_necessary(insn, defines): if isinstance(insn, InstructionBase): - yield insn, [] + yield insn.copy( + id=intern(insn.id) if isinstance(insn.id, str) else insn.id, + insn_deps=frozenset(intern(dep) for dep in insn.insn_deps), + groups=frozenset(intern(grp) for grp in insn.groups), + conflicts_with_groups=frozenset( + intern(grp) for grp in insn.conflicts_with_groups), + forced_iname_deps=frozenset( + intern(iname) for iname in insn.forced_iname_deps), + predicates=frozenset( + intern(pred) for pred in insn.predicates), + ), [] return elif not isinstance(insn, str): raise TypeError("Instructions must be either an Instruction " @@ -692,7 +707,7 @@ class CSEToAssignmentMapper(IdentityMapper): return var -def expand_cses(knl): +def expand_cses(instructions, cse_prefix="cse_expr"): def add_assignment(base_name, expr, dtype): if base_name is None: base_name = "var" @@ -706,16 +721,15 @@ def expand_cses(knl): dtype = np.dtype(dtype) from loopy.kernel.data import TemporaryVariable - new_temp_vars[new_var_name] = TemporaryVariable( + new_temp_vars.append(TemporaryVariable( name=new_var_name, dtype=dtype, is_local=lp.auto, - shape=()) + shape=())) from pymbolic.primitives import Variable new_insn = ExpressionInstruction( - id=knl.make_unique_instruction_id( - extra_used_ids=newly_created_insn_ids), + id=None, assignee=Variable(new_var_name), expression=expr, predicates=insn.predicates) newly_created_insn_ids.add(new_insn.id) @@ -727,20 +741,19 @@ def expand_cses(knl): new_insns = [] - var_name_gen = knl.get_var_name_generator() + from pytools import UniqueNameGenerator + var_name_gen = UniqueNameGenerator(forced_prefix=cse_prefix) newly_created_insn_ids = set() - new_temp_vars = knl.temporary_variables.copy() + new_temp_vars = [] - for insn in knl.instructions: + for insn in instructions: if isinstance(insn, ExpressionInstruction): new_insns.append(insn.copy(expression=cseam(insn.expression))) else: new_insns.append(insn) - return knl.copy( - instructions=new_insns, - temporary_variables=new_temp_vars) + return (new_insns, new_temp_vars) # }}} @@ -1169,6 +1182,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # }}} + instructions, cse_temp_vars = expand_cses(instructions) + for tv in cse_temp_vars: + temporary_variables[tv.name] = tv + del cse_temp_vars + domains = parse_domains(domains, defines) arg_guesser = ArgumentGuesser(domains, instructions, @@ -1194,10 +1212,9 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_nonexistent_iname_deps(knl) - knl = tag_reduction_inames_as_sequential(knl) knl = create_temporaries(knl, default_order) knl = determine_shapes_of_temporaries(knl) - knl = expand_cses(knl) + knl = tag_reduction_inames_as_sequential(knl) knl = expand_defines_in_shapes(knl, defines) knl = guess_arg_shape_if_requested(knl, default_order) knl = apply_default_order_to_args(knl, default_order) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index b88929358afe99c1c9b5450c53df0f4cec7473fa..c5cecfde2fa4005669d1fca5f3439ca282f2c3c0 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -25,6 +25,7 @@ THE SOFTWARE. """ +from six.moves import intern import numpy as np from pytools import Record, memoize_method from loopy.kernel.array import ArrayBase @@ -185,6 +186,8 @@ def parse_tag(tag): class KernelArgument(Record): def __init__(self, **kwargs): + kwargs["name"] = intern(kwargs.pop("name")) + dtype = kwargs.pop("dtype", None) if isinstance(dtype, np.dtype): @@ -312,6 +315,11 @@ class TemporaryVariable(ArrayBase): Whether this is temporary lives in ``local`` memory. May be *True*, *False*, or :class:`loopy.auto` if this is to be automatically determined. + + .. attribute:: base_storage + + The name of a storage array that is to be used to actually + hold the data in this temporary. """ min_target_axes = 0 @@ -320,12 +328,14 @@ class TemporaryVariable(ArrayBase): allowed_extra_kwargs = [ "storage_shape", "base_indices", - "is_local" + "is_local", + "base_storage" ] def __init__(self, name, dtype=None, shape=(), is_local=auto, dim_tags=None, offset=0, strides=None, order=None, - base_indices=None, storage_shape=None): + base_indices=None, storage_shape=None, + base_storage=None): """ :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype` :arg shape: :class:`loopy.auto` or a shape tuple @@ -339,35 +349,29 @@ class TemporaryVariable(ArrayBase): if base_indices is None: base_indices = (0,) * len(shape) - ArrayBase.__init__(self, name=name, + ArrayBase.__init__(self, name=intern(name), dtype=dtype, shape=shape, dim_tags=dim_tags, order="C", base_indices=base_indices, is_local=is_local, - storage_shape=storage_shape) + storage_shape=storage_shape, + base_storage=base_storage) @property def nbytes(self): - from pytools import product - return product(si for si in self.shape)*self.dtype.itemsize - - def get_arg_decl(self, target, name_suffix, shape, dtype, is_written): - from cgen import ArrayOf - from loopy.codegen import POD # uses the correct complex type - from cgen.opencl import CLLocal - - temp_var_decl = POD(target, dtype, self.name) - - # FIXME take into account storage_shape, or something like it - storage_shape = shape + shape = self.shape + if self.storage_shape is not None: + shape = self.storage_shape - if storage_shape: - temp_var_decl = ArrayOf(temp_var_decl, - " * ".join(str(s) for s in storage_shape)) + from pytools import product + return product(si for si in shape)*self.dtype.itemsize - if self.is_local: - temp_var_decl = CLLocal(temp_var_decl) + def decl_info(self, target, index_dtype): + return super(TemporaryVariable, self).decl_info( + target, is_written=True, index_dtype=index_dtype, + shape_override=self.storage_shape) - return temp_var_decl + def get_arg_decl(self, target, name_suffix, shape, dtype, is_written): + return None def __str__(self): return self.stringify(include_typename=False) @@ -512,6 +516,9 @@ class InstructionBase(Record): forced_iname_deps_is_final, forced_iname_deps, priority, boostable, boostable_into, predicates, tags): + if insn_deps is None: + insn_deps = frozenset() + if groups is None: groups = frozenset() @@ -531,6 +538,17 @@ class InstructionBase(Record): if tags is None: tags = () + # Periodically reenable these and run the tests to ensure all + # performance-relevant identifiers are interned. + # + # from loopy.tools import is_interned + # assert is_interned(id) + # assert all(is_interned(dep) for dep in insn_deps) + # assert all(is_interned(grp) for grp in groups) + # assert all(is_interned(grp) for grp in conflicts_with_groups) + # assert all(is_interned(iname) for iname in forced_iname_deps) + # assert all(is_interned(pred) for pred in predicates) + assert isinstance(forced_iname_deps, frozenset) assert isinstance(insn_deps, frozenset) or insn_deps is None assert isinstance(groups, frozenset) @@ -650,6 +668,21 @@ class InstructionBase(Record): # }}} + def __setstate__(self, val): + super(InstructionBase, self).__setstate__(val) + + from loopy.tools import intern_frozenset_of_ids + + self.id = intern(self.id) + self.insn_deps = intern_frozenset_of_ids(self.insn_deps) + self.groups = intern_frozenset_of_ids(self.groups) + self.conflicts_with_groups = ( + intern_frozenset_of_ids(self.conflicts_with_groups)) + self.forced_iname_deps = ( + intern_frozenset_of_ids(self.forced_iname_deps)) + self.predicates = ( + intern_frozenset_of_ids(self.predicates)) + # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index b59c40731d91d1689d2cd9c00884069d35f7856a..be6f32a9bf78fab306cb4acd3f45a1a4f2e66f34 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1,8 +1,6 @@ """Operations on the kernel object.""" -from __future__ import division -from __future__ import absolute_import -import six +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -26,6 +24,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six +from six.moves import intern import numpy as np from islpy import dim_type @@ -204,7 +204,7 @@ def find_all_insn_inames(kernel): # current inames refer to. if par in kernel.all_inames(): - inames_new.add(par) + inames_new.add(intern(par)) # If something writes the bounds of a loop in which I'm # sitting, I had better be in the inames that the writer is diff --git a/loopy/precompute.py b/loopy/precompute.py index b1df5f6786bfee3abf9d1d58dfb30b5bb3d72bd3..ee7f815cf90cd2e870af4b153435083f264503e3 100644 --- a/loopy/precompute.py +++ b/loopy/precompute.py @@ -136,7 +136,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): access_descriptors, array_base_map, storage_axis_names, storage_axis_sources, non1_storage_axis_names, - temporary_name): + temporary_name, compute_insn_id): super(RuleInvocationReplacer, self).__init__(rule_mapping_context) self.subst_name = subst_name @@ -151,6 +151,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): self.non1_storage_axis_names = non1_storage_axis_names self.temporary_name = temporary_name + self.compute_insn_id = compute_insn_id def map_substitution(self, name, tag, arguments, expn_state): if not ( @@ -211,8 +212,26 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): # further as compute expression has already been seen # by rule_mapping_context. + self.replaced_something = True + return new_outer_expr + def map_kernel(self, kernel): + new_insns = [] + + for insn in kernel.instructions: + self.replaced_something = False + + insn = insn.with_transformed_expressions(self, kernel, insn) + + if self.replaced_something: + insn = insn.copy( + insn_deps=insn.insn_deps | frozenset([self.compute_insn_id])) + + new_insns.append(insn) + + return kernel.copy(instructions=new_insns) + # }}} @@ -220,7 +239,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, storage_axes=None, temporary_name=None, precompute_inames=None, storage_axis_to_tag={}, default_tag="l.auto", dtype=None, fetch_bounding_box=False, temporary_is_local=None, - insn_id=None): + compute_insn_id=None): """Precompute the expression described in the substitution rule determined by *subst_use* and store it in a temporary array. A precomputation needs two things to operate, a list of *sweep_inames* (order irrelevant) and an @@ -280,7 +299,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, If the specified inames do not already exist, they will be created. If they do already exist, their loop domain is verified against the one required for this precomputation. - :arg insn_id: The ID of the instruction performing the precomputation. + :arg compute_insn_id: The ID of the instruction performing the precomputation. If `storage_axes` is not specified, it defaults to the arrangement `<direct sweep axes><arguments>` with the direct sweep axes being the @@ -686,11 +705,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} from loopy.kernel.data import ExpressionInstruction - if insn_id is None: - insn_id = kernel.make_unique_instruction_id(based_on=c_subst_name) + if compute_insn_id is None: + compute_insn_id = kernel.make_unique_instruction_id(based_on=c_subst_name) compute_insn = ExpressionInstruction( - id=insn_id, + id=compute_insn_id, assignee=assignee, expression=compute_expression) @@ -703,7 +722,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, access_descriptors, abm, storage_axis_names, storage_axis_sources, non1_storage_axis_names, - temporary_name) + temporary_name, compute_insn_id) kernel = invr.map_kernel(kernel) kernel = kernel.copy( diff --git a/loopy/schedule.py b/loopy/schedule.py index b44569b97bf45038da949b3128c992ea87afba89..1d0dc1221a8280d73cdd03858bb72404c7c79afb 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -171,7 +171,7 @@ def find_used_inames_within(kernel, sched_index): return result -def loop_nest_map(kernel): +def find_loop_nest_map(kernel): """Returns a dictionary mapping inames to other inames that are always nested around them. """ @@ -212,6 +212,49 @@ def loop_nest_map(kernel): return result +def find_loop_insn_dep_map(kernel, loop_nest_map): + """Returns a dictionary mapping inames to other instruction ids that need to + be scheduled before the iname should be eligible for scheduling. + """ + + result = {} + + from loopy.kernel.data import ParallelTag + for insn in kernel.instructions: + for iname in kernel.insn_inames(insn): + if isinstance(kernel.iname_to_tag.get(iname), ParallelTag): + continue + + for dep_insn_id in insn.insn_deps: + dep_insn = kernel.id_to_insn[dep_insn_id] + dep_insn_inames = kernel.insn_inames(dep_insn) + + if iname in dep_insn_inames: + # Nothing to be learned, dependency is in loop over iname + # already. + continue + + # To make sure dep_insn belongs outside of iname, we must prove + # (via loop_nest_map) that all inames that dep_insn will be + # executed inside are nested *around* iname. + if not dep_insn_inames <= loop_nest_map[iname]: + continue + + iname_dep = result.setdefault(iname, set()) + if dep_insn_id not in iname_dep: + logger.debug("{knl}: loop dependency map: iname '{iname}' " + "depends on '{dep_insn}' via '{insn}'" + .format( + knl=kernel.name, + iname=iname, + dep_insn=dep_insn_id, + insn=insn.id)) + + iname_dep.add(dep_insn_id) + + return result + + def group_insn_counts(kernel): result = {} @@ -422,10 +465,14 @@ def generate_loop_schedules_internal( reachable_insn_ids = set() active_groups = frozenset(sched_state.active_group_counts) - for insn_id in sorted(sched_state.unscheduled_insn_ids, - key=lambda insn_id: kernel.id_to_insn[insn_id].priority, - reverse=True): + def insn_sort_key(insn_id): + insn = kernel.id_to_insn[insn_id] + return (insn.priority, len(active_groups & insn.groups)) + + insn_ids_to_try = sorted(sched_state.unscheduled_insn_ids, + key=insn_sort_key, reverse=True) + for insn_id in insn_ids_to_try: insn = kernel.id_to_insn[insn_id] is_ready = insn.insn_deps <= sched_state.scheduled_insn_ids @@ -497,7 +544,7 @@ def generate_loop_schedules_internal( else: new_active_group_counts[grp] = ( - sched_state.group_insn_counts[grp]) + sched_state.group_insn_counts[grp] - 1) else: new_active_group_counts = sched_state.active_group_counts @@ -522,7 +569,10 @@ def generate_loop_schedules_internal( allow_insn=True): yield sub_sched - return + if not sched_state.group_insn_counts: + # No groups: We won't need to backtrack on scheduling + # instructions. + return # }}} @@ -603,6 +653,9 @@ def generate_loop_schedules_internal( print("active inames :", ",".join(sched_state.active_inames)) print("inames entered so far :", ",".join(sched_state.entered_inames)) print("reachable insns:", ",".join(reachable_insn_ids)) + print("active groups (with insn counts):", ",".join( + "%s: %d" % (grp, c) + for grp, c in six.iteritems(sched_state.active_group_counts))) print(75*"-") if needed_inames: @@ -619,6 +672,22 @@ def generate_loop_schedules_internal( print("scheduling %s prohibited by loop nest map" % iname) continue + if ( + not sched_state.loop_insn_dep_map.get(iname, set()) + <= sched_state.scheduled_insn_ids): + if debug_mode: + print( + "scheduling {iname} prohibited by loop dependency map " + "(needs '{needed_insns})'" + .format( + iname=iname, + needed_insns=", ".join( + sched_state.loop_insn_dep_map.get(iname, set()) + - + sched_state.scheduled_insn_ids))) + + continue + iname_home_domain = kernel.domains[kernel.get_home_domain_index(iname)] from islpy import dim_type iname_home_domain_params = set( @@ -795,6 +864,11 @@ class DependencyRecord(Record): A :class:`loopy.InstructionBase` instance. + .. attribute:: dep_descr + + A string containing a phrase describing the dependency. The variables + '{src}' and '{tgt}' will be replaced by their respective instruction IDs. + .. attribute:: variable A string, the name of the variable that caused the dependency to arise. @@ -802,23 +876,15 @@ class DependencyRecord(Record): .. attribute:: var_kind "global" or "local" - - .. attribute:: is_forward - - A :class:`bool` indicating whether this is a forward or reverse - dependency. - - In a 'forward' dependency, the target depends on the source. - In a 'reverse' dependency, the source depends on the target. """ - def __init__(self, source, target, variable, var_kind, is_forward): + def __init__(self, source, target, dep_descr, variable, var_kind): Record.__init__(self, source=source, target=target, + dep_descr=dep_descr, variable=variable, - var_kind=var_kind, - is_forward=is_forward) + var_kind=var_kind) def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): @@ -827,7 +893,7 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): at least one write), then the function will return a tuple ``(target, source, var_name)``. Otherwise, it will return *None*. - This function finds direct or indirect instruction dependencies, but does + This function finds direct or indirect instruction dependencies, but does not attempt to guess dependencies that exist based on common access to variables. @@ -847,11 +913,30 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): if reverse: source, target = target, source - # Check that a dependency exists. + # {{{ check that a dependency exists + + dep_descr = None + target_deps = kernel.recursive_insn_dep_map()[target.id] - if source.id not in target_deps: + if source.id in target_deps: + if reverse: + dep_descr = "{src} rev-depends on {tgt}" + else: + dep_descr = "{tgt} depends on {src}" + + grps = source.groups & target.conflicts_with_groups + if grps: + dep_descr = "{src} conflicts with {tgt} (via '%s')" % ", ".join(grps) + + grps = target.groups & source.conflicts_with_groups + if grps: + dep_descr = "{src} conflicts with {tgt} (via '%s')" % ", ".join(grps) + + if not dep_descr: return None + # }}} + if var_kind == "local": relevant_vars = kernel.local_var_names() elif var_kind == "global": @@ -859,11 +944,27 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): else: raise ValueError("unknown 'var_kind': %s" % var_kind) - tgt_write = set(target.assignee_var_names()) & relevant_vars - tgt_read = target.read_dependency_names() & relevant_vars + temp_to_base_storage = kernel.get_temporary_to_base_storage_map() + + def map_to_base_storage(var_names): + result = set(var_names) - src_write = set(source.assignee_var_names()) & relevant_vars - src_read = source.read_dependency_names() & relevant_vars + for name in var_names: + bs = temp_to_base_storage.get(name) + if bs is not None: + result.add(bs) + + return result + + tgt_write = map_to_base_storage( + set(target.assignee_var_names()) & relevant_vars) + tgt_read = map_to_base_storage( + target.read_dependency_names() & relevant_vars) + + src_write = map_to_base_storage( + set(source.assignee_var_names()) & relevant_vars) + src_read = map_to_base_storage( + source.read_dependency_names() & relevant_vars) waw = tgt_write & src_write raw = tgt_read & src_write @@ -873,9 +974,9 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): return DependencyRecord( source=source, target=target, + dep_descr=dep_descr, variable=var_name, - var_kind=var_kind, - is_forward=not reverse) + var_kind=var_kind) if source is target: return None @@ -884,9 +985,9 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): return DependencyRecord( source=source, target=target, + dep_descr=dep_descr, variable=var_name, - var_kind=var_kind, - is_forward=not reverse) + var_kind=var_kind) return None @@ -998,12 +1099,9 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0): comment = None if dep is not None: - if dep.is_forward: - comment = "for %s (%s depends on %s)" % ( - dep.variable, dep.target.id, dep.source.id) - else: - comment = "for %s (%s rev-depends on %s)" % ( - dep.variable, dep.source.id, dep.target.id) + comment = "for %s (%s)" % ( + dep.variable, dep.dep_descr.format( + tgt=dep.target.id, src=dep.source.id)) result.append(Barrier(comment=comment, kind=dep.var_kind)) @@ -1047,10 +1145,6 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0): # (for leading (before-first-barrier) bit of loop body) for insn_id in insn_ids_from_schedule(subresult[:first_barrier_index]): search_set = candidates - if not reverse: - # can limit search set in case of forward dep - search_set = search_set \ - & kernel.recursive_insn_dep_map()[insn_id] for dep_src_insn_id in search_set: dep = get_barrier_needing_dependency( @@ -1090,10 +1184,6 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0): i += 1 search_set = candidates - if not reverse: - # can limit search set in case of forward dep - search_set = search_set \ - & kernel.recursive_insn_dep_map()[sched_item.insn_id] for dep_src_insn_id in search_set: dep = get_barrier_needing_dependency( @@ -1153,9 +1243,11 @@ def generate_loop_schedules(kernel, debug_args={}): iname for iname in kernel.all_inames() if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) + loop_nest_map = find_loop_nest_map(kernel) sched_state = SchedulerState( kernel=kernel, - loop_nest_map=loop_nest_map(kernel), + loop_nest_map=loop_nest_map, + loop_insn_dep_map=find_loop_insn_dep_map(kernel, loop_nest_map), breakable_inames=ilp_inames, ilp_inames=ilp_inames, vec_inames=vec_inames, @@ -1180,38 +1272,9 @@ def generate_loop_schedules(kernel, debug_args={}): debug=debug, allow_boost=None), generate_loop_schedules_internal(sched_state, debug=debug)] - for gen in generators: - for gen_sched in gen: - # gen_sched = insert_barriers(kernel, gen_sched, - # reverse=False, kind="global") - - # for sched_item in gen_sched: - # if isinstance(sched_item, Barrier) and sched_item.kind == "global": - # raise LoopyError("kernel requires a global barrier %s" - # % sched_item.comment) - - gen_sched = insert_barriers(kernel, gen_sched, - reverse=False, kind="local") - - debug.stop() - yield kernel.copy( - schedule=gen_sched, - state=kernel_state.SCHEDULED) - debug.start() - - schedule_count += 1 - # if no-boost mode yielded a viable schedule, stop now - if schedule_count: - break - - debug.done_scheduling() - - if not schedule_count: + def print_longest_dead_end(): if debug.interactive: - print(75*"-") - print("ERROR: Sorry--loo.py did not find a schedule for your kernel.") - print(75*"-") print("Loo.py will now show you the scheduler state at the point") print("where the longest (dead-end) schedule was generated, in the") print("the hope that some of this makes sense and helps you find") @@ -1230,6 +1293,52 @@ def generate_loop_schedules(kernel, debug_args={}): debug=debug): pass + try: + for gen in generators: + for gen_sched in gen: + # gen_sched = insert_barriers(kernel, gen_sched, + # reverse=False, kind="global") + + # for sched_item in gen_sched: + # if ( + # isinstance(sched_item, Barrier) + # and sched_item.kind == "global"): + # raise LoopyError("kernel requires a global barrier %s" + # % sched_item.comment) + + debug.stop() + + logger.info("%s: barrier insertion: start" % kernel.name) + + gen_sched = insert_barriers(kernel, gen_sched, + reverse=False, kind="local") + + logger.info("%s: barrier insertion: done" % kernel.name) + + yield kernel.copy( + schedule=gen_sched, + state=kernel_state.SCHEDULED) + debug.start() + + schedule_count += 1 + + # if no-boost mode yielded a viable schedule, stop now + if schedule_count: + break + + except KeyboardInterrupt: + print(75*"-") + print("Interrupted during scheduling") + print(75*"-") + print_longest_dead_end() + raise + + debug.done_scheduling() + if not schedule_count: + print(75*"-") + print("ERROR: Sorry--loo.py did not find a schedule for your kernel.") + print(75*"-") + print_longest_dead_end() raise RuntimeError("no valid schedules found") logger.info("%s: schedule done" % kernel.name) diff --git a/loopy/statistics.py b/loopy/statistics.py index 7281cd2b2cc06f3b9d578769609037aafc880e0f..d25ea3eaca44187e8ad7020b6ddfd9e5bedc95bc 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -55,13 +55,14 @@ class ToCountMap: "to {} {}. ToCountMap may only be added to " "0 and other ToCountMap objects." .format(type(other), other)) - return + return self def __mul__(self, other): if isinstance(other, isl.PwQPolynomial): - return ToCountMap({index: self.dict[index]*other - for index in self.dict.keys()}) + return ToCountMap(dict( + (index, self.dict[index]*other) + for index in self.dict.keys())) else: raise ValueError("ToCountMap: Attempted to multiply " "ToCountMap by {} {}." @@ -206,7 +207,7 @@ class ExpressionOpCounter(CombineMapper): "map_slice not implemented.") -class ExpressionSubscriptCounter(CombineMapper): +class GlobalSubscriptCounter(CombineMapper): def __init__(self, knl): self.knl = knl @@ -344,12 +345,12 @@ class ExpressionSubscriptCounter(CombineMapper): map_logical_and = map_logical_or def map_if(self, expr): - warnings.warn("ExpressionSubscriptCounter counting DRAM accesses as " + warnings.warn("GlobalSubscriptCounter counting DRAM accesses as " "sum of if-statement branches.") return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_) def map_if_positive(self, expr): - warnings.warn("ExpressionSubscriptCounter counting DRAM accesses as " + warnings.warn("GlobalSubscriptCounter counting DRAM accesses as " "sum of if_pos-statement branches.") return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) @@ -357,22 +358,22 @@ class ExpressionSubscriptCounter(CombineMapper): map_max = map_min def map_common_subexpression(self, expr): - raise NotImplementedError("ExpressionSubscriptCounter encountered " + raise NotImplementedError("GlobalSubscriptCounter encountered " "common_subexpression, " "map_common_subexpression not implemented.") def map_substitution(self, expr): - raise NotImplementedError("ExpressionSubscriptCounter encountered " + raise NotImplementedError("GlobalSubscriptCounter encountered " "substitution, " "map_substitution not implemented.") def map_derivative(self, expr): - raise NotImplementedError("ExpressionSubscriptCounter encountered " + raise NotImplementedError("GlobalSubscriptCounter encountered " "derivative, " "map_derivative not implemented.") def map_slice(self, expr): - raise NotImplementedError("ExpressionSubscriptCounter encountered slice, " + raise NotImplementedError("GlobalSubscriptCounter encountered slice, " "map_slice not implemented.") @@ -449,9 +450,8 @@ def get_op_poly(knl): return op_poly -def get_DRAM_access_poly(knl): # for now just counting subscripts - - """Count the number of DRAM accesses in a loopy kernel. +def get_gmem_access_poly(knl): # for now just counting subscripts + """Count the number of global memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be counted. @@ -476,7 +476,7 @@ def get_DRAM_access_poly(knl): # for now just counting subscripts # (first create loopy kernel and specify array data types) - subscript_map = get_DRAM_access_poly(knl) + subscript_map = get_gmem_access_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} f32_uncoalesced_load = subscript_map.dict[ @@ -498,7 +498,7 @@ def get_DRAM_access_poly(knl): # for now just counting subscripts knl = preprocess_kernel(knl) subs_poly = 0 - subscript_counter = ExpressionSubscriptCounter(knl) + subscript_counter = GlobalSubscriptCounter(knl) for insn in knl.instructions: insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) @@ -517,6 +517,13 @@ def get_DRAM_access_poly(knl): # for now just counting subscripts return subs_poly +def get_DRAM_access_poly(knl): + from warnings import warn + warn("get_DRAM_access_poly is deprecated. Use get_gmem_access_poly instead", + DeprecationWarning, stacklevel=2) + return get_gmem_access_poly(knl) + + def get_barrier_poly(knl): """Count the number of barriers each thread encounters in a loopy kernel. diff --git a/loopy/subst.py b/loopy/subst.py index a0a031718962df3053b80058818b2f2a4b88d2c8..a29e950a1f32d660eb10147c8638612078e816aa 100644 --- a/loopy/subst.py +++ b/loopy/subst.py @@ -198,16 +198,16 @@ def extract_subst(kernel, subst_name, template, parameters=()): substitutions=new_substs) -# {{{ temporary_to_subst +# {{{ assignment_to_subst -class TemporaryToSubstChanger(RuleAwareIdentityMapper): - def __init__(self, rule_mapping_context, temp_name, definition_insn_ids, +class AssignmentToSubstChanger(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, lhs_name, definition_insn_ids, usage_to_definition, extra_arguments, within): self.var_name_gen = rule_mapping_context.make_unique_var_name - super(TemporaryToSubstChanger, self).__init__(rule_mapping_context) + super(AssignmentToSubstChanger, self).__init__(rule_mapping_context) - self.temp_name = temp_name + self.lhs_name = lhs_name self.definition_insn_ids = definition_insn_ids self.usage_to_definition = usage_to_definition @@ -226,28 +226,28 @@ class TemporaryToSubstChanger(RuleAwareIdentityMapper): try: return self.definition_insn_id_to_subst_name[def_insn_id] except KeyError: - subst_name = self.var_name_gen(self.temp_name+"_subst") + subst_name = self.var_name_gen(self.lhs_name+"_subst") self.definition_insn_id_to_subst_name[def_insn_id] = subst_name return subst_name def map_variable(self, expr, expn_state): - if (expr.name == self.temp_name + if (expr.name == self.lhs_name and expr.name not in expn_state.arg_context): result = self.transform_access(None, expn_state) if result is not None: return result - return super(TemporaryToSubstChanger, self).map_variable( + return super(AssignmentToSubstChanger, self).map_variable( expr, expn_state) def map_subscript(self, expr, expn_state): - if (expr.aggregate.name == self.temp_name + if (expr.aggregate.name == self.lhs_name and expr.aggregate.name not in expn_state.arg_context): result = self.transform_access(expr.index, expn_state) if result is not None: return result - return super(TemporaryToSubstChanger, self).map_subscript( + return super(AssignmentToSubstChanger, self).map_subscript( expr, expn_state) def transform_access(self, index, expn_state): @@ -280,26 +280,29 @@ class TemporaryToSubstChanger(RuleAwareIdentityMapper): return var(subst_name)(*index) -def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None): - """Extract an assignment to a temporary variable +def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, + force_retain_argument=False): + """Extract an assignment (to a temporary variable or an argument) as a :ref:`substituiton-rule`. The temporary may be an array, in which case the array indices will become arguments to the substitution rule. :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. + :arg force_retain_argument: If True and if *lhs_name* is an argument, it is + kept even if it is no longer referenced. This operation will change all usage sites - of *temp_name* matched by *within*. If there - are further usage sites of *temp_name*, then - the original assignment to *temp_name* as well + of *lhs_name* matched by *within*. If there + are further usage sites of *lhs_name*, then + the original assignment to *lhs_name* as well as the temporary variable is left in place. """ if isinstance(extra_arguments, str): extra_arguments = tuple(s.strip() for s in extra_arguments.split(",")) - # {{{ establish the relevant definition of temp_name for each usage site + # {{{ establish the relevant definition of lhs_name for each usage site dep_kernel = expand_subst(kernel) from loopy.preprocess import add_default_dependencies @@ -313,11 +316,11 @@ def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None): def_id = set() for dep_id in insn.insn_deps: dep_insn = id_to_insn[dep_id] - if temp_name in dep_insn.write_dependency_names(): - if temp_name in dep_insn.read_dependency_names(): + if lhs_name in dep_insn.write_dependency_names(): + if lhs_name in dep_insn.read_dependency_names(): raise LoopyError("instruction '%s' both reads *and* " "writes '%s'--cannot transcribe to substitution " - "rule" % (dep_id, temp_name)) + "rule" % (dep_id, lhs_name)) def_id.add(dep_id) else: @@ -329,7 +332,7 @@ def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None): raise LoopyError("more than one write to '%s' found in " "depdendencies of '%s'--definition cannot be resolved " "(writer instructions ids: %s)" - % (temp_name, usage_insn_id, ", ".join(def_id))) + % (lhs_name, usage_insn_id, ", ".join(def_id))) if not def_id: return None @@ -341,20 +344,20 @@ def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None): usage_to_definition = {} for insn in kernel.instructions: - if temp_name not in insn.read_dependency_names(): + if lhs_name not in insn.read_dependency_names(): continue def_id = get_relevant_definition_insn_id(insn.id) if def_id is None: raise LoopyError("no write to '%s' found in dependency tree " "of '%s'--definition cannot be resolved" - % (temp_name, insn.id)) + % (lhs_name, insn.id)) usage_to_definition[insn.id] = def_id definition_insn_ids = set() for insn in kernel.instructions: - if temp_name in insn.write_dependency_names(): + if lhs_name in insn.write_dependency_names(): definition_insn_ids.add(insn.id) # }}} @@ -364,8 +367,8 @@ def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None): rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) - tts = TemporaryToSubstChanger(rule_mapping_context, - temp_name, definition_insn_ids, + tts = AssignmentToSubstChanger(rule_mapping_context, + lhs_name, definition_insn_ids, usage_to_definition, extra_arguments, within) kernel = rule_mapping_context.finish_kernel(tts.map_kernel(kernel)) @@ -401,13 +404,28 @@ def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None): # {{{ delete temporary variable if possible + # (copied below if modified) new_temp_vars = kernel.temporary_variables - if not any(six.itervalues(tts.saw_unmatched_usage_sites)): - # All usage sites matched--they're now substitution rules. - # We can get rid of the variable. + new_args = kernel.args - new_temp_vars = new_temp_vars.copy() - del new_temp_vars[temp_name] + if lhs_name in kernel.temporary_variables: + if not any(six.itervalues(tts.saw_unmatched_usage_sites)): + # All usage sites matched--they're now substitution rules. + # We can get rid of the variable. + + new_temp_vars = new_temp_vars.copy() + del new_temp_vars[lhs_name] + + if lhs_name in kernel.arg_dict and not force_retain_argument: + if not any(six.itervalues(tts.saw_unmatched_usage_sites)): + # All usage sites matched--they're now substitution rules. + # We can get rid of the argument + + new_args = new_args[:] + for i in range(len(new_args)): + if new_args[i].name == lhs_name: + del new_args[i] + break # }}} @@ -423,6 +441,7 @@ def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None): return kernel.copy( substitutions=new_substs, temporary_variables=new_temp_vars, + args=new_args, ) # }}} diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 28943644ec9fcf540257f519c6e338bc5e7d4806..ca71c21269add662dc1ef19a4437c9f297ec6477 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -36,9 +36,9 @@ class CTarget(TargetBase): @memoize_method def get_dtype_registry(self): from loopy.target.c.compyte.dtypes import ( - DTypeRegistry, fill_with_registry_with_c_types) + DTypeRegistry, fill_registry_with_c_types) result = DTypeRegistry() - fill_with_registry_with_c_types(result, respect_windows=False, + fill_registry_with_c_types(result, respect_windows=False, include_bool=True) return result @@ -85,14 +85,100 @@ class CTarget(TargetBase): from cgen import Block body = Block() + temp_decls = [] + # {{{ declare temporaries - body.extend( - idi.cgen_declarator - for tv in six.itervalues(kernel.temporary_variables) - for idi in tv.decl_info( - kernel.target, - is_written=True, index_dtype=kernel.index_dtype)) + base_storage_sizes = {} + base_storage_to_is_local = {} + base_storage_to_align_bytes = {} + + from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute + from loopy.codegen import POD # uses the correct complex type + from cgen.opencl import CLLocal + + class ConstRestrictPointer(Pointer): + def get_decl_pair(self): + sub_tp, sub_decl = self.subdecl.get_decl_pair() + return sub_tp, ("*const restrict %s" % sub_decl) + + for tv in six.itervalues(kernel.temporary_variables): + decl_info = tv.decl_info(self, index_dtype=kernel.index_dtype) + + if not tv.base_storage: + for idi in decl_info: + temp_var_decl = POD(self, idi.dtype, idi.name) + + if idi.shape: + temp_var_decl = ArrayOf(temp_var_decl, + " * ".join(str(s) for s in idi.shape)) + + if tv.is_local: + temp_var_decl = CLLocal(temp_var_decl) + + temp_decls.append(temp_var_decl) + + else: + offset = 0 + base_storage_sizes.setdefault(tv.base_storage, []).append( + tv.nbytes) + base_storage_to_is_local.setdefault(tv.base_storage, []).append( + tv.is_local) + + align_size = tv.dtype.itemsize + + from loopy.kernel.array import VectorArrayDimTag + for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): + if isinstance(dim_tag, VectorArrayDimTag): + align_size *= axis_len + + base_storage_to_align_bytes.setdefault(tv.base_storage, []).append( + align_size) + + for idi in decl_info: + cast_decl = POD(self, idi.dtype, "") + temp_var_decl = POD(self, idi.dtype, idi.name) + + if tv.is_local: + cast_decl = CLLocal(cast_decl) + temp_var_decl = CLLocal(temp_var_decl) + + # The 'restrict' part of this is a complete lie--of course + # all these temporaries are aliased. But we're promising to + # not use them to shovel data from one representation to the + # other. That counts, right? + + cast_decl = ConstRestrictPointer(cast_decl) + temp_var_decl = ConstRestrictPointer(temp_var_decl) + + cast_tp, cast_d = cast_decl.get_decl_pair() + temp_var_decl = Initializer( + temp_var_decl, + "(%s %s) (%s + %s)" % ( + " ".join(cast_tp), cast_d, + tv.base_storage, + offset)) + + temp_decls.append(temp_var_decl) + + from pytools import product + offset += ( + idi.dtype.itemsize + * product(si for si in idi.shape)) + + for bs_name, bs_sizes in six.iteritems(base_storage_sizes): + bs_var_decl = POD(self, np.int8, bs_name) + if base_storage_to_is_local[bs_name]: + bs_var_decl = CLLocal(bs_var_decl) + + bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes)) + + alignment = max(base_storage_to_align_bytes[bs_name]) + bs_var_decl = AlignedAttribute(alignment, bs_var_decl) + + body.append(bs_var_decl) + + body.extend(temp_decls) # }}} diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index fb6ba114d9d906403d47b0aaf69e2fe4cef382f2..ac1c71d46428c14aa1bd1c09d7da19cd0298d5cc 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit fb6ba114d9d906403d47b0aaf69e2fe4cef382f2 +Subproject commit ac1c71d46428c14aa1bd1c09d7da19cd0298d5cc diff --git a/loopy/target/opencl/__init__.py b/loopy/target/opencl/__init__.py index e4533b86dd24a8dca973ac9c8ffd022a4bed204b..eebe6f5da0b81fa9b4c1ac7b4cda0ba8b1ac283e 100644 --- a/loopy/target/opencl/__init__.py +++ b/loopy/target/opencl/__init__.py @@ -214,17 +214,13 @@ class OpenCLTarget(CTarget): @memoize_method def get_dtype_registry(self): - from loopy.target.c.compyte.dtypes import DTypeRegistry, fill_with_registry_with_c_types - result = DTypeRegistry() - fill_with_registry_with_c_types(result, respect_windows=False) + from loopy.target.c.compyte.dtypes import (DTypeRegistry, + fill_registry_with_opencl_c_types) - # complex number support left out + result = DTypeRegistry() + fill_registry_with_opencl_c_types(result) - # CL defines 'long' as 64-bit - result.get_or_register_dtype( - ["unsigned long", "unsigned long int"], np.uint64) - result.get_or_register_dtype( - ["signed long", "signed long int", "long int"], np.int64) + # no complex number support--needs PyOpenCLTarget _register_vector_types(result) diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py index ee936680016b6808723076034c8486a49544e2bc..d13384534c70df602785d4189739a7bc86ed37db 100644 --- a/loopy/target/pyopencl/__init__.py +++ b/loopy/target/pyopencl/__init__.py @@ -233,6 +233,18 @@ def pyopencl_preamble_generator(target, seen_dtypes, seen_functions): # {{{ pyopencl tools +class _LegacyTypeRegistryStub(object): + """Adapts legacy PyOpenCL type registry to be usable with PyOpenCLTarget.""" + + def get_or_register_dtype(self, names, dtype=None): + from pyopencl.compyte.dtypes import get_or_register_dtype + return get_or_register_dtype(names, dtype) + + def dtype_to_ctype(self, dtype): + from pyopencl.compyte.dtypes import dtype_to_ctype + return dtype_to_ctype(dtype) + + class PyOpenCLTarget(OpenCLTarget): def __init__(self, device=None): super(PyOpenCLTarget, self).__init__() @@ -260,8 +272,12 @@ class PyOpenCLTarget(OpenCLTarget): check_sizes(kernel, self.device) def get_dtype_registry(self): - from pyopencl.compyte.dtypes import TYPE_REGISTRY - return TYPE_REGISTRY + try: + from pyopencl.compyte.dtypes import TYPE_REGISTRY + except ImportError: + return _LegacyTypeRegistryStub() + else: + return TYPE_REGISTRY def is_vector_dtype(self, dtype): from pyopencl.array import vec diff --git a/loopy/tools.py b/loopy/tools.py index e734417d6095da768085fa8861c870114f071ec8..55b177bda4e6be03a985286fd4faf6322e257824 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -30,6 +30,7 @@ from loopy.symbolic import WalkMapper as LoopyWalkMapper from pymbolic.mapper.persistent_hash import ( PersistentHashWalkMapper as PersistentHashWalkMapperBase) import six # noqa +from six.moves import intern if six.PY2: @@ -95,6 +96,21 @@ class LoopyKeyBuilder(KeyBuilderBase): else: PersistentHashWalkMapper(key_hash)(key) + +class PymbolicExpressionHashWrapper(object): + def __init__(self, expression): + self.expression = expression + + def __eq__(self, other): + return (type(self) == type(other) + and self.expression == other.expression) + + def __ne__(self, other): + return not self.__eq__(other) + + def update_persistent_hash(self, key_hash, key_builder): + key_builder.update_for_pymbolic_expression(key_hash, self.expression) + # }}} @@ -216,4 +232,13 @@ def remove_common_indentation(code, require_leading_newline=True, # }}} + +def is_interned(s): + return s is None or intern(s) is s + + +def intern_frozenset_of_ids(fs): + return frozenset(intern(s) for s in fs) + + # vim: foldmethod=marker diff --git a/loopy/version.py b/loopy/version.py index 9f1378f162788c307c34b862e8fa0824929b9c6f..9598697b09afc091741cea5d8da37917dd88ce9d 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v10-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v11-islpy%s" % _islpy_version diff --git a/requirements-old-pyopencl.txt b/requirements-old-pyopencl.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d21187d16ec962816dd691d8d11813e9b0ff700 --- /dev/null +++ b/requirements-old-pyopencl.txt @@ -0,0 +1,10 @@ +git+git://github.com/inducer/pytools +git+git://github.com/inducer/islpy +cgen +pyopencl==2015.1 +git+git://github.com/inducer/pymbolic + +hg+https://bitbucket.org/inducer/f2py + +# Optional, needed for using the C preprocessor on Fortran +ply>=3.6 diff --git a/setup.py b/setup.py index b3cefdac0ad16fa7b4ffdf9969a47ac509f7a257..1f1ea68769f71663dc719d274dc38b01b1f602ed 100644 --- a/setup.py +++ b/setup.py @@ -37,8 +37,8 @@ setup(name="loo.py", ], install_requires=[ - "pytools>=2014.2", - "pymbolic>=2014.1.1", + "pytools>=2015.1.3", + "pymbolic>=2015.2.1", "cgen>=2013.1.2", "islpy>=2014.2", "six", diff --git a/test/test_fortran.py b/test/test_fortran.py index c31c370076b681cb0593f38b6a4d92479541b872..a5b1b830bc8834637d5f4c609fff8232ef7449e6 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -123,7 +123,7 @@ def test_asterisk_in_shape(ctx_factory): knl(queue, inp=np.array([1, 2, 3.]), n=3) -def test_temporary_to_subst(ctx_factory): +def test_assignment_to_subst(ctx_factory): fortran_src = """ subroutine fill(out, out2, inp, n) implicit none @@ -143,13 +143,13 @@ def test_temporary_to_subst(ctx_factory): ref_knl = knl - knl = lp.temporary_to_subst(knl, "a", "i") + knl = lp.assignment_to_subst(knl, "a", "i") ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) -def test_temporary_to_subst_two_defs(ctx_factory): +def test_assignment_to_subst_two_defs(ctx_factory): fortran_src = """ subroutine fill(out, out2, inp, n) implicit none @@ -170,13 +170,13 @@ def test_temporary_to_subst_two_defs(ctx_factory): ref_knl = knl - knl = lp.temporary_to_subst(knl, "a") + knl = lp.assignment_to_subst(knl, "a") ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) -def test_temporary_to_subst_indices(ctx_factory): +def test_assignment_to_subst_indices(ctx_factory): fortran_src = """ subroutine fill(out, out2, inp, n) implicit none @@ -201,7 +201,7 @@ def test_temporary_to_subst_indices(ctx_factory): ref_knl = knl assert "a" in knl.temporary_variables - knl = lp.temporary_to_subst(knl, "a") + knl = lp.assignment_to_subst(knl, "a") assert "a" not in knl.temporary_variables ctx = ctx_factory() @@ -235,7 +235,7 @@ def test_if(ctx_factory): ref_knl = knl - knl = lp.temporary_to_subst(knl, "a") + knl = lp.assignment_to_subst(knl, "a") ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) @@ -275,6 +275,8 @@ def test_tagged(ctx_factory): "i_inner,j_inner", ]) def test_matmul(ctx_factory, buffer_inames): + logging.basicConfig(level=logging.INFO) + fortran_src = """ subroutine dgemm(m,n,l,a,b,c) implicit none @@ -409,8 +411,8 @@ def test_fuse_kernels(ctx_factory): assert len(knl.temporary_variables) == 2 # This is needed for correctness, otherwise ordering could foul things up. - knl = lp.temporary_to_subst(knl, "prev") - knl = lp.temporary_to_subst(knl, "prev_0") + knl = lp.assignment_to_subst(knl, "prev") + knl = lp.assignment_to_subst(knl, "prev_0") ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4)) diff --git a/test/test_linalg.py b/test/test_linalg.py index c019eb67fbaba5e6d8983665b67002837225d9ad..c61d963903ff738b62924e31428a928a18afad60 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -554,40 +554,6 @@ def test_image_matrix_mul_ilp(ctx_factory): parameters={}) -@pytest.mark.skipif("sys.version_info < (2,6)") -def test_ilp_race_matmul(ctx_factory): - dtype = np.float32 - order = "C" - - n = 9 - - knl = lp.make_kernel( - "{[i,j,k]: 0<=i,j,k<%d}" % n, - [ - "c[i, j] = sum(k, a[i, k]*b[k, j])" - ], - [ - lp.ImageArg("a", dtype, shape=(n, n)), - lp.ImageArg("b", dtype, shape=(n, n)), - lp.GlobalArg("c", dtype, shape=(n, n), order=order), - ], - name="matmul") - - knl = lp.split_iname(knl, "j", 2, outer_tag="ilp", inner_tag="l.0") - knl = lp.split_iname(knl, "k", 2) - knl = lp.add_prefetch(knl, 'b', ["k_inner"]) - - with lp.CacheMode(False): - from loopy.diagnostic import WriteRaceConditionWarning - from warnings import catch_warnings - with catch_warnings(record=True) as warn_list: - knl = lp.preprocess_kernel(knl) - list(lp.generate_loop_schedules(knl)) - - assert any(isinstance(w.message, WriteRaceConditionWarning) - for w in warn_list) - - def test_fancy_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() diff --git a/test/test_loopy.py b/test/test_loopy.py index 17e0cc54359d2c5ae5a19042063b0c5a0603ca22..7cad3504859d199c0581c8d3248ebafe50a34c4a 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2094,6 +2094,86 @@ def test_vectorize(ctx_factory): ref_knl, ctx, knl, parameters=dict(n=30)) + +def test_alias_temporaries(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + """ + times2(i) := 2*a[i] + times3(i) := 3*a[i] + times4(i) := 4*a[i] + + x[i] = times2(i) + y[i] = times3(i) + z[i] = times4(i) + """) + + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") + + knl = lp.precompute(knl, "times2", "i_inner") + knl = lp.precompute(knl, "times3", "i_inner") + knl = lp.precompute(knl, "times4", "i_inner") + + knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"]) + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=30)) + + +def test_fusion(): + exp_kernel = lp.make_kernel( + ''' { [i]: 0<=i<n } ''', + ''' exp[i] = pow(E, z[i])''', + assumptions="n>0") + + sum_kernel = lp.make_kernel( + '{ [j]: 0<=j<n }', + 'out2 = sum(j, exp[j])', + assumptions='n>0') + + knl = lp.fuse_kernels([exp_kernel, sum_kernel]) + + print(knl) + + +def test_sci_notation_literal(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + set_kernel = lp.make_kernel( + ''' { [i]: 0<=i<12 } ''', + ''' out[i] = 1e-12''') + + set_kernel = lp.set_options(set_kernel, write_cl=True) + + evt, (out,) = set_kernel(queue) + + assert (np.abs(out.get() - 1e-12) < 1e-20).all() + + +def test_to_batched(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + ''' { [i,j]: 0<=i,j<n } ''', + ''' out[i] = sum(j, a[i,j]*x[j])''') + + bknl = lp.to_batched(knl, "nbatches", "out,x") + + a = np.random.randn(5, 5) + x = np.random.randn(7, 5) + + bknl(queue, a=a, x=x) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_statistics.py b/test/test_statistics.py index 87ed797e74fd709c29ad9d763e195ff46985ed96..a58ce6d582a8d03d622028156adff35c61009bc0 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -27,7 +27,7 @@ from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) import loopy as lp -from loopy.statistics import get_op_poly, get_DRAM_access_poly, get_barrier_poly +from loopy.statistics import get_op_poly, get_gmem_access_poly, get_barrier_poly import numpy as np @@ -185,7 +185,7 @@ def test_op_counter_triangular_domain(): assert flops == 78 -def test_DRAM_access_counter_basic(): +def test_gmem_access_counter_basic(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -199,7 +199,7 @@ def test_DRAM_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = get_DRAM_access_poly(knl) + poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -222,7 +222,7 @@ def test_DRAM_access_counter_basic(): assert f64 == n*m -def test_DRAM_access_counter_reduction(): +def test_gmem_access_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -232,7 +232,7 @@ def test_DRAM_access_counter_reduction(): name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - poly = get_DRAM_access_poly(knl) + poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -247,7 +247,7 @@ def test_DRAM_access_counter_reduction(): assert f32 == n*l -def test_DRAM_access_counter_logic(): +def test_gmem_access_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -259,7 +259,7 @@ def test_DRAM_access_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - poly = get_DRAM_access_poly(knl) + poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -278,7 +278,7 @@ def test_DRAM_access_counter_logic(): assert f64 == n*m -def test_DRAM_access_counter_specialops(): +def test_gmem_access_counter_specialops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -292,7 +292,7 @@ def test_DRAM_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = get_DRAM_access_poly(knl) + poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -315,7 +315,7 @@ def test_DRAM_access_counter_specialops(): assert f64 == n*m -def test_DRAM_access_counter_bitwise(): +def test_gmem_access_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -332,7 +332,7 @@ def test_DRAM_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - poly = get_DRAM_access_poly(knl) + poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -347,7 +347,7 @@ def test_DRAM_access_counter_bitwise(): assert i32 == n*m+n*m*l -def test_DRAM_access_counter_mixed(): +def test_gmem_access_counter_mixed(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -363,7 +363,7 @@ def test_DRAM_access_counter_mixed(): knl = lp.split_iname(knl, "j", 16) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - poly = get_DRAM_access_poly(knl) # noqa + poly = get_gmem_access_poly(knl) # noqa n = 512 m = 256 l = 128 @@ -386,7 +386,7 @@ def test_DRAM_access_counter_mixed(): assert f32nonconsec == n*m*l -def test_DRAM_access_counter_nonconsec(): +def test_gmem_access_counter_nonconsec(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -402,7 +402,7 @@ def test_DRAM_access_counter_nonconsec(): knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) - poly = get_DRAM_access_poly(knl) # noqa + poly = get_gmem_access_poly(knl) # noqa n = 512 m = 256 l = 128 @@ -425,7 +425,7 @@ def test_DRAM_access_counter_nonconsec(): assert f32nonconsec == n*m*l -def test_DRAM_access_counter_consec(): +def test_gmem_access_counter_consec(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -440,7 +440,7 @@ def test_DRAM_access_counter_consec(): a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) - poly = get_DRAM_access_poly(knl) + poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -541,7 +541,7 @@ def test_all_counters_parallel_matmul(): assert f32ops == n*m*l*2 assert i32ops == n*m*l*4 + l*n*4 - subscript_map = get_DRAM_access_poly(knl) + subscript_map = get_gmem_access_poly(knl) f32uncoal = subscript_map.dict[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l})