diff --git a/doc/conf.py b/doc/conf.py index 301604607ef9543b559aa7cfdaf875e9140f1299..74e1aec303972378a28dea0b4186720d6a61019e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -46,7 +46,7 @@ master_doc = 'index' # General information about the project. project = u'loopy' -copyright = u'2011, Andreas Klöckner' +copyright = u'2016, Andreas Klöckner' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -54,7 +54,10 @@ copyright = u'2011, Andreas Klöckner' # # The short X.Y version. ver_dic = {} -exec(compile(open("../loopy/version.py").read(), "../loopy/version.py", 'exec'), ver_dic) +with open("../loopy/version.py") as vpy_file: + version_py = vpy_file.read() + +exec(compile(version_py, "../loopy/version.py", 'exec'), ver_dic) version = ".".join(str(x) for x in ver_dic["VERSION"]) # The full version, including alpha/beta/rc tags. release = ver_dic["VERSION_TEXT"] @@ -249,4 +252,4 @@ intersphinx_mapping = { 'http://docs.scipy.org/doc/numpy/': None, } -autoclass_content = "both" +autoclass_content = "class" diff --git a/doc/index.rst b/doc/index.rst index 19bbe87727e8d3ffa5aa0cb1c31e4e206bd97b3e..a0bad2898be4aab74dead90aae825e4e0a460c87 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -63,7 +63,10 @@ Please check :ref:`installation` to get started. :maxdepth: 2 tutorial - reference + ref_creation + ref_kernel + ref_transform + ref_other misc Indices and tables diff --git a/doc/misc.rst b/doc/misc.rst index 2f8fac090455eea1be87c3b4eae7bfd72eba24ee..89279d9744d2f1cf4f080618d0d2b2e0f078a723 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -5,7 +5,9 @@ Installation This command should install :mod:`loopy`:: - pip install loopy + pip install loo.py + +(Note the extra "."!) You may need to run this with :command:`sudo`. If you don't already have `pip `_, diff --git a/doc/ref_creation.rst b/doc/ref_creation.rst new file mode 100644 index 0000000000000000000000000000000000000000..92eff09c9e3ecacfd8bb9030a9e4b9f002fefc71 --- /dev/null +++ b/doc/ref_creation.rst @@ -0,0 +1,33 @@ +.. module:: loopy +.. moduleauthor:: Andreas Kloeckner + +.. _creating-kernels: + +Reference: Creating Kernels +=========================== + +From Loop Domains and Instructions +---------------------------------- + +.. autofunction:: make_kernel + +From Fortran +------------ + +.. autofunction:: parse_fortran + +.. autofunction:: parse_transformed_fortran + +.. autofunction:: c_preprocess + +From Other Kernels +------------------ + +.. autofunction:: fuse_kernels + +To Copy between Data Formats +---------------------------- + +.. autofunction:: make_copy_kernel + +.. vim: tw=75:spell:fdm=marker diff --git a/doc/reference.rst b/doc/ref_kernel.rst similarity index 71% rename from doc/reference.rst rename to doc/ref_kernel.rst index 351a2374a1245bf58296f9d2ee50551050c15f76..33d40385b529f72e54da65238304e87bdb2cddab 100644 --- a/doc/reference.rst +++ b/doc/ref_kernel.rst @@ -1,21 +1,39 @@ -.. _reference: +.. currentmodule:: loopy -Reference Guide -=============== +Reference: Loopy's Model of a Kernel +==================================== -.. module:: loopy -.. moduleauthor:: Andreas Kloeckner +.. _domain-tree: -This guide defines all functionality exposed by loopy. If you would like -a more gentle introduction, you may consider reading the example-based -:ref:`tutorial` instead. +Loop Domain Tree +---------------- -.. _inames: +.. {{{ + +Example:: + + { [i]: 0<=i>> knl = lp.set_options(knl, allow_terminal_colors=False) >>> print(knl) --------------------------------------------------------------------------- KERNEL: loopy_kernel @@ -245,8 +246,6 @@ call :func:`loopy.generate_code`: .. doctest:: >>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32)) - >>> typed_knl = lp.preprocess_kernel(typed_knl, device=ctx.devices[0]) - >>> typed_knl = lp.get_one_scheduled_kernel(typed_knl) >>> code, _ = lp.generate_code(typed_knl) >>> print(code) #define lid(N) ((int) get_local_id(N)) @@ -1186,8 +1185,12 @@ across the remaining axis of the workgroup would emerge. TODO -Obtaining Kernel Statistics ---------------------------- +.. }}} + +Obtaining Performance Statistics +-------------------------------- + +.. {{{ Operations, array access, and barriers can all be counted, which may facilitate performance prediction and optimization of a :mod:`loopy` kernel. diff --git a/examples/python/ispc-harness.py b/examples/python/ispc-harness.py deleted file mode 100644 index 637fd41fcdcdbcc7a464f8209680931adfd014f5..0000000000000000000000000000000000000000 --- a/examples/python/ispc-harness.py +++ /dev/null @@ -1,183 +0,0 @@ -import loopy as lp -import numpy as np -import numpy.linalg as la -import ctypes -import os -from time import time - - -# {{{ temporary directory - -class TemporaryDirectory(object): - """Create and return a temporary directory. This has the same - behavior as mkdtemp but can be used as a context manager. For - example: - - with TemporaryDirectory() as tmpdir: - ... - - Upon exiting the context, the directory and everything contained - in it are removed. - """ - - # Yanked from - # https://hg.python.org/cpython/file/3.3/Lib/tempfile.py - - # Handle mkdtemp raising an exception - name = None - _closed = False - - def __init__(self, suffix="", prefix="tmp", dir=None): - from tempfile import mkdtemp - self.name = mkdtemp(suffix, prefix, dir) - - def __repr__(self): - return "<{} {!r}>".format(self.__class__.__name__, self.name) - - def __enter__(self): - return self.name - - def cleanup(self, _warn=False): - import warnings - if self.name and not self._closed: - from shutil import rmtree - try: - rmtree(self.name) - except (TypeError, AttributeError) as ex: - if "None" not in '%s' % (ex,): - raise - self._rmtree(self.name) - self._closed = True - if _warn and warnings.warn: - warnings.warn("Implicitly cleaning up {!r}".format(self)) - - def __exit__(self, exc, value, tb): - self.cleanup() - - def __del__(self): - # Issue a ResourceWarning if implicit cleanup needed - self.cleanup(_warn=True) - -# }}} - - -# {{{ build_ispc_shared_lib - -def build_ispc_shared_lib( - cwd, ispc_sources, cxx_sources, - ispc_options=[], cxx_options=[]): - from os.path import join - - ispc_source_names = [] - for name, contents in ispc_sources: - ispc_source_names.append(name) - - with open(join(cwd, name), "w") as srcf: - srcf.write(contents) - - cxx_source_names = [] - for name, contents in cxx_sources: - cxx_source_names.append(name) - - with open(join(cwd, name), "w") as srcf: - srcf.write(contents) - - from subprocess import check_call - - check_call( - ["ispc", - "--pic", - "--opt=force-aligned-memory", - "--target=avx2-i32x8", - "-o", "ispc.o"] - + ispc_options - + list(ispc_source_names), - cwd=cwd) - - check_call( - [ - "g++", - "-shared", "-fopenmp", "-Wl,--export-dynamic", - "-fPIC", - "-oshared.so", - "ispc.o", - ] - + cxx_options - + list(cxx_source_names), - cwd=cwd) - -# }}} - - -def cptr_from_numpy(obj): - ary_intf = getattr(obj, "__array_interface__", None) - if ary_intf is None: - raise RuntimeError("no array interface") - - buf_base, is_read_only = ary_intf["data"] - return ctypes.c_void_p(buf_base + ary_intf.get("offset", 0)) - - -def main(): - with open("tasksys.cpp", "r") as ts_file: - tasksys_source = ts_file.read() - - from loopy.target.ispc import ISPCTarget - stream_knl = lp.make_kernel( - "{[i]: 0<=i0") - stream_knl = lp.split_iname(stream_knl, "i", 8, inner_tag="l.0") - stream_knl = lp.split_iname(stream_knl, - "i_outer", 2**22, outer_tag="g.0") - stream_knl = lp.preprocess_kernel(stream_knl) - stream_knl = lp.get_one_scheduled_kernel(stream_knl) - stream_knl = lp.set_argument_order(stream_knl, "n,a,x,y,z") - ispc_code, arg_info = lp.generate_code(stream_knl) - - with TemporaryDirectory() as tmpdir: - build_ispc_shared_lib( - tmpdir, - [("stream.ispc", ispc_code)], - [("tasksys.cpp", tasksys_source)]) - - print(ispc_code) - knl_lib = ctypes.cdll.LoadLibrary(os.path.join(tmpdir, "shared.so")) - - n = 2**28 - a = 5 - x = np.empty(n, dtype=stream_dtype) - y = np.empty(n, dtype=stream_dtype) - z = np.empty(n, dtype=stream_dtype) - - nruns = 30 - start_time = time() - for irun in range(nruns): - knl_lib.loopy_kernel( - ctypes.c_int(n), stream_ctype(a), - cptr_from_numpy(x), - cptr_from_numpy(y), - cptr_from_numpy(z)) - elapsed = time() - start_time - - print(1e-9*3*x.nbytes*nruns/elapsed, "GB/s") - - print(la.norm(z-a*x+y)) - - - -if __name__ == "__main__": - main() - -# vim: foldmethod=marker diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py new file mode 100644 index 0000000000000000000000000000000000000000..a402896c85baa49bc23bb5d770607cf73641c273 --- /dev/null +++ b/examples/python/ispc-stream-harness.py @@ -0,0 +1,156 @@ +import loopy as lp +import numpy as np +import numpy.linalg as la +import ctypes +import ctypes.util +import os +from time import time +from tempfile import TemporaryDirectory + +from loopy.tools import (empty_aligned, address_from_numpy, + build_ispc_shared_lib, cptr_from_numpy) + + +def transform(knl, vars, stream_dtype): + vars = [v.strip() for v in vars.split(",")] + knl = lp.assume(knl, "n>0") + knl = lp.split_iname( + knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) + knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") + + knl = lp.add_and_infer_dtypes(knl, { + var: stream_dtype + for var in vars + }) + + knl = lp.set_argument_order(knl, vars + ["n"]) + + return knl + + +def gen_code(knl): + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + ispc_code, arg_info = lp.generate_code(knl) + + return ispc_code + + +NRUNS = 10 +ALIGN_TO = 4096 +ARRAY_SIZE = 2**28 + +if 0: + STREAM_DTYPE = np.float64 + STREAM_CTYPE = ctypes.c_double +else: + STREAM_DTYPE = np.float32 + STREAM_CTYPE = ctypes.c_float + +if 1: + INDEX_DTYPE = np.int32 + INDEX_CTYPE = ctypes.c_int +else: + INDEX_DTYPE = np.int64 + INDEX_CTYPE = ctypes.c_longlong + + +def main(): + with open("tasksys.cpp", "r") as ts_file: + tasksys_source = ts_file.read() + + def make_knl(name, insn, vars): + knl = lp.make_kernel( + "{[i]: 0<=i loop_list_width: lines.append("[%s]" % loop_list) lines.append("%s%s <- %s # %s" % ( - (loop_list_width+2)*" ", lhs, - rhs, ", ".join(options))) + (loop_list_width+2)*" ", Fore.BLUE+lhs+Style.RESET_ALL, + Fore.MAGENTA+rhs+Style.RESET_ALL, + ", ".join(options))) else: lines.append("[%s]%s%s <- %s # %s" % ( loop_list, " "*(loop_list_width-len(loop_list)), - lhs, rhs, ",".join(options))) + Fore.BLUE + lhs + Style.RESET_ALL, + Fore.MAGENTA+rhs+Style.RESET_ALL, + ",".join(options))) lines.extend(trailing) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index f7c478dff296f26bdd342670b319104e7c3ef694..6a5d523e61c27c068a380b7de6c206147d7bb271 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1011,8 +1011,22 @@ def resolve_wildcard_deps(knl): def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): """User-facing kernel creation entrypoint. - :arg domains: :class:`islpy.BasicSet` + :arg domains: + + A list of :class:`islpy.BasicSet` (i.e. convex set) instances + representing the :ref:`domain-tree`. May also be a list of strings + which will be parsed into such instances according to :ref:`isl-syntax`. + :arg instructions: + + A list of :class:`Assignment` (or other :class:`InstructionBase` + subclasses), possibly intermixed with instances of + :class:`SubstitutionRule`. This same list may also contain strings + which will be parsed into such objects using the + :ref:`assignment-syntax` and the :ref:`subst-rule-syntax`. May also be + a single multi-line string which will be split into lines and then + parsed. + :arg kernel_data: A list of :class:`ValueArg`, :class:`GlobalArg`, ... (etc.) instances. @@ -1054,7 +1068,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): strides. They are expanded only upon kernel creation. :arg default_order: "C" (default) or "F" :arg default_offset: 0 or :class:`loopy.auto`. The default value of - *offset* in :attr:`loopy.kernel.data.GlobalArg` for guessed arguments. + *offset* in :attr:`GlobalArg` for guessed arguments. Defaults to 0. :arg function_manglers: list of functions of signature (name, arg_dtypes) returning a tuple (result_dtype, c_name) @@ -1074,8 +1088,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): to silence :arg options: an instance of :class:`loopy.Options` or an equivalent string representation - :arg target: an instance of :class:`loopy.target.TargetBase`, or *None*, - to use the default target. (see :func:`loopy.set_default_target`) + :arg target: an instance of :class:`loopy.TargetBase`, or *None*, + to use the default target. """ defines = kwargs.pop("defines", {}) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 9d0848788a09b7a6b914a1ce5569faf3102f18b2..5b0cf57e50298febe0580da5fb0efe5756557527 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -185,6 +185,8 @@ def parse_tag(tag): # {{{ arguments class KernelArgument(Record): + """Base class for all argument types""" + def __init__(self, **kwargs): kwargs["name"] = intern(kwargs.pop("name")) @@ -418,7 +420,9 @@ class SubstitutionRule(Record): # {{{ base class class InstructionBase(Record): - """ + """A base class for all types of instruction that can occur in + a kernel. + .. attribute:: id An (otherwise meaningless) identifier that is unique within @@ -427,7 +431,7 @@ class InstructionBase(Record): .. attribute:: depends_on a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances - that *must* be executed before this one. Note that + that *must* be executed before this one. Note that :func:`loopy.preprocess_kernel` (usually invoked automatically) augments this by adding dependencies on any writes to temporaries read by this instruction. @@ -493,6 +497,14 @@ class InstructionBase(Record): A tuple of string identifiers that can be used to identify groups of instructions. + + .. automethod:: __init__ + .. automethod:: assignees_and_indices + .. automethod:: with_transformed_expressions + .. automethod:: write_dependency_names + .. automethod:: dependency_names + .. automethod:: assignee_var_names + .. automethod:: copy """ fields = set("id depends_on depends_on_is_final " @@ -568,10 +580,12 @@ class InstructionBase(Record): predicates=predicates, tags=tags) + # legacy @property def insn_deps(self): return self.depends_on + # legacy @property def insn_deps_is_final(self): return self.depends_on_is_final @@ -740,6 +754,8 @@ class Assignment(InstructionBase): if not *None*, a type that will be assigned to the new temporary variable created from the assignee + + .. automethod:: __init__ """ fields = InstructionBase.fields | \ diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 46b301b47c45430078f554a178448687eb490e94..cb72f283e2151fd8d097f19eb4e4876cc78ccd43 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -770,7 +770,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): If *axis* is None, find a suitable axis automatically. """ try: - desired_length = kernel.get_constant_iname_length(iname) + with isl.SuppressedWarnings(kernel.isl_context): + desired_length = kernel.get_constant_iname_length(iname) except isl.Error: # Likely unbounded, automatic assignment is not # going to happen for this iname. @@ -882,7 +883,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): def get_iname_length(iname): try: - return kernel.get_constant_iname_length(iname) + with isl.SuppressedWarnings(kernel.isl_context): + return kernel.get_constant_iname_length(iname) except isl.Error: return -1 # assign longest auto axis inames first diff --git a/loopy/options.py b/loopy/options.py index 049d21affd284d94f27416d018d38bdbd5b4bddc..9b27f111e2fb93b09090b35278a3753f9f1a3f82 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -27,6 +27,11 @@ from pytools import Record import re +class _ColoramaStub(object): + def __getattribute__(self, name): + return "" + + class Options(Record): """ Unless otherwise specified, these options are Boolean-valued @@ -105,6 +110,11 @@ class Options(Record): Options to pass to the OpenCL compiler when building the kernel. A list of strings. + + .. attribute:: allow_terminal_colors + + A :class:`bool`. Whether to allow colors in terminal output + """ def __init__( @@ -124,6 +134,7 @@ class Options(Record): write_wrapper=False, highlight_wrapper=False, write_cl=False, highlight_cl=False, edit_cl=False, cl_build_options=[], + allow_terminal_colors=True ): Record.__init__( self, @@ -137,6 +148,7 @@ class Options(Record): write_wrapper=write_wrapper, highlight_wrapper=highlight_wrapper, write_cl=write_cl, highlight_cl=highlight_cl, edit_cl=edit_cl, cl_build_options=cl_build_options, + allow_terminal_colors=allow_terminal_colors, ) def update(self, other): @@ -150,6 +162,30 @@ class Options(Record): for field_name in sorted(self.__class__.fields): key_builder.rec(key_hash, getattr(self, field_name)) + @property + def _fore(self): + if self.allow_terminal_colors: + import colorama + return colorama.Fore + else: + return _ColoramaStub() + + @property + def _back(self): + if self.allow_terminal_colors: + import colorama + return colorama.Back + else: + return _ColoramaStub() + + @property + def _style(self): + if self.allow_terminal_colors: + import colorama + return colorama.Style + else: + return _ColoramaStub() + KEY_VAL_RE = re.compile("^([a-zA-Z0-9]+)=(.*)$") diff --git a/loopy/preprocess.py b/loopy/preprocess.py index fe88118e018d5829e9ff2104b70940a39cb95ade..4c75cfd250807c6959c1e5167465d34b029d762e 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -66,6 +66,45 @@ def prepare_for_caching(kernel): # }}} +# {{{ check reduction iname uniqueness + +def check_reduction_iname_uniqueness(kernel): + iname_to_reduction_count = {} + iname_to_nonsimultaneous_reduction_count = {} + + def map_reduction(expr, rec): + rec(expr.expr) + for iname in expr.inames: + iname_to_reduction_count[iname] = ( + iname_to_reduction_count.get(iname, 0) + 1) + if not expr.allow_simultaneous: + iname_to_nonsimultaneous_reduction_count[iname] = ( + iname_to_nonsimultaneous_reduction_count.get(iname, 0) + 1) + + return expr + + from loopy.symbolic import ReductionCallbackMapper + cb_mapper = ReductionCallbackMapper(map_reduction) + + for insn in kernel.instructions: + insn.with_transformed_expressions(cb_mapper) + + for iname, count in six.iteritems(iname_to_reduction_count): + nonsimul_count = iname_to_nonsimultaneous_reduction_count.get(iname, 0) + + if nonsimul_count and count > 1: + raise LoopyError("iname '%s' used in more than one reduction. " + "(%d of them, to be precise.) " + "Since this usage can easily cause loop scheduling " + "problems, this is prohibited by default. " + "Use loopy.make_reduction_inames_unique() to fix this. " + "If you are sure that this is OK, write the reduction " + "as 'simul_reduce(...)' instead of 'reduce(...)'" + % (iname, count)) + +# }}} + + # {{{ infer types def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): @@ -677,11 +716,13 @@ def preprocess_kernel(kernel, device=None): kernel = expand_subst(kernel) # Ordering restriction: - # Type inference doesn't handle substitutions. Get them out of the - # way. + # Type inference and reduction iname uniqueness don't handle substitutions. + # Get them out of the way. kernel = infer_unknown_types(kernel, expect_completion=False) + check_reduction_iname_uniqueness(kernel) + kernel = add_default_dependencies(kernel) # Ordering restrictions: diff --git a/loopy/schedule.py b/loopy/schedule.py index 17c1ab3ce0bad0170f8f7b03e243862de33af21d..de71ffaed0d457f96ee6ddbdeeafac58c0959aa9 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -317,11 +317,40 @@ def group_insn_counts(kernel): return result + +def gen_dependencies_except(kernel, insn_id, except_insn_ids): + insn = kernel.id_to_insn[insn_id] + for dep_id in insn.depends_on: + + if dep_id in except_insn_ids: + continue + + yield dep_id + + for sub_dep_id in gen_dependencies_except(kernel, dep_id, except_insn_ids): + yield sub_dep_id + # }}} # {{{ debug help +def format_insn_id(kernel, insn_id): + Fore = kernel.options._fore + Style = kernel.options._style + return Fore.GREEN + insn_id + Style.RESET_ALL + + +def format_insn(kernel, insn_id): + insn = kernel.id_to_insn[insn_id] + Fore = kernel.options._fore + Style = kernel.options._style + return "[%s] %s%s%s <- %s%s%s" % ( + format_insn_id(kernel, insn_id), + Fore.BLUE, str(insn.assignee), Style.RESET_ALL, + Fore.MAGENTA, str(insn.expression), Style.RESET_ALL) + + def dump_schedule(kernel, schedule): lines = [] indent = "" @@ -337,8 +366,7 @@ def dump_schedule(kernel, schedule): elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] if isinstance(insn, Assignment): - insn_str = "[%s] %s <- %s" % ( - insn.id, str(insn.assignee), str(insn.expression)) + insn_str = format_insn(kernel, sched_item.insn_id) else: insn_str = sched_item.insn_id lines.append(indent + insn_str) @@ -482,6 +510,8 @@ def generate_loop_schedules_internal( # to give loops containing high-priority instructions a chance. kernel = sched_state.kernel + Fore = kernel.options._fore + Style = kernel.options._style if allow_boost is None: rec_allow_boost = None @@ -510,7 +540,7 @@ def generate_loop_schedules_internal( print(dump_schedule(sched_state.kernel, sched_state.schedule)) #print("boost allowed:", allow_boost) print(75*"=") - print("LOOP NEST MAP:") + print("LOOP NEST MAP (inner: outer):") for iname, val in six.iteritems(sched_state.loop_nest_around_map): print("%s : %s" % (iname, ", ".join(val))) print(75*"=") @@ -549,7 +579,7 @@ def generate_loop_schedules_internal( if not is_ready: if debug_mode: print("instruction '%s' is missing insn depedencies '%s'" % ( - insn.id, ",".join( + format_insn(kernel, insn.id), ",".join( insn.depends_on - sched_state.scheduled_insn_ids))) continue @@ -570,10 +600,10 @@ def generate_loop_schedules_internal( if debug_mode: if want-have: print("instruction '%s' is missing inames '%s'" - % (insn.id, ",".join(want-have))) + % (format_insn(kernel, insn.id), ",".join(want-have))) if have-want: print("instruction '%s' won't work under inames '%s'" - % (insn.id, ",".join(have-want))) + % (format_insn(kernel, insn.id), ",".join(have-want))) # {{{ determine group-based readiness @@ -595,7 +625,7 @@ def generate_loop_schedules_internal( # }}} if is_ready and debug_mode: - print("ready to schedule '%s'" % insn.id) + print("ready to schedule '%s'" % format_insn(kernel, insn.id)) if is_ready and not debug_mode: iid_set = frozenset([insn.id]) @@ -660,7 +690,38 @@ def generate_loop_schedules_internal( if last_entered_loop in kernel.insn_inames(insn): if debug_mode: print("cannot leave '%s' because '%s' still depends on it" - % (last_entered_loop, insn.id)) + % (last_entered_loop, format_insn(kernel, insn.id))) + + # check if there's a dependency of insn that needs to be + # outside of last_entered_loop. + for subdep_id in gen_dependencies_except(kernel, insn_id, + sched_state.unscheduled_insn_ids): + subdep = kernel.id_to_insn[insn_id] + want = (kernel.insn_inames(subdep_id) + - sched_state.parallel_inames) + if ( + last_entered_loop not in want and + last_entered_loop not in subdep.boostable_into): + print( + "%(warn)swarning:%(reset_all)s '%(iname)s', " + "which the schedule is " + "currently stuck inside of, seems mis-nested. " + "'%(subdep)s' must occur " "before '%(dep)s', " + "but '%(subdep)s must be outside " + "'%(iname)s', whereas '%(dep)s' must be back " + "in it.%(reset_all)s\n" + " %(subdep_i)s\n" + " %(dep_i)s" + % { + "warn": Fore.RED + Style.BRIGHT, + "reset_all": Style.RESET_ALL, + "iname": last_entered_loop, + "subdep": format_insn_id(kernel, subdep_id), + "dep": format_insn_id(kernel, insn_id), + "subdep_i": format_insn(kernel, subdep_id), + "dep_i": format_insn(kernel, insn_id), + }) + can_leave = False break diff --git a/loopy/statistics.py b/loopy/statistics.py index d39f2f8556ad62642794ab2ef9b4c9504fdb27b1..5aa251dc9318bae6fe49b69642722ecc88d79887 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -35,6 +35,21 @@ from loopy.kernel.data import Assignment from loopy.diagnostic import warn +__doc__ = """ + +.. currentmodule:: loopy + +.. autofunction:: get_op_poly + +.. autofunction:: get_gmem_access_poly + +.. autofunction:: sum_mem_access_to_bytes + +.. autofunction:: get_barrier_poly + +""" + + # {{{ ToCountMap class ToCountMap: diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 7adab80c68c38f900976eb1adcd90226f40a7d9b..b887c703420d092d7f3c0fc9c729dd1d1f942a76 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -79,7 +79,20 @@ class IdentityMapperMixin(object): return expr def map_reduction(self, expr, *args): - return Reduction(expr.operation, expr.inames, self.rec(expr.expr, *args)) + mapped_inames = [self.rec(Variable(iname), *args) for iname in expr.inames] + + new_inames = [] + for iname, new_sym_iname in zip(expr.inames, mapped_inames): + if not isinstance(new_sym_iname, Variable): + from loopy.diagnostic import LoopyError + raise LoopyError("%s did not map iname '%s' to a variable" + % (type(self).__name__, iname)) + + new_inames.append(new_sym_iname.name) + + return Reduction( + expr.operation, tuple(new_inames), self.rec(expr.expr, *args), + allow_simultaneous=expr.allow_simultaneous) def map_tagged_variable(self, expr, *args): # leaf, doesn't change @@ -146,7 +159,8 @@ class StringifyMapper(StringifyMapperBase): return "loc.%d" % expr.index def map_reduction(self, expr, prec): - return "reduce(%s, [%s], %s)" % ( + return "%sreduce(%s, [%s], %s)" % ( + "simul_" if expr.allow_simultaneous else "", expr.operation, ", ".join(expr.inames), expr.expr) def map_tagged_variable(self, expr, prec): @@ -346,11 +360,16 @@ class Reduction(AlgebraicLeaf): The expression (as a :class:`pymbolic.primitives.Expression`) on which reduction is performed. + + .. attribute:: allow_simultaneous + + A :class:`bool`. If not *True*, an iname is allowed to be used + in precisely one reduction, to avoid mis-nesting errors. """ - init_arg_names = ("operation", "inames", "expr") + init_arg_names = ("operation", "inames", "expr", "allow_simultaneous") - def __init__(self, operation, inames, expr): + def __init__(self, operation, inames, expr, allow_simultaneous=False): if isinstance(inames, str): inames = tuple(iname.strip() for iname in inames.split(",")) @@ -378,9 +397,10 @@ class Reduction(AlgebraicLeaf): self.operation = operation self.inames = inames self.expr = expr + self.allow_simultaneous = allow_simultaneous def __getinitargs__(self): - return (self.operation, self.inames, self.expr) + return (self.operation, self.inames, self.expr, self.allow_simultaneous) def get_hash(self): return hash((self.__class__, self.operation, self.inames, @@ -779,7 +799,8 @@ class FunctionToPrimitiveMapper(IdentityMapper): turns those into the actual pymbolic primitives used for that. """ - def _parse_reduction(self, operation, inames, red_expr): + def _parse_reduction(self, operation, inames, red_expr, + allow_simultaneous=False): if isinstance(inames, Variable): inames = (inames,) @@ -795,7 +816,8 @@ class FunctionToPrimitiveMapper(IdentityMapper): processed_inames.append(iname.name) - return Reduction(operation, tuple(processed_inames), red_expr) + return Reduction(operation, tuple(processed_inames), red_expr, + allow_simultaneous=allow_simultaneous) def map_call(self, expr): from loopy.library.reduction import parse_reduction_op @@ -820,7 +842,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: raise TypeError("cse takes two arguments") - elif name == "reduce": + elif name in ["reduce", "simul_reduce"]: if len(expr.parameters) == 3: operation, inames, red_expr = expr.parameters @@ -829,7 +851,8 @@ class FunctionToPrimitiveMapper(IdentityMapper): "must be a symbol") operation = parse_reduction_op(operation.name) - return self._parse_reduction(operation, inames, self.rec(red_expr)) + return self._parse_reduction(operation, inames, self.rec(red_expr), + allow_simultaneous=(name == "simul_reduce")) else: raise TypeError("invalid 'reduce' calling sequence") diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index ee28594a50ce8486702e13f0ee9bf01debb4f859..85e58a809e51ff8067c9effb59240fb5125b46db 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -24,9 +24,26 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +__doc__ = """ + +.. currentmodule:: loopy + +.. autoclass:: TargetBase +.. autoclass:: CTarget +.. autoclass:: CudaTarget +.. autoclass:: OpenCLTarget +.. autoclass:: PyOpenCLTarget +.. autoclass:: ISPCTarget + +""" + class TargetBase(object): - """Objects of this type must be picklable.""" + """Base class for all targets, i.e. different types of code that + loopy can generate. + + Objects of this type must be picklable. + """ # {{{ persistent hashing diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 73310c4db7ad590396377d1d757ebf8eb068a8e6..5d6a856d14f57b277e6c083717346bbeb11af7b6 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -56,6 +56,9 @@ def _preamble_generator(kernel, seen_dtypes, seen_functions): class CTarget(TargetBase): + """A target for plain "C", without any parallel extensions. + """ + hash_fields = TargetBase.hash_fields + ("fortran_abi",) comparison_fields = TargetBase.comparison_fields + ("fortran_abi",) diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 94a144cb972b10cd7a1d797adbd8d259e7eb64dc..55f8da4d608d7a1b3a1ca8ba960c498586f4d76d 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -162,6 +162,8 @@ class LoopyCudaCCodeMapper(LoopyCCodeMapper): # {{{ target class CudaTarget(CTarget): + """A target for Nvidia's CUDA GPU programming language.""" + def __init__(self, extern_c=True): """ :arg extern_c: If *True*, declare kernels using "extern C" to diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 737ee0d998ce5b330697d56228183e0abe5e0137..b0b1c247ec78a29b75890b0e9f04448116b97c4e 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -30,6 +30,8 @@ from loopy.target.c import CTarget from loopy.target.c.codegen.expression import LoopyCCodeMapper from loopy.diagnostic import LoopyError +from pytools import memoize_method + # {{{ expression mapper @@ -54,7 +56,35 @@ class LoopyISPCCodeMapper(LoopyCCodeMapper): # }}} +# {{{ type registry + +def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True): + reg.get_or_register_dtype("bool", np.bool) + + reg.get_or_register_dtype(["int8", "signed char", "char"], np.int8) + reg.get_or_register_dtype(["uint8", "unsigned char"], np.uint8) + reg.get_or_register_dtype(["int16", "short", "signed short", + "signed short int", "short signed int"], np.int16) + reg.get_or_register_dtype(["uint16", "unsigned short", + "unsigned short int", "short unsigned int"], np.uint16) + reg.get_or_register_dtype(["int32", "int", "signed int"], np.int32) + reg.get_or_register_dtype(["uint32", "unsigned", "unsigned int"], np.uint32) + + reg.get_or_register_dtype(["int64"], np.int64) + reg.get_or_register_dtype(["uint64"], np.uint64) + + reg.get_or_register_dtype("float", np.float32) + reg.get_or_register_dtype("double", np.float64) + +# }}} + + class ISPCTarget(CTarget): + """A code generation target for Intel's `ISPC `_ + SPMD programming language, to target Intel's Knight's hardware and modern + Intel CPUs with wide vector units. + """ + def __init__(self, occa_mode=False): """ :arg occa_mode: Whether to modify the generated call signature to @@ -64,6 +94,18 @@ class ISPCTarget(CTarget): super(ISPCTarget, self).__init__() + # {{{ types + + @memoize_method + def get_dtype_registry(self): + from loopy.target.c.compyte.dtypes import DTypeRegistry + result = DTypeRegistry() + fill_registry_with_ispc_types(result, respect_windows=False, + include_bool=True) + return result + + # }}} + # {{{ top-level codegen def generate_code(self, kernel, codegen_state, impl_arg_info): @@ -116,18 +158,26 @@ class ISPCTarget(CTarget): from pymbolic.mapper.stringifier import PREC_COMPARISON, PREC_NONE ccm = self.get_expression_to_code_mapper(codegen_state) - wrapper_body.extend([ - S("assert(programCount == %s)" - % ccm(lsize[0], PREC_COMPARISON)), - S("launch[%s] %s(%s)" + if lsize: + wrapper_body.append( + S("assert(programCount == %s)" + % ccm(lsize[0], PREC_COMPARISON))) + + if gsize: + launch_spec = "[%s]" % ", ".join( + ccm(gs_i, PREC_NONE) + for gs_i in gsize), + else: + launch_spec = "" + + wrapper_body.append( + S("launch%s %s(%s)" % ( - ", ".join( - ccm(gs_i, PREC_NONE) - for gs_i in gsize), + launch_spec, inner_name, ", ".join(arg_names) )) - ]) + ) wrapper_fbody = FunctionBody( ISPCExport( @@ -176,11 +226,8 @@ class ISPCTarget(CTarget): raise LoopyError("unknown barrier kind") def wrap_temporary_decl(self, decl, is_local): - from cgen.ispc import ISPCUniform, ISPCVarying - if is_local: - return ISPCUniform(decl) - else: - return ISPCVarying(decl) + from cgen.ispc import ISPCUniform + return ISPCUniform(decl) def get_global_arg_decl(self, name, shape, dtype, is_written): from loopy.codegen import POD # uses the correct complex type diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index a009e93360016128f69c85fee2a555cd016a5ab1..cfdc8620bb4a383e4d48f004a2c682ba8e495a05 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -189,6 +189,9 @@ class LoopyOpenCLCCodeMapper(LoopyCCodeMapper): # {{{ target class OpenCLTarget(CTarget): + """A target for the OpenCL C heterogeneous compute programming language. + """ + # {{{ library def function_manglers(self): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index d13384534c70df602785d4189739a7bc86ed37db..2947fdc6c5d17e919bdcc329f6243376b00a9969 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -31,12 +31,6 @@ import numpy as np from loopy.target.opencl import OpenCLTarget -import pyopencl as cl -import pyopencl.characterize as cl_char - -# This ensures the dtype registry is populated. -import pyopencl.tools # noqa - import logging logger = logging.getLogger(__name__) @@ -44,6 +38,9 @@ logger = logging.getLogger(__name__) # {{{ temp storage adjust for bank conflict def adjust_local_temp_var_storage(kernel, device): + import pyopencl as cl + import pyopencl.characterize as cl_char + logger.debug("%s: adjust temp var storage" % kernel.name) new_temp_vars = {} @@ -244,9 +241,21 @@ class _LegacyTypeRegistryStub(object): from pyopencl.compyte.dtypes import dtype_to_ctype return dtype_to_ctype(dtype) +# }}} + + +# {{{ target class PyOpenCLTarget(OpenCLTarget): + """A code generation target that takes special advantage of :mod:`pyopencl` + features such as run-time knowledge of the target device (to generate + warnings) and support for complex numbers. + """ + def __init__(self, device=None): + # This ensures the dtype registry is populated. + import pyopencl.tools # noqa + super(PyOpenCLTarget, self).__init__() self.device = device diff --git a/loopy/tools.py b/loopy/tools.py index 55b177bda4e6be03a985286fd4faf6322e257824..777532e7af92bc62e6878f564c8c5545f4cb2c4a 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -233,6 +233,119 @@ def remove_common_indentation(code, require_leading_newline=True, # }}} +# {{{ build_ispc_shared_lib + +# DO NOT RELY ON THESE: THEY WILL GO AWAY + +def build_ispc_shared_lib( + cwd, ispc_sources, cxx_sources, + ispc_options=[], cxx_options=[], + ispc_bin="ispc", + cxx_bin="g++", + quiet=True): + from os.path import join + + ispc_source_names = [] + for name, contents in ispc_sources: + ispc_source_names.append(name) + + with open(join(cwd, name), "w") as srcf: + srcf.write(contents) + + cxx_source_names = [] + for name, contents in cxx_sources: + cxx_source_names.append(name) + + with open(join(cwd, name), "w") as srcf: + srcf.write(contents) + + from subprocess import check_call + + ispc_cmd = ([ispc_bin, + "--pic", + "-o", "ispc.o"] + + ispc_options + + list(ispc_source_names)) + if not quiet: + print(" ".join(ispc_cmd)) + + check_call(ispc_cmd, cwd=cwd) + + cxx_cmd = ([ + cxx_bin, + "-shared", "-Wl,--export-dynamic", + "-fPIC", + "-oshared.so", + "ispc.o", + ] + + cxx_options + + list(cxx_source_names)) + + check_call(cxx_cmd, cwd=cwd) + + if not quiet: + print(" ".join(cxx_cmd)) + +# }}} + + +# {{{ numpy address munging + +# DO NOT RELY ON THESE: THEY WILL GO AWAY + +def address_from_numpy(obj): + ary_intf = getattr(obj, "__array_interface__", None) + if ary_intf is None: + raise RuntimeError("no array interface") + + buf_base, is_read_only = ary_intf["data"] + return buf_base + ary_intf.get("offset", 0) + + +def cptr_from_numpy(obj): + import ctypes + return ctypes.c_void_p(address_from_numpy(obj)) + + +# https://github.com/hgomersall/pyFFTW/blob/master/pyfftw/utils.pxi#L172 +def empty_aligned(shape, dtype, order='C', n=64): + '''empty_aligned(shape, dtype='float64', order='C', n=None) + Function that returns an empty numpy array that is n-byte aligned, + where ``n`` is determined by inspecting the CPU if it is not + provided. + The alignment is given by the final optional argument, ``n``. If + ``n`` is not provided then this function will inspect the CPU to + determine alignment. The rest of the arguments are as per + :func:`numpy.empty`. + ''' + itemsize = np.dtype(dtype).itemsize + + # Apparently there is an issue with numpy.prod wrapping around on 32-bits + # on Windows 64-bit. This shouldn't happen, but the following code + # alleviates the problem. + if not isinstance(shape, (int, np.integer)): + array_length = 1 + for each_dimension in shape: + array_length *= each_dimension + + else: + array_length = shape + + base_ary = np.empty(array_length*itemsize+n, dtype=np.int8) + + # We now need to know how to offset base_ary + # so it is correctly aligned + _array_aligned_offset = (n-address_from_numpy(base_ary)) % n + + array = np.frombuffer( + base_ary[_array_aligned_offset:_array_aligned_offset-n].data, + dtype=dtype).reshape(shape, order=order) + + return array + +# }}} + + def is_interned(s): return s is None or intern(s) is s diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index a6830092910dcc89eefcc337961c3b215831fb41..d41222c26056300729b0c4005200ba6ea904010d 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -25,108 +25,9 @@ THE SOFTWARE. import six -from loopy.symbolic import (RuleAwareIdentityMapper, - SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError -# {{{ split_reduction - -class _ReductionSplitter(RuleAwareIdentityMapper): - def __init__(self, rule_mapping_context, within, inames, direction): - super(_ReductionSplitter, self).__init__( - rule_mapping_context) - - self.within = within - self.inames = inames - self.direction = direction - - def map_reduction(self, expr, expn_state): - if set(expr.inames) & set(expn_state.arg_context): - # FIXME - raise NotImplementedError() - - if (self.inames <= set(expr.inames) - and self.within( - expn_state.kernel, - expn_state.instruction, - expn_state.stack)): - leftover_inames = set(expr.inames) - self.inames - - from loopy.symbolic import Reduction - if self.direction == "in": - return Reduction(expr.operation, tuple(leftover_inames), - Reduction(expr.operation, tuple(self.inames), - self.rec(expr.expr, expn_state))) - elif self.direction == "out": - return Reduction(expr.operation, tuple(self.inames), - Reduction(expr.operation, tuple(leftover_inames), - self.rec(expr.expr, expn_state))) - else: - assert False - else: - return super(_ReductionSplitter, self).map_reduction(expr, expn_state) - - -def _split_reduction(kernel, inames, direction, within=None): - if direction not in ["in", "out"]: - raise ValueError("invalid value for 'direction': %s" % direction) - - if isinstance(inames, str): - inames = inames.split(",") - inames = set(inames) - - from loopy.context_matching import parse_stack_match - within = parse_stack_match(within) - - rule_mapping_context = SubstitutionRuleMappingContext( - kernel.substitutions, kernel.get_var_name_generator()) - rsplit = _ReductionSplitter(rule_mapping_context, - within, inames, direction) - return rule_mapping_context.finish_kernel( - rsplit.map_kernel(kernel)) - - -def split_reduction_inward(kernel, inames, within=None): - """Takes a reduction of the form:: - - sum([i,j,k], ...) - - and splits it into two nested reductions:: - - sum([j,k], sum([i], ...)) - - In this case, *inames* would have been ``"i"`` indicating that - the iname ``i`` should be made the iname governing the inner reduction. - - :arg inames: A list of inames, or a comma-separated string that can - be parsed into those - """ - - return _split_reduction(kernel, inames, "in", within) - - -def split_reduction_outward(kernel, inames, within=None): - """Takes a reduction of the form:: - - sum([i,j,k], ...) - - and splits it into two nested reductions:: - - sum([i], sum([j,k], ...)) - - In this case, *inames* would have been ``"i"`` indicating that - the iname ``i`` should be made the iname governing the outer reduction. - - :arg inames: A list of inames, or a comma-separated string that can - be parsed into those - """ - - return _split_reduction(kernel, inames, "out", within) - -# }}} - - # {{{ fold constants def fold_constants(kernel): diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 59239104e43725c6ea5bf572eda7e79d080427a7..967e14de692ee96b0c01d2ea5bcf8b411890038b 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -29,21 +29,27 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont from loopy.kernel.data import ValueArg, GlobalArg import islpy as isl +__doc__ = """ +.. autofunction:: to_batched +""" + # {{{ to_batched class _BatchVariableChanger(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, kernel, batch_varying_args, - batch_iname_expr): + batch_iname_expr, sequential): super(_BatchVariableChanger, self).__init__(rule_mapping_context) self.kernel = kernel self.batch_varying_args = batch_varying_args self.batch_iname_expr = batch_iname_expr + self.sequential = sequential def needs_batch_subscript(self, name): return ( - name in self.kernel.temporary_variables + (not self.sequential + and name in self.kernel.temporary_variables) or name in self.batch_varying_args) @@ -64,14 +70,18 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): return expr.aggregate[self.batch_iname_expr] -def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch"): +def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", + sequential=False): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. :arg nbatches: the number of batches. May be a constant non-negative integer or a string, which will be added as an integer argument. - :arg batch_varying_args: a list of argument names that depend vary per-batch. + :arg batch_varying_args: a list of argument names that vary per-batch. Each such variable will have a batch index added. + :arg sequential: A :class:`bool`. If *True*, do not duplicate + temporary variables for each batch. This automatically tags the batch + iname for sequential execution. """ from pymbolic import var @@ -110,26 +120,32 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch"): new_args.append(arg) - new_temps = {} - - for temp in six.itervalues(knl.temporary_variables): - new_temps[temp.name] = temp.copy( - shape=(nbatches_expr,) + temp.shape, - dim_tags=("c",) * (len(arg.shape) + 1)) - knl = knl.copy( domains=new_domains, - args=new_args, - temporary_variables=new_temps) + args=new_args) + + if not sequential: + new_temps = {} + + for temp in six.itervalues(knl.temporary_variables): + new_temps[temp.name] = temp.copy( + shape=(nbatches_expr,) + temp.shape, + dim_tags=("c",) * (len(arg.shape) + 1)) + + knl = knl.copy(temporary_variables=new_temps) + else: + import loopy as lp + from loopy.kernel.data import ForceSequentialTag + knl = lp.tag_inames(knl, [(batch_iname, ForceSequentialTag())]) rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, vng) bvc = _BatchVariableChanger(rule_mapping_context, - knl, batch_varying_args, batch_iname_expr) + knl, batch_varying_args, batch_iname_expr, + sequential=sequential) return rule_mapping_context.finish_kernel( bvc.map_kernel(knl)) - # }}} # vim: foldmethod=marker diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 64332637910340d68cb035d64ad6f4f643c0b5c9..7b1deb7951392e2e0c46360f8fd979ebf5aedb37 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -353,6 +353,26 @@ def remove_unused_arguments(knl): for insn in exp_knl.instructions: refd_vars.update(insn.dependency_names()) + from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag + from loopy.symbolic import get_dependencies + from itertools import chain + + def tolerant_get_deps(expr): + if expr is None or expr is lp.auto: + return set() + return get_dependencies(expr) + + for ary in chain(knl.args, six.itervalues(knl.temporary_variables)): + if isinstance(ary, ArrayBase): + refd_vars.update( + tolerant_get_deps(ary.shape) + | tolerant_get_deps(ary.offset)) + + for dim_tag in ary.dim_tags: + if isinstance(dim_tag, FixedStrideArrayDimTag): + refd_vars.update( + tolerant_get_deps(dim_tag.stride)) + for arg in knl.args: if arg.name in refd_vars: new_args.append(arg) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index bf435d3fe08d022790bf31a4d583d4923f0bfeff..e44f8abe227d451e8e940708530f6c20566685e8 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -210,8 +210,9 @@ def _fuse_two_kernels(knla, knlb): from pymbolic.imperative.transform import \ fuse_instruction_streams_with_unique_ids - new_instructions, _ = fuse_instruction_streams_with_unique_ids( - knla.instructions, knlb.instructions) + new_instructions, old_b_id_to_new_b_id = \ + fuse_instruction_streams_with_unique_ids( + knla.instructions, knlb.instructions) # {{{ fuse assumptions @@ -283,12 +284,12 @@ def _fuse_two_kernels(knla, knlb): "target", knla.target, knlb.target), - options=knla.options) + options=knla.options), old_b_id_to_new_b_id # }}} -def fuse_kernels(kernels, suffixes=None): +def fuse_kernels(kernels, suffixes=None, data_flow=None): """Return a kernel that performs all the operations in all entries of *kernels*. @@ -296,6 +297,11 @@ def fuse_kernels(kernels, suffixes=None): :arg suffixes: If given, must be a list of strings of a length matching that of *kernels*. This will be used to disambiguate the names of temporaries, as described below. + :arg data_flow: A list of data dependencies + ``[(var_name, from_kernel, to_kernel), ...]``. + Based on this, the fuser will create dependencies between all + writers of *var_name* in ``kernels[from_kernel]`` to + readers of *var_name* in ``kernels[to_kernel]``. The components of the kernels are fused as follows: @@ -321,9 +327,16 @@ def fuse_kernels(kernels, suffixes=None): * The resulting kernel will contain all instructions from each entry of *kernels*. Clashing instruction IDs will be renamed to ensure uniqueness. + + .. versionchanged:: 2016.2 + + *data_flow* was added in version 2016.2 """ kernels = list(kernels) + if data_flow is None: + data_flow = [] + if suffixes: suffixes = list(suffixes) if len(suffixes) != len(kernels): @@ -356,9 +369,46 @@ def fuse_kernels(kernels, suffixes=None): # }}} - result = kernels.pop(0) - while kernels: - result = _fuse_two_kernels(result, kernels.pop(0)) + kernel_insn_ids = [] + result = None + + for knlb in kernels: + if result is None: + result = knlb + kernel_insn_ids.append([ + insn.id for insn in knlb.instructions]) + else: + result, old_b_id_to_new_b_id = _fuse_two_kernels( + knla=result, + knlb=knlb) + + kernel_insn_ids.append([ + old_b_id_to_new_b_id[insn.id] + for insn in knlb.instructions]) + + # {{{ realize data_flow dependencies + + id_to_insn = result.id_to_insn.copy() + + for var_name, from_kernel, to_kernel in data_flow: + from_writer_ids = frozenset( + insn_id + for insn_id in kernel_insn_ids[from_kernel] + if var_name in id_to_insn[insn_id].assignee_var_names()) + + for insn_id in kernel_insn_ids[to_kernel]: + insn = id_to_insn[insn_id] + if var_name in insn.read_dependency_names(): + insn = insn.copy(depends_on=insn.depends_on | from_writer_ids) + + id_to_insn[insn_id] = insn + + result = result.copy(instructions=[ + id_to_insn[insn_id] + for insn_ids in kernel_insn_ids + for insn_id in insn_ids]) + + # }}} return result diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 8cf16bfd330b3e425023573788bc68bc8a7275a2..c317f2a4fbd186ccf069da74118504249956b4bc 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -35,25 +35,40 @@ from loopy.symbolic import ( from loopy.diagnostic import LoopyError -# {{{ assume +__doc__ = """ +.. currentmodule:: loopy -def assume(kernel, assumptions): - if isinstance(assumptions, str): - assumptions_set_str = "[%s] -> { : %s}" \ - % (",".join(s for s in kernel.outer_params()), - assumptions) - assumptions = isl.BasicSet.read_from_str(kernel.domains[0].get_ctx(), - assumptions_set_str) +.. autofunction:: split_iname - if not isinstance(assumptions, isl.BasicSet): - raise TypeError("'assumptions' must be a BasicSet or a string") +.. autofunction:: chunk_iname - old_assumptions, new_assumptions = isl.align_two(kernel.assumptions, assumptions) +.. autofunction:: join_inames - return kernel.copy( - assumptions=old_assumptions.params() & new_assumptions.params()) +.. autofunction:: tag_inames -# }}} +.. autofunction:: duplicate_inames + +.. undocumented .. autofunction:: link_inames + +.. autofunction:: rename_iname + +.. autofunction:: remove_unused_inames + +.. autofunction:: set_loop_priority + +.. autofunction:: split_reduction_inward + +.. autofunction:: split_reduction_outward + +.. autofunction:: affine_map_inames + +.. autofunction:: realize_ilp + +.. autofunction:: find_unused_axis_tag + +.. autofunction:: make_reduction_inames_unique + +""" # {{{ set loop priority @@ -106,7 +121,8 @@ class _InameSplitter(RuleAwareIdentityMapper): from loopy.symbolic import Reduction return Reduction(expr.operation, tuple(new_inames), - self.rec(expr.expr, expn_state)) + self.rec(expr.expr, expn_state), + expr.allow_simultaneous) else: return super(_InameSplitter, self).map_reduction(expr, expn_state) @@ -269,8 +285,21 @@ def split_iname(kernel, split_iname, inner_length, within=None): """Split *split_iname* into two inames (an 'inner' one and an 'outer' one) so that ``split_iname == inner + outer*inner_length`` and *inner* is of - fixed length *inner_length*. - + constant length *inner_length*. + + :arg outer_iname: The new iname to use for the 'inner' (fixed-length) + loop. Defaults to a name derived from ``split_iname + "_outer"`` + :arg inner_iname: The new iname to use for the 'inner' (fixed-length) + loop. Defaults to a name derived from ``split_iname + "_inner"`` + :arg inner_length: a positive integer + :arg slabs: + A tuple ``(head_it_count, tail_it_count)`` indicating the + number of leading/trailing iterations of *outer_iname* + for which separate code should be generated. + :arg outer_tag: The iname tag (see :ref:`iname-tag`) to apply to + *outer_iname*. + :arg inner_tag: The iname tag (see :ref:`iname-tag`) to apply to + *inner_iname*. :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. """ @@ -418,7 +447,8 @@ class _InameJoiner(RuleAwareSubstitutionMapper): from loopy.symbolic import Reduction return Reduction(expr.operation, tuple(new_inames), - self.rec(expr.expr, expn_state)) + self.rec(expr.expr, expn_state), + expr.allow_simultaneous) else: return super(_InameJoiner, self).map_reduction(expr, expn_state) @@ -544,7 +574,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): """Tag an iname - :arg iname_to_tag: a list of tuple ``(iname, new_tag)``. *new_tag* is given + :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given as an instance of a subclass of :class:`loopy.kernel.data.IndexTag` or as a string as shown in :ref:`iname-tags`. May also be a dictionary for backwards compatibility. @@ -650,7 +680,8 @@ class _InameDuplicator(RuleAwareIdentityMapper): from loopy.symbolic import Reduction return Reduction(expr.operation, new_inames, - self.rec(expr.expr, expn_state)) + self.rec(expr.expr, expn_state), + expr.allow_simultaneous) else: return super(_InameDuplicator, self).map_reduction(expr, expn_state) @@ -1021,6 +1052,106 @@ def remove_unused_inames(knl, inames=None): # }}} +# {{{ split_reduction + +class _ReductionSplitter(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, within, inames, direction): + super(_ReductionSplitter, self).__init__( + rule_mapping_context) + + self.within = within + self.inames = inames + self.direction = direction + + def map_reduction(self, expr, expn_state): + if set(expr.inames) & set(expn_state.arg_context): + # FIXME + raise NotImplementedError() + + if (self.inames <= set(expr.inames) + and self.within( + expn_state.kernel, + expn_state.instruction, + expn_state.stack)): + leftover_inames = set(expr.inames) - self.inames + + from loopy.symbolic import Reduction + if self.direction == "in": + return Reduction(expr.operation, tuple(leftover_inames), + Reduction(expr.operation, tuple(self.inames), + self.rec(expr.expr, expn_state), + expr.allow_simultaneous), + expr.allow_simultaneous) + elif self.direction == "out": + return Reduction(expr.operation, tuple(self.inames), + Reduction(expr.operation, tuple(leftover_inames), + self.rec(expr.expr, expn_state), + expr.allow_simultaneous)) + else: + assert False + else: + return super(_ReductionSplitter, self).map_reduction(expr, expn_state) + + +def _split_reduction(kernel, inames, direction, within=None): + if direction not in ["in", "out"]: + raise ValueError("invalid value for 'direction': %s" % direction) + + if isinstance(inames, str): + inames = inames.split(",") + inames = set(inames) + + from loopy.context_matching import parse_stack_match + within = parse_stack_match(within) + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + rsplit = _ReductionSplitter(rule_mapping_context, + within, inames, direction) + return rule_mapping_context.finish_kernel( + rsplit.map_kernel(kernel)) + + +def split_reduction_inward(kernel, inames, within=None): + """Takes a reduction of the form:: + + sum([i,j,k], ...) + + and splits it into two nested reductions:: + + sum([j,k], sum([i], ...)) + + In this case, *inames* would have been ``"i"`` indicating that + the iname ``i`` should be made the iname governing the inner reduction. + + :arg inames: A list of inames, or a comma-separated string that can + be parsed into those + """ + + return _split_reduction(kernel, inames, "in", within) + + +def split_reduction_outward(kernel, inames, within=None): + """Takes a reduction of the form:: + + sum([i,j,k], ...) + + and splits it into two nested reductions:: + + sum([i], sum([j,k], ...)) + + In this case, *inames* would have been ``"i"`` indicating that + the iname ``i`` should be made the iname governing the outer reduction. + + :arg inames: A list of inames, or a comma-separated string that can + be parsed into those + """ + + return _split_reduction(kernel, inames, "out", within) + +# }}} + + # {{{ affine map inames def affine_map_inames(kernel, old_inames, new_inames, equations): @@ -1259,4 +1390,125 @@ def find_unused_axis_tag(kernel, kind, insn_match=None): # }}} + +# {{{ separate_loop_head_tail_slab + +# undocumented, because not super-useful +def separate_loop_head_tail_slab(kernel, iname, head_it_count, tail_it_count): + """Mark *iname* so that the separate code is generated for + the lower *head_it_count* and the upper *tail_it_count* + iterations of the loop on *iname*. + """ + + iname_slab_increments = kernel.iname_slab_increments.copy() + iname_slab_increments[iname] = (head_it_count, tail_it_count) + + return kernel.copy(iname_slab_increments=iname_slab_increments) + +# }}} + + +# {{{ make_reduction_inames_unique + +class _ReductionInameUniquifier(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, inames, within): + super(_ReductionInameUniquifier, self).__init__(rule_mapping_context) + + self.inames = inames + self.old_to_new = [] + self.within = within + + self.iname_to_red_count = {} + self.iname_to_nonsimultaneous_red_count = {} + + def map_reduction(self, expr, expn_state): + within = self.within( + expn_state.kernel, + expn_state.instruction, + expn_state.stack) + + for iname in expr.inames: + self.iname_to_red_count[iname] = ( + self.iname_to_red_count.get(iname, 0) + 1) + if not expr.allow_simultaneous: + self.iname_to_nonsimultaneous_red_count[iname] = ( + self.iname_to_nonsimultaneous_red_count.get(iname, 0) + 1) + + if within and not expr.allow_simultaneous: + subst_dict = {} + + from pymbolic import var + + new_inames = [] + for iname in expr.inames: + if ( + not (self.inames is None or iname in self.inames) + or + self.iname_to_red_count[iname] <= 1): + new_inames.append(iname) + continue + + new_iname = self.rule_mapping_context.make_unique_var_name(iname) + subst_dict[iname] = var(new_iname) + self.old_to_new.append((iname, new_iname)) + new_inames.append(new_iname) + + from loopy.symbolic import SubstitutionMapper + from pymbolic.mapper.substitutor import make_subst_func + + from loopy.symbolic import Reduction + return Reduction(expr.operation, tuple(new_inames), + self.rec( + SubstitutionMapper(make_subst_func(subst_dict))( + expr.expr), + expn_state), + expr.allow_simultaneous) + else: + return super(_ReductionInameUniquifier, self).map_reduction( + expr, expn_state) + + +def make_reduction_inames_unique(kernel, inames=None, within=None): + """ + :arg inames: if not *None*, only apply to these inames + :arg within: a stack match as understood by + :func:`loopy.context_matching.parse_stack_match`. + + .. versionadded:: 2016.2 + """ + + name_gen = kernel.get_var_name_generator() + + from loopy.context_matching import parse_stack_match + within = parse_stack_match(within) + + # {{{ change kernel + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, name_gen) + r_uniq = _ReductionInameUniquifier(rule_mapping_context, + inames, within=within) + + kernel = rule_mapping_context.finish_kernel( + r_uniq.map_kernel(kernel)) + + # }}} + + # {{{ duplicate the inames + + for old_iname, new_iname in r_uniq.old_to_new: + from loopy.kernel.tools import DomainChanger + domch = DomainChanger(kernel, frozenset([old_iname])) + + from loopy.isl_helpers import duplicate_axes + kernel = kernel.copy( + domains=domch.get_domains_with( + duplicate_axes(domch.domain, [old_iname], [new_iname]))) + + # }}} + + return kernel + +# }}} + # vim: foldmethod=marker diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index 1567263cddc8e199f93652ef4c39ef737b86fb3f..f7600b212cbf4db6b58c91bb3603f5a310c6b2a6 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -28,6 +28,42 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl +__doc__ = """ + +.. currentmodule:: loopy + +.. autofunction:: fix_parameters + +.. autofunction:: assume +""" + + +# {{{ assume + +def assume(kernel, assumptions): + """Include an assumption about :ref:`domain-parameters` in the kernel, e.g. + `n mod 4 = 0`. + + :arg assumptions: a :class:`islpy.BasicSet` or a string representation of + the assumptions in :ref:`isl-syntax`. + """ + if isinstance(assumptions, str): + assumptions_set_str = "[%s] -> { : %s}" \ + % (",".join(s for s in kernel.outer_params()), + assumptions) + assumptions = isl.BasicSet.read_from_str(kernel.domains[0].get_ctx(), + assumptions_set_str) + + if not isinstance(assumptions, isl.BasicSet): + raise TypeError("'assumptions' must be a BasicSet or a string") + + old_assumptions, new_assumptions = isl.align_two(kernel.assumptions, assumptions) + + return kernel.copy( + assumptions=old_assumptions.params() & new_assumptions.params()) + +# }}} + # {{{ fix_parameter @@ -99,6 +135,13 @@ def _fix_parameter(kernel, name, value): def fix_parameters(kernel, **value_dict): + """Fix the values of the arguments to specific constants. + + *value_dict* consists of *name*/*value* pairs, where *name* will be fixed + to be *value*. *name* may refer to :ref:`domain-parameters` or + :ref:`arguments`. + """ + for name, value in six.iteritems(value_dict): kernel = _fix_parameter(kernel, name, value) diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 7623fb8911e2a50b08309c0c6cb45a0dd372bfbb..e599c902227faf8d1292ece2307d097bc8fd7c19 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -289,7 +289,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): """Extract an assignment (to a temporary variable or an argument) - as a :ref:`substituiton-rule`. The temporary may be an array, in + as a :ref:`substitution-rule`. The temporary may be an array, in which case the array indices will become arguments to the substitution rule. @@ -349,7 +349,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, usage_to_definition = {} - for insn in kernel.instructions: + for insn in dep_kernel.instructions: if lhs_name not in insn.read_dependency_names(): continue diff --git a/loopy/version.py b/loopy/version.py index 9ad8ac19bebff7a712e91900815057155205ae57..adc069663503b200bcdd1638c05ae0ffae5f14df 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v18-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v19-islpy%s" % _islpy_version diff --git a/setup.py b/setup.py index 4fbb4e142b0680e28e88282409655c5a644a6f4e..5ed095315234339709309a1c55ec88c7fdab6bfa 100644 --- a/setup.py +++ b/setup.py @@ -40,8 +40,9 @@ setup(name="loo.py", "pytools>=2016.1", "pymbolic>=2016.2", "cgen>=2016.1", - "islpy>=2016.1", + "islpy>=2016.1.2", "six>=1.8.0", + "colorama", ], extras_require={ diff --git a/test/test_dg.py b/test/test_dg.py index 63a961423d2f750a4c9a25fdcb5fb56a479d8a35..fafef86c35211183ebdaeb75acf2b664a36586a0 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -52,10 +52,10 @@ def test_dg_volume(ctx_factory): "{[n,m,k]: 0<= n,m < Np and 0<= k < K}", ], """ - <> du_drst = sum(m, DrDsDt[n,m]*u[k,m]) - <> dv_drst = sum(m, DrDsDt[n,m]*v[k,m]) - <> dw_drst = sum(m, DrDsDt[n,m]*w[k,m]) - <> dp_drst = sum(m, DrDsDt[n,m]*p[k,m]) + <> du_drst = simul_reduce(sum, m, DrDsDt[n,m]*u[k,m]) + <> dv_drst = simul_reduce(sum, m, DrDsDt[n,m]*v[k,m]) + <> dw_drst = simul_reduce(sum, m, DrDsDt[n,m]*w[k,m]) + <> dp_drst = simul_reduce(sum, m, DrDsDt[n,m]*p[k,m]) # volume flux rhsu[k,n] = dot(drst_dx[k],dp_drst) diff --git a/test/test_linalg.py b/test/test_linalg.py index 9c6803e93f2e53a0c071e0372bf71256854de38a..6aeec63c49a72c784ad5cccf1ee7acc1fcae0f2a 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -30,6 +30,8 @@ import pyopencl as cl import pyopencl.array as cl_array import loopy as lp +import logging + from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) @@ -61,12 +63,16 @@ def check_float4(result, ref_result): def test_axpy(ctx_factory): + logging.basicConfig(level="INFO") ctx = ctx_factory() n = 3145182 vec = cl_array.vec + if ctx.devices[0].platform.vendor.startswith("Advanced Micro"): + pytest.skip("crashes on AMD 15.12") + for dtype, check, a, b in [ (np.complex64, None, 5, 7), (vec.float4, check_float4, @@ -111,7 +117,8 @@ def test_axpy(ctx_factory): lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), op_count=[np.dtype(dtype).itemsize*n*3/1e9], op_label=["GBytes"], - parameters={"a": a, "b": b, "n": n}, check_result=check) + parameters={"a": a, "b": b, "n": n}, check_result=check, + blacklist_ref_vendors=["Advanced Micro"]) def test_transpose(ctx_factory): @@ -463,7 +470,7 @@ def test_magma_fermi_matrix_mul(ctx_factory): lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], - parameters={}) + parameters={}, blacklist_ref_vendors="pocl") def test_image_matrix_mul(ctx_factory): diff --git a/test/test_loopy.py b/test/test_loopy.py index 667d7365d412d9be6d9a51c8d63c9470ffede2f9..606eec7667ca3d76a215c3b487e8c93bc371c36e 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -529,6 +529,9 @@ def test_fuzz_code_generator(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) + if ctx.devices[0].platform.vendor.startswith("Advanced Micro"): + pytest.skip("crashes on AMD 15.12") + #from expr_fuzz import get_fuzz_examples #for expr, var_values in get_fuzz_examples(): for expr, var_values in generate_random_fuzz_examples(50): @@ -941,6 +944,27 @@ def test_double_sum(ctx_factory): n = 20 + knl = lp.make_kernel( + "{[i,j]: 0<=i,j {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e