diff --git a/doc/conf.py b/doc/conf.py index 301604607ef9543b559aa7cfdaf875e9140f1299..74e1aec303972378a28dea0b4186720d6a61019e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -46,7 +46,7 @@ master_doc = 'index' # General information about the project. project = u'loopy' -copyright = u'2011, Andreas Klöckner' +copyright = u'2016, Andreas Klöckner' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -54,7 +54,10 @@ copyright = u'2011, Andreas Klöckner' # # The short X.Y version. ver_dic = {} -exec(compile(open("../loopy/version.py").read(), "../loopy/version.py", 'exec'), ver_dic) +with open("../loopy/version.py") as vpy_file: + version_py = vpy_file.read() + +exec(compile(version_py, "../loopy/version.py", 'exec'), ver_dic) version = ".".join(str(x) for x in ver_dic["VERSION"]) # The full version, including alpha/beta/rc tags. release = ver_dic["VERSION_TEXT"] @@ -249,4 +252,4 @@ intersphinx_mapping = { 'http://docs.scipy.org/doc/numpy/': None, } -autoclass_content = "both" +autoclass_content = "class" diff --git a/doc/index.rst b/doc/index.rst index 19bbe87727e8d3ffa5aa0cb1c31e4e206bd97b3e..a0bad2898be4aab74dead90aae825e4e0a460c87 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -63,7 +63,10 @@ Please check :ref:`installation` to get started. :maxdepth: 2 tutorial - reference + ref_creation + ref_kernel + ref_transform + ref_other misc Indices and tables diff --git a/doc/misc.rst b/doc/misc.rst index 2f8fac090455eea1be87c3b4eae7bfd72eba24ee..89279d9744d2f1cf4f080618d0d2b2e0f078a723 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -5,7 +5,9 @@ Installation This command should install :mod:`loopy`:: - pip install loopy + pip install loo.py + +(Note the extra "."!) You may need to run this with :command:`sudo`. If you don't already have `pip <https://pypi.python.org/pypi/pip>`_, diff --git a/doc/ref_creation.rst b/doc/ref_creation.rst new file mode 100644 index 0000000000000000000000000000000000000000..92eff09c9e3ecacfd8bb9030a9e4b9f002fefc71 --- /dev/null +++ b/doc/ref_creation.rst @@ -0,0 +1,33 @@ +.. module:: loopy +.. moduleauthor:: Andreas Kloeckner <inform@tiker.net> + +.. _creating-kernels: + +Reference: Creating Kernels +=========================== + +From Loop Domains and Instructions +---------------------------------- + +.. autofunction:: make_kernel + +From Fortran +------------ + +.. autofunction:: parse_fortran + +.. autofunction:: parse_transformed_fortran + +.. autofunction:: c_preprocess + +From Other Kernels +------------------ + +.. autofunction:: fuse_kernels + +To Copy between Data Formats +---------------------------- + +.. autofunction:: make_copy_kernel + +.. vim: tw=75:spell:fdm=marker diff --git a/doc/reference.rst b/doc/ref_kernel.rst similarity index 71% rename from doc/reference.rst rename to doc/ref_kernel.rst index 351a2374a1245bf58296f9d2ee50551050c15f76..33d40385b529f72e54da65238304e87bdb2cddab 100644 --- a/doc/reference.rst +++ b/doc/ref_kernel.rst @@ -1,21 +1,39 @@ -.. _reference: +.. currentmodule:: loopy -Reference Guide -=============== +Reference: Loopy's Model of a Kernel +==================================== -.. module:: loopy -.. moduleauthor:: Andreas Kloeckner <inform@tiker.net> +.. _domain-tree: -This guide defines all functionality exposed by loopy. If you would like -a more gentle introduction, you may consider reading the example-based -:ref:`tutorial` instead. +Loop Domain Tree +---------------- -.. _inames: +.. {{{ + +Example:: + + { [i]: 0<=i<n } + +A kernel's iteration domain is given by a list of :class:`islpy.BasicSet` +instances (which parametrically represent multi-dimensional sets of +tuples of integers). They define the integer values of the loop variables +for which instructions (see below) will be executed. +It is written in :ref:`isl-syntax`. :mod:`loopy` calls the loop variables +*inames*. In this case, *i* is the sole iname. The loop +domain is given as a conjunction of affine equality +and inequality constraints. Integer divisibility constraints (resulting +in strides) are also allowed. In the absence of divisibility +constraints, the loop domain is convex. -Domain Tree ------------ +Note that *n* in the example is not an iname. It is a +:ref:`domain-parameter` that is passed to the kernel by the user. +To accommodate some data-dependent control flow, there is not actually +a single loop domain, but rather a *tree of loop domains*, +allowing more deeply nested domains to depend on inames +introduced by domains closer to the root. +.. _inames: Inames ^^^^^^ @@ -25,161 +43,10 @@ dependency semantics--otherwise e.g. a fetch could happen inside one loop nest, and then the instruction using that fetch could be inside a wholly different loop nest. -Instructions ------------- - -Expressions -^^^^^^^^^^^ - -Loopy's expressions are a slight superset of the expressions supported by -:mod:`pymbolic`. - -* `if` -* `reductions` - * duplication of reduction inames -* complex-valued arithmetic -* tagging of array access and substitution rule use ("$") -* ``indexof``, ``indexof_vec`` - -.. _types: - -Specifying Types ----------------- - -:mod:`loopy` uses the same type system as :mod:`numpy`. (See -:class:`numpy.dtype`) It also uses :mod:`pyopencl` for a registry of -user-defined types and their C equivalents. See :func:`pyopencl.tools.get_or_register_dtype` -and related functions. - -For a string representation of types, all numpy types (e.g. ``float32`` etc.) -are accepted, in addition to what is registered in :mod:`pyopencl`. - -.. _iname-tags: - -Iname Implementation Tags -------------------------- - -=============================== ==================================================== -Tag Meaning -=============================== ==================================================== -``None`` | ``"for"`` Sequential loop -``"l.N"`` Local (intra-group) axis N ("local") -``"g.N"`` Group-number axis N ("group") -``"unr"`` Unroll -``"ilp"`` | ``"ilp.unr"`` Unroll using instruction-level parallelism -``"ilp.seq"`` Realize parallel iname as innermost loop -``"like.INAME"`` Can be used when tagging inames to tag like another -``"unused.g"`` | ``"unused.l"`` Can be to tag as the next unused group/local axis -=============================== ==================================================== - -(Throughout this table, `N` must be replaced by an actual, zero-based number.) - -"ILP" does three things: - -* Restricts loops to be innermost -* Duplicates reduction storage for any reductions nested around ILP usage -* Causes a loop (unrolled or not) to be opened/generated for each - involved instruction - -.. _data-dim-tags: - -Data Axis Tags --------------- - -Data axis tags specify how a multi-dimensional array (which is loopy's -main way of storing data) is represented in (linear, 1D) computer -memory. This storage format is given as a number of "tags", as listed -in the table below. Each axis of an array has a tag corresponding to it. -In the user interface, array dim tags are specified as a tuple of these -tags or a comma-separated string containing them, such as the following:: - - c,vec,sep,c - -The interpretation of these tags is order-dependent, they are read -from left to right. - -===================== ==================================================== -Tag Meaning -===================== ==================================================== -``c`` Nest current axis around the ones that follow -``f`` Nest current axis inside the ones that follow -``N0`` ... ``N9`` Specify an explicit nesting level for this axis -``stride:EXPR`` A fixed stride -``sep`` Implement this axis by mapping to separate arrays -``vec`` Implement this axis as entries in a vector -===================== ==================================================== - -``sep`` and ``vec`` obviously require the number of entries -in the array along their respective axis to be known at code -generation time. - -When the above speaks about 'nesting levels', this means that axes -"nested inside" others are "faster-moving" when viewed from linear -memory. - -In addition, each tag may be followed by a question mark (``?``), -which indicates that if there are more dimension tags specified -than array axes present, that this axis should be omitted. Axes -with question marks are omitted in a left-first manner until the correct -number of dimension tags is achieved. - -Some examples follow, all of which use a three-dimensional array of shape -*(3, M, 4)*. For simplicity, we assume that array entries have size one. - -* ``c,c,c``: The axes will have strides *(M*4, 4, 1)*, - leading to a C-like / row-major layout. - -* ``f,f,f``: The axes will have strides *(1, 3, 3*M)*, - leading to a Fortran-like / row-major layout. - -* ``sep,c,c``: The array will be mapped to three arrays of - shape *(M, 4)*, each with strides *(4, 1)*. - -* ``c,c,vec``: The array will be mapped to an array of - ``float4`` vectors, with (``float4``-based) strides of - *(M, 1)*. - -* ``N1,N0,N2``: The axes will have strides *(M, 1, 3*M)*. - -.. _creating-kernels: - -Creating Kernels ----------------- - -.. autoclass:: auto - -.. _arguments: - -Arguments -^^^^^^^^^ - -.. autoclass:: ValueArg - :members: - :undoc-members: - -.. autoclass:: GlobalArg - :members: - :undoc-members: - -.. autoclass:: ConstantArg - :members: - :undoc-members: - -.. autoclass:: ImageArg - :members: - :undoc-members: - -.. _temporaries: - -Loop domains -^^^^^^^^^^^^ - -TODO: Explain the domain tree - .. _isl-syntax: ISL syntax -~~~~~~~~~~ +^^^^^^^^^^ The general syntax of an ISL set is the following:: @@ -217,27 +84,73 @@ Examples of constructs that are **not** allowed: (**Note:** This may be added in a future version of loopy. For now, loop domains have to be convex.) -Temporary Variables -^^^^^^^^^^^^^^^^^^^ +.. _domain-parameters: -Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both -have the lifetime of a kernel invocation. +Domain parameters +^^^^^^^^^^^^^^^^^ -.. autoclass:: TemporaryVariable - :members: - :undoc-members: +Domain parameters are identifiers being used in loop domains that are not +*inames*, i.e. they do not define loop variables. In the following domain +specification, *n* is a domain parameter:: + + {[i,j]: 0 <= i,j < n} + +Values of domain parameters arise from + +* being passed to the kernel as :ref:`arguments` + +* being assigned to :ref:`temporaries` to feed into domains + lower in the :ref:`domain-tree`. + +.. _iname-tags: + +Iname Implementation Tags +^^^^^^^^^^^^^^^^^^^^^^^^^ + +=============================== ==================================================== +Tag Meaning +=============================== ==================================================== +``None`` | ``"for"`` Sequential loop +``"l.N"`` Local (intra-group) axis N ("local") +``"g.N"`` Group-number axis N ("group") +``"unr"`` Unroll +``"ilp"`` | ``"ilp.unr"`` Unroll using instruction-level parallelism +``"ilp.seq"`` Realize parallel iname as innermost loop +``"like.INAME"`` Can be used when tagging inames to tag like another +``"unused.g"`` | ``"unused.l"`` Can be to tag as the next unused group/local axis +=============================== ==================================================== + +(Throughout this table, `N` must be replaced by an actual, zero-based number.) + +"ILP" does three things: + +* Restricts loops to be innermost +* Duplicates reduction storage for any reductions nested around ILP usage +* Causes a loop (unrolled or not) to be opened/generated for each + involved instruction + +.. }}} + +.. _instructions: Instructions -^^^^^^^^^^^^ +------------ -.. autoclass:: UniqueName +.. {{{ .. autoclass:: InstructionBase .. _assignments: -Assignments -~~~~~~~~~~~ +Assignment objects +^^^^^^^^^^^^^^^^^^ + +.. autoclass:: Assignment + +.. _assignment-syntax: + +Textual Assignment Syntax +^^^^^^^^^^^^^^^^^^^^^^^^^ The general syntax of an instruction is a simple assignment:: @@ -349,254 +262,206 @@ These are usually key-value pairs. The following attributes are recognized: given instruction groups. See :class:`InstructionBase.conflicts_with_groups`. -Assignment instructions are expressed as instances of the following class: +.. _expression-syntax: -.. autoclass:: ExpressionInstruction +Expressions +^^^^^^^^^^^ -.. _expression-syntax: +Loopy's expressions are a slight superset of the expressions supported by +:mod:`pymbolic`. -Expression Syntax -~~~~~~~~~~~~~~~~~ +* ``if`` +* ``reductions`` + * duplication of reduction inames + * ``reduce`` vs ``simul_reduce`` +* complex-valued arithmetic +* tagging of array access and substitution rule use ("$") +* ``indexof``, ``indexof_vec`` TODO: Functions TODO: Reductions C Block Instructions -~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^ .. autoclass:: CInstruction -.. _substitution-rule: - -Substitution Rules -^^^^^^^^^^^^^^^^^^ - -Syntax of a substitution rule:: - - rule_name(arg1, arg2) := EXPRESSION - -Kernels -^^^^^^^ - -.. class:: LoopKernel +.. }}} -Do not create :class:`LoopKernel` objects directly. Instead, use the following -function, which is responsible for creating kernels: +Data: Arguments and Temporaries +------------------------------- -.. autofunction:: make_kernel +.. {{{ -.. autofunction:: parse_fortran +Kernels operate on two types of data: 'arguments' carrying data into and out of a kernel, +and temporaries with lifetimes tied to the runtime of the kernel. -.. autofunction:: parse_transformed_fortran +.. _arguments: -.. autofunction:: make_copy_kernel +Arguments +^^^^^^^^^ -.. autofunction:: fuse_kernels +.. autoclass:: KernelArgument + :members: + :undoc-members: -.. autofunction:: c_preprocess +.. autoclass:: ValueArg + :members: + :undoc-members: -Transforming Kernels --------------------- +.. autoclass:: GlobalArg + :members: + :undoc-members: -.. _context-matching: +.. autoclass:: ConstantArg + :members: + :undoc-members: -Matching contexts -^^^^^^^^^^^^^^^^^ +.. autoclass:: ImageArg + :members: + :undoc-members: -TODO: Matching instruction tags +.. _temporaries: -.. automodule:: loopy.context_matching +Temporary Variables +^^^^^^^^^^^^^^^^^^^ -.. autofunction:: parse_match +Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both +have the lifetime of a kernel invocation. -.. autofunction:: parse_stack_match +.. autoclass:: TemporaryVariable + :members: + :undoc-members: -.. currentmodule:: loopy +.. _types: -Wrangling inames +Specifying Types ^^^^^^^^^^^^^^^^ -.. autofunction:: split_iname - -.. autofunction:: chunk_iname - -.. autofunction:: join_inames - -.. autofunction:: tag_inames - -.. autofunction:: duplicate_inames - -.. undocumented .. autofunction:: link_inames - -.. autofunction:: rename_iname - -.. autofunction:: remove_unused_inames - -.. autofunction:: set_loop_priority - -.. autofunction:: split_reduction_inward - -.. autofunction:: split_reduction_outward - -.. autofunction:: affine_map_inames - -.. autofunction:: realize_ilp - -.. autofunction:: find_unused_axis_tag - -Dealing with Parameters -^^^^^^^^^^^^^^^^^^^^^^^ - -.. autofunction:: fix_parameters - -.. autofunction:: assume +:mod:`loopy` uses the same type system as :mod:`numpy`. (See +:class:`numpy.dtype`) It also uses :mod:`pyopencl` for a registry of +user-defined types and their C equivalents. See :func:`pyopencl.tools.get_or_register_dtype` +and related functions. -Dealing with Substitution Rules -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +For a string representation of types, all numpy types (e.g. ``float32`` etc.) +are accepted, in addition to what is registered in :mod:`pyopencl`. -.. autofunction:: extract_subst +.. _data-dim-tags: -.. autofunction:: assignment_to_subst +Data Axis Tags +^^^^^^^^^^^^^^ -.. autofunction:: expand_subst +Data axis tags specify how a multi-dimensional array (which is loopy's +main way of storing data) is represented in (linear, 1D) computer +memory. This storage format is given as a number of "tags", as listed +in the table below. Each axis of an array has a tag corresponding to it. +In the user interface, array dim tags are specified as a tuple of these +tags or a comma-separated string containing them, such as the following:: -.. autofunction:: find_rules_matching + c,vec,sep,c -.. autofunction:: find_one_rule_matching +The interpretation of these tags is order-dependent, they are read +from left to right. -Caching, Precomputation and Prefetching -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +===================== ==================================================== +Tag Meaning +===================== ==================================================== +``c`` Nest current axis around the ones that follow +``f`` Nest current axis inside the ones that follow +``N0`` ... ``N9`` Specify an explicit nesting level for this axis +``stride:EXPR`` A fixed stride +``sep`` Implement this axis by mapping to separate arrays +``vec`` Implement this axis as entries in a vector +===================== ==================================================== -.. autofunction:: precompute +``sep`` and ``vec`` obviously require the number of entries +in the array along their respective axis to be known at code +generation time. -.. autofunction:: add_prefetch +When the above speaks about 'nesting levels', this means that axes +"nested inside" others are "faster-moving" when viewed from linear +memory. -.. autofunction:: buffer_array +In addition, each tag may be followed by a question mark (``?``), +which indicates that if there are more dimension tags specified +than array axes present, that this axis should be omitted. Axes +with question marks are omitted in a left-first manner until the correct +number of dimension tags is achieved. -.. autofunction:: alias_temporaries +Some examples follow, all of which use a three-dimensional array of shape +*(3, M, 4)*. For simplicity, we assume that array entries have size one. -Influencing data access -^^^^^^^^^^^^^^^^^^^^^^^ +* ``c,c,c``: The axes will have strides *(M*4, 4, 1)*, + leading to a C-like / row-major layout. -.. autofunction:: change_arg_to_image +* ``f,f,f``: The axes will have strides *(1, 3, 3*M)*, + leading to a Fortran-like / row-major layout. -.. autofunction:: tag_data_axes +* ``sep,c,c``: The array will be mapped to three arrays of + shape *(M, 4)*, each with strides *(4, 1)*. -.. autofunction:: remove_unused_arguments +* ``c,c,vec``: The array will be mapped to an array of + ``float4`` vectors, with (``float4``-based) strides of + *(M, 1)*. -.. autofunction:: set_array_dim_names +* ``N1,N0,N2``: The axes will have strides *(M, 1, 3*M)*. -Padding -^^^^^^^ +.. }}} -.. autofunction:: split_array_dim +.. _substitution-rule: -.. autofunction:: find_padding_multiple +Substitution Rules +------------------ -.. autofunction:: add_padding +.. {{{ -Manipulating Instructions +Substitution Rule Objects ^^^^^^^^^^^^^^^^^^^^^^^^^ -.. autofunction:: set_instruction_priority - -.. autofunction:: add_dependency - -.. autofunction:: remove_instructions - -.. autofunction:: tag_instructions - -Library interface -^^^^^^^^^^^^^^^^^ - -.. autofunction:: register_reduction_parser - -.. autofunction:: register_preamble_generators - -.. autofunction:: register_symbol_manglers - -.. autofunction:: register_function_manglers - -Arguments -^^^^^^^^^ - -.. autofunction:: set_argument_order - -.. autofunction:: add_dtypes - -.. autofunction:: infer_unknown_types - -.. autofunction:: add_and_infer_dtypes - -.. autofunction:: rename_argument - -Batching -^^^^^^^^ +.. autoclass:: SubstitutionRule -.. autofunction:: to_batched +.. _subst-rule-syntax: -Finishing up -^^^^^^^^^^^^ +Textual Syntax for Substitution Rules +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. autofunction:: generate_loop_schedules - -.. autofunction:: get_one_scheduled_kernel - -.. autofunction:: generate_code - -Running -------- - -.. autoclass:: CompiledKernel - -Automatic Testing ------------------ - -.. autofunction:: auto_test_vs_ref - -Troubleshooting ---------------- - -Printing :class:`LoopKernel` objects -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -If you're confused about things loopy is referring to in an error message or -about the current state of the :class:`LoopKernel` you are transforming, the -following always works:: - - print kernel +Syntax of a substitution rule:: -(And it yields a human-readable--albeit terse--representation of *kernel*.) + rule_name(arg1, arg2) := EXPRESSION -.. autofunction:: preprocess_kernel +.. }}} -.. autofunction:: get_dot_dependency_graph +Kernel Options +-------------- -.. autofunction:: show_dependency_graph +.. autoclass:: Options -Options +Targets ------- -.. autoclass:: Options +.. automodule:: loopy.target -.. autofunction:: set_options +Helper values +------------- -Controlling caching -------------------- +.. {{{ -.. autofunction:: set_caching_enabled +.. autoclass:: auto -.. autoclass:: CacheMode +.. autoclass:: UniqueName -Obtaining Kernel Statistics ---------------------------- +.. }}} -.. autofunction:: get_op_poly +The Kernel Object +----------------- -.. autofunction:: get_gmem_access_poly +Do not create :class:`LoopKernel` objects directly. Instead, refer to +:ref:`creating-kernels`. -.. autofunction:: sum_mem_access_to_bytes +.. autoclass:: LoopKernel -.. autofunction:: get_barrier_poly +.. autoclass:: kernel_state + :members: + :undoc-members: -.. vim: tw=75:spell +.. vim: tw=75:spell:fdm=marker diff --git a/doc/ref_other.rst b/doc/ref_other.rst new file mode 100644 index 0000000000000000000000000000000000000000..71d6c54b11dcd15977bdb375cea2207d881b5696 --- /dev/null +++ b/doc/ref_other.rst @@ -0,0 +1,46 @@ +Reference: Other Functionality +============================== + +Obtaining Kernel Performance Statistics +--------------------------------------- + +.. automodule:: loopy.statistics + +Controlling caching +------------------- + +.. autofunction:: set_caching_enabled + +.. autoclass:: CacheMode + +Running Kernels +--------------- + +In addition to simply calling kernels using :class:`LoopKernel.__call__`, +the following underlying functionality may be used: + +.. autoclass:: CompiledKernel + +Automatic Testing +----------------- + +.. autofunction:: auto_test_vs_ref + +Troubleshooting +--------------- + +Printing :class:`LoopKernel` objects +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you're confused about things loopy is referring to in an error message or +about the current state of the :class:`LoopKernel` you are transforming, the +following always works:: + + print(kernel) + +(And it yields a human-readable--albeit terse--representation of *kernel*.) + +.. autofunction:: get_dot_dependency_graph + +.. autofunction:: show_dependency_graph + diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst new file mode 100644 index 0000000000000000000000000000000000000000..4c3c24873945dc0d87ec414ff7870fafd0a5bda1 --- /dev/null +++ b/doc/ref_transform.rst @@ -0,0 +1,135 @@ +.. _reference-transform: + +Reference: Transforming Kernels +=============================== + +Dealing with Parameters +----------------------- + +.. automodule:: loopy.transform.parameter + +Wrangling inames +---------------- + +.. automodule:: loopy.transform.iname + +Dealing with Substitution Rules +------------------------------- + +.. currentmodule:: loopy + +.. autofunction:: extract_subst + +.. autofunction:: assignment_to_subst + +.. autofunction:: expand_subst + +.. autofunction:: find_rules_matching + +.. autofunction:: find_one_rule_matching + +Caching, Precomputation and Prefetching +--------------------------------------- + +.. autofunction:: precompute + +.. autofunction:: add_prefetch + +.. autofunction:: buffer_array + +.. autofunction:: alias_temporaries + +Influencing data access +----------------------- + +.. autofunction:: change_arg_to_image + +.. autofunction:: tag_data_axes + +.. autofunction:: remove_unused_arguments + +.. autofunction:: set_array_dim_names + +Padding Data +------------ + +.. autofunction:: split_array_dim + +.. autofunction:: find_padding_multiple + +.. autofunction:: add_padding + +Manipulating Instructions +------------------------- + +.. autofunction:: set_instruction_priority + +.. autofunction:: add_dependency + +.. autofunction:: remove_instructions + +.. autofunction:: tag_instructions + +Registering Library Routines +---------------------------- + +.. autofunction:: register_reduction_parser + +.. autofunction:: register_preamble_generators + +.. autofunction:: register_symbol_manglers + +.. autofunction:: register_function_manglers + +Modifying Arguments +------------------- + +.. autofunction:: set_argument_order + +.. autofunction:: add_dtypes + +.. autofunction:: infer_unknown_types + +.. autofunction:: add_and_infer_dtypes + +.. autofunction:: rename_argument + +Creating Batches of Operations +------------------------------ + +.. automodule:: loopy.transform.batch + +Finishing up +------------ + +.. currentmodule:: loopy + +.. autofunction:: preprocess_kernel + +.. autofunction:: generate_loop_schedules + +.. autofunction:: get_one_scheduled_kernel + +.. autofunction:: generate_code + +Setting options +--------------- + +.. autofunction:: set_options + +.. _context-matching: + +Matching contexts +----------------- + +TODO: Matching instruction tags + +.. automodule:: loopy.context_matching + +.. autofunction:: parse_match + +.. autofunction:: parse_stack_match + + +.. vim: tw=75:spell + diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 24ec0ce4fb7a140fb70e65a8793edcf7e03b2042..4275474d9ce74e04aa3c7ae69356f0672db2128c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -96,6 +96,7 @@ always see loopy's view of a kernel by printing it. .. doctest:: + >>> knl = lp.set_options(knl, allow_terminal_colors=False) >>> print(knl) --------------------------------------------------------------------------- KERNEL: loopy_kernel @@ -245,8 +246,6 @@ call :func:`loopy.generate_code`: .. doctest:: >>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32)) - >>> typed_knl = lp.preprocess_kernel(typed_knl, device=ctx.devices[0]) - >>> typed_knl = lp.get_one_scheduled_kernel(typed_knl) >>> code, _ = lp.generate_code(typed_knl) >>> print(code) #define lid(N) ((int) get_local_id(N)) @@ -1186,8 +1185,12 @@ across the remaining axis of the workgroup would emerge. TODO -Obtaining Kernel Statistics ---------------------------- +.. }}} + +Obtaining Performance Statistics +-------------------------------- + +.. {{{ Operations, array access, and barriers can all be counted, which may facilitate performance prediction and optimization of a :mod:`loopy` kernel. diff --git a/examples/python/ispc-harness.py b/examples/python/ispc-harness.py deleted file mode 100644 index 637fd41fcdcdbcc7a464f8209680931adfd014f5..0000000000000000000000000000000000000000 --- a/examples/python/ispc-harness.py +++ /dev/null @@ -1,183 +0,0 @@ -import loopy as lp -import numpy as np -import numpy.linalg as la -import ctypes -import os -from time import time - - -# {{{ temporary directory - -class TemporaryDirectory(object): - """Create and return a temporary directory. This has the same - behavior as mkdtemp but can be used as a context manager. For - example: - - with TemporaryDirectory() as tmpdir: - ... - - Upon exiting the context, the directory and everything contained - in it are removed. - """ - - # Yanked from - # https://hg.python.org/cpython/file/3.3/Lib/tempfile.py - - # Handle mkdtemp raising an exception - name = None - _closed = False - - def __init__(self, suffix="", prefix="tmp", dir=None): - from tempfile import mkdtemp - self.name = mkdtemp(suffix, prefix, dir) - - def __repr__(self): - return "<{} {!r}>".format(self.__class__.__name__, self.name) - - def __enter__(self): - return self.name - - def cleanup(self, _warn=False): - import warnings - if self.name and not self._closed: - from shutil import rmtree - try: - rmtree(self.name) - except (TypeError, AttributeError) as ex: - if "None" not in '%s' % (ex,): - raise - self._rmtree(self.name) - self._closed = True - if _warn and warnings.warn: - warnings.warn("Implicitly cleaning up {!r}".format(self)) - - def __exit__(self, exc, value, tb): - self.cleanup() - - def __del__(self): - # Issue a ResourceWarning if implicit cleanup needed - self.cleanup(_warn=True) - -# }}} - - -# {{{ build_ispc_shared_lib - -def build_ispc_shared_lib( - cwd, ispc_sources, cxx_sources, - ispc_options=[], cxx_options=[]): - from os.path import join - - ispc_source_names = [] - for name, contents in ispc_sources: - ispc_source_names.append(name) - - with open(join(cwd, name), "w") as srcf: - srcf.write(contents) - - cxx_source_names = [] - for name, contents in cxx_sources: - cxx_source_names.append(name) - - with open(join(cwd, name), "w") as srcf: - srcf.write(contents) - - from subprocess import check_call - - check_call( - ["ispc", - "--pic", - "--opt=force-aligned-memory", - "--target=avx2-i32x8", - "-o", "ispc.o"] - + ispc_options - + list(ispc_source_names), - cwd=cwd) - - check_call( - [ - "g++", - "-shared", "-fopenmp", "-Wl,--export-dynamic", - "-fPIC", - "-oshared.so", - "ispc.o", - ] - + cxx_options - + list(cxx_source_names), - cwd=cwd) - -# }}} - - -def cptr_from_numpy(obj): - ary_intf = getattr(obj, "__array_interface__", None) - if ary_intf is None: - raise RuntimeError("no array interface") - - buf_base, is_read_only = ary_intf["data"] - return ctypes.c_void_p(buf_base + ary_intf.get("offset", 0)) - - -def main(): - with open("tasksys.cpp", "r") as ts_file: - tasksys_source = ts_file.read() - - from loopy.target.ispc import ISPCTarget - stream_knl = lp.make_kernel( - "{[i]: 0<=i<n}", - "z[i] = a*x[i] + y[i]", - target=ISPCTarget()) - - stream_dtype = np.float64 - stream_ctype = ctypes.c_double - - stream_knl = lp.add_and_infer_dtypes(stream_knl, { - "a": stream_dtype, - "x": stream_dtype, - "y": stream_dtype - }) - - stream_knl = lp.assume(stream_knl, "n>0") - stream_knl = lp.split_iname(stream_knl, "i", 8, inner_tag="l.0") - stream_knl = lp.split_iname(stream_knl, - "i_outer", 2**22, outer_tag="g.0") - stream_knl = lp.preprocess_kernel(stream_knl) - stream_knl = lp.get_one_scheduled_kernel(stream_knl) - stream_knl = lp.set_argument_order(stream_knl, "n,a,x,y,z") - ispc_code, arg_info = lp.generate_code(stream_knl) - - with TemporaryDirectory() as tmpdir: - build_ispc_shared_lib( - tmpdir, - [("stream.ispc", ispc_code)], - [("tasksys.cpp", tasksys_source)]) - - print(ispc_code) - knl_lib = ctypes.cdll.LoadLibrary(os.path.join(tmpdir, "shared.so")) - - n = 2**28 - a = 5 - x = np.empty(n, dtype=stream_dtype) - y = np.empty(n, dtype=stream_dtype) - z = np.empty(n, dtype=stream_dtype) - - nruns = 30 - start_time = time() - for irun in range(nruns): - knl_lib.loopy_kernel( - ctypes.c_int(n), stream_ctype(a), - cptr_from_numpy(x), - cptr_from_numpy(y), - cptr_from_numpy(z)) - elapsed = time() - start_time - - print(1e-9*3*x.nbytes*nruns/elapsed, "GB/s") - - print(la.norm(z-a*x+y)) - - - -if __name__ == "__main__": - main() - -# vim: foldmethod=marker diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py new file mode 100644 index 0000000000000000000000000000000000000000..a402896c85baa49bc23bb5d770607cf73641c273 --- /dev/null +++ b/examples/python/ispc-stream-harness.py @@ -0,0 +1,156 @@ +import loopy as lp +import numpy as np +import numpy.linalg as la +import ctypes +import ctypes.util +import os +from time import time +from tempfile import TemporaryDirectory + +from loopy.tools import (empty_aligned, address_from_numpy, + build_ispc_shared_lib, cptr_from_numpy) + + +def transform(knl, vars, stream_dtype): + vars = [v.strip() for v in vars.split(",")] + knl = lp.assume(knl, "n>0") + knl = lp.split_iname( + knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) + knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") + + knl = lp.add_and_infer_dtypes(knl, { + var: stream_dtype + for var in vars + }) + + knl = lp.set_argument_order(knl, vars + ["n"]) + + return knl + + +def gen_code(knl): + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + ispc_code, arg_info = lp.generate_code(knl) + + return ispc_code + + +NRUNS = 10 +ALIGN_TO = 4096 +ARRAY_SIZE = 2**28 + +if 0: + STREAM_DTYPE = np.float64 + STREAM_CTYPE = ctypes.c_double +else: + STREAM_DTYPE = np.float32 + STREAM_CTYPE = ctypes.c_float + +if 1: + INDEX_DTYPE = np.int32 + INDEX_CTYPE = ctypes.c_int +else: + INDEX_DTYPE = np.int64 + INDEX_CTYPE = ctypes.c_longlong + + +def main(): + with open("tasksys.cpp", "r") as ts_file: + tasksys_source = ts_file.read() + + def make_knl(name, insn, vars): + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + insn, + target=lp.ISPCTarget(), index_dtype=INDEX_DTYPE, + name="stream_"+name+"_tasks") + + knl = transform(knl, vars, STREAM_DTYPE) + return knl + + init_knl = make_knl("init", """ + a[i] = 1 + b[i] = 2 + c[i] = 0 + """, "a,b,c") + triad_knl = make_knl("triad", """ + a[i] = b[i] + scalar * c[i] + """, "a,b,c,scalar") + + with TemporaryDirectory() as tmpdir: + ispc_code = gen_code(init_knl) + gen_code(triad_knl) + print(ispc_code) + + build_ispc_shared_lib( + tmpdir, + [("stream.ispc", ispc_code)], + [("tasksys.cpp", tasksys_source)], + cxx_options=["-g", "-fopenmp", "-DISPC_USE_OMP"], + ispc_options=([ + #"-g", "--no-omit-frame-pointer", + "--target=avx2-i32x8", + "--opt=force-aligned-memory", + "--opt=disable-loop-unroll", + #"--opt=fast-math", + #"--opt=disable-fma", + ] + + (["--addressing=64"] if INDEX_DTYPE == np.int64 else []) + ), + #ispc_bin="/home/andreask/pack/ispc-v1.9.0-linux/ispc", + quiet=False, + ) + + knl_lib = ctypes.cdll.LoadLibrary(os.path.join(tmpdir, "shared.so")) + + scalar = 5 + + a = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO) + b = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO) + c = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO) + + print( + hex(address_from_numpy(a)), + hex(address_from_numpy(b)), + hex(address_from_numpy(c))) + assert address_from_numpy(a) % ALIGN_TO == 0 + assert address_from_numpy(b) % ALIGN_TO == 0 + assert address_from_numpy(c) % ALIGN_TO == 0 + + knl_lib.stream_init_tasks( + cptr_from_numpy(a), + cptr_from_numpy(b), + cptr_from_numpy(c), + INDEX_CTYPE(ARRAY_SIZE), + ) + + def call_kernel(): + knl_lib.stream_triad_tasks( + cptr_from_numpy(a), + cptr_from_numpy(b), + cptr_from_numpy(c), + STREAM_CTYPE(scalar), + INDEX_CTYPE(ARRAY_SIZE), + ) + + call_kernel() + call_kernel() + + start_time = time() + + for irun in range(NRUNS): + call_kernel() + + elapsed = time() - start_time + + print(elapsed/NRUNS) + + print(1e-9*3*a.nbytes*NRUNS/elapsed, "GB/s") + + assert la.norm(a-b+scalar*c, np.inf) < np.finfo(STREAM_DTYPE).eps * 10 + + +if __name__ == "__main__": + main() + +# vim: foldmethod=marker diff --git a/examples/python/run-ispc-harness.sh b/examples/python/run-ispc-harness.sh new file mode 100755 index 0000000000000000000000000000000000000000..f39a51fefee478b8023ded65a2ce3c8de018c852 --- /dev/null +++ b/examples/python/run-ispc-harness.sh @@ -0,0 +1,3 @@ +#! /bin/bash + +OMP_PLACES=cores OMP_DISPLAY_ENV=true OMP_SCHEDULE=static python "$@" diff --git a/loopy/__init__.py b/loopy/__init__.py index bf0a2be1bae22f93bad784ea59874fb305f898f8..5e3ad508596a20cce8a084e4d70604cdafa47c79 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -38,11 +38,13 @@ from loopy.library.function import ( from loopy.kernel.data import ( auto, + KernelArgument, ValueArg, GlobalArg, ConstantArg, ImageArg, InstructionBase, Assignment, ExpressionInstruction, CInstruction, - TemporaryVariable) + TemporaryVariable, + SubstitutionRule) -from loopy.kernel import LoopKernel +from loopy.kernel import LoopKernel, kernel_state from loopy.kernel.tools import ( get_dot_dependency_graph, show_dependency_graph, @@ -54,10 +56,12 @@ from loopy.library.reduction import register_reduction_parser # {{{ import transforms from loopy.transform.iname import ( - assume, set_loop_priority, + set_loop_priority, split_iname, chunk_iname, join_inames, tag_inames, duplicate_inames, rename_iname, link_inames, remove_unused_inames, - affine_map_inames, find_unused_axis_tag) + split_reduction_inward, split_reduction_outward, + affine_map_inames, find_unused_axis_tag, + make_reduction_inames_unique) from loopy.transform.instruction import ( find_instructions, map_instructions, @@ -79,8 +83,7 @@ from loopy.transform.buffer import buffer_array from loopy.transform.fusion import fuse_kernels from loopy.transform.arithmetic import ( - split_reduction_inward, - split_reduction_outward, fold_constants, + fold_constants, collect_common_factors_on_increment) from loopy.transform.padding import ( @@ -89,7 +92,7 @@ from loopy.transform.padding import ( from loopy.transform.ilp import realize_ilp from loopy.transform.batch import to_batched -from loopy.transform.parameter import fix_parameters +from loopy.transform.parameter import assume, fix_parameters # }}} @@ -107,15 +110,24 @@ from loopy.auto_test import auto_test_vs_ref from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran, parse_fortran) +from loopy.target import TargetBase +from loopy.target.c import CTarget +from loopy.target.cuda import CudaTarget +from loopy.target.opencl import OpenCLTarget +from loopy.target.pyopencl import PyOpenCLTarget +from loopy.target.ispc import ISPCTarget + __all__ = [ "TaggedVariable", "Reduction", "LinearSubscript", "auto", - "LoopKernel", + "LoopKernel", "kernel_state", + "KernelArgument", "ValueArg", "ScalarArg", "GlobalArg", "ArrayArg", "ConstantArg", "ImageArg", "TemporaryVariable", + "SubstitutionRule", "InstructionBase", "Assignment", "ExpressionInstruction", "CInstruction", @@ -127,11 +139,13 @@ __all__ = [ # {{{ transforms - "assume", "set_loop_priority", + "set_loop_priority", "split_iname", "chunk_iname", "join_inames", "tag_inames", "duplicate_inames", "rename_iname", "link_inames", "remove_unused_inames", + "split_reduction_inward", "split_reduction_outward", "affine_map_inames", "find_unused_axis_tag", + "make_reduction_inames_unique", "add_prefetch", "change_arg_to_image", "tag_data_axes", "set_array_dim_names", "remove_unused_arguments", @@ -148,7 +162,6 @@ __all__ = [ "precompute", "buffer_array", "fuse_kernels", - "split_reduction_inward", "split_reduction_outward", "fold_constants", "collect_common_factors_on_increment", "split_array_dim", "split_arg_axis", "find_padding_multiple", @@ -158,7 +171,7 @@ __all__ = [ "to_batched", - "fix_parameters", + "assume", "fix_parameters", # }}} @@ -186,6 +199,9 @@ __all__ = [ "LoopyError", "LoopyWarning", + "TargetBase", "CTarget", "CudaTarget", "OpenCLTarget", + "PyOpenCLTarget", "ISPCTarget", + # {{{ from this file "register_preamble_generators", diff --git a/loopy/auto_test.py b/loopy/auto_test.py index d51f669ca1e9d0afb53d8c39a7a784e772075767..607d091b98b7a1919dacbe10f0280ea35c666a43 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -1,7 +1,4 @@ -from __future__ import division -from __future__ import absolute_import -from six.moves import range -from six.moves import zip +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -25,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from six.moves import range, zip from pytools import Record from warnings import warn @@ -313,15 +311,28 @@ def _default_check_result(result, ref_result): # {{{ find device for reference test -def _enumerate_cl_devices_for_ref_test(): +def _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors): import pyopencl as cl noncpu_devs = [] cpu_devs = [] + if isinstance(blacklist_ref_vendors, str): + blacklist_ref_vendors = blacklist_ref_vendors.split(",") + for pf in cl.get_platforms(): for dev in pf.get_devices(): + if any(bl in dev.platform.vendor + for bl in blacklist_ref_vendors): + print("blacklist", dev, blacklist_ref_vendors) + continue + if dev.type & cl.device_type.CPU: + if "Intel" in dev.platform.vendor: + # Sorry, Intel, your CPU CL has gotten too crashy of late. + # (Feb 2016) + continue + cpu_devs.append(dev) else: noncpu_devs.append(dev) @@ -351,7 +362,7 @@ def auto_test_vs_ref( dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, - quiet=False): + quiet=False, blacklist_ref_vendors=[]): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. @@ -406,7 +417,7 @@ def auto_test_vs_ref( ref_errors = [] - for dev in _enumerate_cl_devices_for_ref_test(): + for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors): ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue(ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) @@ -417,6 +428,9 @@ def auto_test_vs_ref( ref_sched_kernel = knl break + logger.info("%s (ref): trying %s for the reference calculation" % ( + ref_knl.name, dev)) + ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75*"-") diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 2e136d7bb5264238f024775b4b9b9d31479bb914..91af52cde0000ce811172be53bf21e13ce5fde97 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -464,10 +464,15 @@ def generate_code(kernel, device=None): warn("passing 'device' to generate_code() is deprecated", DeprecationWarning, stacklevel=2) + from loopy.kernel import kernel_state + if kernel.state == kernel_state.INITIAL: + from loopy.preprocess import preprocess_kernel + kernel = preprocess_kernel(kernel) + if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) - from loopy.kernel import kernel_state + if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 6d0b2ca60504060e677d1e3bb941b6c3ccbb3fc2..70530d3a90cbdd5a48395034dd05000f6781b473 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -424,13 +424,15 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): from cgen import Comment result.append(Comment(cmt)) - from cgen import Initializer, POD, Const, Line + from loopy.codegen import POD + from cgen import Initializer, Const, Line from loopy.symbolic import aff_to_expr if (static_ubound - static_lbound).plain_is_zero(): # single-trip, generate just a variable assignment, not a loop result.append(gen_code_block([ - Initializer(Const(POD(kernel.index_dtype, loop_iname)), + Initializer( + Const(POD(kernel.target, kernel.index_dtype, loop_iname)), ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), Line(), inner, diff --git a/loopy/compiled.py b/loopy/compiled.py index 6d4396b5a11ef99bcf7e8e0b03b5d9d7d8fb6d88..32536021dcfb3a1460f23af7e066aff9bf772f46 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -791,8 +791,16 @@ class _CLKernelInfo(Record): class CompiledKernel: + """An object connecting a kernel to a :class:`pyopencl.Context` + for execution. + + .. automethod:: __init__ + .. automethod:: __call__ + """ + def __init__(self, context, kernel): """ + :arg context: a :class:`pyopencl.Context` :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the kernel has not yet been loop-scheduled, that is done, too, with no @@ -908,10 +916,12 @@ class CompiledKernel: def __call__(self, queue, **kwargs): """ - :arg allocator: - :arg wait_for: - :arg out_host: - + :arg allocator: a callable passed a byte count and returning + a :class:`pyopencl.Buffer`. A :class:`pyopencl` allocator + maybe. + :arg wait_for: A list of :class:`pyopencl.Event` instances + for which to wait. + :arg out_host: :class:`bool` Decides whether output arguments (i.e. arguments written by the kernel) are to be returned as :mod:`numpy` arrays. *True* for yes, *False* for no. diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index f2bbb288249332dc6a333f95fa2600fa48967e60..561a29cc816ba25372fb957311420b8c18291ac7 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -237,7 +237,7 @@ def parse_transformed_fortran(source, free_form=True, strict=True, def parse_fortran(source, filename="<floopy code>", free_form=True, strict=True, - auto_dependencies=True): + auto_dependencies=True, target=None): """ :returns: a list of :class:`loopy.LoopKernel` objects """ @@ -257,7 +257,8 @@ def parse_fortran(source, filename="<floopy code>", free_form=True, strict=True, "and returned invalid data (Sorry!)") from loopy.frontend.fortran.translator import F2LoopyTranslator - f2loopy = F2LoopyTranslator(filename, auto_dependencies=auto_dependencies) + f2loopy = F2LoopyTranslator(filename, auto_dependencies=auto_dependencies, + target=target) f2loopy(tree) return f2loopy.make_kernels() diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 53ea602f1d9b0a5b4fd9b04bc2486d7daae0ca5f..68dd5fa952c2dca519397980288ef56c1b5fdd5f 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -198,10 +198,11 @@ class Scope(object): # {{{ translator class F2LoopyTranslator(FTreeWalkerBase): - def __init__(self, filename, auto_dependencies): + def __init__(self, filename, auto_dependencies, target=None): FTreeWalkerBase.__init__(self) self.auto_dependencies = auto_dependencies + self.target = target self.scope_stack = [] @@ -679,6 +680,7 @@ class F2LoopyTranslator(FTreeWalkerBase): name=sub.subprogram_name, default_order="F", index_dtype=self.index_dtype, + target=self.target, ) from loopy.loop import fuse_loop_domains diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 42f83bfdf45d030a77e3fb045460f064ff58cbc9..fb081a959523114ee5fc823eec59420f4adf3457 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -90,11 +90,16 @@ class LoopKernel(RecordWithoutPickling): .. attribute:: domains a list of :class:`islpy.BasicSet` instances + representing the :ref:`domain-tree`. .. attribute:: instructions + + A list of :class:`InstructionBase` instances, e.g. + :class:`Assignment`. See :ref:`instructions`. + .. attribute:: args - A list of :class:`loopy.kernel.data.KernelArgument` + A list of :class:`loopy.KernelArgument` .. attribute:: schedule @@ -108,7 +113,7 @@ class LoopKernel(RecordWithoutPickling): .. attribute:: temporary_variables A :class:`dict` of mapping variable names to - :class:`loopy.kernel.data.TemporaryVariable` + :class:`loopy.TemporaryVariable` instances. .. attribute:: iname_to_tag @@ -155,7 +160,7 @@ class LoopKernel(RecordWithoutPickling): .. attribute:: target - A subclass of :class:`loopy.target.TargetBase`. + A subclass of :class:`loopy.TargetBase`. """ # {{{ constructor @@ -1007,6 +1012,9 @@ class LoopKernel(RecordWithoutPickling): printed_insn_ids = set() + Fore = self.options._fore + Style = self.options._style + def print_insn(insn): if insn.id in printed_insn_ids: return @@ -1030,7 +1038,7 @@ class LoopKernel(RecordWithoutPickling): loop_list = ",".join(sorted(kernel.insn_inames(insn))) - options = [insn.id] + options = [Fore.GREEN+insn.id+Style.RESET_ALL] if insn.priority: options.append("priority=%d" % insn.priority) if insn.tags: @@ -1043,12 +1051,15 @@ class LoopKernel(RecordWithoutPickling): if len(loop_list) > loop_list_width: lines.append("[%s]" % loop_list) lines.append("%s%s <- %s # %s" % ( - (loop_list_width+2)*" ", lhs, - rhs, ", ".join(options))) + (loop_list_width+2)*" ", Fore.BLUE+lhs+Style.RESET_ALL, + Fore.MAGENTA+rhs+Style.RESET_ALL, + ", ".join(options))) else: lines.append("[%s]%s%s <- %s # %s" % ( loop_list, " "*(loop_list_width-len(loop_list)), - lhs, rhs, ",".join(options))) + Fore.BLUE + lhs + Style.RESET_ALL, + Fore.MAGENTA+rhs+Style.RESET_ALL, + ",".join(options))) lines.extend(trailing) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index f7c478dff296f26bdd342670b319104e7c3ef694..6a5d523e61c27c068a380b7de6c206147d7bb271 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1011,8 +1011,22 @@ def resolve_wildcard_deps(knl): def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): """User-facing kernel creation entrypoint. - :arg domains: :class:`islpy.BasicSet` + :arg domains: + + A list of :class:`islpy.BasicSet` (i.e. convex set) instances + representing the :ref:`domain-tree`. May also be a list of strings + which will be parsed into such instances according to :ref:`isl-syntax`. + :arg instructions: + + A list of :class:`Assignment` (or other :class:`InstructionBase` + subclasses), possibly intermixed with instances of + :class:`SubstitutionRule`. This same list may also contain strings + which will be parsed into such objects using the + :ref:`assignment-syntax` and the :ref:`subst-rule-syntax`. May also be + a single multi-line string which will be split into lines and then + parsed. + :arg kernel_data: A list of :class:`ValueArg`, :class:`GlobalArg`, ... (etc.) instances. @@ -1054,7 +1068,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): strides. They are expanded only upon kernel creation. :arg default_order: "C" (default) or "F" :arg default_offset: 0 or :class:`loopy.auto`. The default value of - *offset* in :attr:`loopy.kernel.data.GlobalArg` for guessed arguments. + *offset* in :attr:`GlobalArg` for guessed arguments. Defaults to 0. :arg function_manglers: list of functions of signature (name, arg_dtypes) returning a tuple (result_dtype, c_name) @@ -1074,8 +1088,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): to silence :arg options: an instance of :class:`loopy.Options` or an equivalent string representation - :arg target: an instance of :class:`loopy.target.TargetBase`, or *None*, - to use the default target. (see :func:`loopy.set_default_target`) + :arg target: an instance of :class:`loopy.TargetBase`, or *None*, + to use the default target. """ defines = kwargs.pop("defines", {}) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 9d0848788a09b7a6b914a1ce5569faf3102f18b2..5b0cf57e50298febe0580da5fb0efe5756557527 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -185,6 +185,8 @@ def parse_tag(tag): # {{{ arguments class KernelArgument(Record): + """Base class for all argument types""" + def __init__(self, **kwargs): kwargs["name"] = intern(kwargs.pop("name")) @@ -418,7 +420,9 @@ class SubstitutionRule(Record): # {{{ base class class InstructionBase(Record): - """ + """A base class for all types of instruction that can occur in + a kernel. + .. attribute:: id An (otherwise meaningless) identifier that is unique within @@ -427,7 +431,7 @@ class InstructionBase(Record): .. attribute:: depends_on a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances - that *must* be executed before this one. Note that + that *must* be executed before this one. Note that :func:`loopy.preprocess_kernel` (usually invoked automatically) augments this by adding dependencies on any writes to temporaries read by this instruction. @@ -493,6 +497,14 @@ class InstructionBase(Record): A tuple of string identifiers that can be used to identify groups of instructions. + + .. automethod:: __init__ + .. automethod:: assignees_and_indices + .. automethod:: with_transformed_expressions + .. automethod:: write_dependency_names + .. automethod:: dependency_names + .. automethod:: assignee_var_names + .. automethod:: copy """ fields = set("id depends_on depends_on_is_final " @@ -568,10 +580,12 @@ class InstructionBase(Record): predicates=predicates, tags=tags) + # legacy @property def insn_deps(self): return self.depends_on + # legacy @property def insn_deps_is_final(self): return self.depends_on_is_final @@ -740,6 +754,8 @@ class Assignment(InstructionBase): if not *None*, a type that will be assigned to the new temporary variable created from the assignee + + .. automethod:: __init__ """ fields = InstructionBase.fields | \ diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 46b301b47c45430078f554a178448687eb490e94..cb72f283e2151fd8d097f19eb4e4876cc78ccd43 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -770,7 +770,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): If *axis* is None, find a suitable axis automatically. """ try: - desired_length = kernel.get_constant_iname_length(iname) + with isl.SuppressedWarnings(kernel.isl_context): + desired_length = kernel.get_constant_iname_length(iname) except isl.Error: # Likely unbounded, automatic assignment is not # going to happen for this iname. @@ -882,7 +883,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): def get_iname_length(iname): try: - return kernel.get_constant_iname_length(iname) + with isl.SuppressedWarnings(kernel.isl_context): + return kernel.get_constant_iname_length(iname) except isl.Error: return -1 # assign longest auto axis inames first diff --git a/loopy/options.py b/loopy/options.py index 049d21affd284d94f27416d018d38bdbd5b4bddc..9b27f111e2fb93b09090b35278a3753f9f1a3f82 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -27,6 +27,11 @@ from pytools import Record import re +class _ColoramaStub(object): + def __getattribute__(self, name): + return "" + + class Options(Record): """ Unless otherwise specified, these options are Boolean-valued @@ -105,6 +110,11 @@ class Options(Record): Options to pass to the OpenCL compiler when building the kernel. A list of strings. + + .. attribute:: allow_terminal_colors + + A :class:`bool`. Whether to allow colors in terminal output + """ def __init__( @@ -124,6 +134,7 @@ class Options(Record): write_wrapper=False, highlight_wrapper=False, write_cl=False, highlight_cl=False, edit_cl=False, cl_build_options=[], + allow_terminal_colors=True ): Record.__init__( self, @@ -137,6 +148,7 @@ class Options(Record): write_wrapper=write_wrapper, highlight_wrapper=highlight_wrapper, write_cl=write_cl, highlight_cl=highlight_cl, edit_cl=edit_cl, cl_build_options=cl_build_options, + allow_terminal_colors=allow_terminal_colors, ) def update(self, other): @@ -150,6 +162,30 @@ class Options(Record): for field_name in sorted(self.__class__.fields): key_builder.rec(key_hash, getattr(self, field_name)) + @property + def _fore(self): + if self.allow_terminal_colors: + import colorama + return colorama.Fore + else: + return _ColoramaStub() + + @property + def _back(self): + if self.allow_terminal_colors: + import colorama + return colorama.Back + else: + return _ColoramaStub() + + @property + def _style(self): + if self.allow_terminal_colors: + import colorama + return colorama.Style + else: + return _ColoramaStub() + KEY_VAL_RE = re.compile("^([a-zA-Z0-9]+)=(.*)$") diff --git a/loopy/preprocess.py b/loopy/preprocess.py index fe88118e018d5829e9ff2104b70940a39cb95ade..4c75cfd250807c6959c1e5167465d34b029d762e 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -66,6 +66,45 @@ def prepare_for_caching(kernel): # }}} +# {{{ check reduction iname uniqueness + +def check_reduction_iname_uniqueness(kernel): + iname_to_reduction_count = {} + iname_to_nonsimultaneous_reduction_count = {} + + def map_reduction(expr, rec): + rec(expr.expr) + for iname in expr.inames: + iname_to_reduction_count[iname] = ( + iname_to_reduction_count.get(iname, 0) + 1) + if not expr.allow_simultaneous: + iname_to_nonsimultaneous_reduction_count[iname] = ( + iname_to_nonsimultaneous_reduction_count.get(iname, 0) + 1) + + return expr + + from loopy.symbolic import ReductionCallbackMapper + cb_mapper = ReductionCallbackMapper(map_reduction) + + for insn in kernel.instructions: + insn.with_transformed_expressions(cb_mapper) + + for iname, count in six.iteritems(iname_to_reduction_count): + nonsimul_count = iname_to_nonsimultaneous_reduction_count.get(iname, 0) + + if nonsimul_count and count > 1: + raise LoopyError("iname '%s' used in more than one reduction. " + "(%d of them, to be precise.) " + "Since this usage can easily cause loop scheduling " + "problems, this is prohibited by default. " + "Use loopy.make_reduction_inames_unique() to fix this. " + "If you are sure that this is OK, write the reduction " + "as 'simul_reduce(...)' instead of 'reduce(...)'" + % (iname, count)) + +# }}} + + # {{{ infer types def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): @@ -677,11 +716,13 @@ def preprocess_kernel(kernel, device=None): kernel = expand_subst(kernel) # Ordering restriction: - # Type inference doesn't handle substitutions. Get them out of the - # way. + # Type inference and reduction iname uniqueness don't handle substitutions. + # Get them out of the way. kernel = infer_unknown_types(kernel, expect_completion=False) + check_reduction_iname_uniqueness(kernel) + kernel = add_default_dependencies(kernel) # Ordering restrictions: diff --git a/loopy/schedule.py b/loopy/schedule.py index 17c1ab3ce0bad0170f8f7b03e243862de33af21d..de71ffaed0d457f96ee6ddbdeeafac58c0959aa9 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -317,11 +317,40 @@ def group_insn_counts(kernel): return result + +def gen_dependencies_except(kernel, insn_id, except_insn_ids): + insn = kernel.id_to_insn[insn_id] + for dep_id in insn.depends_on: + + if dep_id in except_insn_ids: + continue + + yield dep_id + + for sub_dep_id in gen_dependencies_except(kernel, dep_id, except_insn_ids): + yield sub_dep_id + # }}} # {{{ debug help +def format_insn_id(kernel, insn_id): + Fore = kernel.options._fore + Style = kernel.options._style + return Fore.GREEN + insn_id + Style.RESET_ALL + + +def format_insn(kernel, insn_id): + insn = kernel.id_to_insn[insn_id] + Fore = kernel.options._fore + Style = kernel.options._style + return "[%s] %s%s%s <- %s%s%s" % ( + format_insn_id(kernel, insn_id), + Fore.BLUE, str(insn.assignee), Style.RESET_ALL, + Fore.MAGENTA, str(insn.expression), Style.RESET_ALL) + + def dump_schedule(kernel, schedule): lines = [] indent = "" @@ -337,8 +366,7 @@ def dump_schedule(kernel, schedule): elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] if isinstance(insn, Assignment): - insn_str = "[%s] %s <- %s" % ( - insn.id, str(insn.assignee), str(insn.expression)) + insn_str = format_insn(kernel, sched_item.insn_id) else: insn_str = sched_item.insn_id lines.append(indent + insn_str) @@ -482,6 +510,8 @@ def generate_loop_schedules_internal( # to give loops containing high-priority instructions a chance. kernel = sched_state.kernel + Fore = kernel.options._fore + Style = kernel.options._style if allow_boost is None: rec_allow_boost = None @@ -510,7 +540,7 @@ def generate_loop_schedules_internal( print(dump_schedule(sched_state.kernel, sched_state.schedule)) #print("boost allowed:", allow_boost) print(75*"=") - print("LOOP NEST MAP:") + print("LOOP NEST MAP (inner: outer):") for iname, val in six.iteritems(sched_state.loop_nest_around_map): print("%s : %s" % (iname, ", ".join(val))) print(75*"=") @@ -549,7 +579,7 @@ def generate_loop_schedules_internal( if not is_ready: if debug_mode: print("instruction '%s' is missing insn depedencies '%s'" % ( - insn.id, ",".join( + format_insn(kernel, insn.id), ",".join( insn.depends_on - sched_state.scheduled_insn_ids))) continue @@ -570,10 +600,10 @@ def generate_loop_schedules_internal( if debug_mode: if want-have: print("instruction '%s' is missing inames '%s'" - % (insn.id, ",".join(want-have))) + % (format_insn(kernel, insn.id), ",".join(want-have))) if have-want: print("instruction '%s' won't work under inames '%s'" - % (insn.id, ",".join(have-want))) + % (format_insn(kernel, insn.id), ",".join(have-want))) # {{{ determine group-based readiness @@ -595,7 +625,7 @@ def generate_loop_schedules_internal( # }}} if is_ready and debug_mode: - print("ready to schedule '%s'" % insn.id) + print("ready to schedule '%s'" % format_insn(kernel, insn.id)) if is_ready and not debug_mode: iid_set = frozenset([insn.id]) @@ -660,7 +690,38 @@ def generate_loop_schedules_internal( if last_entered_loop in kernel.insn_inames(insn): if debug_mode: print("cannot leave '%s' because '%s' still depends on it" - % (last_entered_loop, insn.id)) + % (last_entered_loop, format_insn(kernel, insn.id))) + + # check if there's a dependency of insn that needs to be + # outside of last_entered_loop. + for subdep_id in gen_dependencies_except(kernel, insn_id, + sched_state.unscheduled_insn_ids): + subdep = kernel.id_to_insn[insn_id] + want = (kernel.insn_inames(subdep_id) + - sched_state.parallel_inames) + if ( + last_entered_loop not in want and + last_entered_loop not in subdep.boostable_into): + print( + "%(warn)swarning:%(reset_all)s '%(iname)s', " + "which the schedule is " + "currently stuck inside of, seems mis-nested. " + "'%(subdep)s' must occur " "before '%(dep)s', " + "but '%(subdep)s must be outside " + "'%(iname)s', whereas '%(dep)s' must be back " + "in it.%(reset_all)s\n" + " %(subdep_i)s\n" + " %(dep_i)s" + % { + "warn": Fore.RED + Style.BRIGHT, + "reset_all": Style.RESET_ALL, + "iname": last_entered_loop, + "subdep": format_insn_id(kernel, subdep_id), + "dep": format_insn_id(kernel, insn_id), + "subdep_i": format_insn(kernel, subdep_id), + "dep_i": format_insn(kernel, insn_id), + }) + can_leave = False break diff --git a/loopy/statistics.py b/loopy/statistics.py index d39f2f8556ad62642794ab2ef9b4c9504fdb27b1..5aa251dc9318bae6fe49b69642722ecc88d79887 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -35,6 +35,21 @@ from loopy.kernel.data import Assignment from loopy.diagnostic import warn +__doc__ = """ + +.. currentmodule:: loopy + +.. autofunction:: get_op_poly + +.. autofunction:: get_gmem_access_poly + +.. autofunction:: sum_mem_access_to_bytes + +.. autofunction:: get_barrier_poly + +""" + + # {{{ ToCountMap class ToCountMap: diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 7adab80c68c38f900976eb1adcd90226f40a7d9b..b887c703420d092d7f3c0fc9c729dd1d1f942a76 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -79,7 +79,20 @@ class IdentityMapperMixin(object): return expr def map_reduction(self, expr, *args): - return Reduction(expr.operation, expr.inames, self.rec(expr.expr, *args)) + mapped_inames = [self.rec(Variable(iname), *args) for iname in expr.inames] + + new_inames = [] + for iname, new_sym_iname in zip(expr.inames, mapped_inames): + if not isinstance(new_sym_iname, Variable): + from loopy.diagnostic import LoopyError + raise LoopyError("%s did not map iname '%s' to a variable" + % (type(self).__name__, iname)) + + new_inames.append(new_sym_iname.name) + + return Reduction( + expr.operation, tuple(new_inames), self.rec(expr.expr, *args), + allow_simultaneous=expr.allow_simultaneous) def map_tagged_variable(self, expr, *args): # leaf, doesn't change @@ -146,7 +159,8 @@ class StringifyMapper(StringifyMapperBase): return "loc.%d" % expr.index def map_reduction(self, expr, prec): - return "reduce(%s, [%s], %s)" % ( + return "%sreduce(%s, [%s], %s)" % ( + "simul_" if expr.allow_simultaneous else "", expr.operation, ", ".join(expr.inames), expr.expr) def map_tagged_variable(self, expr, prec): @@ -346,11 +360,16 @@ class Reduction(AlgebraicLeaf): The expression (as a :class:`pymbolic.primitives.Expression`) on which reduction is performed. + + .. attribute:: allow_simultaneous + + A :class:`bool`. If not *True*, an iname is allowed to be used + in precisely one reduction, to avoid mis-nesting errors. """ - init_arg_names = ("operation", "inames", "expr") + init_arg_names = ("operation", "inames", "expr", "allow_simultaneous") - def __init__(self, operation, inames, expr): + def __init__(self, operation, inames, expr, allow_simultaneous=False): if isinstance(inames, str): inames = tuple(iname.strip() for iname in inames.split(",")) @@ -378,9 +397,10 @@ class Reduction(AlgebraicLeaf): self.operation = operation self.inames = inames self.expr = expr + self.allow_simultaneous = allow_simultaneous def __getinitargs__(self): - return (self.operation, self.inames, self.expr) + return (self.operation, self.inames, self.expr, self.allow_simultaneous) def get_hash(self): return hash((self.__class__, self.operation, self.inames, @@ -779,7 +799,8 @@ class FunctionToPrimitiveMapper(IdentityMapper): turns those into the actual pymbolic primitives used for that. """ - def _parse_reduction(self, operation, inames, red_expr): + def _parse_reduction(self, operation, inames, red_expr, + allow_simultaneous=False): if isinstance(inames, Variable): inames = (inames,) @@ -795,7 +816,8 @@ class FunctionToPrimitiveMapper(IdentityMapper): processed_inames.append(iname.name) - return Reduction(operation, tuple(processed_inames), red_expr) + return Reduction(operation, tuple(processed_inames), red_expr, + allow_simultaneous=allow_simultaneous) def map_call(self, expr): from loopy.library.reduction import parse_reduction_op @@ -820,7 +842,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: raise TypeError("cse takes two arguments") - elif name == "reduce": + elif name in ["reduce", "simul_reduce"]: if len(expr.parameters) == 3: operation, inames, red_expr = expr.parameters @@ -829,7 +851,8 @@ class FunctionToPrimitiveMapper(IdentityMapper): "must be a symbol") operation = parse_reduction_op(operation.name) - return self._parse_reduction(operation, inames, self.rec(red_expr)) + return self._parse_reduction(operation, inames, self.rec(red_expr), + allow_simultaneous=(name == "simul_reduce")) else: raise TypeError("invalid 'reduce' calling sequence") diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index ee28594a50ce8486702e13f0ee9bf01debb4f859..85e58a809e51ff8067c9effb59240fb5125b46db 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -24,9 +24,26 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +__doc__ = """ + +.. currentmodule:: loopy + +.. autoclass:: TargetBase +.. autoclass:: CTarget +.. autoclass:: CudaTarget +.. autoclass:: OpenCLTarget +.. autoclass:: PyOpenCLTarget +.. autoclass:: ISPCTarget + +""" + class TargetBase(object): - """Objects of this type must be picklable.""" + """Base class for all targets, i.e. different types of code that + loopy can generate. + + Objects of this type must be picklable. + """ # {{{ persistent hashing diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 73310c4db7ad590396377d1d757ebf8eb068a8e6..5d6a856d14f57b277e6c083717346bbeb11af7b6 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -56,6 +56,9 @@ def _preamble_generator(kernel, seen_dtypes, seen_functions): class CTarget(TargetBase): + """A target for plain "C", without any parallel extensions. + """ + hash_fields = TargetBase.hash_fields + ("fortran_abi",) comparison_fields = TargetBase.comparison_fields + ("fortran_abi",) diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 94a144cb972b10cd7a1d797adbd8d259e7eb64dc..55f8da4d608d7a1b3a1ca8ba960c498586f4d76d 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -162,6 +162,8 @@ class LoopyCudaCCodeMapper(LoopyCCodeMapper): # {{{ target class CudaTarget(CTarget): + """A target for Nvidia's CUDA GPU programming language.""" + def __init__(self, extern_c=True): """ :arg extern_c: If *True*, declare kernels using "extern C" to diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 737ee0d998ce5b330697d56228183e0abe5e0137..b0b1c247ec78a29b75890b0e9f04448116b97c4e 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -30,6 +30,8 @@ from loopy.target.c import CTarget from loopy.target.c.codegen.expression import LoopyCCodeMapper from loopy.diagnostic import LoopyError +from pytools import memoize_method + # {{{ expression mapper @@ -54,7 +56,35 @@ class LoopyISPCCodeMapper(LoopyCCodeMapper): # }}} +# {{{ type registry + +def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True): + reg.get_or_register_dtype("bool", np.bool) + + reg.get_or_register_dtype(["int8", "signed char", "char"], np.int8) + reg.get_or_register_dtype(["uint8", "unsigned char"], np.uint8) + reg.get_or_register_dtype(["int16", "short", "signed short", + "signed short int", "short signed int"], np.int16) + reg.get_or_register_dtype(["uint16", "unsigned short", + "unsigned short int", "short unsigned int"], np.uint16) + reg.get_or_register_dtype(["int32", "int", "signed int"], np.int32) + reg.get_or_register_dtype(["uint32", "unsigned", "unsigned int"], np.uint32) + + reg.get_or_register_dtype(["int64"], np.int64) + reg.get_or_register_dtype(["uint64"], np.uint64) + + reg.get_or_register_dtype("float", np.float32) + reg.get_or_register_dtype("double", np.float64) + +# }}} + + class ISPCTarget(CTarget): + """A code generation target for Intel's `ISPC <https://ispc.github.io/>`_ + SPMD programming language, to target Intel's Knight's hardware and modern + Intel CPUs with wide vector units. + """ + def __init__(self, occa_mode=False): """ :arg occa_mode: Whether to modify the generated call signature to @@ -64,6 +94,18 @@ class ISPCTarget(CTarget): super(ISPCTarget, self).__init__() + # {{{ types + + @memoize_method + def get_dtype_registry(self): + from loopy.target.c.compyte.dtypes import DTypeRegistry + result = DTypeRegistry() + fill_registry_with_ispc_types(result, respect_windows=False, + include_bool=True) + return result + + # }}} + # {{{ top-level codegen def generate_code(self, kernel, codegen_state, impl_arg_info): @@ -116,18 +158,26 @@ class ISPCTarget(CTarget): from pymbolic.mapper.stringifier import PREC_COMPARISON, PREC_NONE ccm = self.get_expression_to_code_mapper(codegen_state) - wrapper_body.extend([ - S("assert(programCount == %s)" - % ccm(lsize[0], PREC_COMPARISON)), - S("launch[%s] %s(%s)" + if lsize: + wrapper_body.append( + S("assert(programCount == %s)" + % ccm(lsize[0], PREC_COMPARISON))) + + if gsize: + launch_spec = "[%s]" % ", ".join( + ccm(gs_i, PREC_NONE) + for gs_i in gsize), + else: + launch_spec = "" + + wrapper_body.append( + S("launch%s %s(%s)" % ( - ", ".join( - ccm(gs_i, PREC_NONE) - for gs_i in gsize), + launch_spec, inner_name, ", ".join(arg_names) )) - ]) + ) wrapper_fbody = FunctionBody( ISPCExport( @@ -176,11 +226,8 @@ class ISPCTarget(CTarget): raise LoopyError("unknown barrier kind") def wrap_temporary_decl(self, decl, is_local): - from cgen.ispc import ISPCUniform, ISPCVarying - if is_local: - return ISPCUniform(decl) - else: - return ISPCVarying(decl) + from cgen.ispc import ISPCUniform + return ISPCUniform(decl) def get_global_arg_decl(self, name, shape, dtype, is_written): from loopy.codegen import POD # uses the correct complex type diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index a009e93360016128f69c85fee2a555cd016a5ab1..cfdc8620bb4a383e4d48f004a2c682ba8e495a05 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -189,6 +189,9 @@ class LoopyOpenCLCCodeMapper(LoopyCCodeMapper): # {{{ target class OpenCLTarget(CTarget): + """A target for the OpenCL C heterogeneous compute programming language. + """ + # {{{ library def function_manglers(self): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index d13384534c70df602785d4189739a7bc86ed37db..2947fdc6c5d17e919bdcc329f6243376b00a9969 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -31,12 +31,6 @@ import numpy as np from loopy.target.opencl import OpenCLTarget -import pyopencl as cl -import pyopencl.characterize as cl_char - -# This ensures the dtype registry is populated. -import pyopencl.tools # noqa - import logging logger = logging.getLogger(__name__) @@ -44,6 +38,9 @@ logger = logging.getLogger(__name__) # {{{ temp storage adjust for bank conflict def adjust_local_temp_var_storage(kernel, device): + import pyopencl as cl + import pyopencl.characterize as cl_char + logger.debug("%s: adjust temp var storage" % kernel.name) new_temp_vars = {} @@ -244,9 +241,21 @@ class _LegacyTypeRegistryStub(object): from pyopencl.compyte.dtypes import dtype_to_ctype return dtype_to_ctype(dtype) +# }}} + + +# {{{ target class PyOpenCLTarget(OpenCLTarget): + """A code generation target that takes special advantage of :mod:`pyopencl` + features such as run-time knowledge of the target device (to generate + warnings) and support for complex numbers. + """ + def __init__(self, device=None): + # This ensures the dtype registry is populated. + import pyopencl.tools # noqa + super(PyOpenCLTarget, self).__init__() self.device = device diff --git a/loopy/tools.py b/loopy/tools.py index 55b177bda4e6be03a985286fd4faf6322e257824..777532e7af92bc62e6878f564c8c5545f4cb2c4a 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -233,6 +233,119 @@ def remove_common_indentation(code, require_leading_newline=True, # }}} +# {{{ build_ispc_shared_lib + +# DO NOT RELY ON THESE: THEY WILL GO AWAY + +def build_ispc_shared_lib( + cwd, ispc_sources, cxx_sources, + ispc_options=[], cxx_options=[], + ispc_bin="ispc", + cxx_bin="g++", + quiet=True): + from os.path import join + + ispc_source_names = [] + for name, contents in ispc_sources: + ispc_source_names.append(name) + + with open(join(cwd, name), "w") as srcf: + srcf.write(contents) + + cxx_source_names = [] + for name, contents in cxx_sources: + cxx_source_names.append(name) + + with open(join(cwd, name), "w") as srcf: + srcf.write(contents) + + from subprocess import check_call + + ispc_cmd = ([ispc_bin, + "--pic", + "-o", "ispc.o"] + + ispc_options + + list(ispc_source_names)) + if not quiet: + print(" ".join(ispc_cmd)) + + check_call(ispc_cmd, cwd=cwd) + + cxx_cmd = ([ + cxx_bin, + "-shared", "-Wl,--export-dynamic", + "-fPIC", + "-oshared.so", + "ispc.o", + ] + + cxx_options + + list(cxx_source_names)) + + check_call(cxx_cmd, cwd=cwd) + + if not quiet: + print(" ".join(cxx_cmd)) + +# }}} + + +# {{{ numpy address munging + +# DO NOT RELY ON THESE: THEY WILL GO AWAY + +def address_from_numpy(obj): + ary_intf = getattr(obj, "__array_interface__", None) + if ary_intf is None: + raise RuntimeError("no array interface") + + buf_base, is_read_only = ary_intf["data"] + return buf_base + ary_intf.get("offset", 0) + + +def cptr_from_numpy(obj): + import ctypes + return ctypes.c_void_p(address_from_numpy(obj)) + + +# https://github.com/hgomersall/pyFFTW/blob/master/pyfftw/utils.pxi#L172 +def empty_aligned(shape, dtype, order='C', n=64): + '''empty_aligned(shape, dtype='float64', order='C', n=None) + Function that returns an empty numpy array that is n-byte aligned, + where ``n`` is determined by inspecting the CPU if it is not + provided. + The alignment is given by the final optional argument, ``n``. If + ``n`` is not provided then this function will inspect the CPU to + determine alignment. The rest of the arguments are as per + :func:`numpy.empty`. + ''' + itemsize = np.dtype(dtype).itemsize + + # Apparently there is an issue with numpy.prod wrapping around on 32-bits + # on Windows 64-bit. This shouldn't happen, but the following code + # alleviates the problem. + if not isinstance(shape, (int, np.integer)): + array_length = 1 + for each_dimension in shape: + array_length *= each_dimension + + else: + array_length = shape + + base_ary = np.empty(array_length*itemsize+n, dtype=np.int8) + + # We now need to know how to offset base_ary + # so it is correctly aligned + _array_aligned_offset = (n-address_from_numpy(base_ary)) % n + + array = np.frombuffer( + base_ary[_array_aligned_offset:_array_aligned_offset-n].data, + dtype=dtype).reshape(shape, order=order) + + return array + +# }}} + + def is_interned(s): return s is None or intern(s) is s diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index a6830092910dcc89eefcc337961c3b215831fb41..d41222c26056300729b0c4005200ba6ea904010d 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -25,108 +25,9 @@ THE SOFTWARE. import six -from loopy.symbolic import (RuleAwareIdentityMapper, - SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError -# {{{ split_reduction - -class _ReductionSplitter(RuleAwareIdentityMapper): - def __init__(self, rule_mapping_context, within, inames, direction): - super(_ReductionSplitter, self).__init__( - rule_mapping_context) - - self.within = within - self.inames = inames - self.direction = direction - - def map_reduction(self, expr, expn_state): - if set(expr.inames) & set(expn_state.arg_context): - # FIXME - raise NotImplementedError() - - if (self.inames <= set(expr.inames) - and self.within( - expn_state.kernel, - expn_state.instruction, - expn_state.stack)): - leftover_inames = set(expr.inames) - self.inames - - from loopy.symbolic import Reduction - if self.direction == "in": - return Reduction(expr.operation, tuple(leftover_inames), - Reduction(expr.operation, tuple(self.inames), - self.rec(expr.expr, expn_state))) - elif self.direction == "out": - return Reduction(expr.operation, tuple(self.inames), - Reduction(expr.operation, tuple(leftover_inames), - self.rec(expr.expr, expn_state))) - else: - assert False - else: - return super(_ReductionSplitter, self).map_reduction(expr, expn_state) - - -def _split_reduction(kernel, inames, direction, within=None): - if direction not in ["in", "out"]: - raise ValueError("invalid value for 'direction': %s" % direction) - - if isinstance(inames, str): - inames = inames.split(",") - inames = set(inames) - - from loopy.context_matching import parse_stack_match - within = parse_stack_match(within) - - rule_mapping_context = SubstitutionRuleMappingContext( - kernel.substitutions, kernel.get_var_name_generator()) - rsplit = _ReductionSplitter(rule_mapping_context, - within, inames, direction) - return rule_mapping_context.finish_kernel( - rsplit.map_kernel(kernel)) - - -def split_reduction_inward(kernel, inames, within=None): - """Takes a reduction of the form:: - - sum([i,j,k], ...) - - and splits it into two nested reductions:: - - sum([j,k], sum([i], ...)) - - In this case, *inames* would have been ``"i"`` indicating that - the iname ``i`` should be made the iname governing the inner reduction. - - :arg inames: A list of inames, or a comma-separated string that can - be parsed into those - """ - - return _split_reduction(kernel, inames, "in", within) - - -def split_reduction_outward(kernel, inames, within=None): - """Takes a reduction of the form:: - - sum([i,j,k], ...) - - and splits it into two nested reductions:: - - sum([i], sum([j,k], ...)) - - In this case, *inames* would have been ``"i"`` indicating that - the iname ``i`` should be made the iname governing the outer reduction. - - :arg inames: A list of inames, or a comma-separated string that can - be parsed into those - """ - - return _split_reduction(kernel, inames, "out", within) - -# }}} - - # {{{ fold constants def fold_constants(kernel): diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 59239104e43725c6ea5bf572eda7e79d080427a7..967e14de692ee96b0c01d2ea5bcf8b411890038b 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -29,21 +29,27 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont from loopy.kernel.data import ValueArg, GlobalArg import islpy as isl +__doc__ = """ +.. autofunction:: to_batched +""" + # {{{ to_batched class _BatchVariableChanger(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, kernel, batch_varying_args, - batch_iname_expr): + batch_iname_expr, sequential): super(_BatchVariableChanger, self).__init__(rule_mapping_context) self.kernel = kernel self.batch_varying_args = batch_varying_args self.batch_iname_expr = batch_iname_expr + self.sequential = sequential def needs_batch_subscript(self, name): return ( - name in self.kernel.temporary_variables + (not self.sequential + and name in self.kernel.temporary_variables) or name in self.batch_varying_args) @@ -64,14 +70,18 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): return expr.aggregate[self.batch_iname_expr] -def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch"): +def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", + sequential=False): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. :arg nbatches: the number of batches. May be a constant non-negative integer or a string, which will be added as an integer argument. - :arg batch_varying_args: a list of argument names that depend vary per-batch. + :arg batch_varying_args: a list of argument names that vary per-batch. Each such variable will have a batch index added. + :arg sequential: A :class:`bool`. If *True*, do not duplicate + temporary variables for each batch. This automatically tags the batch + iname for sequential execution. """ from pymbolic import var @@ -110,26 +120,32 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch"): new_args.append(arg) - new_temps = {} - - for temp in six.itervalues(knl.temporary_variables): - new_temps[temp.name] = temp.copy( - shape=(nbatches_expr,) + temp.shape, - dim_tags=("c",) * (len(arg.shape) + 1)) - knl = knl.copy( domains=new_domains, - args=new_args, - temporary_variables=new_temps) + args=new_args) + + if not sequential: + new_temps = {} + + for temp in six.itervalues(knl.temporary_variables): + new_temps[temp.name] = temp.copy( + shape=(nbatches_expr,) + temp.shape, + dim_tags=("c",) * (len(arg.shape) + 1)) + + knl = knl.copy(temporary_variables=new_temps) + else: + import loopy as lp + from loopy.kernel.data import ForceSequentialTag + knl = lp.tag_inames(knl, [(batch_iname, ForceSequentialTag())]) rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, vng) bvc = _BatchVariableChanger(rule_mapping_context, - knl, batch_varying_args, batch_iname_expr) + knl, batch_varying_args, batch_iname_expr, + sequential=sequential) return rule_mapping_context.finish_kernel( bvc.map_kernel(knl)) - # }}} # vim: foldmethod=marker diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 64332637910340d68cb035d64ad6f4f643c0b5c9..7b1deb7951392e2e0c46360f8fd979ebf5aedb37 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -353,6 +353,26 @@ def remove_unused_arguments(knl): for insn in exp_knl.instructions: refd_vars.update(insn.dependency_names()) + from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag + from loopy.symbolic import get_dependencies + from itertools import chain + + def tolerant_get_deps(expr): + if expr is None or expr is lp.auto: + return set() + return get_dependencies(expr) + + for ary in chain(knl.args, six.itervalues(knl.temporary_variables)): + if isinstance(ary, ArrayBase): + refd_vars.update( + tolerant_get_deps(ary.shape) + | tolerant_get_deps(ary.offset)) + + for dim_tag in ary.dim_tags: + if isinstance(dim_tag, FixedStrideArrayDimTag): + refd_vars.update( + tolerant_get_deps(dim_tag.stride)) + for arg in knl.args: if arg.name in refd_vars: new_args.append(arg) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index bf435d3fe08d022790bf31a4d583d4923f0bfeff..e44f8abe227d451e8e940708530f6c20566685e8 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -210,8 +210,9 @@ def _fuse_two_kernels(knla, knlb): from pymbolic.imperative.transform import \ fuse_instruction_streams_with_unique_ids - new_instructions, _ = fuse_instruction_streams_with_unique_ids( - knla.instructions, knlb.instructions) + new_instructions, old_b_id_to_new_b_id = \ + fuse_instruction_streams_with_unique_ids( + knla.instructions, knlb.instructions) # {{{ fuse assumptions @@ -283,12 +284,12 @@ def _fuse_two_kernels(knla, knlb): "target", knla.target, knlb.target), - options=knla.options) + options=knla.options), old_b_id_to_new_b_id # }}} -def fuse_kernels(kernels, suffixes=None): +def fuse_kernels(kernels, suffixes=None, data_flow=None): """Return a kernel that performs all the operations in all entries of *kernels*. @@ -296,6 +297,11 @@ def fuse_kernels(kernels, suffixes=None): :arg suffixes: If given, must be a list of strings of a length matching that of *kernels*. This will be used to disambiguate the names of temporaries, as described below. + :arg data_flow: A list of data dependencies + ``[(var_name, from_kernel, to_kernel), ...]``. + Based on this, the fuser will create dependencies between all + writers of *var_name* in ``kernels[from_kernel]`` to + readers of *var_name* in ``kernels[to_kernel]``. The components of the kernels are fused as follows: @@ -321,9 +327,16 @@ def fuse_kernels(kernels, suffixes=None): * The resulting kernel will contain all instructions from each entry of *kernels*. Clashing instruction IDs will be renamed to ensure uniqueness. + + .. versionchanged:: 2016.2 + + *data_flow* was added in version 2016.2 """ kernels = list(kernels) + if data_flow is None: + data_flow = [] + if suffixes: suffixes = list(suffixes) if len(suffixes) != len(kernels): @@ -356,9 +369,46 @@ def fuse_kernels(kernels, suffixes=None): # }}} - result = kernels.pop(0) - while kernels: - result = _fuse_two_kernels(result, kernels.pop(0)) + kernel_insn_ids = [] + result = None + + for knlb in kernels: + if result is None: + result = knlb + kernel_insn_ids.append([ + insn.id for insn in knlb.instructions]) + else: + result, old_b_id_to_new_b_id = _fuse_two_kernels( + knla=result, + knlb=knlb) + + kernel_insn_ids.append([ + old_b_id_to_new_b_id[insn.id] + for insn in knlb.instructions]) + + # {{{ realize data_flow dependencies + + id_to_insn = result.id_to_insn.copy() + + for var_name, from_kernel, to_kernel in data_flow: + from_writer_ids = frozenset( + insn_id + for insn_id in kernel_insn_ids[from_kernel] + if var_name in id_to_insn[insn_id].assignee_var_names()) + + for insn_id in kernel_insn_ids[to_kernel]: + insn = id_to_insn[insn_id] + if var_name in insn.read_dependency_names(): + insn = insn.copy(depends_on=insn.depends_on | from_writer_ids) + + id_to_insn[insn_id] = insn + + result = result.copy(instructions=[ + id_to_insn[insn_id] + for insn_ids in kernel_insn_ids + for insn_id in insn_ids]) + + # }}} return result diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 8cf16bfd330b3e425023573788bc68bc8a7275a2..c317f2a4fbd186ccf069da74118504249956b4bc 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -35,25 +35,40 @@ from loopy.symbolic import ( from loopy.diagnostic import LoopyError -# {{{ assume +__doc__ = """ +.. currentmodule:: loopy -def assume(kernel, assumptions): - if isinstance(assumptions, str): - assumptions_set_str = "[%s] -> { : %s}" \ - % (",".join(s for s in kernel.outer_params()), - assumptions) - assumptions = isl.BasicSet.read_from_str(kernel.domains[0].get_ctx(), - assumptions_set_str) +.. autofunction:: split_iname - if not isinstance(assumptions, isl.BasicSet): - raise TypeError("'assumptions' must be a BasicSet or a string") +.. autofunction:: chunk_iname - old_assumptions, new_assumptions = isl.align_two(kernel.assumptions, assumptions) +.. autofunction:: join_inames - return kernel.copy( - assumptions=old_assumptions.params() & new_assumptions.params()) +.. autofunction:: tag_inames -# }}} +.. autofunction:: duplicate_inames + +.. undocumented .. autofunction:: link_inames + +.. autofunction:: rename_iname + +.. autofunction:: remove_unused_inames + +.. autofunction:: set_loop_priority + +.. autofunction:: split_reduction_inward + +.. autofunction:: split_reduction_outward + +.. autofunction:: affine_map_inames + +.. autofunction:: realize_ilp + +.. autofunction:: find_unused_axis_tag + +.. autofunction:: make_reduction_inames_unique + +""" # {{{ set loop priority @@ -106,7 +121,8 @@ class _InameSplitter(RuleAwareIdentityMapper): from loopy.symbolic import Reduction return Reduction(expr.operation, tuple(new_inames), - self.rec(expr.expr, expn_state)) + self.rec(expr.expr, expn_state), + expr.allow_simultaneous) else: return super(_InameSplitter, self).map_reduction(expr, expn_state) @@ -269,8 +285,21 @@ def split_iname(kernel, split_iname, inner_length, within=None): """Split *split_iname* into two inames (an 'inner' one and an 'outer' one) so that ``split_iname == inner + outer*inner_length`` and *inner* is of - fixed length *inner_length*. - + constant length *inner_length*. + + :arg outer_iname: The new iname to use for the 'inner' (fixed-length) + loop. Defaults to a name derived from ``split_iname + "_outer"`` + :arg inner_iname: The new iname to use for the 'inner' (fixed-length) + loop. Defaults to a name derived from ``split_iname + "_inner"`` + :arg inner_length: a positive integer + :arg slabs: + A tuple ``(head_it_count, tail_it_count)`` indicating the + number of leading/trailing iterations of *outer_iname* + for which separate code should be generated. + :arg outer_tag: The iname tag (see :ref:`iname-tag`) to apply to + *outer_iname*. + :arg inner_tag: The iname tag (see :ref:`iname-tag`) to apply to + *inner_iname*. :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. """ @@ -418,7 +447,8 @@ class _InameJoiner(RuleAwareSubstitutionMapper): from loopy.symbolic import Reduction return Reduction(expr.operation, tuple(new_inames), - self.rec(expr.expr, expn_state)) + self.rec(expr.expr, expn_state), + expr.allow_simultaneous) else: return super(_InameJoiner, self).map_reduction(expr, expn_state) @@ -544,7 +574,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): """Tag an iname - :arg iname_to_tag: a list of tuple ``(iname, new_tag)``. *new_tag* is given + :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given as an instance of a subclass of :class:`loopy.kernel.data.IndexTag` or as a string as shown in :ref:`iname-tags`. May also be a dictionary for backwards compatibility. @@ -650,7 +680,8 @@ class _InameDuplicator(RuleAwareIdentityMapper): from loopy.symbolic import Reduction return Reduction(expr.operation, new_inames, - self.rec(expr.expr, expn_state)) + self.rec(expr.expr, expn_state), + expr.allow_simultaneous) else: return super(_InameDuplicator, self).map_reduction(expr, expn_state) @@ -1021,6 +1052,106 @@ def remove_unused_inames(knl, inames=None): # }}} +# {{{ split_reduction + +class _ReductionSplitter(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, within, inames, direction): + super(_ReductionSplitter, self).__init__( + rule_mapping_context) + + self.within = within + self.inames = inames + self.direction = direction + + def map_reduction(self, expr, expn_state): + if set(expr.inames) & set(expn_state.arg_context): + # FIXME + raise NotImplementedError() + + if (self.inames <= set(expr.inames) + and self.within( + expn_state.kernel, + expn_state.instruction, + expn_state.stack)): + leftover_inames = set(expr.inames) - self.inames + + from loopy.symbolic import Reduction + if self.direction == "in": + return Reduction(expr.operation, tuple(leftover_inames), + Reduction(expr.operation, tuple(self.inames), + self.rec(expr.expr, expn_state), + expr.allow_simultaneous), + expr.allow_simultaneous) + elif self.direction == "out": + return Reduction(expr.operation, tuple(self.inames), + Reduction(expr.operation, tuple(leftover_inames), + self.rec(expr.expr, expn_state), + expr.allow_simultaneous)) + else: + assert False + else: + return super(_ReductionSplitter, self).map_reduction(expr, expn_state) + + +def _split_reduction(kernel, inames, direction, within=None): + if direction not in ["in", "out"]: + raise ValueError("invalid value for 'direction': %s" % direction) + + if isinstance(inames, str): + inames = inames.split(",") + inames = set(inames) + + from loopy.context_matching import parse_stack_match + within = parse_stack_match(within) + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + rsplit = _ReductionSplitter(rule_mapping_context, + within, inames, direction) + return rule_mapping_context.finish_kernel( + rsplit.map_kernel(kernel)) + + +def split_reduction_inward(kernel, inames, within=None): + """Takes a reduction of the form:: + + sum([i,j,k], ...) + + and splits it into two nested reductions:: + + sum([j,k], sum([i], ...)) + + In this case, *inames* would have been ``"i"`` indicating that + the iname ``i`` should be made the iname governing the inner reduction. + + :arg inames: A list of inames, or a comma-separated string that can + be parsed into those + """ + + return _split_reduction(kernel, inames, "in", within) + + +def split_reduction_outward(kernel, inames, within=None): + """Takes a reduction of the form:: + + sum([i,j,k], ...) + + and splits it into two nested reductions:: + + sum([i], sum([j,k], ...)) + + In this case, *inames* would have been ``"i"`` indicating that + the iname ``i`` should be made the iname governing the outer reduction. + + :arg inames: A list of inames, or a comma-separated string that can + be parsed into those + """ + + return _split_reduction(kernel, inames, "out", within) + +# }}} + + # {{{ affine map inames def affine_map_inames(kernel, old_inames, new_inames, equations): @@ -1259,4 +1390,125 @@ def find_unused_axis_tag(kernel, kind, insn_match=None): # }}} + +# {{{ separate_loop_head_tail_slab + +# undocumented, because not super-useful +def separate_loop_head_tail_slab(kernel, iname, head_it_count, tail_it_count): + """Mark *iname* so that the separate code is generated for + the lower *head_it_count* and the upper *tail_it_count* + iterations of the loop on *iname*. + """ + + iname_slab_increments = kernel.iname_slab_increments.copy() + iname_slab_increments[iname] = (head_it_count, tail_it_count) + + return kernel.copy(iname_slab_increments=iname_slab_increments) + +# }}} + + +# {{{ make_reduction_inames_unique + +class _ReductionInameUniquifier(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, inames, within): + super(_ReductionInameUniquifier, self).__init__(rule_mapping_context) + + self.inames = inames + self.old_to_new = [] + self.within = within + + self.iname_to_red_count = {} + self.iname_to_nonsimultaneous_red_count = {} + + def map_reduction(self, expr, expn_state): + within = self.within( + expn_state.kernel, + expn_state.instruction, + expn_state.stack) + + for iname in expr.inames: + self.iname_to_red_count[iname] = ( + self.iname_to_red_count.get(iname, 0) + 1) + if not expr.allow_simultaneous: + self.iname_to_nonsimultaneous_red_count[iname] = ( + self.iname_to_nonsimultaneous_red_count.get(iname, 0) + 1) + + if within and not expr.allow_simultaneous: + subst_dict = {} + + from pymbolic import var + + new_inames = [] + for iname in expr.inames: + if ( + not (self.inames is None or iname in self.inames) + or + self.iname_to_red_count[iname] <= 1): + new_inames.append(iname) + continue + + new_iname = self.rule_mapping_context.make_unique_var_name(iname) + subst_dict[iname] = var(new_iname) + self.old_to_new.append((iname, new_iname)) + new_inames.append(new_iname) + + from loopy.symbolic import SubstitutionMapper + from pymbolic.mapper.substitutor import make_subst_func + + from loopy.symbolic import Reduction + return Reduction(expr.operation, tuple(new_inames), + self.rec( + SubstitutionMapper(make_subst_func(subst_dict))( + expr.expr), + expn_state), + expr.allow_simultaneous) + else: + return super(_ReductionInameUniquifier, self).map_reduction( + expr, expn_state) + + +def make_reduction_inames_unique(kernel, inames=None, within=None): + """ + :arg inames: if not *None*, only apply to these inames + :arg within: a stack match as understood by + :func:`loopy.context_matching.parse_stack_match`. + + .. versionadded:: 2016.2 + """ + + name_gen = kernel.get_var_name_generator() + + from loopy.context_matching import parse_stack_match + within = parse_stack_match(within) + + # {{{ change kernel + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, name_gen) + r_uniq = _ReductionInameUniquifier(rule_mapping_context, + inames, within=within) + + kernel = rule_mapping_context.finish_kernel( + r_uniq.map_kernel(kernel)) + + # }}} + + # {{{ duplicate the inames + + for old_iname, new_iname in r_uniq.old_to_new: + from loopy.kernel.tools import DomainChanger + domch = DomainChanger(kernel, frozenset([old_iname])) + + from loopy.isl_helpers import duplicate_axes + kernel = kernel.copy( + domains=domch.get_domains_with( + duplicate_axes(domch.domain, [old_iname], [new_iname]))) + + # }}} + + return kernel + +# }}} + # vim: foldmethod=marker diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index 1567263cddc8e199f93652ef4c39ef737b86fb3f..f7600b212cbf4db6b58c91bb3603f5a310c6b2a6 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -28,6 +28,42 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl +__doc__ = """ + +.. currentmodule:: loopy + +.. autofunction:: fix_parameters + +.. autofunction:: assume +""" + + +# {{{ assume + +def assume(kernel, assumptions): + """Include an assumption about :ref:`domain-parameters` in the kernel, e.g. + `n mod 4 = 0`. + + :arg assumptions: a :class:`islpy.BasicSet` or a string representation of + the assumptions in :ref:`isl-syntax`. + """ + if isinstance(assumptions, str): + assumptions_set_str = "[%s] -> { : %s}" \ + % (",".join(s for s in kernel.outer_params()), + assumptions) + assumptions = isl.BasicSet.read_from_str(kernel.domains[0].get_ctx(), + assumptions_set_str) + + if not isinstance(assumptions, isl.BasicSet): + raise TypeError("'assumptions' must be a BasicSet or a string") + + old_assumptions, new_assumptions = isl.align_two(kernel.assumptions, assumptions) + + return kernel.copy( + assumptions=old_assumptions.params() & new_assumptions.params()) + +# }}} + # {{{ fix_parameter @@ -99,6 +135,13 @@ def _fix_parameter(kernel, name, value): def fix_parameters(kernel, **value_dict): + """Fix the values of the arguments to specific constants. + + *value_dict* consists of *name*/*value* pairs, where *name* will be fixed + to be *value*. *name* may refer to :ref:`domain-parameters` or + :ref:`arguments`. + """ + for name, value in six.iteritems(value_dict): kernel = _fix_parameter(kernel, name, value) diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 7623fb8911e2a50b08309c0c6cb45a0dd372bfbb..e599c902227faf8d1292ece2307d097bc8fd7c19 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -289,7 +289,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): """Extract an assignment (to a temporary variable or an argument) - as a :ref:`substituiton-rule`. The temporary may be an array, in + as a :ref:`substitution-rule`. The temporary may be an array, in which case the array indices will become arguments to the substitution rule. @@ -349,7 +349,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, usage_to_definition = {} - for insn in kernel.instructions: + for insn in dep_kernel.instructions: if lhs_name not in insn.read_dependency_names(): continue diff --git a/loopy/version.py b/loopy/version.py index 9ad8ac19bebff7a712e91900815057155205ae57..adc069663503b200bcdd1638c05ae0ffae5f14df 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v18-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v19-islpy%s" % _islpy_version diff --git a/setup.py b/setup.py index 4fbb4e142b0680e28e88282409655c5a644a6f4e..5ed095315234339709309a1c55ec88c7fdab6bfa 100644 --- a/setup.py +++ b/setup.py @@ -40,8 +40,9 @@ setup(name="loo.py", "pytools>=2016.1", "pymbolic>=2016.2", "cgen>=2016.1", - "islpy>=2016.1", + "islpy>=2016.1.2", "six>=1.8.0", + "colorama", ], extras_require={ diff --git a/test/test_dg.py b/test/test_dg.py index 63a961423d2f750a4c9a25fdcb5fb56a479d8a35..fafef86c35211183ebdaeb75acf2b664a36586a0 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -52,10 +52,10 @@ def test_dg_volume(ctx_factory): "{[n,m,k]: 0<= n,m < Np and 0<= k < K}", ], """ - <> du_drst = sum(m, DrDsDt[n,m]*u[k,m]) - <> dv_drst = sum(m, DrDsDt[n,m]*v[k,m]) - <> dw_drst = sum(m, DrDsDt[n,m]*w[k,m]) - <> dp_drst = sum(m, DrDsDt[n,m]*p[k,m]) + <> du_drst = simul_reduce(sum, m, DrDsDt[n,m]*u[k,m]) + <> dv_drst = simul_reduce(sum, m, DrDsDt[n,m]*v[k,m]) + <> dw_drst = simul_reduce(sum, m, DrDsDt[n,m]*w[k,m]) + <> dp_drst = simul_reduce(sum, m, DrDsDt[n,m]*p[k,m]) # volume flux rhsu[k,n] = dot(drst_dx[k],dp_drst) diff --git a/test/test_linalg.py b/test/test_linalg.py index 9c6803e93f2e53a0c071e0372bf71256854de38a..6aeec63c49a72c784ad5cccf1ee7acc1fcae0f2a 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -30,6 +30,8 @@ import pyopencl as cl import pyopencl.array as cl_array import loopy as lp +import logging + from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) @@ -61,12 +63,16 @@ def check_float4(result, ref_result): def test_axpy(ctx_factory): + logging.basicConfig(level="INFO") ctx = ctx_factory() n = 3145182 vec = cl_array.vec + if ctx.devices[0].platform.vendor.startswith("Advanced Micro"): + pytest.skip("crashes on AMD 15.12") + for dtype, check, a, b in [ (np.complex64, None, 5, 7), (vec.float4, check_float4, @@ -111,7 +117,8 @@ def test_axpy(ctx_factory): lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), op_count=[np.dtype(dtype).itemsize*n*3/1e9], op_label=["GBytes"], - parameters={"a": a, "b": b, "n": n}, check_result=check) + parameters={"a": a, "b": b, "n": n}, check_result=check, + blacklist_ref_vendors=["Advanced Micro"]) def test_transpose(ctx_factory): @@ -463,7 +470,7 @@ def test_magma_fermi_matrix_mul(ctx_factory): lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], - parameters={}) + parameters={}, blacklist_ref_vendors="pocl") def test_image_matrix_mul(ctx_factory): diff --git a/test/test_loopy.py b/test/test_loopy.py index 667d7365d412d9be6d9a51c8d63c9470ffede2f9..606eec7667ca3d76a215c3b487e8c93bc371c36e 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -529,6 +529,9 @@ def test_fuzz_code_generator(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) + if ctx.devices[0].platform.vendor.startswith("Advanced Micro"): + pytest.skip("crashes on AMD 15.12") + #from expr_fuzz import get_fuzz_examples #for expr, var_values in get_fuzz_examples(): for expr, var_values in generate_random_fuzz_examples(50): @@ -941,6 +944,27 @@ def test_double_sum(ctx_factory): n = 20 + knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n }", + [ + "a = simul_reduce(sum, (i,j), i*j)", + "b = simul_reduce(sum, i, simul_reduce(sum, j, i*j))", + ], + assumptions="n>=1") + + evt, (a, b) = knl(queue, n=n) + + ref = sum(i*j for i in range(n) for j in range(n)) + assert a.get() == ref + assert b.get() == ref + + +def test_double_sum_made_unique(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 20 + knl = lp.make_kernel( "{[i,j]: 0<=i,j<n }", [ @@ -949,9 +973,10 @@ def test_double_sum(ctx_factory): ], assumptions="n>=1") - cknl = lp.CompiledKernel(ctx, knl) + knl = lp.make_reduction_inames_unique(knl) + print(knl) - evt, (a, b) = cknl(queue, n=n) + evt, (a, b) = knl(queue, n=n) ref = sum(i*j for i in range(n) for j in range(n)) assert a.get() == ref @@ -1095,8 +1120,8 @@ def test_arg_guessing_with_reduction(ctx_factory): knl = lp.make_kernel( "{[i,j]: 0<=i,j<n }", """ - a = 1.5 + sum((i,j), i*j) - d = 1.5 + sum((i,j), b[i,j]) + a = 1.5 + simul_reduce(sum, (i,j), i*j) + d = 1.5 + simul_reduce(sum, (i,j), b[i,j]) b[i, j] = i*j c[i+j, j] = b[j,i] """, @@ -1892,19 +1917,22 @@ def test_poisson(ctx_factory): sdim = 3 knl = lp.make_kernel( - "{ [c,i,j,k,ell,ell2]: \ + "{ [c,i,j,k,ell,ell2,ell3]: \ 0 <= c < nels and \ 0 <= i < nbf and \ 0 <= j < nbf and \ 0 <= k < nqp and \ - 0 <= ell < sdim and \ - 0 <= ell2 < sdim }", + 0 <= ell,ell2 < sdim}", """ - dpsi(bf,k0,dir) := sum(ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] ) - Ael[c,i,j] = J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell)) + dpsi(bf,k0,dir) := \ + simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] ) + Ael[c,i,j] = \ + J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell)) """, assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0") + print(knl) + knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp) ref_knl = knl @@ -1975,7 +2003,7 @@ def test_generate_c_snippet(): u = var("u") from functools import partial - l_sum = partial(lp.Reduction, "sum") + l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa @@ -2266,7 +2294,10 @@ def test_finite_difference_expr_subst(ctx_factory): lp.GlobalArg("u", shape="n+2"), ]) - fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl]) + fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl], + data_flow=[ + ("f", 1, 0) + ]) fused_knl = lp.set_options(fused_knl, write_cl=True) evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 4350a3878469cb7ccd16ba5f0dda2d287d7f2136..3eacbaa2850b12ab1130a0f4b02ac5698bc9fab9 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -179,7 +179,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", - init_expression="0") + init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv @@ -242,7 +242,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), - do_check=False, quiet=True) + quiet=True) elapsed = results["elapsed_wall"] diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py index a00fce1776dd8d0722e59ae544ce439568a66d5d..f4b91b236a16a8a30f2962d92b83780343fe801a 100644 --- a/test/test_sem_reagan.py +++ b/test/test_sem_reagan.py @@ -47,16 +47,16 @@ def test_tim2d(ctx_factory): knl = lp.make_kernel( "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, [ - "ur(a,b) := sum(o, D[a,o]*u[e,o,b])", - "us(a,b) := sum(o, D[b,o]*u[e,a,o])", + "ur(a,b) := simul_reduce(sum, o, D[a,o]*u[e,o,b])", + "us(a,b) := simul_reduce(sum, o, D[b,o]*u[e,a,o])", #"Gu(mat_entry,a,b) := G[mat_entry,e,m,j]*ur(m,j)", "Gux(a,b) := G$x[0,e,a,b]*ur(a,b)+G$x[1,e,a,b]*us(a,b)", "Guy(a,b) := G$y[1,e,a,b]*ur(a,b)+G$y[2,e,a,b]*us(a,b)", "lap[e,i,j] = " - " sum(m, D[m,i]*Gux(m,j))" - "+ sum(m, D[m,j]*Guy(i,m))" + " simul_reduce(sum, m, D[m,i]*Gux(m,j))" + "+ simul_reduce(sum, m, D[m,j]*Guy(i,m))" ], [