Compare revisions

37ca145f · 37ca145f · 37ca145f · adf1b10e · adf1b10e · adf1b10e
--- a/build-helpers/make-linux-build-docker.sh
+++ b/build-helpers/make-linux-build-docker.sh
-#! /bin/bash
-
-# should be run in this directory (build-helpers)
-
-if test "$1" = "--nodate"; then
-  TGT_NAME=loopy-centos6
-else
-  TGT_NAME=loopy-centos6-$(date +"%Y-%m-%d")
-fi
-
-echo "Generating $TGT_NAME..."
-
-set -e
-set -x
-
-docker pull centos:6
-
-CNT=$(docker create -t -v $(pwd):/mnt centos:6 /mnt/make-linux-build-docker-inner.sh)
-echo "working in container $CNT"
-
-docker start -i $CNT
-
-docker cp $CNT:/tmp/build/loopy/dist/loopy $(pwd) || true
-
-mv loopy $TGT_NAME
-
-docker rm $CNT
-
--- a/build-helpers/run-pyinstaller.sh
+++ b/build-helpers/run-pyinstaller.sh
-#! /bin/bash
-
-# run this from the loopy root directory
-
-rm -Rf dist build
-
-pyinstaller \
-  --workpath=build/pyinstaller \
-  build-helpers/loopy.spec
--- a/build-helpers/upload.sh
+++ b/build-helpers/upload.sh
-#! /bin/bash
-
-set -e
-
-scp "$1" tiker.net:public_html/pub/loopy-binaries/
--- a/contrib/c-integer-semantics.py
+++ b/contrib/c-integer-semantics.py
+#!/usr/bin/env python
+
+import ctypes
+from os import system
+
+
+C_SRC = """
+#include <stdlib.h>
+#include <stdint.h>
+
+int64_t cdiv(int64_t a, int64_t b)
+{
+    return a/b;
+}
+
+int64_t cmod(int64_t a, int64_t b)
+{
+    return a%b;
+}
+
+#define LOOPY_CALL_WITH_INTEGER_TYPES(MACRO_NAME) \
+    MACRO_NAME(int8, char) \
+    MACRO_NAME(int16, short) \
+    MACRO_NAME(int32, int) \
+    MACRO_NAME(int64, long long)
+
+#define LOOPY_DEFINE_FLOOR_DIV(SUFFIX, TYPE) \
+    TYPE loopy_floor_div_##SUFFIX(TYPE a, TYPE b) \
+    { \
+        if ((a<0) != (b<0)) \
+            a = a - (b + (b<0) - (b>=0)); \
+        return a/b; \
+    }
+
+LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV)
+#undef LOOPY_DEFINE_FLOOR_DIV
+
+#define LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \
+    TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \
+    { \
+        if (a<0) \
+            a = a - (b-1); \
+        return a/b; \
+    }
+
+LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B)
+#undef LOOPY_DEFINE_FLOOR_DIV_POS_B
+
+
+#define LOOPY_DEFINE_MOD_POS_B(SUFFIX, TYPE) \
+    TYPE loopy_mod_pos_b_##SUFFIX(TYPE a, TYPE b) \
+    { \
+        TYPE result = a%b; \
+        if (result < 0) \
+            result += b; \
+        return result; \
+    }
+
+LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD_POS_B)
+#undef LOOPY_DEFINE_MOD_POS_B
+
+#define LOOPY_DEFINE_MOD(SUFFIX, TYPE) \
+    TYPE loopy_mod_##SUFFIX(TYPE a, TYPE b) \
+    { \
+        TYPE result = a%b; \
+        if (result < 0 && b > 0) \
+            result += b; \
+        if (result > 0 && b < 0) \
+            result = result + b; \
+        return result; \
+    }
+
+LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD)
+#undef LOOPY_DEFINE_MOD
+
+
+"""
+
+
+def main():
+    with open("int-experiments.c", "w") as outf:
+        outf.write(C_SRC)
+
+    system("gcc -Wall -shared int-experiments.c -o int-experiments.so")
+
+    int_exp = ctypes.CDLL("int-experiments.so")
+    for func in [
+            int_exp.cdiv,
+            int_exp.cmod,
+            int_exp.loopy_floor_div_int64,
+            int_exp.loopy_floor_div_pos_b_int64,
+            int_exp.loopy_mod_pos_b_int64,
+            int_exp.loopy_mod_int64,
+            ]:
+        func.argtypes = [ctypes.c_longlong, ctypes.c_longlong]
+        func.restype = ctypes.c_longlong
+
+    cmod = int_exp.cmod
+    int_floor_div = int_exp.loopy_floor_div_int64
+    int_floor_div_pos_b = int_exp.loopy_floor_div_pos_b_int64
+    int_mod_pos_b = int_exp.loopy_mod_pos_b_int64
+    int_mod = int_exp.loopy_mod_int64
+
+    m = 50
+
+    for a in range(-m, m):
+        for b in range(1, m):
+            cresult = int_floor_div_pos_b(a, b)
+            presult = a // b
+            assert cresult == presult
+            if cresult != presult:
+                print(a, b, cresult, presult)
+
+    for a in range(-m, m):
+        for b in range(-m, m):
+            if b == 0:
+                continue
+
+            cresult = int_floor_div(a, b)
+            presult = a // b
+            assert cresult == presult
+            if cresult != presult:
+                print(a, b, cresult, presult)
+
+    for a in range(-m, m):
+        for b in range(1, m):
+            cresult = int_mod_pos_b(a, b)
+            presult = a % b
+            assert cresult == presult
+
+    for a in range(-m, m):
+        for b in range(-m, m):
+            if b == 0:
+                continue
+
+            cresult = int_mod(a, b)
+            presult = a % b
+            assert cresult == presult
+            if cresult != presult:
+                print(a, b, cresult, presult)
+
+    # print(int_mod(552, -918), 552 % -918)
+    print(cmod(23, -11), 23 % -11)
+
+
+if __name__ == "__main__":
+    main()
--- a/contrib/mem-pattern-explorer/pattern_vis.py
+++ b/contrib/mem-pattern-explorer/pattern_vis.py
+import numpy as np
+
+
+# Inspired by a visualization used in the Halide tutorial
+# https://www.youtube.com/watch?v=3uiEyEKji0M
+
+
+def div_ceil(nr, dr):
+    return -(-nr // dr)
+
+
+def product(iterable):
+    from functools import reduce
+    from operator import mul
+    return reduce(mul, iterable, 1)
+
+
+class ArrayAccessPatternContext:
+    def __init__(self, gsize, lsize, subgroup_size=32, decay_constant=0.75):
+        self.lsize = lsize
+        self.gsize = gsize
+        self.subgroup_size = subgroup_size
+        self.timestamp = 0
+        self.decay_constant = decay_constant
+
+        self.ind_length = len(gsize) + len(lsize)
+
+        self.arrays = []
+
+    def l(self, index):  # noqa: E743
+        subscript = [np.newaxis] * self.ind_length
+        subscript[len(self.gsize) + index] = slice(None)
+
+        return np.arange(self.lsize[index])[tuple(subscript)]
+
+    def g(self, index):
+        subscript = [np.newaxis] * self.ind_length
+        subscript[index] = slice(None)
+
+        return np.arange(self.gsize[index])[tuple(subscript)]
+
+    def nsubgroups(self):
+        return div_ceil(product(self.lsize), self.subgroup_size)
+
+    def animate(self, f, interval=200):
+        import matplotlib.animation as animation
+        import matplotlib.pyplot as plt
+
+        fig = plt.figure()
+
+        plots = []
+        for iary, ary in enumerate(self.arrays):
+            ax = fig.add_subplot(1, len(self.arrays), 1+iary)
+            ax.set_title(ary.name)
+            plots.append(ary.plot(ax))
+
+        def data_gen():
+            for _ in f():
+                self.tick()
+
+                for ary, plot in zip(self.arrays, plots):
+                    plot.set_array(ary.get_plot_data())
+
+                fig.canvas.draw()
+                yield plots
+
+        # must be kept alive until after plt.show()
+        return animation.FuncAnimation(
+                fig, lambda x: x, data_gen,
+                blit=False, interval=interval, repeat=True)
+
+    def tick(self):
+        self.timestamp += 1
+
+
+class Array:
+    def __init__(self, ctx, name, shape, strides, elements_per_row=None):
+        # Each array element stores a tuple:
+        # (timestamp, subgroup, g0, g1, g2, ) of last access
+
+        assert len(shape) == len(strides)
+
+        self.nattributes = 2+len(ctx.gsize)
+
+        if elements_per_row is None:
+            if len(shape) > 1:
+                minstride = min(strides)
+                for sh_i, st_i in zip(shape, strides):
+                    if st_i == minstride:
+                        elements_per_row = sh_i
+                        break
+        else:
+            elements_per_row = 256
+
+        self.array = np.zeros((product(shape), self.nattributes,), dtype=np.int32)
+
+        self.ctx = ctx
+        self.name = name
+        self.shape = shape
+        self.strides = strides
+        self.elements_per_row = elements_per_row
+
+        ctx.arrays.append(self)
+
+    def __getitem__(self, index):
+        if not isinstance(index, tuple):
+            index = (index,)
+
+        assert len(index) == len(self.shape)
+
+        all_subscript = (np.newaxis,) * self.ctx.ind_length
+
+        def reshape_ind(ind):
+            if not isinstance(ind, np.ndarray):
+                return ind[all_subscript]
+
+            else:
+                assert len(ind.shape) == self.ctx.ind_length
+
+        lin_index = sum(
+                ind_i * stride_i
+                for ind_i, stride_i in zip(index, self.strides))
+
+        if not isinstance(lin_index, np.ndarray):
+            subscript = [np.newaxis] * self.ctx.ind_length
+            lin_index = np.array(lin_index)[subscript]
+
+        self.array[lin_index, 0] = self.ctx.timestamp
+        for i, _glength in enumerate(self.ctx.gsize):
+            if lin_index.shape[i] > 1:
+                self.array[lin_index, 2+i] = self.ctx.g(i)
+
+        workitem_index = 0
+        for i in range(len(self.ctx.lsize))[::-1]:
+            workitem_index = (
+                    workitem_index * self.ctx.lsize[i]
+                    + self.ctx.l(i))
+        subgroup = workitem_index//self.ctx.subgroup_size
+        self.array[lin_index, 1] = subgroup
+
+    def __setitem__(self, index, value):
+        self.__getitem__(index)
+
+    def get_plot_data(self):
+        nelements = self.array.shape[0]
+        base_shape = (
+                div_ceil(nelements, self.elements_per_row),
+                self.elements_per_row,)
+        shaped_array = np.zeros(
+                (*base_shape, self.nattributes),
+                dtype=np.float32)
+        shaped_array.reshape(-1, self.nattributes)[:nelements] = self.array
+
+        modulation = np.exp(
+                -self.ctx.decay_constant*(self.ctx.timestamp-shaped_array[:, :, 0]))
+
+        subgroup = shaped_array[:, :, 1]
+        if self.ctx.nsubgroups() > 1:
+            subgroup = subgroup/(self.ctx.nsubgroups()-1)
+        else:
+            subgroup.fill(1)
+
+        rgb_array = np.zeros((*base_shape, 3))
+        if 1:
+            if len(self.ctx.gsize) > 1:
+                # g.0 -> red
+                rgb_array[:, :, 0] = shaped_array[:, :, 2]/(self.ctx.gsize[0]-1)
+            if len(self.ctx.gsize) > 1:
+                # g.1 -> blue
+                rgb_array[:, :, 2] = shaped_array[:, :, 3]/(self.ctx.gsize[1]-1)
+        if 1:
+            rgb_array[:, :, 1] = subgroup
+
+        return rgb_array*modulation[:, :, np.newaxis]
+
+    def plot(self, ax, **kwargs):
+        return ax.imshow(
+                self.get_plot_data(), interpolation="nearest",
+                **kwargs)
+
+
+def show_example():
+    n = 2**7
+    n16 = div_ceil(n, 16)
+    ctx = ArrayAccessPatternContext(gsize=(n16, n16), lsize=(16, 16))
+    in0 = Array(ctx, "in0", (n, n), (n, 1))
+
+    if 0:
+        # knl a
+        i_inner = ctx.l(1)
+        i_outer = ctx.g(1)
+        k_inner = ctx.l(0)
+
+        def f():
+            for k_outer in range(n16):
+                in0[i_inner + i_outer*16, k_inner + k_outer*16]
+                yield
+    elif 0:
+        # knl b
+        j_inner = ctx.l(0)
+        j_outer = ctx.g(0)
+        k_inner = ctx.l(1)
+
+        def f():
+            for k_outer in range(n16):
+                in0[k_inner + k_outer*16, j_inner + j_outer*16]
+                yield
+
+    ani = ctx.animate(f)
+    import matplotlib.pyplot as plt
+    if 1:
+        plt.show()
+    else:
+        ani.save("access.mp4")
+
+
+def show_example_2():
+    bsize = 8
+    blocks = 3
+
+    ctx = ArrayAccessPatternContext(gsize=(1,), lsize=(1,),
+            decay_constant=0.005)
+    in0 = Array(ctx, "in0", (blocks*bsize, blocks*bsize), (blocks*bsize, 1))
+
+    def f():
+        for i_outer in range(blocks):
+            for j_outer in range(blocks):
+                for i_inner in range(bsize):
+                    for j_inner in range(bsize):
+                        in0[i_inner + i_outer*bsize, j_inner + j_outer*bsize]
+                        yield
+
+    ani = ctx.animate(f, interval=10)
+    import matplotlib.pyplot as plt
+    if 1:
+        plt.show()
+    else:
+        ani.save("access.mp4")
+
+
+if __name__ == "__main__":
+    show_example_2()
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -3,7 +3,7 @@

 # You can set these variables from the command line.
 SPHINXOPTS    =
-SPHINXBUILD   = python ` which sphinx-build`
+SPHINXBUILD   = python `which sphinx-build`
 PAPER         =
 BUILDDIR      = _build


--- a/doc/conf.py
+++ b/doc/conf.py
-# -*- coding: utf-8 -*-
-#
-# loopy documentation build configuration file, created by
-# sphinx-quickstart on Tue Aug  9 13:40:49 2011.
-#
-# This file is execfile()d with the current directory set to its containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
+import os
+from urllib.request import urlopen

-#import sys, os

-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+_conf_url = "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py"
+with urlopen(_conf_url) as _inf:
+    exec(compile(_inf.read(), _conf_url, "exec"), globals())

-# -- General configuration -----------------------------------------------------
+copyright = "2016, Andreas Klöckner"

-# If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be extensions
-# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = [
-        'sphinx.ext.autodoc',
-        'sphinx.ext.intersphinx',
-        #'sphinx.ext.viewcode',
-        'sphinx.ext.doctest',
-        ]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix of source filenames.
-source_suffix = '.rst'
-
-# The encoding of source files.
-#source_encoding = 'utf-8-sig'
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = u'loopy'
-copyright = u'2016, Andreas Klöckner'
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
 # The short X.Y version.
 ver_dic = {}
 _version_source = "../loopy/version.py"
 with open(_version_source) as vpy_file:
    version_py = vpy_file.read()

-exec(compile(version_py, _version_source, 'exec'), ver_dic)
+os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"] = "1"
+exec(compile(version_py, _version_source, "exec"), ver_dic)
 version = ".".join(str(x) for x in ver_dic["VERSION"])
 # The full version, including alpha/beta/rc tags.
 release = ver_dic["VERSION_TEXT"]
+del os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"]

-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#language = None
-
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-#today = ''
-# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-exclude_patterns = ['_build']
-
-# The reST default role (used for this markup: `text`) to use for all documents.
-#default_role = None
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
-
-# If true, the current module name will be prepended to all description
-# unit titles (such as .. function::).
-#add_module_names = True
-
-# If true, sectionauthor and moduleauthor directives will be shown in the
-# output. They are ignored by default.
-#show_authors = False
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+exclude_patterns = ["_build"]

-# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
-
-
-# -- Options for HTML output ---------------------------------------------------
-
-html_theme = "alabaster"
-
-html_theme_options = {
-        "extra_nav_links": {
-            "🚀 Github": "https://github.com/inducer/loopy",
-            "💾 Download Releases": "https://pypi.python.org/pypi/loo.py",
-            }
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+        "python": ("https://docs.python.org/3", None),
+        "numpy": ("https://numpy.org/doc/stable/", None),
+        "pytools": ("https://documen.tician.de/pytools", None),
+        "islpy": ("https://documen.tician.de/islpy", None),
+        "pyopencl": ("https://documen.tician.de/pyopencl", None),
+        "cgen": ("https://documen.tician.de/cgen", None),
+        "pymbolic": ("https://documen.tician.de/pymbolic", None),
+        "constantdict": ("https://matthiasdiener.github.io/constantdict/", None),
        }

-html_sidebars = {
-    '**': [
-        'about.html',
-        'navigation.html',
-        'relations.html',
-        'searchbox.html',
-    ]
-}
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#html_theme_options = {}
-
-# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
-
-# The name for this set of Sphinx documents.  If None, it defaults to
-# "<project> v<release> documentation".
-#html_title = None
-
-# A shorter title for the navigation bar.  Default is the same as html_title.
-#html_short_title = None
-
-# The name of an image file (relative to this directory) to place at the top
-# of the sidebar.
-#html_logo = None
-
-# The name of an image file (within the static path) to use as favicon of the
-# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-#html_favicon = None
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
-
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-#html_use_smartypants = True
+nitpicky = True

-# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+nitpick_ignore_regex = [
+        ["py:class", r"typing_extensions\.(.+)"],
+        ["py:class", r"numpy\.u?int[0-9]+"],
+        ["py:class", r"numpy\.float[0-9]+"],
+        ["py:class", r"numpy\.complex[0-9]+"],

-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-#html_additional_pages = {}
+        # Reference not found from "<unknown>"? I'm not even sure where to look.
+        ["py:class", r"ExpressionNode"],

-# If false, no module index is generated.
-#html_domain_indices = True
-
-# If false, no index is generated.
-#html_use_index = True
-
-# If true, the index is split into individual pages for each letter.
-#html_split_index = False
-
-# If true, links to the reST sources are added to the pages.
-html_show_sourcelink = False
-
-# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
-
-# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
-
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a <link> tag referring to it.  The value of this option must be the
-# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
-
-# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'loopydoc'
-
-
-# -- Options for LaTeX output --------------------------------------------------
-
-# The paper size ('letter' or 'a4').
-#latex_paper_size = 'letter'
-
-# The font size ('10pt', '11pt' or '12pt').
-#latex_font_size = '10pt'
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title, author, documentclass [howto/manual]).
-latex_documents = [
-        ('index', 'loopy.tex', u'loopy Documentation',
-            u'Andreas Kloeckner', 'manual'),
-]
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-#latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-#latex_use_parts = False
-
-# If true, show page references after internal links.
-#latex_show_pagerefs = False
-
-# If true, show URL addresses after external links.
-#latex_show_urls = False
-
-# Additional stuff for the LaTeX preamble.
-#latex_preamble = ''
-
-# Documents to append as an appendix to all manuals.
-#latex_appendices = []
-
-# If false, no module index is generated.
-#latex_domain_indices = True
-
-
-# -- Options for manual page output --------------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    ('index', 'loopy', u'loopy Documentation',
-     [u'Andreas Kloeckner'], 1)
-]
-
-
-# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {
-    'https://docs.python.org/3': None,
-    'https://documen.tician.de/islpy': None,
-    'https://documen.tician.de/pyopencl': None,
-    'https://documen.tician.de/cgen': None,
-    'https://docs.scipy.org/doc/numpy/': None,
-    }
-
-autoclass_content = "class"
+        # Type aliases
+        ["py:class", r"InameStr"],
+        ["py:class", r"ConcreteCallablesTable"],
+        ["py:class", r"LoopNestTree"],
+        ["py:class", r"LoopTree"],
+        ["py:class", r"ToLoopyTypeConvertible"],
+        ["py:class", r"ToStackMatchConvertible"],
+        ]
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -18,43 +18,20 @@ When you run this script, the following kernel is generated, compiled, and execu

 (See the full example for how to print the generated code.)

-Want to try out loopy?
----------------------
-
-There's no need to go through :ref:`installation` if you'd just like to get a
-feel for what loopy is.  Instead, you may
-`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/builds/1989/artifacts/browse/build-helpers/>`_.
-This is purposefully built on an ancient Linux distribution, so it should work
-on most versions of Linux that are currently out there.
-
-Once you have the binary, do the following::
-
-    chmod +x ./loopy-centos6
-    ./loopy-centos6 --target=opencl hello-loopy-lp.py
-    ./loopy-centos6 --target=cuda hello-loopy-lp.py
-    ./loopy-centos6 --target=ispc hello-loopy-lp.py
-
-Grab the example here: :download:`examples/python/hello-loopy.py <../examples/python/hello-loopy-lp.py>`.
-
-You may also donwload the most recent version by going to the `list of builds
-<https://gitlab.tiker.net/inducer/loopy/builds>`_, clicking on the newest one
-of type "CentOS binary", clicking on "Browse" under "Build Artifacts", then
-navigating to "build-helpers", and downloading the binary from there.
+.. _static-binary:

 Places on the web related to Loopy
 ----------------------------------

-* `Python package index <http://pypi.python.org/pypi/loo.py>`_ (download releases) Note the extra '.' in the PyPI identifier!
-
-* `Github <http://github.com/inducer/loopy>`_ (get latest source code, file bugs)
-* `Wiki <http://wiki.tiker.net/Loopy>`_ (read installation tips, get examples, read FAQ)
-* `Homepage <http://mathema.tician.de/software/loopy>`_
+* `Python package index <https://pypi.org/project/loopy>`_ (download releases)
+* `Github <https://github.com/inducer/loopy>`_ (get latest source code, file bugs)
+* `Homepage <https://mathema.tician.de/software/loopy>`_

 Table of Contents
 -----------------

 If you're only just learning about loopy, consider the following `paper
-<http://arxiv.org/abs/1405.7470>`_ on loo.py that may serve as a good
+<https://arxiv.org/abs/1405.7470>`_ on loopy that may serve as a good
 introduction.

 Please check :ref:`installation` to get started.
@@ -65,9 +42,14 @@ Please check :ref:`installation` to get started.
    tutorial
    ref_creation
    ref_kernel
+    ref_translation_unit
    ref_transform
+    ref_call
    ref_other
    misc
+    ref_internals
+    🚀 Github <https://github.com/inducer/loopy>
+    💾 Download Releases <https://pypi.org/project/loopy>

 Indices and tables
 ==================

--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -3,11 +3,21 @@
 Installation
 ============

-This command should install :mod:`loopy`::
+Option 0: Static Binary
+-----------------------
+
+If you would just like to experiment with :mod:`loopy`'s code transformation
+abilities, the easiest way to get loopy is to download a statically-linked
+Linux binary.

-    pip install loo.py
+See :ref:`static-binary` for details.

-(Note the extra "."!)
+Option 1: From Source, no PyOpenCL integration
+-----------------------------------------------
+
+This command should install :mod:`loopy`::
+
+    pip install loopy

 You may need to run this with :command:`sudo`.
 If you don't already have `pip <https://pypi.python.org/pypi/pip>`_,
@@ -17,19 +27,66 @@ run this beforehand::
    python get-pip.py

 For a more manual installation, `download the source
-<http://pypi.python.org/pypi/loo.py>`_, unpack it, and say::
+<https://pypi.org/project/loopy>`_, unpack it, and say::

    python setup.py install

 You may also clone its git repository::

-    git clone --recursive git://github.com/inducer/loopy
-    git clone --recursive http://git.tiker.net/trees/loopy.git
+    git clone --recursive https://github.com/inducer/loopy.git
+
+Option 2: From Conda Forge, with PyOpenCL integration
+-----------------------------------------------------
+
+This set of instructions is intended for 64-bit Linux and
+MacOS support computers:
+
+#.  Make sure your system has the basics to build software.
+
+    On Debian derivatives (Ubuntu and many more),
+    installing ``build-essential`` should do the trick.
+
+    Everywhere else, just making sure you have the ``g++`` package should be
+    enough.
+
+#.  Install `miniforge <https://github.com/conda-forge/miniforge>`_.
+
+#.  ``export CONDA=/WHERE/YOU/INSTALLED/miniforge3``
+
+    If you accepted the default location, this should work:
+
+    ``export CONDA=$HOME/miniforge3``
+
+#.  ``$CONDA/bin/conda create -n dev``
+
+#.  ``source $CONDA/bin/activate dev``
+
+#.  ``conda install git pip pocl islpy pyopencl`` (Linux)
+
+    or
+
+    ``conda install osx-pocl-opencl git pip pocl islpy pyopencl`` (OS X)
+
+#.  Type the following command::
+
+        pip install git+https://github.com/inducer/loopy
+
+Next time you want to use :mod:`loopy`, just run the following command::
+
+    source /WHERE/YOU/INSTALLED/miniforge3/bin/activate dev
+
+You may also like to add this to a startup file (like :file:`$HOME/.bashrc`) or create an alias for it.
+
+See the `PyOpenCL installation instructions
+<https://documen.tician.de/pyopencl/misc.html#installation>`_ for options
+regarding OpenCL drivers.

 User-visible Changes
 ====================

-Version 2016.2
+See also :ref:`language-versioning`.
+
+Version 2018.1
 --------------
 .. note::

@@ -57,7 +114,7 @@ Licensing

 Loopy is licensed to you under the MIT/X Consortium license:

-Copyright (c) 2009-13 Andreas Klöckner and Contributors.
+Copyright (c) 2009-17 Andreas Klöckner and Contributors.

 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
@@ -101,7 +158,7 @@ In the meantime, you can generate code simply by saying::
    print(cg_result.host_code())
    print(cg_result.device_code())

-Additionally, for C-based languages, header defintions are available via::
+Additionally, for C-based languages, header definitions are available via::

    loopy.generate_header(knl)

@@ -131,7 +188,7 @@ source of examples. Here are some links:
 Here's a more complicated example of a loopy code:

 .. literalinclude:: ../examples/python/find-centers.py
-    :language: c
+    :language: python

 This example is included in the :mod:`loopy` distribution as
 :download:`examples/python/find-centers.py <../examples/python/find-centers.py>`.
@@ -193,7 +250,7 @@ This list is always growing, but here are a few pointers:
 * Precompute subexpressions:

  Use a :ref:`substitution-rule` to assign a name to a subexpression,
-  using may be :func:`loopy.assignment_to_subst` or :func:`extract_subst`.
+  using may be :func:`loopy.assignment_to_subst` or :func:`loopy.extract_subst`.
  Then use :func:`loopy.precompute` to create an (array or scalar)
  temporary with precomputed values.

@@ -232,12 +289,12 @@ This list is always growing, but here are a few pointers:
  Use :func:`loopy.tag_inames` with the ``"vec"`` iname tag.
  Note that the corresponding axis of an array must
  also be tagged using the ``"vec"`` array axis tag
-  (using :func:`tag_array_axes`) in order for vector code to be
+  (using :func:`loopy.tag_array_axes`) in order for vector code to be
  generated.

  Vectorized loops (and array axes) must have a fixed size. (See either
-  :func:`split_iname` or :func:`fix_parameters` along with
-  :func:`split_array_axis`.)
+  :func:`loopy.split_iname` or :func:`loopy.fix_parameters` along with
+  :func:`loopy.split_array_axis`.)

 * Reuse of Temporary Storage

@@ -246,7 +303,7 @@ This list is always growing, but here are a few pointers:

 * SoA $\leftrightarrow$ AoS

-  Use :func:`tag_array_axes` with the ``"sep"`` array axis tag
+  Use :func:`loopy.tag_array_axes` with the ``"sep"`` array axis tag
  to generate separate arrays for each entry of a short, fixed-length
  array axis.

@@ -257,7 +314,7 @@ This list is always growing, but here are a few pointers:

  Use :func:`loopy.tag_inames` with the ``"ilp"`` tag.
  ILP loops must have a fixed size. (See either
-  :func:`split_iname` or :func:`fix_parameters`.)
+  :func:`loopy.split_iname` or :func:`loopy.fix_parameters`.)

 * Type inference

@@ -275,12 +332,69 @@ This list is always growing, but here are a few pointers:

 * Interface with your own library functions

-  Use :func:`loopy.register_function_manglers`.
+  See :ref:`func-interface` for details.

 * Loop collapse

  Use :func:`loopy.join_inames`.

+In what sense does Loopy support vectorization?
+-----------------------------------------------
+
+There are really two ways in which the OpenCL/CUDA model of computation exposes
+vectorization:
+
+* "SIMT": The user writes scalar program instances and either the compiler or
+  the hardware joins the individual program instances into vectors of a
+  hardware-given length for execution.
+
+* "Short vectors": This type of vectorization is based on vector types,
+  e.g. ``float4``, which support arithmetic with implicit vector semantics
+  as well as a number of 'intrinsic' functions.
+
+Loopy supports both. The first one, SIMT, is accessible by tagging inames with,
+e.g., ``l.0```. Accessing the second one requires using both execution- and
+data-reshaping capabilities in loopy. To start with, you need an array that
+has an axis with the length of the desired vector. If that's not yet available,
+you may use :func:`loopy.split_array_axis` to produce one. Similarly, you need
+an iname whose bounds match those of the desired vector length. Again, if you
+don't already have one, :func:`loopy.split_iname` will easily produce one.
+Lastly, both the array axis an the iname need the implementation tag ``"vec"``.
+Here is an example of this machinery in action:
+
+.. literalinclude:: ../examples/python/vector-types.py
+    :language: python
+
+Note how the example slices off the last 'slab' of iterations to ensure that
+the bulk of the iteration does not require conditionals which would prevent
+successful vectorization. This generates the following code:
+
+.. literalinclude:: ../examples/python/vector-types.cl
+    :language: c
+
+What is the story with language versioning?
+-------------------------------------------
+
+The idea is to keep supporting multiple versions at a time. There's a
+tension in loopy between the need to build code that keeps working
+unchanged for some number of years, and needing the language to
+evolve--not just as a research vehicle, but also to enable to respond
+to emerging needs in applications and hardware.
+
+The idea is not to support all versions indefinitely, merely to allow
+users to upgrade on their own schedule on the scale of a couple years.
+Warnings about needing to upgrade would get noisier as a version nears
+deprecation. In a way, it is intended to be a version of Python's
+`__future__` flags, which IMO have the served the language tremendously
+well.
+
+One can also obtain the current language version programmatically:
+:data:`loopy.MOST_RECENT_LANGUAGE_VERSION`.
+But pinning your code to that would mean choosing to not use the
+potentially valuable guarantee to keep existing code working unchanged
+for a while. Instead, it might be wiser to just grab the version of the
+language current at the time of writing the code.
+
 Uh-oh. I got a scheduling error. Any hints?
 -------------------------------------------

@@ -325,7 +439,7 @@ If you use loopy for your work and find its approach helpful, please
 consider citing the following article.

    A. Klöckner. `Loo.py: transformation-based code generation for GPUs and
-    CPUs <http://arxiv.org/abs/1405.7470>`_. Proceedings of ARRAY '14: ACM
+    CPUs <https://arxiv.org/abs/1405.7470>`_. Proceedings of ARRAY '14: ACM
    SIGPLAN Workshop on Libraries, Languages, and Compilers for Array
    Programming. Edinburgh, Scotland.

@@ -342,14 +456,33 @@ Here's a Bibtex entry for your convenience::
       doi = "{10.1145/2627373.2627387}",
    }

+Getting help
+============
+
+Email the friendly folks on the `loopy mailing list <https://lists.tiker.net/listinfo/loopy>`_.
+
 Acknowledgments
 ===============

-Andreas Klöckner's work on :mod:`loopy` was supported in part by
+Work on loopy was supported in part by
+
+- the Department of Energy, National Nuclear Security Administration, under Award Number DE-NA0003963,
+- the US Navy ONR, under grant number N00014-14-1-0117, and
+- the US National Science Foundation under grant numbers DMS-1418961, CCF-1524433, DMS-1654756, SHF-1911019, and OAC-1931577.
+
+AK also gratefully acknowledges a hardware gift from Nvidia Corporation.
+
+The views and opinions expressed herein do not necessarily reflect those of the funding agencies.
+
+Cross-References to Other Documentation
+=======================================
+
+.. currentmodule:: numpy
+
+.. class:: int16
+
+    See :class:`numpy.generic`.

-* US Navy ONR grant number N00014-14-1-0117
-* the US National Science Foundation under grant numbers DMS-1418961 and CCF-1524433.
+.. class:: complex128

-AK also gratefully acknowledges a hardware gift from Nvidia Corporation.  The
-views and opinions expressed herein do not necessarily reflect those of the
-funding agencies.
+    See :class:`numpy.generic`.
--- a/doc/ref_call.rst
+++ b/doc/ref_call.rst
+.. currentmodule:: loopy
+
+
+.. _func-interface:
+
+Function Interface
+==================
+
+
+Resolving and specialization
+----------------------------
+
+In :mod:`loopy`, a :class:`loopy.TranslationUnit` is a collection of callables
+and entrypoints. Callables are of type
+:class:`loopy.kernel.function_interface.InKernelCallable`. Functions start life
+as simple :class:`pymbolic.primitives.Call` nodes. Call resolution turns the function
+identifiers in those calls into :class:`~loopy.symbolic.ResolvedFunction` objects.
+Each resolved function has an entry in :attr:`TranslationUnit.callables_table`.
+The process of realizing a function as a
+:class:`~loopy.kernel.function_interface.InKernelCallable` is referred to as
+resolving.
+
+During code generation for a :class:`~loopy.TranslationUnit`, a (resolved) callable
+is *specialized* depending on the types and shapes of the arguments passed at a
+call site. For example, a call to ``sin(x)`` in :mod:`loopy` is type-generic to
+begin with, but it later specialized to either ``sinf``, ``sin`` or ``sinl``
+depending on the type of its argument ``x``. A callable's behavior during type
+or shape specialization is encoded via
+:meth:`~loopy.InKernelCallable.with_types` and
+:meth:`~loopy.InKernelCallable.with_descrs`.
+
+
+Registering callables
+---------------------
+
+A user can *register* callables within a  :class:`~loopy.TranslationUnit` to
+allow loopy to resolve calls not pre-defined in :mod:`loopy`. In :mod:`loopy`,
+we typically aim to expose all the standard math functions defined for
+a :class:`~loopy.target.TargetBase`. Other foreign functions could be invoked by
+*registering* them.
+
+An example demonstrating registering a ``CBlasGemv`` as a loopy callable:
+
+.. literalinclude:: ../examples/python/call-external.py
+
+Call Instruction for a kernel call
+----------------------------------
+
+At a call-site involving a call to a :class:`loopy.LoopKernel`, the arguments to
+the call must be ordered by the order of input arguments of the callee kernel.
+Similarly, the assignees must be ordered by the order of callee kernel's output
+arguments. Since a :class:`~loopy.kernel.data.KernelArgument` can be both an
+input and an output, such arguments would be a part of the call instruction's
+assignees as well as the call expression node's parameters.
+
+Entry points
+------------
+
+Only callables in :attr:`loopy.TranslationUnit.entrypoints` can be called from
+the outside. All other callables are only visible from within the translation
+unit, similar to C's ``static`` functions.
+
+Reference
+---------
+
+.. automodule:: loopy.kernel.function_interface
--- a/doc/ref_creation.rst
+++ b/doc/ref_creation.rst
-.. module:: loopy
-.. moduleauthor:: Andreas Kloeckner <inform@tiker.net>
-
+.. currentmodule:: loopy
 .. _creating-kernels:

 Reference: Creating Kernels
@@ -30,4 +28,11 @@ To Copy between Data Formats

 .. autofunction:: make_copy_kernel

+Einstein summation convention kernels
+-------------------------------------
+
+.. autofunction:: make_einsum
+
+.. automodule:: loopy.version
+
 .. vim: tw=75:spell:fdm=marker
--- a/doc/ref_internals.rst
+++ b/doc/ref_internals.rst
+Reference: Documentation for Internal API
+=========================================
+
+Targets
+-------
+
+See also :ref:`targets`.
+
+.. automodule:: loopy.target.c
+
+Symbolic
+--------
+
+See also :ref:`expression-syntax`.
+
+.. automodule:: loopy.symbolic
+
+Types
+-----
+
+DTypes of variables in a :class:`loopy.LoopKernel` must be picklable, so in
+the codegen pipeline user-provided types are converted to
+:class:`loopy.types.LoopyType`.
+
+.. automodule:: loopy.types
+
+Type inference
+^^^^^^^^^^^^^^
+
+.. automodule:: loopy.type_inference
+
+Codegen
+-------
+
+.. automodule:: loopy.codegen
+
+Reduction Operation
+-------------------
+
+.. automodule:: loopy.library.reduction
+
+Iname Tags
+----------
+
+.. automodule:: loopy.kernel.data
+
+Array
+-----
+
+.. automodule:: loopy.kernel.array
+
+Checks
+------
+
+.. automodule:: loopy.check
+
+Schedule
+--------
+
+.. automodule:: loopy.schedule
+.. automodule:: loopy.schedule.tools
+.. automodule:: loopy.schedule.tree
+
+
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -3,6 +3,72 @@
 Reference: Loopy's Model of a Kernel
 ====================================

+What Types of Computation can a Loopy Program Express?
+------------------------------------------------------
+
+Loopy programs consist of an a-priori unordered set of statements, operating
+on :math:`n`-dimensional array variables.
+
+Arrays consist of "plain old data" and structures thereof, as describable
+by a :class:`numpy.dtype`.  The n-dimensional shape of these arrays is
+given by a tuple of expressions at most affine in parameters that are
+fixed for the duration of program execution.
+Each array variable in the program is either an argument or a temporary
+variable.  A temporary variable is only live within the program, while
+argument variables are accessible outside the program and constitute the
+program's inputs and outputs.
+
+A statement (still called 'instruction' in some places, cf.
+:class:`loopy.InstructionBase`) encodes an assignment to an entry of an array.
+The right-hand side of an assignment consists of an expression that may
+consist of arithmetic operations and calls to functions.
+If the outermost operation of the RHS expression is a function call,
+the RHS value may be a tuple, and multiple (still scalar) arrays appear
+as LHS values. (This is the only sense in which tuple types are supported.)
+Each statement is parameterized by zero or more loop variables ("inames").
+A statement is executed once for each integer point defined by the domain
+forest for the iname tuple given for that statement
+(:attr:`loopy.InstructionBase.within_inames`). Each execution of a
+statement (with specific values of the inames) is called a *statement
+instance*.  Dependencies between these instances as well as instances of
+other statements are encoded in the program representation and specify permissible
+execution orderings.  (The semantics of the dependencies are `being
+sharpened <https://github.com/inducer/loopy/pull/168>`__.) Assignments
+(comprising the evaluation of the RHS and the assignment to the LHS) may
+be specified to be atomic.
+
+The basic building blocks of the domain forest are sets given as
+conjunctions of equalities and inequalities of quasi-affine expressions on
+integer tuples, called domains, and represented as instances of
+:class:`islpy.BasicSet`. The entries of each integer tuple are
+either *parameters* or *inames*. Each domain may optionally have a *parent
+domain*. Parameters of parent-less domains are given by value arguments
+supplied to the program that will remain unchanged during program
+execution. Parameters of domains with parents may be
+
+- run-time-constant value arguments to the program, or
+- inames from parent domains, or
+- scalar, integer temporary variables that are written by statements
+  with iteration domains controlled by a parent domain.
+
+For each tuple of concrete parameter values, the set of iname tuples must be
+finite. Each iname is defined by exactly one domain.
+
+For a tuple of inames, the domain forest defines an iteration domain
+by finding all the domains defining the inames involved, along with their
+parent domains. The resulting tree of domains may contain multiple roots,
+but no branches. The iteration domain is then constructed by intersecting
+these domains and constructing the projection of that set onto the space
+given by the required iname tuple. Observe that, via the parent-child
+domain mechanism, imperfectly-nested and data-dependent loops become
+expressible.
+
+The set of functions callable from the language is predefined by the system.
+Additional functions may be defined by the user by registering them. It is
+not currently possible to define functions from within Loopy, however work
+is progressing on permitting this. Even once this is allowed, recursion
+will not be permitted.
+
 .. _domain-tree:

 Loop Domain Forest
@@ -130,9 +196,12 @@ Iname Implementation Tags
 Tag                             Meaning
 =============================== ====================================================
 ``None`` | ``"for"``            Sequential loop
+``"ord"``                       Forced-order sequential loop
 ``"l.N"``                       Local (intra-group) axis N ("local")
 ``"g.N"``                       Group-number axis N ("group")
 ``"unr"``                       Unroll
+``"unr_hint"``                  Unroll using compiler directives
+``"unr_hint.N"``                Unroll at most N times using compiler directives
 ``"ilp"`` | ``"ilp.unr"``       Unroll using instruction-level parallelism
 ``"ilp.seq"``                   Realize parallel iname as innermost loop
 ``"like.INAME"``                Can be used when tagging inames to tag like another
@@ -150,6 +219,42 @@ Tag                             Meaning

 .. }}}

+Identifiers
+-----------
+
+Reserved Identifiers
+^^^^^^^^^^^^^^^^^^^^
+
+The identifier prefix ``_lp_`` is reserved for internal usage; when creating
+*inames*, *argument names*, *temporary variable names*, *substitution rule
+names*, *instruction IDs*, and other identifiers, users should *not* use names
+beginning with ``_lp_``.  This prefix is used for identifiers created
+internally when operating on Loopy's kernel IR. For Loopy developers, further
+information on name prefixes used within submodules is below.
+
+Identifier Registry
+^^^^^^^^^^^^^^^^^^^
+
+Functionality in :mod:`loopy` *must* use identifiers beginning with ``_lp_`` for
+all internally-created identifiers. Additionally, each name beginning with
+``_lp_`` must start with one of the reserved prefixes below. New prefixes may
+be registered by adding them to the table below. New prefixes may not themselves
+be the prefix of an existing prefix.
+
+**Reserved Identifier Prefixes**
+
+======================= ==================================
+Reserved Prefix         Usage (module or purpose)
+======================= ==================================
+``_lp_linchk_``         ``loopy.linearization.checker``
+======================= ==================================
+
+.. note::
+
+    Existing Loopy code may not yet fully satisfy these naming requirements.
+    Name changes are in progress, and prefixes will be added to this registry
+    as they are created.
+
 .. _instructions:

 Instructions
@@ -157,6 +262,7 @@ Instructions

 .. {{{

+.. autoclass:: HappensAfter
 .. autoclass:: InstructionBase

 .. _assignments:
@@ -326,15 +432,30 @@ Expressions
 Loopy's expressions are a slight superset of the expressions supported by
 :mod:`pymbolic`.

-* ``if``
-* ``elif`` (following an ``if``)
-* ``else`` (following an ``if`` / ``elif``)
+* ``if(cond, then, else_)``
+
+* ``a[[ 8*i + j ]]``: Linear subscripts.
+  See :class:`loopy.symbolic.LinearSubscript`.
+
 * ``reductions``
-    * duplication of reduction inames
+  See :class:`loopy.symbolic.Reduction`.
+
    * ``reduce`` vs ``simul_reduce``
+
 * complex-valued arithmetic
+
 * tagging of array access and substitution rule use ("$")
+  See :class:`loopy.symbolic.TaggedVariable`.
+
 * ``indexof``, ``indexof_vec``
+* ``cast(type, value)``: No parse syntax currently.
+  See :class:`loopy.symbolic.TypeCast`.
+
+* If constants in expressions are subclasses of :class:`numpy.generic`,
+  generated code will contain literals of exactly that type, making them
+  *explicitly typed*. Constants given as Python types such as :class:`int`,
+  :class:`float` or :class:`complex` are called *implicitly* typed and
+  adapt to the type of the expected result.

 TODO: Functions
 TODO: Reductions
@@ -342,6 +463,7 @@ TODO: Reductions
 Function Call Instructions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^

+.. automodule:: loopy
 .. autoclass:: CallInstruction

 C Block Instructions
@@ -352,12 +474,14 @@ C Block Instructions
 Atomic Operations
 ^^^^^^^^^^^^^^^^^

-.. autoclass:: memory_ordering
+.. autoclass:: MemoryOrdering

-.. autoclass:: memory_scope
+.. autoclass:: MemoryScope

 .. autoclass:: VarAtomicity

+.. autoclass:: OrderedAtomic
+
 .. autoclass:: AtomicInit

 .. autoclass:: AtomicUpdate
@@ -372,6 +496,12 @@ Barrier Instructions

 .. autoclass:: BarrierInstruction

+Instruction Tags
+^^^^^^^^^^^^^^^^
+
+.. autoclass:: LegacyStringInstructionTag
+.. autoclass:: UseStreamingStoreTag
+
 .. }}}

 Data: Arguments and Temporaries
@@ -388,24 +518,14 @@ Arguments
 ^^^^^^^^^

 .. autoclass:: KernelArgument
-    :members:
-    :undoc-members:

 .. autoclass:: ValueArg
-    :members:
-    :undoc-members:

-.. autoclass:: GlobalArg
-    :members:
-    :undoc-members:
+.. autoclass:: ArrayArg

 .. autoclass:: ConstantArg
-    :members:
-    :undoc-members:

 .. autoclass:: ImageArg
-    :members:
-    :undoc-members:

 .. _temporaries:

@@ -415,11 +535,9 @@ Temporary Variables
 Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both
 have the lifetime of a kernel invocation.

-.. autoclass:: temp_var_scope
+.. autoclass:: AddressSpace

 .. autoclass:: TemporaryVariable
-    :members:
-    :undoc-members:

 .. _types:

@@ -538,10 +656,10 @@ Helper values

 .. {{{

-.. autoclass:: auto
-
 .. autoclass:: UniqueName

+.. autoclass:: Optional
+
 .. }}}

 Libraries: Extending and Interfacing with External Functionality
@@ -573,10 +691,18 @@ The Kernel Object
 Do not create :class:`LoopKernel` objects directly. Instead, refer to
 :ref:`creating-kernels`.

-.. autoclass:: LoopKernel
+.. automodule:: loopy.kernel
+
+Implementation Details: The Base Array
+--------------------------------------
+
+All array-like data in :mod:`loopy` (such as :class:`ArrayArg` and
+:class:`TemporaryVariable`) derive from single, shared base array type,
+described next.
+
+.. currentmodule:: loopy.kernel.array
+
+.. autoclass:: ArrayBase

-.. autoclass:: kernel_state
-    :members:
-    :undoc-members:

 .. vim: tw=75:spell:fdm=marker
--- a/doc/ref_other.rst
+++ b/doc/ref_other.rst
 Reference: Other Functionality
 ==============================

+Auxiliary Data Types
+--------------------
+
+.. automodule:: loopy.typing
+
 Obtaining Kernel Performance Statistics
 ---------------------------------------

@@ -9,6 +14,24 @@ Obtaining Kernel Performance Statistics
 Controlling caching
 -------------------

+.. envvar:: LOOPY_NO_CACHE
+.. envvar:: CG_NO_CACHE
+
+    By default, loopy will cache (on disk) the result of various stages
+    of code generation to speed up future code generation of the same kernel.
+    By setting the environment variables :envvar:`LOOPY_NO_CACHE` or
+    :envvar:`CG_NO_CACHE` to any
+    string that :func:`pytools.strtobool` evaluates as ``True``, this caching
+    is suppressed.
+
+
+.. envvar:: LOOPY_ABORT_ON_CACHE_MISS
+
+    If set to a string that :func:`pytools.strtobool` evaluates as ``True``,
+    loopy will raise an exception if a cache miss occurs. This can be useful
+    for debugging cache-related issues. For example, it can be used to automatically test whether caching is successful for a particular code, by setting this variable to ``True`` and re-running the code.
+
+
 .. autofunction:: set_caching_enabled

 .. autoclass:: CacheMode
@@ -16,10 +39,11 @@ Controlling caching
 Running Kernels
 ---------------

-In addition to simply calling kernels using :class:`LoopKernel.__call__`,
-the following underlying functionality may be used:
+Use :class:`TranslationUnit.executor` to bind a translation unit
+to execution resources, and then use :class:`ExecutorBase.__call__`
+to invoke the kernel.

-.. autoclass:: CompiledKernel
+.. autoclass:: ExecutorBase

 Automatic Testing
 -----------------
@@ -44,3 +68,4 @@ following always works::

 .. autofunction:: show_dependency_graph

+.. autofunction:: t_unit_to_python
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -50,6 +50,10 @@ Influencing data access

 .. autofunction:: set_array_axis_names

+.. automodule:: loopy.transform.privatize
+
+.. autofunction:: allocate_temporaries_for_base_storage
+
 Padding Data
 ------------

@@ -74,6 +78,8 @@ Manipulating Instructions

 .. autofunction:: add_nosync

+.. autofunction:: add_barrier
+
 Registering Library Routines
 ----------------------------

@@ -83,8 +89,6 @@ Registering Library Routines

 .. autofunction:: register_symbol_manglers

-.. autofunction:: register_function_manglers
-
 Modifying Arguments
 -------------------

@@ -98,7 +102,7 @@ Modifying Arguments

 .. autofunction:: rename_argument

-.. autofunction:: set_temporary_scope
+.. autofunction:: set_temporary_address_space

 Creating Batches of Operations
 ------------------------------
@@ -114,7 +118,7 @@ Finishing up

 .. autofunction:: generate_loop_schedules

-.. autofunction:: get_one_scheduled_kernel
+.. autofunction:: get_one_linearized_kernel

 .. autofunction:: save_and_reload_temporaries

@@ -140,4 +144,3 @@ TODO: Matching instruction tags
 .. automodule:: loopy.match

 .. vim: tw=75:spell
-
--- a/doc/ref_translation_unit.rst
+++ b/doc/ref_translation_unit.rst
+.. currentmodule:: loopy
+
+Translation Units
+=================
+
+.. automodule:: loopy.translation_unit
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
--- a/examples/fortran/foo.floopy
+++ b/examples/fortran/foo.floopy
-subroutine fill(out, a, n)
-  implicit none
-
-  real_type a, out(n)
-  integer n, i
-
-  do i = 1, n
-    out(i) = a
-  end do
-  do i = 1, n
-    out(i) = out(i) * factor
-  end do
-end
-
-!$loopy begin
-!
-! SOURCE = lp.c_preprocess(SOURCE, [
-!       "factor 4.0",
-!       "real_type real*8",
-!       ])
-! fill, = lp.parse_fortran(SOURCE, FILENAME)
-! fill = lp.split_iname(fill, "i", 128,
-!     outer_tag="g.0", inner_tag="l.0")
-! fill = lp.split_iname(fill, "i_1", 128,
-!     outer_tag="g.0", inner_tag="l.0")
-! RESULT = [fill]
-!
-!$loopy end
-
-! vim:filetype=floopy
--- a/examples/fortran/ipython-integration-demo.ipynb
+++ b/examples/fortran/ipython-integration-demo.ipynb
 {
- "metadata": {
-  "name": "",
-  "signature": "sha256:c9f8334aa7aa4a5ad1437fa5871aafa52bbc9131271d9e90e7be47d22725cc94"
- },
- "nbformat": 3,
- "nbformat_minor": 0,
- "worksheets": [
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loopy IPython Integration Demo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext loopy.ipython_ext"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Without transform code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%fortran_kernel\n",
+    "\n",
+    "subroutine fill(out, a, n)\n",
+    "  implicit none\n",
+    "\n",
+    "  real*8 a, out(n)\n",
+    "  integer n, i\n",
+    "\n",
+    "  do i = 1, n\n",
+    "    out(i) = a\n",
+    "  end do\n",
+    "end"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(prog)  # noqa: F821"
+   ]
+  },
  {
-   "cells": [
-    {
-     "cell_type": "markdown",
-     "metadata": {},
-     "source": [
-      "# Loopy IPython Integration Demo"
-     ]
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "%load_ext loopy.ipython_ext"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 1
-    },
-    {
-     "cell_type": "markdown",
-     "metadata": {},
-     "source": [
-      "## Without transform code"
-     ]
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "%%fortran_kernel\n",
-      "\n",
-      "subroutine fill(out, a, n)\n",
-      "  implicit none\n",
-      "\n",
-      "  real*8 a, out(n)\n",
-      "  integer n, i\n",
-      "\n",
-      "  do i = 1, n\n",
-      "    out(i) = a\n",
-      "  end do\n",
-      "end"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 2
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "print(fill)"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "---------------------------------------------------------------------------\n",
-        "KERNEL: fill\n",
-        "---------------------------------------------------------------------------\n",
-        "ARGUMENTS:\n",
-        "a: ValueArg, type: float64\n",
-        "n: ValueArg, type: int32\n",
-        "out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n",
-        "---------------------------------------------------------------------------\n",
-        "DOMAINS:\n",
-        "[n] -> { [i] : i >= 0 and i <= -1 + n }\n",
-        "---------------------------------------------------------------------------\n",
-        "INAME IMPLEMENTATION TAGS:\n",
-        "i: None\n",
-        "---------------------------------------------------------------------------\n",
-        "INSTRUCTIONS:\n",
-        "[i]                                  out[i] <- a   # insn0\n",
-        "---------------------------------------------------------------------------\n"
-       ]
-      }
-     ],
-     "prompt_number": 3
-    },
-    {
-     "cell_type": "markdown",
-     "metadata": {},
-     "source": [
-      "## With transform code"
-     ]
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "split_amount = 128"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 4
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "%%transformed_fortran_kernel\n",
-      "\n",
-      "subroutine tr_fill(out, a, n)\n",
-      "  implicit none\n",
-      "\n",
-      "  real*8 a, out(n)\n",
-      "  integer n, i\n",
-      "\n",
-      "  do i = 1, n\n",
-      "    out(i) = a\n",
-      "  end do\n",
-      "end\n",
-      "\n",
-      "!$loopy begin\n",
-      "!\n",
-      "! tr_fill, = lp.parse_fortran(SOURCE)\n",
-      "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n",
-      "!     outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
-      "! RESULT = [tr_fill]\n",
-      "!\n",
-      "!$loopy end"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 5
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "print(tr_fill)"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "---------------------------------------------------------------------------\n",
-        "KERNEL: tr_fill\n",
-        "---------------------------------------------------------------------------\n",
-        "ARGUMENTS:\n",
-        "a: ValueArg, type: float64\n",
-        "n: ValueArg, type: int32\n",
-        "out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n",
-        "---------------------------------------------------------------------------\n",
-        "DOMAINS:\n",
-        "[n] -> { [i_outer, i_inner] : i_inner >= -128i_outer and i_inner <= -1 + n - 128i_outer and i_inner >= 0 and i_inner <= 127 }\n",
-        "---------------------------------------------------------------------------\n",
-        "INAME IMPLEMENTATION TAGS:\n",
-        "i_inner: l.0\n",
-        "i_outer: g.0\n",
-        "---------------------------------------------------------------------------\n",
-        "INSTRUCTIONS:\n",
-        "[i_inner,i_outer]                    out[i_inner + i_outer*128] <- a   # insn0\n",
-        "---------------------------------------------------------------------------\n"
-       ]
-      }
-     ],
-     "prompt_number": 6
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [],
-     "language": "python",
-     "metadata": {},
-     "outputs": []
-    }
-   ],
-   "metadata": {}
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## With transform code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "split_amount = 128"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%transformed_fortran_kernel\n",
+    "\n",
+    "subroutine tr_fill(out, a, n)\n",
+    "  implicit none\n",
+    "\n",
+    "  real*8 a, out(n)\n",
+    "  integer n, i\n",
+    "\n",
+    "  do i = 1, n\n",
+    "    out(i) = a\n",
+    "  end do\n",
+    "end\n",
+    "\n",
+    "!$loopy begin\n",
+    "!\n",
+    "! tr_fill = lp.parse_fortran(SOURCE)\n",
+    "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n",
+    "!     outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
+    "! RESULT = tr_fill\n",
+    "!\n",
+    "!$loopy end"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(prog)  # noqa: F821"
+   ]
  }
- ]
-}
\ No newline at end of file
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
--- a/examples/fortran/matmul-driver.py
+++ b/examples/fortran/matmul-driver.py
+import numpy as np
+import numpy.linalg as la
+
+import pyopencl as cl
+import pyopencl.array
+import pyopencl.clrandom
+
+import loopy as lp
+
+
+def main():
+    import pathlib
+    fn = pathlib.Path(__file__).parent / "matmul.floopy"
+
+    with open(fn) as inf:
+        source = inf.read()
+
+    dgemm = lp.parse_transformed_fortran(source, filename=fn)
+
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    n = 2048
+    a = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
+    b = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
+    c = cl.array.zeros(queue, (n, n), dtype=np.float64, order="F")
+    cl.clrandom.fill_rand(a)
+    cl.clrandom.fill_rand(b)
+
+    dgemm = lp.set_options(dgemm, write_code=True)
+
+    dgemm(queue, a=a, b=b, alpha=1, c=c)
+
+    c_ref = (a.get() @ b.get())
+    assert la.norm(c_ref - c.get())/la.norm(c_ref) < 1e-10
+
+
+if __name__ == "__main__":
+    main()
No results found