Compare revisions

37ca145f · 37ca145f · 37ca145f · 9a8b5547 · 9a8b5547 · 9a8b5547
--- a/build-helpers/make-linux-build-docker.sh
+++ b/build-helpers/make-linux-build-docker.sh
-#! /bin/bash
-
-# should be run in this directory (build-helpers)
-
-if test "$1" = "--nodate"; then
-  TGT_NAME=loopy-centos6
-else
-  TGT_NAME=loopy-centos6-$(date +"%Y-%m-%d")
-fi
-
-echo "Generating $TGT_NAME..."
-
-set -e
-set -x
-
-docker pull centos:6
-
-CNT=$(docker create -t -v $(pwd):/mnt centos:6 /mnt/make-linux-build-docker-inner.sh)
-echo "working in container $CNT"
-
-docker start -i $CNT
-
-docker cp $CNT:/tmp/build/loopy/dist/loopy $(pwd) || true
-
-mv loopy $TGT_NAME
-
-docker rm $CNT
-
--- a/build-helpers/run-pyinstaller.sh
+++ b/build-helpers/run-pyinstaller.sh
-#! /bin/bash
-
-# run this from the loopy root directory
-
-rm -Rf dist build
-
-pyinstaller \
-  --workpath=build/pyinstaller \
-  build-helpers/loopy.spec
--- a/build-helpers/upload.sh
+++ b/build-helpers/upload.sh
-#! /bin/bash
-
-set -e
-
-scp "$1" tiker.net:public_html/pub/loopy-binaries/
--- a/contrib/c-integer-semantics.py
+++ b/contrib/c-integer-semantics.py
+#!/usr/bin/env python
+
+import ctypes
+from os import system
+
+
+C_SRC = """
+#include <stdlib.h>
+#include <stdint.h>
+
+int64_t cdiv(int64_t a, int64_t b)
+{
+    return a/b;
+}
+
+int64_t cmod(int64_t a, int64_t b)
+{
+    return a%b;
+}
+
+#define LOOPY_CALL_WITH_INTEGER_TYPES(MACRO_NAME) \
+    MACRO_NAME(int8, char) \
+    MACRO_NAME(int16, short) \
+    MACRO_NAME(int32, int) \
+    MACRO_NAME(int64, long long)
+
+#define LOOPY_DEFINE_FLOOR_DIV(SUFFIX, TYPE) \
+    TYPE loopy_floor_div_##SUFFIX(TYPE a, TYPE b) \
+    { \
+        if ((a<0) != (b<0)) \
+            a = a - (b + (b<0) - (b>=0)); \
+        return a/b; \
+    }
+
+LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV)
+#undef LOOPY_DEFINE_FLOOR_DIV
+
+#define LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \
+    TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \
+    { \
+        if (a<0) \
+            a = a - (b-1); \
+        return a/b; \
+    }
+
+LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B)
+#undef LOOPY_DEFINE_FLOOR_DIV_POS_B
+
+
+#define LOOPY_DEFINE_MOD_POS_B(SUFFIX, TYPE) \
+    TYPE loopy_mod_pos_b_##SUFFIX(TYPE a, TYPE b) \
+    { \
+        TYPE result = a%b; \
+        if (result < 0) \
+            result += b; \
+        return result; \
+    }
+
+LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD_POS_B)
+#undef LOOPY_DEFINE_MOD_POS_B
+
+#define LOOPY_DEFINE_MOD(SUFFIX, TYPE) \
+    TYPE loopy_mod_##SUFFIX(TYPE a, TYPE b) \
+    { \
+        TYPE result = a%b; \
+        if (result < 0 && b > 0) \
+            result += b; \
+        if (result > 0 && b < 0) \
+            result = result + b; \
+        return result; \
+    }
+
+LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD)
+#undef LOOPY_DEFINE_MOD
+
+
+"""
+
+
+def main():
+    with open("int-experiments.c", "w") as outf:
+        outf.write(C_SRC)
+
+    system("gcc -Wall -shared int-experiments.c -o int-experiments.so")
+
+    int_exp = ctypes.CDLL("int-experiments.so")
+    for func in [
+            int_exp.cdiv,
+            int_exp.cmod,
+            int_exp.loopy_floor_div_int64,
+            int_exp.loopy_floor_div_pos_b_int64,
+            int_exp.loopy_mod_pos_b_int64,
+            int_exp.loopy_mod_int64,
+            ]:
+        func.argtypes = [ctypes.c_longlong, ctypes.c_longlong]
+        func.restype = ctypes.c_longlong
+
+    cmod = int_exp.cmod
+    int_floor_div = int_exp.loopy_floor_div_int64
+    int_floor_div_pos_b = int_exp.loopy_floor_div_pos_b_int64
+    int_mod_pos_b = int_exp.loopy_mod_pos_b_int64
+    int_mod = int_exp.loopy_mod_int64
+
+    m = 50
+
+    for a in range(-m, m):
+        for b in range(1, m):
+            cresult = int_floor_div_pos_b(a, b)
+            presult = a // b
+            assert cresult == presult
+            if cresult != presult:
+                print(a, b, cresult, presult)
+
+    for a in range(-m, m):
+        for b in range(-m, m):
+            if b == 0:
+                continue
+
+            cresult = int_floor_div(a, b)
+            presult = a // b
+            assert cresult == presult
+            if cresult != presult:
+                print(a, b, cresult, presult)
+
+    for a in range(-m, m):
+        for b in range(1, m):
+            cresult = int_mod_pos_b(a, b)
+            presult = a % b
+            assert cresult == presult
+
+    for a in range(-m, m):
+        for b in range(-m, m):
+            if b == 0:
+                continue
+
+            cresult = int_mod(a, b)
+            presult = a % b
+            assert cresult == presult
+            if cresult != presult:
+                print(a, b, cresult, presult)
+
+    # print(int_mod(552, -918), 552 % -918)
+    print(cmod(23, -11), 23 % -11)
+
+
+if __name__ == "__main__":
+    main()
--- a/contrib/mem-pattern-explorer/pattern_vis.py
+++ b/contrib/mem-pattern-explorer/pattern_vis.py
+import numpy as np
+
+
+# Inspired by a visualization used in the Halide tutorial
+# https://www.youtube.com/watch?v=3uiEyEKji0M
+
+
+def div_ceil(nr, dr):
+    return -(-nr // dr)
+
+
+def product(iterable):
+    from functools import reduce
+    from operator import mul
+    return reduce(mul, iterable, 1)
+
+
+class ArrayAccessPatternContext:
+    def __init__(self, gsize, lsize, subgroup_size=32, decay_constant=0.75):
+        self.lsize = lsize
+        self.gsize = gsize
+        self.subgroup_size = subgroup_size
+        self.timestamp = 0
+        self.decay_constant = decay_constant
+
+        self.ind_length = len(gsize) + len(lsize)
+
+        self.arrays = []
+
+    def l(self, index):  # noqa: E743
+        subscript = [np.newaxis] * self.ind_length
+        subscript[len(self.gsize) + index] = slice(None)
+
+        return np.arange(self.lsize[index])[tuple(subscript)]
+
+    def g(self, index):
+        subscript = [np.newaxis] * self.ind_length
+        subscript[index] = slice(None)
+
+        return np.arange(self.gsize[index])[tuple(subscript)]
+
+    def nsubgroups(self):
+        return div_ceil(product(self.lsize), self.subgroup_size)
+
+    def animate(self, f, interval=200):
+        import matplotlib.animation as animation
+        import matplotlib.pyplot as plt
+
+        fig = plt.figure()
+
+        plots = []
+        for iary, ary in enumerate(self.arrays):
+            ax = fig.add_subplot(1, len(self.arrays), 1+iary)
+            ax.set_title(ary.name)
+            plots.append(ary.plot(ax))
+
+        def data_gen():
+            for _ in f():
+                self.tick()
+
+                for ary, plot in zip(self.arrays, plots):
+                    plot.set_array(ary.get_plot_data())
+
+                fig.canvas.draw()
+                yield plots
+
+        # must be kept alive until after plt.show()
+        return animation.FuncAnimation(
+                fig, lambda x: x, data_gen,
+                blit=False, interval=interval, repeat=True)
+
+    def tick(self):
+        self.timestamp += 1
+
+
+class Array:
+    def __init__(self, ctx, name, shape, strides, elements_per_row=None):
+        # Each array element stores a tuple:
+        # (timestamp, subgroup, g0, g1, g2, ) of last access
+
+        assert len(shape) == len(strides)
+
+        self.nattributes = 2+len(ctx.gsize)
+
+        if elements_per_row is None:
+            if len(shape) > 1:
+                minstride = min(strides)
+                for sh_i, st_i in zip(shape, strides):
+                    if st_i == minstride:
+                        elements_per_row = sh_i
+                        break
+        else:
+            elements_per_row = 256
+
+        self.array = np.zeros((product(shape), self.nattributes,), dtype=np.int32)
+
+        self.ctx = ctx
+        self.name = name
+        self.shape = shape
+        self.strides = strides
+        self.elements_per_row = elements_per_row
+
+        ctx.arrays.append(self)
+
+    def __getitem__(self, index):
+        if not isinstance(index, tuple):
+            index = (index,)
+
+        assert len(index) == len(self.shape)
+
+        all_subscript = (np.newaxis,) * self.ctx.ind_length
+
+        def reshape_ind(ind):
+            if not isinstance(ind, np.ndarray):
+                return ind[all_subscript]
+
+            else:
+                assert len(ind.shape) == self.ctx.ind_length
+
+        lin_index = sum(
+                ind_i * stride_i
+                for ind_i, stride_i in zip(index, self.strides))
+
+        if not isinstance(lin_index, np.ndarray):
+            subscript = [np.newaxis] * self.ctx.ind_length
+            lin_index = np.array(lin_index)[subscript]
+
+        self.array[lin_index, 0] = self.ctx.timestamp
+        for i, _glength in enumerate(self.ctx.gsize):
+            if lin_index.shape[i] > 1:
+                self.array[lin_index, 2+i] = self.ctx.g(i)
+
+        workitem_index = 0
+        for i in range(len(self.ctx.lsize))[::-1]:
+            workitem_index = (
+                    workitem_index * self.ctx.lsize[i]
+                    + self.ctx.l(i))
+        subgroup = workitem_index//self.ctx.subgroup_size
+        self.array[lin_index, 1] = subgroup
+
+    def __setitem__(self, index, value):
+        self.__getitem__(index)
+
+    def get_plot_data(self):
+        nelements = self.array.shape[0]
+        base_shape = (
+                div_ceil(nelements, self.elements_per_row),
+                self.elements_per_row,)
+        shaped_array = np.zeros(
+                (*base_shape, self.nattributes),
+                dtype=np.float32)
+        shaped_array.reshape(-1, self.nattributes)[:nelements] = self.array
+
+        modulation = np.exp(
+                -self.ctx.decay_constant*(self.ctx.timestamp-shaped_array[:, :, 0]))
+
+        subgroup = shaped_array[:, :, 1]
+        if self.ctx.nsubgroups() > 1:
+            subgroup = subgroup/(self.ctx.nsubgroups()-1)
+        else:
+            subgroup.fill(1)
+
+        rgb_array = np.zeros((*base_shape, 3))
+        if 1:
+            if len(self.ctx.gsize) > 1:
+                # g.0 -> red
+                rgb_array[:, :, 0] = shaped_array[:, :, 2]/(self.ctx.gsize[0]-1)
+            if len(self.ctx.gsize) > 1:
+                # g.1 -> blue
+                rgb_array[:, :, 2] = shaped_array[:, :, 3]/(self.ctx.gsize[1]-1)
+        if 1:
+            rgb_array[:, :, 1] = subgroup
+
+        return rgb_array*modulation[:, :, np.newaxis]
+
+    def plot(self, ax, **kwargs):
+        return ax.imshow(
+                self.get_plot_data(), interpolation="nearest",
+                **kwargs)
+
+
+def show_example():
+    n = 2**7
+    n16 = div_ceil(n, 16)
+    ctx = ArrayAccessPatternContext(gsize=(n16, n16), lsize=(16, 16))
+    in0 = Array(ctx, "in0", (n, n), (n, 1))
+
+    if 0:
+        # knl a
+        i_inner = ctx.l(1)
+        i_outer = ctx.g(1)
+        k_inner = ctx.l(0)
+
+        def f():
+            for k_outer in range(n16):
+                in0[i_inner + i_outer*16, k_inner + k_outer*16]
+                yield
+    elif 0:
+        # knl b
+        j_inner = ctx.l(0)
+        j_outer = ctx.g(0)
+        k_inner = ctx.l(1)
+
+        def f():
+            for k_outer in range(n16):
+                in0[k_inner + k_outer*16, j_inner + j_outer*16]
+                yield
+
+    ani = ctx.animate(f)
+    import matplotlib.pyplot as plt
+    if 1:
+        plt.show()
+    else:
+        ani.save("access.mp4")
+
+
+def show_example_2():
+    bsize = 8
+    blocks = 3
+
+    ctx = ArrayAccessPatternContext(gsize=(1,), lsize=(1,),
+            decay_constant=0.005)
+    in0 = Array(ctx, "in0", (blocks*bsize, blocks*bsize), (blocks*bsize, 1))
+
+    def f():
+        for i_outer in range(blocks):
+            for j_outer in range(blocks):
+                for i_inner in range(bsize):
+                    for j_inner in range(bsize):
+                        in0[i_inner + i_outer*bsize, j_inner + j_outer*bsize]
+                        yield
+
+    ani = ctx.animate(f, interval=10)
+    import matplotlib.pyplot as plt
+    if 1:
+        plt.show()
+    else:
+        ani.save("access.mp4")
+
+
+if __name__ == "__main__":
+    show_example_2()
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -3,7 +3,7 @@

 # You can set these variables from the command line.
 SPHINXOPTS    =
-SPHINXBUILD   = python ` which sphinx-build`
+SPHINXBUILD   = python `which sphinx-build`
 PAPER         =
 BUILDDIR      = _build


--- a/doc/conf.py
+++ b/doc/conf.py
-# -*- coding: utf-8 -*-
-#
-# loopy documentation build configuration file, created by
-# sphinx-quickstart on Tue Aug  9 13:40:49 2011.
-#
-# This file is execfile()d with the current directory set to its containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
+import os
+from urllib.request import urlopen

-#import sys, os

-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+_conf_url = "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py"
+with urlopen(_conf_url) as _inf:
+    exec(compile(_inf.read(), _conf_url, "exec"), globals())

-# -- General configuration -----------------------------------------------------
+copyright = "2016, Andreas Klöckner"

-# If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be extensions
-# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = [
-        'sphinx.ext.autodoc',
-        'sphinx.ext.intersphinx',
-        #'sphinx.ext.viewcode',
-        'sphinx.ext.doctest',
-        ]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix of source filenames.
-source_suffix = '.rst'
-
-# The encoding of source files.
-#source_encoding = 'utf-8-sig'
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = u'loopy'
-copyright = u'2016, Andreas Klöckner'
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
 # The short X.Y version.
 ver_dic = {}
 _version_source = "../loopy/version.py"
 with open(_version_source) as vpy_file:
    version_py = vpy_file.read()

-exec(compile(version_py, _version_source, 'exec'), ver_dic)
+os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"] = "1"
+exec(compile(version_py, _version_source, "exec"), ver_dic)
 version = ".".join(str(x) for x in ver_dic["VERSION"])
 # The full version, including alpha/beta/rc tags.
 release = ver_dic["VERSION_TEXT"]
+del os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"]

-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#language = None
-
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-#today = ''
-# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-exclude_patterns = ['_build']
-
-# The reST default role (used for this markup: `text`) to use for all documents.
-#default_role = None
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
-
-# If true, the current module name will be prepended to all description
-# unit titles (such as .. function::).
-#add_module_names = True
-
-# If true, sectionauthor and moduleauthor directives will be shown in the
-# output. They are ignored by default.
-#show_authors = False
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+exclude_patterns = ["_build"]

-# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
-
-
-# -- Options for HTML output ---------------------------------------------------
-
-html_theme = "alabaster"
-
-html_theme_options = {
-        "extra_nav_links": {
-            "🚀 Github": "https://github.com/inducer/loopy",
-            "💾 Download Releases": "https://pypi.python.org/pypi/loo.py",
-            }
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+        "python": ("https://docs.python.org/3", None),
+        "numpy": ("https://numpy.org/doc/stable/", None),
+        "pytools": ("https://documen.tician.de/pytools", None),
+        "islpy": ("https://documen.tician.de/islpy", None),
+        "pyopencl": ("https://documen.tician.de/pyopencl", None),
+        "cgen": ("https://documen.tician.de/cgen", None),
+        "pymbolic": ("https://documen.tician.de/pymbolic", None),
+        "constantdict": ("https://matthiasdiener.github.io/constantdict/", None),
        }

-html_sidebars = {
-    '**': [
-        'about.html',
-        'navigation.html',
-        'relations.html',
-        'searchbox.html',
-    ]
-}
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#html_theme_options = {}
-
-# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
-
-# The name for this set of Sphinx documents.  If None, it defaults to
-# "<project> v<release> documentation".
-#html_title = None
-
-# A shorter title for the navigation bar.  Default is the same as html_title.
-#html_short_title = None
-
-# The name of an image file (relative to this directory) to place at the top
-# of the sidebar.
-#html_logo = None
-
-# The name of an image file (within the static path) to use as favicon of the
-# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-#html_favicon = None
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
-
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-#html_use_smartypants = True
+nitpicky = True

-# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+nitpick_ignore_regex = [
+        ["py:class", r"typing_extensions\.(.+)"],
+        ["py:class", r"numpy\.u?int[0-9]+"],
+        ["py:class", r"numpy\.float[0-9]+"],
+        ["py:class", r"numpy\.complex[0-9]+"],

-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-#html_additional_pages = {}
+        # Reference not found from "<unknown>"? I'm not even sure where to look.
+        ["py:class", r"ExpressionNode"],

-# If false, no module index is generated.
-#html_domain_indices = True
-
-# If false, no index is generated.
-#html_use_index = True
-
-# If true, the index is split into individual pages for each letter.
-#html_split_index = False
-
-# If true, links to the reST sources are added to the pages.
-html_show_sourcelink = False
-
-# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
-
-# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
-
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a <link> tag referring to it.  The value of this option must be the
-# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
-
-# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'loopydoc'
-
-
-# -- Options for LaTeX output --------------------------------------------------
-
-# The paper size ('letter' or 'a4').
-#latex_paper_size = 'letter'
-
-# The font size ('10pt', '11pt' or '12pt').
-#latex_font_size = '10pt'
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title, author, documentclass [howto/manual]).
-latex_documents = [
-        ('index', 'loopy.tex', u'loopy Documentation',
-            u'Andreas Kloeckner', 'manual'),
-]
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-#latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-#latex_use_parts = False
-
-# If true, show page references after internal links.
-#latex_show_pagerefs = False
-
-# If true, show URL addresses after external links.
-#latex_show_urls = False
-
-# Additional stuff for the LaTeX preamble.
-#latex_preamble = ''
-
-# Documents to append as an appendix to all manuals.
-#latex_appendices = []
-
-# If false, no module index is generated.
-#latex_domain_indices = True
-
-
-# -- Options for manual page output --------------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    ('index', 'loopy', u'loopy Documentation',
-     [u'Andreas Kloeckner'], 1)
-]
-
-
-# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {
-    'https://docs.python.org/3': None,
-    'https://documen.tician.de/islpy': None,
-    'https://documen.tician.de/pyopencl': None,
-    'https://documen.tician.de/cgen': None,
-    'https://docs.scipy.org/doc/numpy/': None,
-    }
-
-autoclass_content = "class"
+        # Type aliases
+        ["py:class", r"InameStr"],
+        ["py:class", r"ConcreteCallablesTable"],
+        ["py:class", r"LoopNestTree"],
+        ["py:class", r"LoopTree"],
+        ["py:class", r"ToLoopyTypeConvertible"],
+        ["py:class", r"ToStackMatchConvertible"],
+        ]
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -18,43 +18,20 @@ When you run this script, the following kernel is generated, compiled, and execu

 (See the full example for how to print the generated code.)

-Want to try out loopy?
----------------------
-
-There's no need to go through :ref:`installation` if you'd just like to get a
-feel for what loopy is.  Instead, you may
-`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/builds/1989/artifacts/browse/build-helpers/>`_.
-This is purposefully built on an ancient Linux distribution, so it should work
-on most versions of Linux that are currently out there.
-
-Once you have the binary, do the following::
-
-    chmod +x ./loopy-centos6
-    ./loopy-centos6 --target=opencl hello-loopy-lp.py
-    ./loopy-centos6 --target=cuda hello-loopy-lp.py
-    ./loopy-centos6 --target=ispc hello-loopy-lp.py
-
-Grab the example here: :download:`examples/python/hello-loopy.py <../examples/python/hello-loopy-lp.py>`.
-
-You may also donwload the most recent version by going to the `list of builds
-<https://gitlab.tiker.net/inducer/loopy/builds>`_, clicking on the newest one
-of type "CentOS binary", clicking on "Browse" under "Build Artifacts", then
-navigating to "build-helpers", and downloading the binary from there.
+.. _static-binary:

 Places on the web related to Loopy
 ----------------------------------

-* `Python package index <http://pypi.python.org/pypi/loo.py>`_ (download releases) Note the extra '.' in the PyPI identifier!
-
-* `Github <http://github.com/inducer/loopy>`_ (get latest source code, file bugs)
-* `Wiki <http://wiki.tiker.net/Loopy>`_ (read installation tips, get examples, read FAQ)
-* `Homepage <http://mathema.tician.de/software/loopy>`_
+* `Python package index <https://pypi.org/project/loopy>`_ (download releases)
+* `Github <https://github.com/inducer/loopy>`_ (get latest source code, file bugs)
+* `Homepage <https://mathema.tician.de/software/loopy>`_

 Table of Contents
 -----------------

 If you're only just learning about loopy, consider the following `paper
-<http://arxiv.org/abs/1405.7470>`_ on loo.py that may serve as a good
+<https://arxiv.org/abs/1405.7470>`_ on loopy that may serve as a good
 introduction.

 Please check :ref:`installation` to get started.
@@ -65,9 +42,14 @@ Please check :ref:`installation` to get started.
    tutorial
    ref_creation
    ref_kernel
+    ref_translation_unit
    ref_transform
+    ref_call
    ref_other
    misc
+    ref_internals
+    🚀 Github <https://github.com/inducer/loopy>
+    💾 Download Releases <https://pypi.org/project/loopy>

 Indices and tables
 ==================

--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -3,11 +3,21 @@
 Installation
 ============

-This command should install :mod:`loopy`::
+Option 0: Static Binary
+-----------------------
+
+If you would just like to experiment with :mod:`loopy`'s code transformation
+abilities, the easiest way to get loopy is to download a statically-linked
+Linux binary.

-    pip install loo.py
+See :ref:`static-binary` for details.

-(Note the extra "."!)
+Option 1: From Source, no PyOpenCL integration
+-----------------------------------------------
+
+This command should install :mod:`loopy`::
+
+    pip install loopy

 You may need to run this with :command:`sudo`.
 If you don't already have `pip <https://pypi.python.org/pypi/pip>`_,
@@ -17,19 +27,66 @@ run this beforehand::
    python get-pip.py

 For a more manual installation, `download the source
-<http://pypi.python.org/pypi/loo.py>`_, unpack it, and say::
+<https://pypi.org/project/loopy>`_, unpack it, and say::

    python setup.py install

 You may also clone its git repository::

-    git clone --recursive git://github.com/inducer/loopy
-    git clone --recursive http://git.tiker.net/trees/loopy.git
+    git clone --recursive https://github.com/inducer/loopy.git
+
+Option 2: From Conda Forge, with PyOpenCL integration
+-----------------------------------------------------
+
+This set of instructions is intended for 64-bit Linux and
+MacOS support computers:
+
+#.  Make sure your system has the basics to build software.
+
+    On Debian derivatives (Ubuntu and many more),
+    installing ``build-essential`` should do the trick.
+
+    Everywhere else, just making sure you have the ``g++`` package should be
+    enough.
+
+#.  Install `miniforge <https://github.com/conda-forge/miniforge>`_.
+
+#.  ``export CONDA=/WHERE/YOU/INSTALLED/miniforge3``
+
+    If you accepted the default location, this should work:
+
+    ``export CONDA=$HOME/miniforge3``
+
+#.  ``$CONDA/bin/conda create -n dev``
+
+#.  ``source $CONDA/bin/activate dev``
+
+#.  ``conda install git pip pocl islpy pyopencl`` (Linux)
+
+    or
+
+    ``conda install osx-pocl-opencl git pip pocl islpy pyopencl`` (OS X)
+
+#.  Type the following command::
+
+        pip install git+https://github.com/inducer/loopy
+
+Next time you want to use :mod:`loopy`, just run the following command::
+
+    source /WHERE/YOU/INSTALLED/miniforge3/bin/activate dev
+
+You may also like to add this to a startup file (like :file:`$HOME/.bashrc`) or create an alias for it.
+
+See the `PyOpenCL installation instructions
+<https://documen.tician.de/pyopencl/misc.html#installation>`_ for options
+regarding OpenCL drivers.

 User-visible Changes
 ====================

-Version 2016.2
+See also :ref:`language-versioning`.
+
+Version 2018.1
 --------------
 .. note::

@@ -57,7 +114,7 @@ Licensing

 Loopy is licensed to you under the MIT/X Consortium license:

-Copyright (c) 2009-13 Andreas Klöckner and Contributors.
+Copyright (c) 2009-17 Andreas Klöckner and Contributors.

 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
@@ -101,7 +158,7 @@ In the meantime, you can generate code simply by saying::
    print(cg_result.host_code())
    print(cg_result.device_code())

-Additionally, for C-based languages, header defintions are available via::
+Additionally, for C-based languages, header definitions are available via::

    loopy.generate_header(knl)

@@ -131,7 +188,7 @@ source of examples. Here are some links:
 Here's a more complicated example of a loopy code:

 .. literalinclude:: ../examples/python/find-centers.py
-    :language: c
+    :language: python

 This example is included in the :mod:`loopy` distribution as
 :download:`examples/python/find-centers.py <../examples/python/find-centers.py>`.
@@ -193,7 +250,7 @@ This list is always growing, but here are a few pointers:
 * Precompute subexpressions:

  Use a :ref:`substitution-rule` to assign a name to a subexpression,
-  using may be :func:`loopy.assignment_to_subst` or :func:`extract_subst`.
+  using may be :func:`loopy.assignment_to_subst` or :func:`loopy.extract_subst`.
  Then use :func:`loopy.precompute` to create an (array or scalar)
  temporary with precomputed values.

@@ -232,12 +289,12 @@ This list is always growing, but here are a few pointers:
  Use :func:`loopy.tag_inames` with the ``"vec"`` iname tag.
  Note that the corresponding axis of an array must
  also be tagged using the ``"vec"`` array axis tag
-  (using :func:`tag_array_axes`) in order for vector code to be
+  (using :func:`loopy.tag_array_axes`) in order for vector code to be
  generated.

  Vectorized loops (and array axes) must have a fixed size. (See either
-  :func:`split_iname` or :func:`fix_parameters` along with
-  :func:`split_array_axis`.)
+  :func:`loopy.split_iname` or :func:`loopy.fix_parameters` along with
+  :func:`loopy.split_array_axis`.)

 * Reuse of Temporary Storage

@@ -246,7 +303,7 @@ This list is always growing, but here are a few pointers:

 * SoA $\leftrightarrow$ AoS

-  Use :func:`tag_array_axes` with the ``"sep"`` array axis tag
+  Use :func:`loopy.tag_array_axes` with the ``"sep"`` array axis tag
  to generate separate arrays for each entry of a short, fixed-length
  array axis.

@@ -257,7 +314,7 @@ This list is always growing, but here are a few pointers:

  Use :func:`loopy.tag_inames` with the ``"ilp"`` tag.
  ILP loops must have a fixed size. (See either
-  :func:`split_iname` or :func:`fix_parameters`.)
+  :func:`loopy.split_iname` or :func:`loopy.fix_parameters`.)

 * Type inference

@@ -275,12 +332,69 @@ This list is always growing, but here are a few pointers:

 * Interface with your own library functions

-  Use :func:`loopy.register_function_manglers`.
+  See :ref:`func-interface` for details.

 * Loop collapse

  Use :func:`loopy.join_inames`.

+In what sense does Loopy support vectorization?
+-----------------------------------------------
+
+There are really two ways in which the OpenCL/CUDA model of computation exposes
+vectorization:
+
+* "SIMT": The user writes scalar program instances and either the compiler or
+  the hardware joins the individual program instances into vectors of a
+  hardware-given length for execution.
+
+* "Short vectors": This type of vectorization is based on vector types,
+  e.g. ``float4``, which support arithmetic with implicit vector semantics
+  as well as a number of 'intrinsic' functions.
+
+Loopy supports both. The first one, SIMT, is accessible by tagging inames with,
+e.g., ``l.0```. Accessing the second one requires using both execution- and
+data-reshaping capabilities in loopy. To start with, you need an array that
+has an axis with the length of the desired vector. If that's not yet available,
+you may use :func:`loopy.split_array_axis` to produce one. Similarly, you need
+an iname whose bounds match those of the desired vector length. Again, if you
+don't already have one, :func:`loopy.split_iname` will easily produce one.
+Lastly, both the array axis an the iname need the implementation tag ``"vec"``.
+Here is an example of this machinery in action:
+
+.. literalinclude:: ../examples/python/vector-types.py
+    :language: python
+
+Note how the example slices off the last 'slab' of iterations to ensure that
+the bulk of the iteration does not require conditionals which would prevent
+successful vectorization. This generates the following code:
+
+.. literalinclude:: ../examples/python/vector-types.cl
+    :language: c
+
+What is the story with language versioning?
+-------------------------------------------
+
+The idea is to keep supporting multiple versions at a time. There's a
+tension in loopy between the need to build code that keeps working
+unchanged for some number of years, and needing the language to
+evolve--not just as a research vehicle, but also to enable to respond
+to emerging needs in applications and hardware.
+
+The idea is not to support all versions indefinitely, merely to allow
+users to upgrade on their own schedule on the scale of a couple years.
+Warnings about needing to upgrade would get noisier as a version nears
+deprecation. In a way, it is intended to be a version of Python's
+`__future__` flags, which IMO have the served the language tremendously
+well.
+
+One can also obtain the current language version programmatically:
+:data:`loopy.MOST_RECENT_LANGUAGE_VERSION`.
+But pinning your code to that would mean choosing to not use the
+potentially valuable guarantee to keep existing code working unchanged
+for a while. Instead, it might be wiser to just grab the version of the
+language current at the time of writing the code.
+
 Uh-oh. I got a scheduling error. Any hints?
 -------------------------------------------

@@ -325,7 +439,7 @@ If you use loopy for your work and find its approach helpful, please
 consider citing the following article.

    A. Klöckner. `Loo.py: transformation-based code generation for GPUs and
-    CPUs <http://arxiv.org/abs/1405.7470>`_. Proceedings of ARRAY '14: ACM
+    CPUs <https://arxiv.org/abs/1405.7470>`_. Proceedings of ARRAY '14: ACM
    SIGPLAN Workshop on Libraries, Languages, and Compilers for Array
    Programming. Edinburgh, Scotland.

@@ -342,14 +456,33 @@ Here's a Bibtex entry for your convenience::
       doi = "{10.1145/2627373.2627387}",
    }

+Getting help
+============
+
+Email the friendly folks on the `loopy mailing list <https://lists.tiker.net/listinfo/loopy>`_.
+
 Acknowledgments
 ===============

-Andreas Klöckner's work on :mod:`loopy` was supported in part by
+Work on loopy was supported in part by
+
+- the Department of Energy, National Nuclear Security Administration, under Award Number DE-NA0003963,
+- the US Navy ONR, under grant number N00014-14-1-0117, and
+- the US National Science Foundation under grant numbers DMS-1418961, CCF-1524433, DMS-1654756, SHF-1911019, and OAC-1931577.
+
+AK also gratefully acknowledges a hardware gift from Nvidia Corporation.
+
+The views and opinions expressed herein do not necessarily reflect those of the funding agencies.
+
+Cross-References to Other Documentation
+=======================================
+
+.. currentmodule:: numpy
+
+.. class:: int16
+
+    See :class:`numpy.generic`.

-* US Navy ONR grant number N00014-14-1-0117
-* the US National Science Foundation under grant numbers DMS-1418961 and CCF-1524433.
+.. class:: complex128

-AK also gratefully acknowledges a hardware gift from Nvidia Corporation.  The
-views and opinions expressed herein do not necessarily reflect those of the
-funding agencies.
+    See :class:`numpy.generic`.
--- a/doc/ref_call.rst
+++ b/doc/ref_call.rst
+.. currentmodule:: loopy
+
+
+.. _func-interface:
+
+Function Interface
+==================
+
+
+Resolving and specialization
+----------------------------
+
+In :mod:`loopy`, a :class:`loopy.TranslationUnit` is a collection of callables
+and entrypoints. Callables are of type
+:class:`loopy.kernel.function_interface.InKernelCallable`. Functions start life
+as simple :class:`pymbolic.primitives.Call` nodes. Call resolution turns the function
+identifiers in those calls into :class:`~loopy.symbolic.ResolvedFunction` objects.
+Each resolved function has an entry in :attr:`TranslationUnit.callables_table`.
+The process of realizing a function as a
+:class:`~loopy.kernel.function_interface.InKernelCallable` is referred to as
+resolving.
+
+During code generation for a :class:`~loopy.TranslationUnit`, a (resolved) callable
+is *specialized* depending on the types and shapes of the arguments passed at a
+call site. For example, a call to ``sin(x)`` in :mod:`loopy` is type-generic to
+begin with, but it later specialized to either ``sinf``, ``sin`` or ``sinl``
+depending on the type of its argument ``x``. A callable's behavior during type
+or shape specialization is encoded via
+:meth:`~loopy.InKernelCallable.with_types` and
+:meth:`~loopy.InKernelCallable.with_descrs`.
+
+
+Registering callables
+---------------------
+
+A user can *register* callables within a  :class:`~loopy.TranslationUnit` to
+allow loopy to resolve calls not pre-defined in :mod:`loopy`. In :mod:`loopy`,
+we typically aim to expose all the standard math functions defined for
+a :class:`~loopy.target.TargetBase`. Other foreign functions could be invoked by
+*registering* them.
+
+An example demonstrating registering a ``CBlasGemv`` as a loopy callable:
+
+.. literalinclude:: ../examples/python/call-external.py
+
+Call Instruction for a kernel call
+----------------------------------
+
+At a call-site involving a call to a :class:`loopy.LoopKernel`, the arguments to
+the call must be ordered by the order of input arguments of the callee kernel.
+Similarly, the assignees must be ordered by the order of callee kernel's output
+arguments. Since a :class:`~loopy.kernel.data.KernelArgument` can be both an
+input and an output, such arguments would be a part of the call instruction's
+assignees as well as the call expression node's parameters.
+
+Entry points
+------------
+
+Only callables in :attr:`loopy.TranslationUnit.entrypoints` can be called from
+the outside. All other callables are only visible from within the translation
+unit, similar to C's ``static`` functions.
+
+Reference
+---------
+
+.. automodule:: loopy.kernel.function_interface
--- a/doc/ref_creation.rst
+++ b/doc/ref_creation.rst
-.. module:: loopy
-.. moduleauthor:: Andreas Kloeckner <inform@tiker.net>
-
+.. currentmodule:: loopy
 .. _creating-kernels:

 Reference: Creating Kernels
@@ -30,4 +28,11 @@ To Copy between Data Formats

 .. autofunction:: make_copy_kernel

+Einstein summation convention kernels
+-------------------------------------
+
+.. autofunction:: make_einsum
+
+.. automodule:: loopy.version
+
 .. vim: tw=75:spell:fdm=marker
--- a/doc/ref_internals.rst
+++ b/doc/ref_internals.rst
+Reference: Documentation for Internal API
+=========================================
+
+Targets
+-------
+
+See also :ref:`targets`.
+
+.. automodule:: loopy.target.c
+
+Symbolic
+--------
+
+See also :ref:`expression-syntax`.
+
+.. automodule:: loopy.symbolic
+
+Types
+-----
+
+DTypes of variables in a :class:`loopy.LoopKernel` must be picklable, so in
+the codegen pipeline user-provided types are converted to
+:class:`loopy.types.LoopyType`.
+
+.. automodule:: loopy.types
+
+Type inference
+^^^^^^^^^^^^^^
+
+.. automodule:: loopy.type_inference
+
+Codegen
+-------
+
+.. automodule:: loopy.codegen
+
+Reduction Operation
+-------------------
+
+.. automodule:: loopy.library.reduction
+
+Iname Tags
+----------
+
+.. automodule:: loopy.kernel.data
+
+Array
+-----
+
+.. automodule:: loopy.kernel.array
+
+Checks
+------
+
+.. automodule:: loopy.check
+
+Schedule
+--------
+
+.. automodule:: loopy.schedule
+.. automodule:: loopy.schedule.tools
+.. automodule:: loopy.schedule.tree
+
+
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -3,6 +3,72 @@
 Reference: Loopy's Model of a Kernel
 ====================================

+What Types of Computation can a Loopy Program Express?
+------------------------------------------------------
+
+Loopy programs consist of an a-priori unordered set of statements, operating
+on :math:`n`-dimensional array variables.
+
+Arrays consist of "plain old data" and structures thereof, as describable
+by a :class:`numpy.dtype`.  The n-dimensional shape of these arrays is
+given by a tuple of expressions at most affine in parameters that are
+fixed for the duration of program execution.
+Each array variable in the program is either an argument or a temporary
+variable.  A temporary variable is only live within the program, while
+argument variables are accessible outside the program and constitute the
+program's inputs and outputs.
+
+A statement (still called 'instruction' in some places, cf.
+:class:`loopy.InstructionBase`) encodes an assignment to an entry of an array.
+The right-hand side of an assignment consists of an expression that may
+consist of arithmetic operations and calls to functions.
+If the outermost operation of the RHS expression is a function call,
+the RHS value may be a tuple, and multiple (still scalar) arrays appear
+as LHS values. (This is the only sense in which tuple types are supported.)
+Each statement is parameterized by zero or more loop variables ("inames").
+A statement is executed once for each integer point defined by the domain
+forest for the iname tuple given for that statement
+(:attr:`loopy.InstructionBase.within_inames`). Each execution of a
+statement (with specific values of the inames) is called a *statement
+instance*.  Dependencies between these instances as well as instances of
+other statements are encoded in the program representation and specify permissible
+execution orderings.  (The semantics of the dependencies are `being
+sharpened <https://github.com/inducer/loopy/pull/168>`__.) Assignments
+(comprising the evaluation of the RHS and the assignment to the LHS) may
+be specified to be atomic.
+
+The basic building blocks of the domain forest are sets given as
+conjunctions of equalities and inequalities of quasi-affine expressions on
+integer tuples, called domains, and represented as instances of
+:class:`islpy.BasicSet`. The entries of each integer tuple are
+either *parameters* or *inames*. Each domain may optionally have a *parent
+domain*. Parameters of parent-less domains are given by value arguments
+supplied to the program that will remain unchanged during program
+execution. Parameters of domains with parents may be
+
+- run-time-constant value arguments to the program, or
+- inames from parent domains, or
+- scalar, integer temporary variables that are written by statements
+  with iteration domains controlled by a parent domain.
+
+For each tuple of concrete parameter values, the set of iname tuples must be
+finite. Each iname is defined by exactly one domain.
+
+For a tuple of inames, the domain forest defines an iteration domain
+by finding all the domains defining the inames involved, along with their
+parent domains. The resulting tree of domains may contain multiple roots,
+but no branches. The iteration domain is then constructed by intersecting
+these domains and constructing the projection of that set onto the space
+given by the required iname tuple. Observe that, via the parent-child
+domain mechanism, imperfectly-nested and data-dependent loops become
+expressible.
+
+The set of functions callable from the language is predefined by the system.
+Additional functions may be defined by the user by registering them. It is
+not currently possible to define functions from within Loopy, however work
+is progressing on permitting this. Even once this is allowed, recursion
+will not be permitted.
+
 .. _domain-tree:

 Loop Domain Forest
@@ -130,9 +196,12 @@ Iname Implementation Tags
 Tag                             Meaning
 =============================== ====================================================
 ``None`` | ``"for"``            Sequential loop
+``"ord"``                       Forced-order sequential loop
 ``"l.N"``                       Local (intra-group) axis N ("local")
 ``"g.N"``                       Group-number axis N ("group")
 ``"unr"``                       Unroll
+``"unr_hint"``                  Unroll using compiler directives
+``"unr_hint.N"``                Unroll at most N times using compiler directives
 ``"ilp"`` | ``"ilp.unr"``       Unroll using instruction-level parallelism
 ``"ilp.seq"``                   Realize parallel iname as innermost loop
 ``"like.INAME"``                Can be used when tagging inames to tag like another
@@ -150,6 +219,42 @@ Tag                             Meaning

 .. }}}

+Identifiers
+-----------
+
+Reserved Identifiers
+^^^^^^^^^^^^^^^^^^^^
+
+The identifier prefix ``_lp_`` is reserved for internal usage; when creating
+*inames*, *argument names*, *temporary variable names*, *substitution rule
+names*, *instruction IDs*, and other identifiers, users should *not* use names
+beginning with ``_lp_``.  This prefix is used for identifiers created
+internally when operating on Loopy's kernel IR. For Loopy developers, further
+information on name prefixes used within submodules is below.
+
+Identifier Registry
+^^^^^^^^^^^^^^^^^^^
+
+Functionality in :mod:`loopy` *must* use identifiers beginning with ``_lp_`` for
+all internally-created identifiers. Additionally, each name beginning with
+``_lp_`` must start with one of the reserved prefixes below. New prefixes may
+be registered by adding them to the table below. New prefixes may not themselves
+be the prefix of an existing prefix.
+
+**Reserved Identifier Prefixes**
+
+======================= ==================================
+Reserved Prefix         Usage (module or purpose)
+======================= ==================================
+``_lp_linchk_``         ``loopy.linearization.checker``
+======================= ==================================
+
+.. note::
+
+    Existing Loopy code may not yet fully satisfy these naming requirements.
+    Name changes are in progress, and prefixes will be added to this registry
+    as they are created.
+
 .. _instructions:

 Instructions
@@ -157,6 +262,7 @@ Instructions

 .. {{{

+.. autoclass:: HappensAfter
 .. autoclass:: InstructionBase

 .. _assignments:
@@ -326,15 +432,30 @@ Expressions
 Loopy's expressions are a slight superset of the expressions supported by
 :mod:`pymbolic`.

-* ``if``
-* ``elif`` (following an ``if``)
-* ``else`` (following an ``if`` / ``elif``)
+* ``if(cond, then, else_)``
+
+* ``a[[ 8*i + j ]]``: Linear subscripts.
+  See :class:`loopy.symbolic.LinearSubscript`.
+
 * ``reductions``
-    * duplication of reduction inames
+  See :class:`loopy.symbolic.Reduction`.
+
    * ``reduce`` vs ``simul_reduce``
+
 * complex-valued arithmetic
+
 * tagging of array access and substitution rule use ("$")
+  See :class:`loopy.symbolic.TaggedVariable`.
+
 * ``indexof``, ``indexof_vec``
+* ``cast(type, value)``: No parse syntax currently.
+  See :class:`loopy.symbolic.TypeCast`.
+
+* If constants in expressions are subclasses of :class:`numpy.generic`,
+  generated code will contain literals of exactly that type, making them
+  *explicitly typed*. Constants given as Python types such as :class:`int`,
+  :class:`float` or :class:`complex` are called *implicitly* typed and
+  adapt to the type of the expected result.

 TODO: Functions
 TODO: Reductions
@@ -342,6 +463,7 @@ TODO: Reductions
 Function Call Instructions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^

+.. automodule:: loopy
 .. autoclass:: CallInstruction

 C Block Instructions
@@ -352,12 +474,14 @@ C Block Instructions
 Atomic Operations
 ^^^^^^^^^^^^^^^^^

-.. autoclass:: memory_ordering
+.. autoclass:: MemoryOrdering

-.. autoclass:: memory_scope
+.. autoclass:: MemoryScope

 .. autoclass:: VarAtomicity

+.. autoclass:: OrderedAtomic
+
 .. autoclass:: AtomicInit

 .. autoclass:: AtomicUpdate
@@ -372,6 +496,12 @@ Barrier Instructions

 .. autoclass:: BarrierInstruction

+Instruction Tags
+^^^^^^^^^^^^^^^^
+
+.. autoclass:: LegacyStringInstructionTag
+.. autoclass:: UseStreamingStoreTag
+
 .. }}}

 Data: Arguments and Temporaries
@@ -388,24 +518,14 @@ Arguments
 ^^^^^^^^^

 .. autoclass:: KernelArgument
-    :members:
-    :undoc-members:

 .. autoclass:: ValueArg
-    :members:
-    :undoc-members:

-.. autoclass:: GlobalArg
-    :members:
-    :undoc-members:
+.. autoclass:: ArrayArg

 .. autoclass:: ConstantArg
-    :members:
-    :undoc-members:

 .. autoclass:: ImageArg
-    :members:
-    :undoc-members:

 .. _temporaries:

@@ -415,11 +535,9 @@ Temporary Variables
 Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both
 have the lifetime of a kernel invocation.

-.. autoclass:: temp_var_scope
+.. autoclass:: AddressSpace

 .. autoclass:: TemporaryVariable
-    :members:
-    :undoc-members:

 .. _types:

@@ -538,10 +656,10 @@ Helper values

 .. {{{

-.. autoclass:: auto
-
 .. autoclass:: UniqueName

+.. autoclass:: Optional
+
 .. }}}

 Libraries: Extending and Interfacing with External Functionality
@@ -573,10 +691,18 @@ The Kernel Object
 Do not create :class:`LoopKernel` objects directly. Instead, refer to
 :ref:`creating-kernels`.

-.. autoclass:: LoopKernel
+.. automodule:: loopy.kernel
+
+Implementation Details: The Base Array
+--------------------------------------
+
+All array-like data in :mod:`loopy` (such as :class:`ArrayArg` and
+:class:`TemporaryVariable`) derive from single, shared base array type,
+described next.
+
+.. currentmodule:: loopy.kernel.array
+
+.. autoclass:: ArrayBase

-.. autoclass:: kernel_state
-    :members:
-    :undoc-members:

 .. vim: tw=75:spell:fdm=marker
--- a/doc/ref_other.rst
+++ b/doc/ref_other.rst
 Reference: Other Functionality
 ==============================

+Auxiliary Data Types
+--------------------
+
+.. automodule:: loopy.typing
+
 Obtaining Kernel Performance Statistics
 ---------------------------------------

@@ -9,6 +14,24 @@ Obtaining Kernel Performance Statistics
 Controlling caching
 -------------------

+.. envvar:: LOOPY_NO_CACHE
+.. envvar:: CG_NO_CACHE
+
+    By default, loopy will cache (on disk) the result of various stages
+    of code generation to speed up future code generation of the same kernel.
+    By setting the environment variables :envvar:`LOOPY_NO_CACHE` or
+    :envvar:`CG_NO_CACHE` to any
+    string that :func:`pytools.strtobool` evaluates as ``True``, this caching
+    is suppressed.
+
+
+.. envvar:: LOOPY_ABORT_ON_CACHE_MISS
+
+    If set to a string that :func:`pytools.strtobool` evaluates as ``True``,
+    loopy will raise an exception if a cache miss occurs. This can be useful
+    for debugging cache-related issues. For example, it can be used to automatically test whether caching is successful for a particular code, by setting this variable to ``True`` and re-running the code.
+
+
 .. autofunction:: set_caching_enabled

 .. autoclass:: CacheMode
@@ -16,10 +39,11 @@ Controlling caching
 Running Kernels
 ---------------

-In addition to simply calling kernels using :class:`LoopKernel.__call__`,
-the following underlying functionality may be used:
+Use :class:`TranslationUnit.executor` to bind a translation unit
+to execution resources, and then use :class:`ExecutorBase.__call__`
+to invoke the kernel.

-.. autoclass:: CompiledKernel
+.. autoclass:: ExecutorBase

 Automatic Testing
 -----------------
@@ -44,3 +68,4 @@ following always works::

 .. autofunction:: show_dependency_graph

+.. autofunction:: t_unit_to_python
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -50,6 +50,10 @@ Influencing data access

 .. autofunction:: set_array_axis_names

+.. automodule:: loopy.transform.privatize
+
+.. autofunction:: allocate_temporaries_for_base_storage
+
 Padding Data
 ------------

@@ -74,6 +78,8 @@ Manipulating Instructions

 .. autofunction:: add_nosync

+.. autofunction:: add_barrier
+
 Registering Library Routines
 ----------------------------

@@ -83,8 +89,6 @@ Registering Library Routines

 .. autofunction:: register_symbol_manglers

-.. autofunction:: register_function_manglers
-
 Modifying Arguments
 -------------------

@@ -98,7 +102,7 @@ Modifying Arguments

 .. autofunction:: rename_argument

-.. autofunction:: set_temporary_scope
+.. autofunction:: set_temporary_address_space

 Creating Batches of Operations
 ------------------------------
@@ -114,7 +118,7 @@ Finishing up

 .. autofunction:: generate_loop_schedules

-.. autofunction:: get_one_scheduled_kernel
+.. autofunction:: get_one_linearized_kernel

 .. autofunction:: save_and_reload_temporaries

@@ -140,4 +144,3 @@ TODO: Matching instruction tags
 .. automodule:: loopy.match

 .. vim: tw=75:spell
-
--- a/doc/ref_translation_unit.rst
+++ b/doc/ref_translation_unit.rst
+.. currentmodule:: loopy
+
+Translation Units
+=================
+
+.. automodule:: loopy.translation_unit
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -25,9 +25,12 @@ import a few modules and set up a :class:`pyopencl.Context` and a

    >>> import loopy as lp
    >>> lp.set_caching_enabled(False)
+    >>> from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2

    >>> from warnings import filterwarnings, catch_warnings
    >>> filterwarnings('error', category=lp.LoopyWarning)
+    >>> from loopy.diagnostic import DirectCallUncachedWarning
+    >>> filterwarnings('ignore', category=DirectCallUncachedWarning)

    >>> ctx = cl.create_some_context(interactive=False)
    >>> queue = cl.CommandQueue(ctx)
@@ -111,18 +114,20 @@ always see loopy's view of a kernel by printing it.
    KERNEL: loopy_kernel
    ---------------------------------------------------------------------------
    ARGUMENTS:
-    a: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
-    n: ValueArg, type: <runtime>
-    out: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
+    a: type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1) in aspace: global
+    n: ValueArg, type: <auto/runtime>
+    out: type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1) out aspace: global
    ---------------------------------------------------------------------------
    DOMAINS:
    [n] -> { [i] : 0 <= i < n }
    ---------------------------------------------------------------------------
-    INAME IMPLEMENTATION TAGS:
+    INAME TAGS:
    i: None
    ---------------------------------------------------------------------------
    INSTRUCTIONS:
-     [i]                                  out[i] <- 2*a[i]   # insn
+    for i
+      out[i] = 2*a[i]  {id=insn}
+    end i
    ---------------------------------------------------------------------------

 You'll likely have noticed that there's quite a bit more information here
@@ -152,7 +157,7 @@ following:
  See :ref:`specifying-arguments`.

 * Loopy has not determined the type of ``a`` and ``out``. The data type is
-  given as ``<runtime>``, which means that these types will be determined
+  given as ``<auto/runtime>``, which means that these types will be determined
  by the data passed in when the kernel is invoked. Loopy generates (and
  caches!) a copy of the kernel for each combination of types passed in.

@@ -176,11 +181,11 @@ for good measure.
    >>> assert (out.get() == (2*x_vec_dev).get()).all()

 We can have loopy print the OpenCL kernel it generated
-by passing :attr:`loopy.Options.write_cl`.
+by passing :attr:`loopy.Options.write_code`.

 .. doctest::

-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> evt, (out,) = knl(queue, a=x_vec_dev)
    #define lid(N) ((int) get_local_id(N))
    #define gid(N) ((int) get_group_id(N))
@@ -205,6 +210,7 @@ For convenience, loopy kernels also directly accept :mod:`numpy` arrays:

 .. doctest::

+    >>> knl = lp.set_options(knl, write_code=False)
    >>> evt, (out,) = knl(queue, a=x_vec_host)
    >>> assert (out == (2*x_vec_host)).all()

@@ -224,37 +230,46 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`:

 .. doctest::

-    >>> knl = lp.set_options(knl, write_wrapper=True, write_cl=False)
+    >>> knl = lp.set_options(knl, write_wrapper=True, write_code=False)
    >>> evt, (out,) = knl(queue, a=x_vec_host)
-    from __future__ import division
+    import numpy as _lpy_np
    ...
    def invoke_loopy_kernel_loopy_kernel(_lpy_cl_kernels, queue, allocator=None, wait_for=None, out_host=None, a=None, n=None, out=None):
        if allocator is None:
            allocator = _lpy_cl_tools.DeferredAllocator(queue.context)
    <BLANKLINE>
-        # {{{ find integer arguments from shapes
+        # {{{ find integer arguments from array data
    <BLANKLINE>
        if n is None:
            if a is not None:
                n = a.shape[0]
+    <BLANKLINE>
            elif out is not None:
                n = out.shape[0]
    <BLANKLINE>
        # }}}
    ...

+You can also pass options to the OpenCL implementation
+by passing :attr:`loopy.Options.build_options`.
+
+.. doctest::
+
+    >>> knl = lp.set_options(knl, build_options=["-cl-mad-enable"])
+
+
 Generating code
 ~~~~~~~~~~~~~~~

 Instead of using loopy to run the code it generates, you can also just use
 loopy as a code generator and take care of executing the generated kernels
 yourself. In this case, make sure loopy knows about all types, and then
-call :func:`loopy.generate_code`:
+call :func:`loopy.generate_code_v2`:

 .. doctest::

    >>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32))
-    >>> code, _ = lp.generate_code(typed_knl)
+    >>> code = lp.generate_code_v2(typed_knl).device_code()
    >>> print(code)
    #define lid(N) ((int) get_local_id(N))
    #define gid(N) ((int) get_group_id(N))
@@ -295,7 +310,8 @@ argument:
    ...     """
    ...     out[j,i] = a[i,j]
    ...     out[i,j] = 2*out[i,j]
-    ...     """)
+    ...     """,
+    ...     [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...])

 loopy's programming model is completely *unordered* by default. This means
 that:
@@ -322,7 +338,9 @@ an explicit dependency:
    ...     """
    ...     out[j,i] = a[i,j] {id=transpose}
    ...     out[i,j] = 2*out[i,j]  {dep=transpose}
-    ...     """)
+    ...     """,
+    ...     [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...],
+    ...     name="transpose_and_dbl")

 ``{id=transpose}`` assigns the identifier *transpose* to the first
 instruction, and ``{dep=transpose}`` declares a dependency of the second
@@ -331,9 +349,9 @@ that these dependencies show up there, too:

 .. doctest::

-    >>> print(knl.stringify(with_dependencies=True))
+    >>> print(knl["transpose_and_dbl"].stringify(with_dependencies=True))
    ---------------------------------------------------------------------------
-    KERNEL: loopy_kernel
+    KERNEL: transpose_and_dbl
    ---------------------------------------------------------------------------
    ...
    ---------------------------------------------------------------------------
@@ -352,7 +370,7 @@ loopy can also show an instruction dependency graph, using

 Dependencies are shown as arrows from prerequisite to dependent in the
 graph.  This functionality requires the open-source `graphviz
-<http://graphviz.org>`_ graph drawing tools to be installed. The generated
+<https://graphviz.org>`_ graph drawing tools to be installed. The generated
 graph will open in a browser window.

 Since manually notating lots of dependencies is cumbersome, loopy has
@@ -377,13 +395,13 @@ Let us take a look at the generated code for the above kernel:

 .. doctest::

-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> knl = lp.prioritize_loops(knl, "i,j")
    >>> evt, (out,) = knl(queue, a=a_mat_dev)
    #define lid(N) ((int) get_local_id(N))
    #define gid(N) ((int) get_group_id(N))
    <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) transpose_and_dbl(__global float *__restrict__ out, __global float const *__restrict__ a, int const n)
    {
      for (int i = 0; i <= -1 + n; ++i)
        for (int j = 0; j <= -1 + n; ++j)
@@ -419,20 +437,22 @@ with identical bounds, for the use of the transpose:
    ...     """
    ...     out[j,i] = a[i,j] {id=transpose}
    ...     out[ii,jj] = 2*out[ii,jj]  {dep=transpose}
-    ...     """)
-    >>> knl = lp.prioritize_loops(knl, "i,j,ii,jj")
+    ...     """,
+    ...     [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...])
+    >>> knl = lp.prioritize_loops(knl, "i,j")
+    >>> knl = lp.prioritize_loops(knl, "ii,jj")

 :func:`loopy.duplicate_inames` can be used to achieve the same goal.
 Now the intended code is generated and our test passes.

 .. doctest::

-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> evt, (out,) = knl(queue, a=a_mat_dev)
    #define lid(N) ((int) get_local_id(N))
    #define gid(N) ((int) get_group_id(N))
    <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *__restrict__ out, __global float const *__restrict__ a, int const n)
    {
      for (int i = 0; i <= -1 + n; ++i)
        for (int j = 0; j <= -1 + n; ++j)
@@ -482,13 +502,13 @@ ambiguous.

 .. doctest::

-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> evt, (out,) = knl(queue, a=a_mat_dev)
    #define lid(N) ((int) get_local_id(N))
    ...
      for (int j = 0; j <= -1 + n; ++j)
        for (int i = 0; i <= -1 + n; ++i)
-          a[n * i + j] = 0.0f;
+          a[n * i + j] = (float) (0.0f);
    ...

 No more warnings! Loop nesting is also reflected in the dependency graph:
@@ -520,7 +540,7 @@ is overwritten with the new kernel::
    knl = lp.do_something(knl, arguments...)

 We've already seen an example of a transformation above:
-For instance, :func:`prioritize_loops` fit the pattern.
+For instance, :func:`loopy.prioritize_loops` fit the pattern.

 :func:`loopy.split_iname` is another fundamental (and useful) transformation. It
 turns one existing iname (recall that this is loopy's word for a 'loop
@@ -540,13 +560,13 @@ Consider this example:
    ...     "a[i] = 0", assumptions="n>=1")
    >>> knl = lp.split_iname(knl, "i", 16)
    >>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> evt, (out,) = knl(queue, a=x_vec_dev)
    #define lid(N) ((int) get_local_id(N))
    ...
-      for (int i_outer = 0; i_outer <= -1 + ((15 + n) / 16); ++i_outer)
-        for (int i_inner = 0; i_inner <= (-16 + n + -16 * i_outer >= 0 ? 15 : -1 + n + -16 * i_outer); ++i_inner)
-          a[16 * i_outer + i_inner] = 0.0f;
+      for (int i_outer = 0; i_outer <= -1 + (15 + n) / 16; ++i_outer)
+        for (int i_inner = 0; i_inner <= ((-17 + n + -16 * i_outer >= 0) ? 15 : -1 + n + -16 * i_outer); ++i_inner)
+          a[16 * i_outer + i_inner] = (float) (0.0f);
    ...

 By default, the new, split inames are named *OLD_outer* and *OLD_inner*,
@@ -571,13 +591,13 @@ relation to loop nesting. For example, it's perfectly possible to request
    ...     "a[i] = 0", assumptions="n>=1")
    >>> knl = lp.split_iname(knl, "i", 16)
    >>> knl = lp.prioritize_loops(knl, "i_inner,i_outer")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> evt, (out,) = knl(queue, a=x_vec_dev)
    #define lid(N) ((int) get_local_id(N))
    ...
-      for (int i_inner = 0; i_inner <= (-17 + n >= 0 ? 15 : -1 + n); ++i_inner)
-        for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
-          a[16 * i_outer + i_inner] = 0.0f;
+      for (int i_inner = 0; i_inner <= ((-17 + n >= 0) ? 15 : -1 + n); ++i_inner)
+        for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + (15 + n + 15 * i_inner) / 16; ++i_outer)
+          a[16 * i_outer + i_inner] = (float) (0.0f);
    ...

 Notice how loopy has automatically generated guard conditionals to make
@@ -595,13 +615,13 @@ commonly called 'loop tiling':
    ...     assumptions="n mod 16 = 0 and n >= 1")
    >>> knl = lp.split_iname(knl, "i", 16)
    >>> knl = lp.split_iname(knl, "j", 16)
-    >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner,j_inner")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> evt, (out,) = knl(queue, a=a_mat_dev)
    #define lid(N) ((int) get_local_id(N))
    ...
-      for (int i_outer = 0; i_outer <= ((-16 + n) / 16); ++i_outer)
-        for (int j_outer = 0; j_outer <= ((-16 + n) / 16); ++j_outer)
+      for (int i_outer = 0; i_outer <= (-16 + n) / 16; ++i_outer)
+        for (int j_outer = 0; j_outer <= (-16 + n) / 16; ++j_outer)
          for (int i_inner = 0; i_inner <= 15; ++i_inner)
            for (int j_inner = 0; j_inner <= 15; ++j_inner)
              out[n * (16 * i_outer + i_inner) + 16 * j_outer + j_inner] = a[n * (16 * j_outer + j_inner) + 16 * i_outer + i_inner];
@@ -638,18 +658,17 @@ loop's tag to ``"unr"``:
    >>> knl = lp.split_iname(knl, "i", 4)
    >>> knl = lp.tag_inames(knl, dict(i_inner="unr"))
    >>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> evt, (out,) = knl(queue, a=x_vec_dev)
    #define lid(N) ((int) get_local_id(N))
    #define gid(N) ((int) get_group_id(N))
-    #define int_floor_div_pos_b(a,b) (                 ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)                 )
    ...
-      for (int i_outer = 0; i_outer <= int_floor_div_pos_b(-4 + n, 4); ++i_outer)
+      for (int i_outer = 0; i_outer <= loopy_floor_div_pos_b_int32(-4 + n, 4); ++i_outer)
      {
-        a[4 * i_outer] = 0.0f;
-        a[4 * i_outer + 1] = 0.0f;
-        a[4 * i_outer + 2] = 0.0f;
-        a[4 * i_outer + 3] = 0.0f;
+        a[4 * i_outer] = (float) (0.0f);
+        a[1 + 4 * i_outer] = (float) (0.0f);
+        a[2 + 4 * i_outer] = (float) (0.0f);
+        a[3 + 4 * i_outer] = (float) (0.0f);
      }
    ...

@@ -688,7 +707,7 @@ Iname implementation tags are also printed along with the entire kernel:
    >>> print(knl)
    ---------------------------------------------------------------------------
    ...
-    INAME IMPLEMENTATION TAGS:
+    INAME TAGS:
    i_inner: unr
    i_outer: None
    ---------------------------------------------------------------------------
@@ -714,14 +733,14 @@ Let's try this out on our vector fill kernel by creating workgroups of size
    ...     "a[i] = 0", assumptions="n>=0")
    >>> knl = lp.split_iname(knl, "i", 128,
    ...         outer_tag="g.0", inner_tag="l.0")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> evt, (out,) = knl(queue, a=x_vec_dev)
    #define lid(N) ((int) get_local_id(N))
    ...
    __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n)
    {
      if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
-        a[128 * gid(0) + lid(0)] = 0.0f;
+        a[128 * gid(0) + lid(0)] = (float) (0.0f);
    }

 Loopy requires that workgroup sizes are fixed and constant at compile time.
@@ -733,11 +752,11 @@ those for us:

 .. doctest::

-    >>> glob, loc = knl.get_grid_size_upper_bounds()
+    >>> glob, loc = knl["loopy_kernel"].get_grid_size_upper_bounds(knl.callables_table)
    >>> print(glob)
-    (Aff("[n] -> { [(floor((127 + n)/128))] }"),)
+    (PwAff("[n] -> { [(floor((127 + n)/128))] }"),)
    >>> print(loc)
-    (Aff("[n] -> { [(128)] }"),)
+    (PwAff("[n] -> { [(128)] }"),)

 Note that this functionality returns internal objects and is not really
 intended for end users.
@@ -760,26 +779,26 @@ assumption:
    >>> knl = lp.split_iname(knl, "i", 4)
    >>> knl = lp.tag_inames(knl, dict(i_inner="unr"))
    >>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> evt, (out,) = knl(queue, a=x_vec_dev)
    #define lid(N) ((int) get_local_id(N))
    ...
-      for (int i_outer = 0; i_outer <= -1 + ((3 + n) / 4); ++i_outer)
+      for (int i_outer = 0; i_outer <= -1 + (3 + n) / 4; ++i_outer)
      {
-        a[4 * i_outer] = 0.0f;
+        a[4 * i_outer] = (float) (0.0f);
        if (-2 + -4 * i_outer + n >= 0)
-          a[4 * i_outer + 1] = 0.0f;
+          a[1 + 4 * i_outer] = (float) (0.0f);
        if (-3 + -4 * i_outer + n >= 0)
-          a[4 * i_outer + 2] = 0.0f;
+          a[2 + 4 * i_outer] = (float) (0.0f);
        if (-4 + -4 * i_outer + n >= 0)
-          a[4 * i_outer + 3] = 0.0f;
+          a[3 + 4 * i_outer] = (float) (0.0f);
      }
    ...

 While these conditionals enable the generated code to deal with arbitrary
 *n*, they come at a performance cost. Loopy allows generating separate code
 for the last iteration of the *i_outer* loop, by using the *slabs* keyword
-argument to :func:`split_iname`. Since this last iteration of *i_outer* is
+argument to :func:`loopy.split_iname`. Since this last iteration of *i_outer* is
 the only iteration for which ``i_inner + 4*i_outer`` can become larger than
 *n*, only the (now separate) code for that iteration contains conditionals,
 enabling some cost savings:
@@ -788,32 +807,32 @@ enabling some cost savings:

    >>> knl = orig_knl
    >>> knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
    >>> evt, (out,) = knl(queue, a=x_vec_dev)
    #define lid(N) ((int) get_local_id(N))
    ...
      /* bulk slab for 'i_outer' */
-      for (int i_outer = 0; i_outer <= -2 + ((3 + n) / 4); ++i_outer)
+      for (int i_outer = 0; i_outer <= -2 + (3 + n) / 4; ++i_outer)
      {
-        a[4 * i_outer] = 0.0f;
-        a[4 * i_outer + 1] = 0.0f;
-        a[4 * i_outer + 2] = 0.0f;
-        a[4 * i_outer + 3] = 0.0f;
+        a[4 * i_outer] = (float) (0.0f);
+        a[1 + 4 * i_outer] = (float) (0.0f);
+        a[2 + 4 * i_outer] = (float) (0.0f);
+        a[3 + 4 * i_outer] = (float) (0.0f);
      }
      /* final slab for 'i_outer' */
      {
-        int const i_outer = -1 + n + -1 * (3 * n / 4);
+        int const i_outer = -1 + n + -1 * ((3 * n) / 4);
    <BLANKLINE>
-        if (-1 + n >= 0)
+        if (i_outer >= 0)
        {
-          a[4 * i_outer] = 0.0f;
+          a[4 * i_outer] = (float) (0.0f);
          if (-2 + -4 * i_outer + n >= 0)
-            a[4 * i_outer + 1] = 0.0f;
+            a[1 + 4 * i_outer] = (float) (0.0f);
          if (-3 + -4 * i_outer + n >= 0)
-            a[4 * i_outer + 2] = 0.0f;
+            a[2 + 4 * i_outer] = (float) (0.0f);
          if (4 + 4 * i_outer + -1 * n == 0)
-            a[4 * i_outer + 3] = 0.0f;
+            a[3 + 4 * i_outer] = (float) (0.0f);
        }
      }
    ...
@@ -884,7 +903,7 @@ memory, local to each work item.

 .. doctest::

-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> evt, (out1, out2) = knl(queue, a=x_vec_dev)
    #define lid(N) ((int) get_local_id(N))
    ...
@@ -940,12 +959,12 @@ Consider the following example:
    ...     "{ [i_outer,i_inner, k]:  "
    ...          "0<= 16*i_outer + i_inner <n and 0<= i_inner,k <16}",
    ...     """
-    ...     <> a_temp[i_inner] = a[16*i_outer + i_inner] {priority=10}
+    ...     <> a_temp[i_inner] = a[16*i_outer + i_inner]
    ...     out[16*i_outer + i_inner] = sum(k, a_temp[k])
    ...     """)
    >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0"))
-    >>> knl = lp.set_temporary_scope(knl, "a_temp", "local")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_temporary_address_space(knl, "a_temp", "local")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> evt, (out,) = knl(queue, a=x_vec_dev)
    #define lid(N) ((int) get_local_id(N))
    ...
@@ -1010,13 +1029,13 @@ transformation exists in :func:`loopy.add_prefetch`:
    ...     out[16*i_outer + i_inner] = sum(k, a[16*i_outer + i_inner])
    ...     """)
    >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0"))
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> knl_pf = lp.add_prefetch(knl, "a")
    >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
    #define lid(N) ((int) get_local_id(N))
    ...
-        acc_k = 0.0f;
        a_fetch = a[16 * gid(0) + lid(0)];
+        acc_k = 0.0f;
        for (int k = 0; k <= 15; ++k)
          acc_k = acc_k + a_fetch;
        out[16 * gid(0) + lid(0)] = acc_k;
@@ -1035,17 +1054,15 @@ earlier:

 .. doctest::

-    >>> knl_pf = lp.add_prefetch(knl, "a", ["i_inner"])
+    >>> knl_pf = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.0")
    >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
    #define lid(N) ((int) get_local_id(N))
    ...
-      if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
-        acc_k = 0.0f;
      if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
        a_fetch[lid(0)] = a[16 * gid(0) + lid(0)];
-      barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */;
      if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
      {
+        acc_k = 0.0f;
        for (int k = 0; k <= 15; ++k)
          acc_k = acc_k + a_fetch[lid(0)];
        out[16 * gid(0) + lid(0)] = acc_k;
@@ -1064,7 +1081,7 @@ Temporaries in global memory
 local and private temporaries, the runtime allocates storage for global
 temporaries when the kernel gets executed. The user must explicitly specify that
 a temporary is global. To specify that a temporary is global, use
-:func:`loopy.set_temporary_scope`.
+:func:`loopy.set_temporary_address_space`.

 Substitution rules
 ~~~~~~~~~~~~~~~~~~
@@ -1105,11 +1122,12 @@ work item:

 :mod:`loopy` supports two kinds of barriers:

-* *Local barriers* ensure consistency of local memory accesses to items within
+* *Local barriers* ensure consistency of memory accesses to items within
  *the same* work group. This synchronizes with all instructions in the work
-  group.
+  group.  The type of memory (local or global) may be specified by the
+  :attr:`loopy.BarrierInstruction.mem_kind`

-* *Global barriers* ensure consistency of global memory accesses
+* *Global barriers* ensure consistency of memory accesses
  across *all* work groups, i.e. it synchronizes with every work item
  executing the kernel. Note that there is no exact equivalent for
  this kind of barrier in OpenCL. [#global-barrier-note]_
@@ -1118,14 +1136,17 @@ Once a work item has reached a barrier, it waits for everyone that it
 synchronizes with to reach the barrier before continuing. This means that unless
 all work items reach the same barrier, the kernel will hang during execution.

+Barrier insertion
+~~~~~~~~~~~~~~~~~
+
 By default, :mod:`loopy` inserts local barriers between two instructions when it
 detects that a dependency involving local memory may occur across work items. To
 see this in action, take a look at the section on :ref:`local_temporaries`.

-In contrast, :mod:`loopy` will *not* insert global barriers automatically.
-Global barriers require manual intervention along with some special
-post-processing which we describe below. Consider the following kernel, which
-attempts to rotate its input to the right by 1 in parallel:
+In contrast, :mod:`loopy` will *not* insert global barriers automatically and
+instead will report an error if it detects the need for a global barrier. As an
+example, consider the following kernel, which attempts to rotate its input to
+the right by 1 in parallel:

 .. doctest::

@@ -1151,14 +1172,28 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted:
   >>> cgr = lp.generate_code_v2(knl)
   Traceback (most recent call last):
   ...
-   MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
+   loopy.diagnostic.MissingBarrierError: rotate_v1: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
+
+The syntax for a inserting a global barrier instruction is
+``... gbarrier``. :mod:`loopy` also supports manually inserting local
+barriers. The syntax for a local barrier instruction is ``... lbarrier``.
+
+Saving temporaries across global barriers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The syntax for a global barrier instruction is ``... gbarrier``. This needs to
-be added between the pair of offending instructions.
+For some platforms (currently only PyOpenCL), :mod:`loopy` implements global
+barriers by splitting the kernel into a host side kernel and multiple
+device-side kernels. On such platforms, it will be necessary to save non-global
+temporaries that are live across kernel calls. This section presents an example
+of how to use :func:`loopy.save_and_reload_temporaries` which is helpful for
+that purpose.
+
+Let us start with an example. Consider the kernel from above with a
+``... gbarrier`` instruction that has already been inserted.

 .. doctest::

-   >>> knl = lp.make_kernel(
+   >>> prog = lp.make_kernel(
   ...     "[n] -> {[i] : 0<=i<n}",
   ...     """
   ...     for i
@@ -1173,34 +1208,41 @@ be added between the pair of offending instructions.
   ...      ],
   ...     name="rotate_v2",
   ...     assumptions="n mod 16 = 0")
-   >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0")
+   >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0")

-When we try to generate code for this, it will still not work.
+.. testsetup::

-   >>> cgr = lp.generate_code_v2(knl)
+    >>> prog = prog.with_kernel(
+    ...    prog.default_entrypoint.copy(
+    ...        silenced_warnings=["v1_scheduler_fallback"]))
+
+Here is what happens when we try to generate code for the kernel:
+
+   >>> cgr = lp.generate_code_v2(prog)
   Traceback (most recent call last):
   ...
-   MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?)
+   loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?)

-To understand what is going on, you need to know that :mod:`loopy` implements
-global barriers by splitting the kernel into multiple device-side kernels. The
-splitting happens when the instruction schedule is generated. To see the
-schedule, we must first call :func:`loopy.get_one_scheduled_kernel`:
+This happens due to the kernel splitting done by :mod:`loopy`. The splitting
+happens when the instruction schedule is generated. To see the schedule, we
+should call :func:`loopy.get_one_linearized_kernel`:

-   >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
-   >>> print(knl)
+   >>> prog = lp.preprocess_kernel(prog)
+   >>> knl = lp.get_one_linearized_kernel(prog["rotate_v2"], prog.callables_table)
+   >>> prog = prog.with_kernel(knl)
+   >>> print(prog)
   ---------------------------------------------------------------------------
   KERNEL: rotate_v2
   ---------------------------------------------------------------------------
   ...
   ---------------------------------------------------------------------------
-   SCHEDULE:
-      0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[])
-      1:     [maketmp] tmp <- arr[i_inner + i_outer*16]
+   LINEARIZATION:
+      0: CALL KERNEL rotate_v2
+      1:     tmp = arr[i_inner + i_outer*16]  {id=maketmp}
      2: RETURN FROM KERNEL rotate_v2
-      3: ---BARRIER:global---
-      4: CALL KERNEL rotate_v2_0(extra_args=[], extra_inames=[])
-      5:     [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp
+      3: ... gbarrier
+      4: CALL KERNEL rotate_v2_0
+      5:     arr[(i_inner + i_outer*16 + 1) % n] = tmp  {id=rotate}
      6: RETURN FROM KERNEL rotate_v2_0
   ---------------------------------------------------------------------------

@@ -1214,33 +1256,33 @@ goes for local temporaries).
 :func:`loopy.save_and_reload_temporaries` for the purpose of handling the
 task of saving and restoring temporary values across global barriers. This
 function adds instructions to the kernel without scheduling them. That means
-that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to
+that :func:`loopy.get_one_linearized_kernel` needs to be called one more time to
 put those instructions into the schedule.

-   >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
-   >>> knl = lp.save_and_reload_temporaries(knl)
-   >>> knl = lp.get_one_scheduled_kernel(knl)  # Schedule added instructions
-   >>> print(knl)
+   >>> prog = lp.save_and_reload_temporaries(prog)
+   >>> knl = lp.get_one_linearized_kernel(prog["rotate_v2"], prog.callables_table)  # Schedule added instructions
+   >>> prog = prog.with_kernel(knl)
+   >>> print(prog)
   ---------------------------------------------------------------------------
   KERNEL: rotate_v2
   ---------------------------------------------------------------------------
   ...
   ---------------------------------------------------------------------------
   TEMPORARIES:
-   tmp: type: np:dtype('int32'), shape: () scope:private
-   tmp_save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) scope:global
+   tmp: type: np:dtype('int32'), shape: () aspace: private
+   tmp_save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) aspace: global
   ---------------------------------------------------------------------------
   ...
   ---------------------------------------------------------------------------
-   SCHEDULE:
-      0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[])
-      1:     [maketmp] tmp <- arr[i_inner + i_outer*16]
-      2:     [tmp.save] tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp
+   LINEARIZATION:
+      0: CALL KERNEL rotate_v2
+      1:     tmp = arr[i_inner + i_outer*16]  {id=maketmp}
+      2:     tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp  {id=tmp.save}
      3: RETURN FROM KERNEL rotate_v2
-      4: ---BARRIER:global---
-      5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[])
-      6:     [tmp.reload] tmp <- tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0]
-      7:     [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp
+      4: ... gbarrier
+      5: CALL KERNEL rotate_v2_0
+      6:     tmp = tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0]  {id=tmp.reload}
+      7:     arr[(i_inner + i_outer*16 + 1) % n] = tmp  {id=rotate}
      8: RETURN FROM KERNEL rotate_v2_0
   ---------------------------------------------------------------------------

@@ -1259,12 +1301,12 @@ does in more detail:

 The kernel translates into two OpenCL kernels.

-   >>> cgr = lp.generate_code_v2(knl)
+   >>> cgr = lp.generate_code_v2(prog)
   >>> print(cgr.device_code())
   #define lid(N) ((int) get_local_id(N))
   #define gid(N) ((int) get_group_id(N))
   <BLANKLINE>
-   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int const *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
   {
     int tmp;
   <BLANKLINE>
@@ -1272,20 +1314,20 @@ The kernel translates into two OpenCL kernels.
     tmp_save_slot[16 * gid(0) + lid(0)] = tmp;
   }
   <BLANKLINE>
-   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int const *__restrict__ tmp_save_slot)
   {
     int tmp;
   <BLANKLINE>
     tmp = tmp_save_slot[16 * gid(0) + lid(0)];
-     arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp;
+     arr[(1 + lid(0) + gid(0) * 16) % n] = tmp;
   }

-Executing the kernel does what we expect.
+Now we can execute the kernel.

   >>> arr = cl.array.arange(queue, 16, dtype=np.int32)
   >>> print(arr)
   [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
-   >>> evt, (out,) = knl(queue, arr=arr)
+   >>> evt, (out,) = prog(queue, arr=arr)
   >>> print(arr)
   [15  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]

@@ -1339,9 +1381,9 @@ a loopy kernel by simply calling them, e.g.::
 Additionally, all functions of one variable are currently recognized during
 code-generation however additional implementation may be required for custom
 functions.  The full lists of available functions may be found in a the
-:class:`TargetBase` implementation (e.g. :class:`CudaTarget`)
+:class:`loopy.TargetBase` implementation (e.g. :class:`loopy.CudaTarget`)

-Custom user functions may be represented using the method described in :ref:`_functions`
+Custom user functions may be represented using the method described in :ref:`functions`


 Data-dependent control flow
@@ -1377,7 +1419,7 @@ Attempting to create this kernel results in an error:
    ... # While trying to find shape axis 0 of argument 'out', the following exception occurred:
    Traceback (most recent call last):
    ...
-    StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n <= 1; [(n)] : n >= 2 }'
+    loopy.diagnostic.StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n <= 1; [(n)] : n >= 2 }'

 The problem is that loopy cannot find a simple, universally valid expression
 for the length of *out* in this case. Notice how the kernel accesses both the
@@ -1433,7 +1475,7 @@ We'll also request a prefetch--but suppose we only do so across the

 .. doctest::

-    >>> knl = lp.add_prefetch(knl, "a", "i_inner")
+    >>> knl = lp.add_prefetch(knl, "a", "i_inner", default_tag="l.auto")

 When we try to run our code, we get the following warning from loopy as a first
 sign that something is amiss:
@@ -1443,13 +1485,13 @@ sign that something is amiss:
    >>> evt, (out,) = knl(queue, a=a_mat_dev)
    Traceback (most recent call last):
    ...
-    WriteRaceConditionWarning: in kernel transpose: instruction 'a_fetch_rule' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch_rule)' to silenced_warnings kernel attribute to disable)
+    loopy.diagnostic.WriteRaceConditionWarning: in kernel transpose: instruction 'a_fetch_rule' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch_rule)' to silenced_warnings kernel attribute to disable)

 When we ask to see the code, the issue becomes apparent:

 .. doctest::

-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, write_code=True)
    >>> from warnings import catch_warnings
    >>> with catch_warnings():
    ...     filterwarnings("always", category=lp.LoopyWarning)
@@ -1522,24 +1564,22 @@ containing different types of data:
    ...     """
    ...     c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
    ...     e[i, k] = g[i,k]*(2+h[i,k+1])
-    ...     """)
+    ...     """, name="stats_knl")
    >>> knl = lp.add_and_infer_dtypes(knl,
    ...     dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))

-Note that loopy will infer the data types for arrays c and e from the
+Note that loopy will infer the data types for arrays ``c`` and ``e`` from the
 information provided. Now we will count the operations:

 .. doctest::

-    >>> op_map = lp.get_op_map(knl)
-    >>> print(lp.stringify_stats_mapping(op_map))
-    Op(np:dtype('float32'), add) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float32'), div) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float32'), mul) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float64'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float64'), mul) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('int32'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    <BLANKLINE>
+    >>> op_map = lp.get_op_map(knl, subgroup_size=32)
+    >>> print(op_map)
+    Op(np:dtype('float32'), add, subgroup, "stats_knl"): ...
+
+Each line of output will look roughly like::
+
+    Op(np:dtype('float32'), add, subgroup, "kernel_name") : [l, m, n] -> { l * m * n : l > 0 and m > 0 and n > 0 }

 :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
 :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A
@@ -1549,23 +1589,24 @@ number of operations matching the characteristics of the :class:`loopy.Op`
 specified in the key (in terms of the :class:`loopy.LoopKernel`
 *inames*). :class:`loopy.Op` attributes include:

- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+- dtype: A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the
  data type operated on.

 - name: A :class:`str` that specifies the kind of arithmetic operation as
  *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.

-One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
+One way to evaluate these polynomials is with :meth:`islpy.PwQPolynomial.eval_with_dict`:

 .. doctest::

    >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
-    >>> f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(param_dict)
-    >>> f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(param_dict)
-    >>> f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(param_dict)
-    >>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict)
-    >>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict)
-    >>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict)
+    >>> from loopy.statistics import CountGranularity as CG
+    >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
+    >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
+    >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
+    >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
+    >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
+    >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
    >>> print("%i\n%i\n%i\n%i\n%i\n%i" %
    ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
    524288
@@ -1579,7 +1620,7 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
 grouping, and evaluating subsets of the counts. Suppose we want to know the
 total number of 32-bit operations of any kind. We can easily count these
 using functions :func:`loopy.ToCountMap.filter_by` and
-:func:`loopy.ToCountMap.eval_and_sum`:
+:func:`loopy.ToCountPolynomialMap.eval_and_sum`:

 .. doctest::

@@ -1595,16 +1636,18 @@ together into keys containing only the specified fields:
 .. doctest::

    >>> op_map_dtype = op_map.group_by('dtype')
-    >>> print(lp.stringify_stats_mapping(op_map_dtype))
-    Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('int32'), None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    <BLANKLINE>
+    >>> print(op_map_dtype)
+    Op(np:dtype('float32'), None, None): ...
    >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32)
    ...                           ].eval_with_dict(param_dict)
    >>> print(f32op_count)
    1572864

+The lines of output above might look like::
+
+    Op(np:dtype('float32'), None, None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), None, None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 }
+
 See the reference page for :class:`loopy.ToCountMap` and :class:`loopy.Op` for
 more information on these functions.

@@ -1617,15 +1660,16 @@ we'll continue using the kernel from the previous example:

 .. doctest::

-    >>> mem_map = lp.get_mem_access_map(knl)
-    >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    <BLANKLINE>
+    >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32)
+    >>> print(mem_map)
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, 'stats_knl'): ...
+
+Each line of output will look roughly like::
+
+
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroupw, 'stats_knl') : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, 'stats_knl') : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, 'stats_knl') : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }

 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
 :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
@@ -1634,11 +1678,21 @@ we'll continue using the kernel from the previous example:
 - mtype: A :class:`str` that specifies the memory type accessed as **global**
  or **local**

- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+- dtype: A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the
  data type accessed.

- stride: An :class:`int` that specifies stride of the memory access. A stride
-  of 0 indicates a uniform access (i.e. all threads access the same item).
+- lid_strides: A :class:`dict` of **{** :class:`int` **:**
+  :data:`~pymbolic.typing.Expression` or :class:`int` **}** that specifies
+  local strides for each local id in the memory access index. Local ids not
+  found will not be present in ``lid_strides.keys()``. Uniform access (i.e.
+  work-items within a sub-group access the same item) is indicated by setting
+  ``lid_strides[0]=0``, but may also occur when no local id 0 is found, in
+  which case the 0 key will not be present in lid_strides.
+
+- gid_strides: A :class:`dict` of **{** :class:`int` **:**
+  :data:`~pymbolic.typing.Expression` or :class:`int` **}** that specifies
+  global strides for each global id in the memory access index. Global ids not
+  found will not be present in ``gid_strides.keys()``.

 - direction: A :class:`str` that specifies the direction of memory access as
  **load** or **store**.
@@ -1646,17 +1700,21 @@ we'll continue using the kernel from the previous example:
 - variable: A :class:`str` that specifies the variable name of the data
  accessed.

-We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
+We can evaluate these polynomials using :meth:`islpy.PwQPolynomial.eval_with_dict`:

 .. doctest::

-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g')
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g',
+    ...                  variable_tags=None, count_granularity=CG.SUBGROUP, kernel_name="stats_knl")
    ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e')
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e',
+    ...                  variable_tags=None, count_granularity=CG.SUBGROUP, kernel_name="stats_knl")
    ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a')
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a',
+    ...                  variable_tags=None, count_granularity=CG.SUBGROUP, kernel_name="stats_knl")
    ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c')
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c',
+    ...                  variable_tags=None, count_granularity=CG.SUBGROUP, kernel_name="stats_knl")
    ...                  ].eval_with_dict(param_dict)
    >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
    ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1673,20 +1731,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 .. doctest::

    >>> bytes_map = mem_map.to_bytes()
-    >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    <BLANKLINE>
+    >>> print(bytes_map)
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, 'stats_knl'): ...
    >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
    ...                                         ).group_by('direction')
-    >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
-    MemAccess(None, None, None, load, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(None, None, None, store, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 }
-    <BLANKLINE>
+    >>> print(global_ld_st_bytes)
+    MemAccess(None, None, None, None, load, None, None, None, None): ...
+    MemAccess(None, None, None, None, store, None, None, None, None): ...
    >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
    ...                            ].eval_with_dict(param_dict)
    >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store')
@@ -1695,45 +1746,58 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
    bytes loaded: 7340032
    bytes stored: 2621440

+The lines of output above might look like::
+
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup): [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup): [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup): [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, {}, load, g, None, subgroup): [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, {}, load, h, None, subgroup): [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, {}, store, e, None, subgroup): [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+
 One can see how these functions might be useful in computing, for example,
 achieved memory bandwidth in byte/sec or performance in FLOP/sec.

 ~~~~~~~~~~~

 Since we have not tagged any of the inames or parallelized the kernel across
-threads (which would have produced iname tags), :func:`loopy.get_mem_access_map`
-considers the memory accesses *uniform*, so the *stride* of each access is 0.
-Now we'll parallelize the kernel and count the array accesses again. The
-resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
+work-items (which would have produced iname tags), :func:`loopy.get_mem_access_map`
+finds no local or global id strides, leaving ``lid_strides`` and ``gid_strides``
+empty for each memory access. Now we'll parallelize the kernel and count the array
+accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated
+this time.

 .. doctest::

    >>> knl_consec = lp.split_iname(knl, "k", 128,
    ...                             outer_tag="l.1", inner_tag="l.0")
-    >>> mem_map = lp.get_mem_access_map(knl_consec)
-    >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 1, load, a) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, load, b) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, store, c) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, g) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, h) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, store, e) : [m, l, n] -> { ... }
-    <BLANKLINE>
-
-With this parallelization, consecutive threads will access consecutive array
+    >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32)
+    >>> print(mem_map)
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, 'stats_knl'): ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, 'stats_knl'): ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, 'stats_knl'): ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, 'stats_knl'): ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, 'stats_knl'): ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, 'stats_knl'): ...
+
+With this parallelization, consecutive work-items will access consecutive array
 elements in memory. The polynomials are a bit more complicated now due to the
 parallelization, but when we evaluate them, we see that the total number of
 array accesses has not changed:

 .. doctest::

-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g')
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g',
+    ...                  variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
    ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e')
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e',
+    ...                  variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
    ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a')
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a',
+    ...                  variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
    ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c')
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c',
+    ...                  variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
    ...                  ].eval_with_dict(param_dict)
    >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
    ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1744,36 +1808,39 @@ array accesses has not changed:

 ~~~~~~~~~~~

-To produce *nonconsecutive* array accesses with stride greater than 1, we'll
-switch the inner and outer tags in our parallelization of the kernel:
+To produce *nonconsecutive* array accesses with local id 0 stride greater than 1,
+we'll switch the inner and outer tags in our parallelization of the kernel:

 .. doctest::

    >>> knl_nonconsec = lp.split_iname(knl, "k", 128,
    ...                                outer_tag="l.0", inner_tag="l.1")
-    >>> mem_map = lp.get_mem_access_map(knl_nonconsec)
-    >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 128, load, a) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, load, b) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, store, c) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, g) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, h) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, store, e) : [m, l, n] -> { ... }
-    <BLANKLINE>
-
-With this parallelization, consecutive threads will access *nonconsecutive*
+    >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32)
+    >>> print(mem_map)
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, 'stats_knl'): ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, 'stats_knl'): ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, 'stats_knl'): ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, 'stats_knl'): ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, 'stats_knl'): ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, 'stats_knl'): ...
+
+With this parallelization, consecutive work-items will access *nonconsecutive*
 array elements in memory. The total number of array accesses still has not
 changed:

 .. doctest::

-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g')
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g',
+    ...                  variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
    ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e')
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e',
+    ...                  variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
    ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a')
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a',
+    ...                  variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
    ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c')
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c',
+    ...                  variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
    ...                  ].eval_with_dict(param_dict)
    >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
    ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1791,7 +1858,7 @@ criteria are more complicated than a simple list of allowable values:
    >>> def f(key):
    ...     from loopy.types import to_loopy_type
    ...     return key.dtype == to_loopy_type(np.float32) and \
-    ...            key.stride > 1
+    ...            key.lid_strides[0] > 1
    >>> count = mem_map.filter_by_func(f).eval_and_sum(param_dict)
    >>> print(count)
    2097152
@@ -1800,21 +1867,20 @@ Counting synchronization events
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 :func:`loopy.get_synchronization_map` counts the number of synchronization
-events per **thread** in a kernel. First, we'll call this function on the
+events per **work-item** in a kernel. First, we'll call this function on the
 kernel from the previous example:

 .. doctest::

    >>> sync_map = lp.get_synchronization_map(knl)
-    >>> print(lp.stringify_stats_mapping(sync_map))
-    kernel_launch : { 1 }
-    <BLANKLINE>
+    >>> print(sync_map)
+    Sync(kernel_launch, stats_knl): [l, m, n] -> { 1 }

-We can evaluate this polynomial using :func:`islpy.eval_with_dict`:
+We can evaluate this polynomial using :meth:`islpy.PwQPolynomial.eval_with_dict`:

 .. doctest::

-    >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict)
+    >>> launch_count = sync_map[lp.Sync("kernel_launch", "stats_knl")].eval_with_dict(param_dict)
    >>> print("Kernel launch count: %s" % launch_count)
    Kernel launch count: 1

@@ -1830,7 +1896,7 @@ Now to make things more interesting, we'll create a kernel with barriers:
    ...     e[i,j,k] = c[i,j,k+1]+c[i,j,k-1]
    ...     """
    ...     ], [
-    ...     lp.TemporaryVariable("c", lp.auto, shape=(50, 10, 99)),
+    ...     lp.TemporaryVariable("c", dtype=None, shape=(50, 10, 99)),
    ...     "..."
    ...     ])
    >>> knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
@@ -1844,38 +1910,35 @@ Now to make things more interesting, we'll create a kernel with barriers:
    {
      __local int c[50 * 10 * 99];
    <BLANKLINE>
-      {
-        int const k_outer = 0;
-    <BLANKLINE>
+      for (int i = 0; i <= 49; ++i)
        for (int j = 0; j <= 9; ++j)
-          for (int i = 0; i <= 49; ++i)
-          {
-            barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
-            c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
-            barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
-            e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
-          }
-      }
+        {
+          int const k_outer = 0;
+    <BLANKLINE>
+          barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
+          c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
+          barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
+          e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
+        }
    }

-In this kernel, when a thread performs the second instruction it uses data
-produced by *different* threads during the first instruction. Because of this,
+In this kernel, when a work-item performs the second instruction it uses data
+produced by *different* work-items during the first instruction. Because of this,
 barriers are required for correct execution, so loopy inserts them. Now we'll
 count the barriers using :func:`loopy.get_synchronization_map`:

 .. doctest::

    >>> sync_map = lp.get_synchronization_map(knl)
-    >>> print(lp.stringify_stats_mapping(sync_map))
-    barrier_local : { 1000 }
-    kernel_launch : { 1 }
-    <BLANKLINE>
+    >>> print(sync_map)
+    Sync(barrier_local, loopy_kernel): { 1000 }
+    Sync(kernel_launch, loopy_kernel): { 1 }

-Based on the kernel code printed above, we would expect each thread to
+Based on the kernel code printed above, we would expect each work-item to
 encounter 50x10x2 barriers, which matches the result from
 :func:`loopy.get_synchronization_map`. In this case, the number of barriers
 does not depend on any inames, so we can pass an empty dictionary to
-:func:`islpy.eval_with_dict`.
+:meth:`islpy.PwQPolynomial.eval_with_dict`.

 .. }}}


--- a/examples/fortran/foo.floopy
+++ b/examples/fortran/foo.floopy
-subroutine fill(out, a, n)
-  implicit none
-
-  real_type a, out(n)
-  integer n, i
-
-  do i = 1, n
-    out(i) = a
-  end do
-  do i = 1, n
-    out(i) = out(i) * factor
-  end do
-end
-
-!$loopy begin
-!
-! SOURCE = lp.c_preprocess(SOURCE, [
-!       "factor 4.0",
-!       "real_type real*8",
-!       ])
-! fill, = lp.parse_fortran(SOURCE, FILENAME)
-! fill = lp.split_iname(fill, "i", 128,
-!     outer_tag="g.0", inner_tag="l.0")
-! fill = lp.split_iname(fill, "i_1", 128,
-!     outer_tag="g.0", inner_tag="l.0")
-! RESULT = [fill]
-!
-!$loopy end
-
-! vim:filetype=floopy
--- a/examples/fortran/ipython-integration-demo.ipynb
+++ b/examples/fortran/ipython-integration-demo.ipynb
 {
- "metadata": {
-  "name": "",
-  "signature": "sha256:c9f8334aa7aa4a5ad1437fa5871aafa52bbc9131271d9e90e7be47d22725cc94"
- },
- "nbformat": 3,
- "nbformat_minor": 0,
- "worksheets": [
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loopy IPython Integration Demo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext loopy.ipython_ext"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Without transform code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%fortran_kernel\n",
+    "\n",
+    "subroutine fill(out, a, n)\n",
+    "  implicit none\n",
+    "\n",
+    "  real*8 a, out(n)\n",
+    "  integer n, i\n",
+    "\n",
+    "  do i = 1, n\n",
+    "    out(i) = a\n",
+    "  end do\n",
+    "end"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(prog)  # noqa: F821"
+   ]
+  },
  {
-   "cells": [
-    {
-     "cell_type": "markdown",
-     "metadata": {},
-     "source": [
-      "# Loopy IPython Integration Demo"
-     ]
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "%load_ext loopy.ipython_ext"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 1
-    },
-    {
-     "cell_type": "markdown",
-     "metadata": {},
-     "source": [
-      "## Without transform code"
-     ]
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "%%fortran_kernel\n",
-      "\n",
-      "subroutine fill(out, a, n)\n",
-      "  implicit none\n",
-      "\n",
-      "  real*8 a, out(n)\n",
-      "  integer n, i\n",
-      "\n",
-      "  do i = 1, n\n",
-      "    out(i) = a\n",
-      "  end do\n",
-      "end"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 2
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "print(fill)"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "---------------------------------------------------------------------------\n",
-        "KERNEL: fill\n",
-        "---------------------------------------------------------------------------\n",
-        "ARGUMENTS:\n",
-        "a: ValueArg, type: float64\n",
-        "n: ValueArg, type: int32\n",
-        "out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n",
-        "---------------------------------------------------------------------------\n",
-        "DOMAINS:\n",
-        "[n] -> { [i] : i >= 0 and i <= -1 + n }\n",
-        "---------------------------------------------------------------------------\n",
-        "INAME IMPLEMENTATION TAGS:\n",
-        "i: None\n",
-        "---------------------------------------------------------------------------\n",
-        "INSTRUCTIONS:\n",
-        "[i]                                  out[i] <- a   # insn0\n",
-        "---------------------------------------------------------------------------\n"
-       ]
-      }
-     ],
-     "prompt_number": 3
-    },
-    {
-     "cell_type": "markdown",
-     "metadata": {},
-     "source": [
-      "## With transform code"
-     ]
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "split_amount = 128"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 4
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "%%transformed_fortran_kernel\n",
-      "\n",
-      "subroutine tr_fill(out, a, n)\n",
-      "  implicit none\n",
-      "\n",
-      "  real*8 a, out(n)\n",
-      "  integer n, i\n",
-      "\n",
-      "  do i = 1, n\n",
-      "    out(i) = a\n",
-      "  end do\n",
-      "end\n",
-      "\n",
-      "!$loopy begin\n",
-      "!\n",
-      "! tr_fill, = lp.parse_fortran(SOURCE)\n",
-      "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n",
-      "!     outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
-      "! RESULT = [tr_fill]\n",
-      "!\n",
-      "!$loopy end"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 5
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "print(tr_fill)"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "---------------------------------------------------------------------------\n",
-        "KERNEL: tr_fill\n",
-        "---------------------------------------------------------------------------\n",
-        "ARGUMENTS:\n",
-        "a: ValueArg, type: float64\n",
-        "n: ValueArg, type: int32\n",
-        "out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n",
-        "---------------------------------------------------------------------------\n",
-        "DOMAINS:\n",
-        "[n] -> { [i_outer, i_inner] : i_inner >= -128i_outer and i_inner <= -1 + n - 128i_outer and i_inner >= 0 and i_inner <= 127 }\n",
-        "---------------------------------------------------------------------------\n",
-        "INAME IMPLEMENTATION TAGS:\n",
-        "i_inner: l.0\n",
-        "i_outer: g.0\n",
-        "---------------------------------------------------------------------------\n",
-        "INSTRUCTIONS:\n",
-        "[i_inner,i_outer]                    out[i_inner + i_outer*128] <- a   # insn0\n",
-        "---------------------------------------------------------------------------\n"
-       ]
-      }
-     ],
-     "prompt_number": 6
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [],
-     "language": "python",
-     "metadata": {},
-     "outputs": []
-    }
-   ],
-   "metadata": {}
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## With transform code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "split_amount = 128"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%transformed_fortran_kernel\n",
+    "\n",
+    "subroutine tr_fill(out, a, n)\n",
+    "  implicit none\n",
+    "\n",
+    "  real*8 a, out(n)\n",
+    "  integer n, i\n",
+    "\n",
+    "  do i = 1, n\n",
+    "    out(i) = a\n",
+    "  end do\n",
+    "end\n",
+    "\n",
+    "!$loopy begin\n",
+    "!\n",
+    "! tr_fill = lp.parse_fortran(SOURCE)\n",
+    "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n",
+    "!     outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
+    "! RESULT = tr_fill\n",
+    "!\n",
+    "!$loopy end"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(prog)  # noqa: F821"
+   ]
  }
- ]
-}
\ No newline at end of file
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
--- a/examples/fortran/matmul-driver.py
+++ b/examples/fortran/matmul-driver.py
+import numpy as np
+import numpy.linalg as la
+
+import pyopencl as cl
+import pyopencl.array
+import pyopencl.clrandom
+
+import loopy as lp
+
+
+def main():
+    import pathlib
+    fn = pathlib.Path(__file__).parent / "matmul.floopy"
+
+    with open(fn) as inf:
+        source = inf.read()
+
+    dgemm = lp.parse_transformed_fortran(source, filename=fn)
+
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    n = 2048
+    a = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
+    b = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
+    c = cl.array.zeros(queue, (n, n), dtype=np.float64, order="F")
+    cl.clrandom.fill_rand(a)
+    cl.clrandom.fill_rand(b)
+
+    dgemm = lp.set_options(dgemm, write_code=True)
+
+    dgemm(queue, a=a, b=b, alpha=1, c=c)
+
+    c_ref = (a.get() @ b.get())
+    assert la.norm(c_ref - c.get())/la.norm(c_ref) < 1e-10
+
+
+if __name__ == "__main__":
+    main()
No results found