Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tasmith4/loopy
  • ben_sepanski/loopy
  • arghdos/loopy
  • inducer/loopy
  • wence-/loopy
  • isuruf/loopy
  • fikl2/loopy
  • xywei/loopy
  • kaushikcfd/loopy
  • zweiner2/loopy
10 results
Show changes
Showing
with 2091 additions and 897 deletions
#! /bin/bash
# should be run in this directory (build-helpers)
if test "$1" = "--nodate"; then
TGT_NAME=loopy-centos6
else
TGT_NAME=loopy-centos6-$(date +"%Y-%m-%d")
fi
echo "Generating $TGT_NAME..."
set -e
set -x
docker pull centos:6
CNT=$(docker create -t -v $(pwd):/mnt centos:6 /mnt/make-linux-build-docker-inner.sh)
echo "working in container $CNT"
docker start -i $CNT
docker cp $CNT:/tmp/build/loopy/dist/loopy $(pwd) || true
mv loopy $TGT_NAME
docker rm $CNT
#! /bin/bash
# run this from the loopy root directory
rm -Rf dist build
pyinstaller \
--workpath=build/pyinstaller \
build-helpers/loopy.spec
#! /bin/bash
set -e
scp "$1" tiker.net:public_html/pub/loopy-binaries/
#!/usr/bin/env python
import ctypes
from os import system
C_SRC = """
#include <stdlib.h>
#include <stdint.h>
int64_t cdiv(int64_t a, int64_t b)
{
return a/b;
}
int64_t cmod(int64_t a, int64_t b)
{
return a%b;
}
#define LOOPY_CALL_WITH_INTEGER_TYPES(MACRO_NAME) \
MACRO_NAME(int8, char) \
MACRO_NAME(int16, short) \
MACRO_NAME(int32, int) \
MACRO_NAME(int64, long long)
#define LOOPY_DEFINE_FLOOR_DIV(SUFFIX, TYPE) \
TYPE loopy_floor_div_##SUFFIX(TYPE a, TYPE b) \
{ \
if ((a<0) != (b<0)) \
a = a - (b + (b<0) - (b>=0)); \
return a/b; \
}
LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV)
#undef LOOPY_DEFINE_FLOOR_DIV
#define LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \
TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \
{ \
if (a<0) \
a = a - (b-1); \
return a/b; \
}
LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B)
#undef LOOPY_DEFINE_FLOOR_DIV_POS_B
#define LOOPY_DEFINE_MOD_POS_B(SUFFIX, TYPE) \
TYPE loopy_mod_pos_b_##SUFFIX(TYPE a, TYPE b) \
{ \
TYPE result = a%b; \
if (result < 0) \
result += b; \
return result; \
}
LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD_POS_B)
#undef LOOPY_DEFINE_MOD_POS_B
#define LOOPY_DEFINE_MOD(SUFFIX, TYPE) \
TYPE loopy_mod_##SUFFIX(TYPE a, TYPE b) \
{ \
TYPE result = a%b; \
if (result < 0 && b > 0) \
result += b; \
if (result > 0 && b < 0) \
result = result + b; \
return result; \
}
LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD)
#undef LOOPY_DEFINE_MOD
"""
def main():
with open("int-experiments.c", "w") as outf:
outf.write(C_SRC)
system("gcc -Wall -shared int-experiments.c -o int-experiments.so")
int_exp = ctypes.CDLL("int-experiments.so")
for func in [
int_exp.cdiv,
int_exp.cmod,
int_exp.loopy_floor_div_int64,
int_exp.loopy_floor_div_pos_b_int64,
int_exp.loopy_mod_pos_b_int64,
int_exp.loopy_mod_int64,
]:
func.argtypes = [ctypes.c_longlong, ctypes.c_longlong]
func.restype = ctypes.c_longlong
cmod = int_exp.cmod
int_floor_div = int_exp.loopy_floor_div_int64
int_floor_div_pos_b = int_exp.loopy_floor_div_pos_b_int64
int_mod_pos_b = int_exp.loopy_mod_pos_b_int64
int_mod = int_exp.loopy_mod_int64
m = 50
for a in range(-m, m):
for b in range(1, m):
cresult = int_floor_div_pos_b(a, b)
presult = a // b
assert cresult == presult
if cresult != presult:
print(a, b, cresult, presult)
for a in range(-m, m):
for b in range(-m, m):
if b == 0:
continue
cresult = int_floor_div(a, b)
presult = a // b
assert cresult == presult
if cresult != presult:
print(a, b, cresult, presult)
for a in range(-m, m):
for b in range(1, m):
cresult = int_mod_pos_b(a, b)
presult = a % b
assert cresult == presult
for a in range(-m, m):
for b in range(-m, m):
if b == 0:
continue
cresult = int_mod(a, b)
presult = a % b
assert cresult == presult
if cresult != presult:
print(a, b, cresult, presult)
# print(int_mod(552, -918), 552 % -918)
print(cmod(23, -11), 23 % -11)
if __name__ == "__main__":
main()
import numpy as np
# Inspired by a visualization used in the Halide tutorial
# https://www.youtube.com/watch?v=3uiEyEKji0M
def div_ceil(nr, dr):
return -(-nr // dr)
def product(iterable):
from functools import reduce
from operator import mul
return reduce(mul, iterable, 1)
class ArrayAccessPatternContext:
def __init__(self, gsize, lsize, subgroup_size=32, decay_constant=0.75):
self.lsize = lsize
self.gsize = gsize
self.subgroup_size = subgroup_size
self.timestamp = 0
self.decay_constant = decay_constant
self.ind_length = len(gsize) + len(lsize)
self.arrays = []
def l(self, index): # noqa: E743
subscript = [np.newaxis] * self.ind_length
subscript[len(self.gsize) + index] = slice(None)
return np.arange(self.lsize[index])[tuple(subscript)]
def g(self, index):
subscript = [np.newaxis] * self.ind_length
subscript[index] = slice(None)
return np.arange(self.gsize[index])[tuple(subscript)]
def nsubgroups(self):
return div_ceil(product(self.lsize), self.subgroup_size)
def animate(self, f, interval=200):
import matplotlib.animation as animation
import matplotlib.pyplot as plt
fig = plt.figure()
plots = []
for iary, ary in enumerate(self.arrays):
ax = fig.add_subplot(1, len(self.arrays), 1+iary)
ax.set_title(ary.name)
plots.append(ary.plot(ax))
def data_gen():
for _ in f():
self.tick()
for ary, plot in zip(self.arrays, plots):
plot.set_array(ary.get_plot_data())
fig.canvas.draw()
yield plots
# must be kept alive until after plt.show()
return animation.FuncAnimation(
fig, lambda x: x, data_gen,
blit=False, interval=interval, repeat=True)
def tick(self):
self.timestamp += 1
class Array:
def __init__(self, ctx, name, shape, strides, elements_per_row=None):
# Each array element stores a tuple:
# (timestamp, subgroup, g0, g1, g2, ) of last access
assert len(shape) == len(strides)
self.nattributes = 2+len(ctx.gsize)
if elements_per_row is None:
if len(shape) > 1:
minstride = min(strides)
for sh_i, st_i in zip(shape, strides):
if st_i == minstride:
elements_per_row = sh_i
break
else:
elements_per_row = 256
self.array = np.zeros((product(shape), self.nattributes,), dtype=np.int32)
self.ctx = ctx
self.name = name
self.shape = shape
self.strides = strides
self.elements_per_row = elements_per_row
ctx.arrays.append(self)
def __getitem__(self, index):
if not isinstance(index, tuple):
index = (index,)
assert len(index) == len(self.shape)
all_subscript = (np.newaxis,) * self.ctx.ind_length
def reshape_ind(ind):
if not isinstance(ind, np.ndarray):
return ind[all_subscript]
else:
assert len(ind.shape) == self.ctx.ind_length
lin_index = sum(
ind_i * stride_i
for ind_i, stride_i in zip(index, self.strides))
if not isinstance(lin_index, np.ndarray):
subscript = [np.newaxis] * self.ctx.ind_length
lin_index = np.array(lin_index)[subscript]
self.array[lin_index, 0] = self.ctx.timestamp
for i, _glength in enumerate(self.ctx.gsize):
if lin_index.shape[i] > 1:
self.array[lin_index, 2+i] = self.ctx.g(i)
workitem_index = 0
for i in range(len(self.ctx.lsize))[::-1]:
workitem_index = (
workitem_index * self.ctx.lsize[i]
+ self.ctx.l(i))
subgroup = workitem_index//self.ctx.subgroup_size
self.array[lin_index, 1] = subgroup
def __setitem__(self, index, value):
self.__getitem__(index)
def get_plot_data(self):
nelements = self.array.shape[0]
base_shape = (
div_ceil(nelements, self.elements_per_row),
self.elements_per_row,)
shaped_array = np.zeros(
(*base_shape, self.nattributes),
dtype=np.float32)
shaped_array.reshape(-1, self.nattributes)[:nelements] = self.array
modulation = np.exp(
-self.ctx.decay_constant*(self.ctx.timestamp-shaped_array[:, :, 0]))
subgroup = shaped_array[:, :, 1]
if self.ctx.nsubgroups() > 1:
subgroup = subgroup/(self.ctx.nsubgroups()-1)
else:
subgroup.fill(1)
rgb_array = np.zeros((*base_shape, 3))
if 1:
if len(self.ctx.gsize) > 1:
# g.0 -> red
rgb_array[:, :, 0] = shaped_array[:, :, 2]/(self.ctx.gsize[0]-1)
if len(self.ctx.gsize) > 1:
# g.1 -> blue
rgb_array[:, :, 2] = shaped_array[:, :, 3]/(self.ctx.gsize[1]-1)
if 1:
rgb_array[:, :, 1] = subgroup
return rgb_array*modulation[:, :, np.newaxis]
def plot(self, ax, **kwargs):
return ax.imshow(
self.get_plot_data(), interpolation="nearest",
**kwargs)
def show_example():
n = 2**7
n16 = div_ceil(n, 16)
ctx = ArrayAccessPatternContext(gsize=(n16, n16), lsize=(16, 16))
in0 = Array(ctx, "in0", (n, n), (n, 1))
if 0:
# knl a
i_inner = ctx.l(1)
i_outer = ctx.g(1)
k_inner = ctx.l(0)
def f():
for k_outer in range(n16):
in0[i_inner + i_outer*16, k_inner + k_outer*16]
yield
elif 0:
# knl b
j_inner = ctx.l(0)
j_outer = ctx.g(0)
k_inner = ctx.l(1)
def f():
for k_outer in range(n16):
in0[k_inner + k_outer*16, j_inner + j_outer*16]
yield
ani = ctx.animate(f)
import matplotlib.pyplot as plt
if 1:
plt.show()
else:
ani.save("access.mp4")
def show_example_2():
bsize = 8
blocks = 3
ctx = ArrayAccessPatternContext(gsize=(1,), lsize=(1,),
decay_constant=0.005)
in0 = Array(ctx, "in0", (blocks*bsize, blocks*bsize), (blocks*bsize, 1))
def f():
for i_outer in range(blocks):
for j_outer in range(blocks):
for i_inner in range(bsize):
for j_inner in range(bsize):
in0[i_inner + i_outer*bsize, j_inner + j_outer*bsize]
yield
ani = ctx.animate(f, interval=10)
import matplotlib.pyplot as plt
if 1:
plt.show()
else:
ani.save("access.mp4")
if __name__ == "__main__":
show_example_2()
......@@ -3,7 +3,7 @@
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = python ` which sphinx-build`
SPHINXBUILD = python `which sphinx-build`
PAPER =
BUILDDIR = _build
......
# -*- coding: utf-8 -*-
#
# loopy documentation build configuration file, created by
# sphinx-quickstart on Tue Aug 9 13:40:49 2011.
#
# This file is execfile()d with the current directory set to its containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import os
from urllib.request import urlopen
#import sys, os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#sys.path.insert(0, os.path.abspath('.'))
_conf_url = "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py"
with urlopen(_conf_url) as _inf:
exec(compile(_inf.read(), _conf_url, "exec"), globals())
# -- General configuration -----------------------------------------------------
copyright = "2016, Andreas Klöckner"
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.intersphinx',
#'sphinx.ext.viewcode',
'sphinx.ext.doctest',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'loopy'
copyright = u'2016, Andreas Klöckner'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
ver_dic = {}
with open("../loopy/version.py") as vpy_file:
_version_source = "../loopy/version.py"
with open(_version_source) as vpy_file:
version_py = vpy_file.read()
exec(compile(version_py, "../loopy/version.py", 'exec'), ver_dic)
os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"] = "1"
exec(compile(version_py, _version_source, "exec"), ver_dic)
version = ".".join(str(x) for x in ver_dic["VERSION"])
# The full version, including alpha/beta/rc tags.
release = ver_dic["VERSION_TEXT"]
del os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"]
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
# The reST default role (used for this markup: `text`) to use for all documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
exclude_patterns = ["_build"]
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# -- Options for HTML output ---------------------------------------------------
html_theme = "alabaster"
html_theme_options = {
"extra_nav_links": {
"🚀 Github": "https://github.com/inducer/loopy",
"💾 Download Releases": "https://pypi.python.org/pypi/loo.py",
}
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
"python": ("https://docs.python.org/3", None),
"numpy": ("https://numpy.org/doc/stable/", None),
"pytools": ("https://documen.tician.de/pytools", None),
"islpy": ("https://documen.tician.de/islpy", None),
"pyopencl": ("https://documen.tician.de/pyopencl", None),
"cgen": ("https://documen.tician.de/cgen", None),
"pymbolic": ("https://documen.tician.de/pymbolic", None),
"constantdict": ("https://matthiasdiener.github.io/constantdict/", None),
}
html_sidebars = {
'**': [
'about.html',
'navigation.html',
'relations.html',
'searchbox.html',
]
}
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
nitpicky = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
nitpick_ignore_regex = [
["py:class", r"typing_extensions\.(.+)"],
["py:class", r"numpy\.u?int[0-9]+"],
["py:class", r"numpy\.float[0-9]+"],
["py:class", r"numpy\.complex[0-9]+"],
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# Reference not found from "<unknown>"? I'm not even sure where to look.
["py:class", r"ExpressionNode"],
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
html_show_sourcelink = False
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'loopydoc'
# -- Options for LaTeX output --------------------------------------------------
# The paper size ('letter' or 'a4').
#latex_paper_size = 'letter'
# The font size ('10pt', '11pt' or '12pt').
#latex_font_size = '10pt'
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
('index', 'loopy.tex', u'loopy Documentation',
u'Andreas Kloeckner', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Additional stuff for the LaTeX preamble.
#latex_preamble = ''
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output --------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'loopy', u'loopy Documentation',
[u'Andreas Kloeckner'], 1)
]
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
'http://docs.python.org/': None,
'http://documen.tician.de/islpy': None,
'http://documen.tician.de/pyopencl': None,
'http://documen.tician.de/cgen': None,
'http://docs.scipy.org/doc/numpy/': None,
}
autoclass_content = "class"
# Type aliases
["py:class", r"InameStr"],
["py:class", r"ConcreteCallablesTable"],
["py:class", r"LoopNestTree"],
["py:class", r"LoopTree"],
["py:class", r"ToLoopyTypeConvertible"],
["py:class", r"ToStackMatchConvertible"],
]
......@@ -18,43 +18,20 @@ When you run this script, the following kernel is generated, compiled, and execu
(See the full example for how to print the generated code.)
Want to try out loopy?
----------------------
There's no need to go through :ref:`installation` if you'd just like to get a
feel for what loopy is. Instead, you may
`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/builds/1989/artifacts/browse/build-helpers/>`_.
This is purposefully built on an ancient Linux distribution, so it should work
on most versions of Linux that are currently out there.
Once you have the binary, do the following::
chmod +x ./loopy-centos6
./loopy-centos6 --target=opencl hello-loopy-lp.py
./loopy-centos6 --target=cuda hello-loopy-lp.py
./loopy-centos6 --target=ispc hello-loopy-lp.py
Grab the example here: :download:`examples/python/hello-loopy.py <../examples/python/hello-loopy-lp.py>`.
You may also donwload the most recent version by going to the `list of builds
<https://gitlab.tiker.net/inducer/loopy/builds>`_, clicking on the newest one
of type "CentOS binary", clicking on "Browse" under "Build Artifacts", then
navigating to "build-helpers", and downloading the binary from there.
.. _static-binary:
Places on the web related to Loopy
----------------------------------
* `Python package index <http://pypi.python.org/pypi/loo.py>`_ (download releases) Note the extra '.' in the PyPI identifier!
* `Github <http://github.com/inducer/loopy>`_ (get latest source code, file bugs)
* `Wiki <http://wiki.tiker.net/Loopy>`_ (read installation tips, get examples, read FAQ)
* `Homepage <http://mathema.tician.de/software/loopy>`_
* `Python package index <https://pypi.org/project/loopy>`_ (download releases)
* `Github <https://github.com/inducer/loopy>`_ (get latest source code, file bugs)
* `Homepage <https://mathema.tician.de/software/loopy>`_
Table of Contents
-----------------
If you're only just learning about loopy, consider the following `paper
<http://arxiv.org/abs/1405.7470>`_ on loo.py that may serve as a good
<https://arxiv.org/abs/1405.7470>`_ on loopy that may serve as a good
introduction.
Please check :ref:`installation` to get started.
......@@ -65,9 +42,14 @@ Please check :ref:`installation` to get started.
tutorial
ref_creation
ref_kernel
ref_translation_unit
ref_transform
ref_call
ref_other
misc
ref_internals
🚀 Github <https://github.com/inducer/loopy>
💾 Download Releases <https://pypi.org/project/loopy>
Indices and tables
==================
......
......@@ -3,11 +3,21 @@
Installation
============
This command should install :mod:`loopy`::
Option 0: Static Binary
-----------------------
If you would just like to experiment with :mod:`loopy`'s code transformation
abilities, the easiest way to get loopy is to download a statically-linked
Linux binary.
See :ref:`static-binary` for details.
pip install loo.py
Option 1: From Source, no PyOpenCL integration
-----------------------------------------------
This command should install :mod:`loopy`::
(Note the extra "."!)
pip install loopy
You may need to run this with :command:`sudo`.
If you don't already have `pip <https://pypi.python.org/pypi/pip>`_,
......@@ -17,19 +27,66 @@ run this beforehand::
python get-pip.py
For a more manual installation, `download the source
<http://pypi.python.org/pypi/islpy>`_, unpack it, and say::
<https://pypi.org/project/loopy>`_, unpack it, and say::
python setup.py install
You may also clone its git repository::
git clone --recursive git://github.com/inducer/loopy
git clone --recursive http://git.tiker.net/trees/loopy.git
git clone --recursive https://github.com/inducer/loopy.git
Option 2: From Conda Forge, with PyOpenCL integration
-----------------------------------------------------
This set of instructions is intended for 64-bit Linux and
MacOS support computers:
#. Make sure your system has the basics to build software.
On Debian derivatives (Ubuntu and many more),
installing ``build-essential`` should do the trick.
Everywhere else, just making sure you have the ``g++`` package should be
enough.
#. Install `miniforge <https://github.com/conda-forge/miniforge>`_.
#. ``export CONDA=/WHERE/YOU/INSTALLED/miniforge3``
If you accepted the default location, this should work:
``export CONDA=$HOME/miniforge3``
#. ``$CONDA/bin/conda create -n dev``
#. ``source $CONDA/bin/activate dev``
#. ``conda install git pip pocl islpy pyopencl`` (Linux)
or
``conda install osx-pocl-opencl git pip pocl islpy pyopencl`` (OS X)
#. Type the following command::
pip install git+https://github.com/inducer/loopy
Next time you want to use :mod:`loopy`, just run the following command::
source /WHERE/YOU/INSTALLED/miniforge3/bin/activate dev
You may also like to add this to a startup file (like :file:`$HOME/.bashrc`) or create an alias for it.
See the `PyOpenCL installation instructions
<https://documen.tician.de/pyopencl/misc.html#installation>`_ for options
regarding OpenCL drivers.
User-visible Changes
====================
Version 2016.2
See also :ref:`language-versioning`.
Version 2018.1
--------------
.. note::
......@@ -57,7 +114,7 @@ Licensing
Loopy is licensed to you under the MIT/X Consortium license:
Copyright (c) 2009-13 Andreas Klöckner and Contributors.
Copyright (c) 2009-17 Andreas Klöckner and Contributors.
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
......@@ -83,8 +140,297 @@ OTHER DEALINGS IN THE SOFTWARE.
Frequently Asked Questions
==========================
The FAQ is maintained collaboratively on the
`Wiki FAQ page <http://wiki.tiker.net/Loopy/FrequentlyAskedQuestions>`_.
Is Loopy specific to OpenCL?
----------------------------
No, absolutely not. You can switch to a different code generation target
(subclasses of :class:`loopy.TargetBase`) by using (say)::
knl = knl.copy(target=loopy.CudaTarget())
Also see :ref:`targets`. (Py)OpenCL right now has the best support for
running kernels directly out of the box, but that could easily be expanded.
Open an issue to discuss what you need.
In the meantime, you can generate code simply by saying::
cg_result = loopy.generate_code_v2(knl)
print(cg_result.host_code())
print(cg_result.device_code())
Additionally, for C-based languages, header definitions are available via::
loopy.generate_header(knl)
For what types of codes does :mod:`loopy` work well?
----------------------------------------------------
Any array-based/number-crunching code whose control flow is not *too*
data dependent should be expressible. For example:
* Sparse matrix-vector multiplies, despite data-dependent control
flow (varying row lengths, say), is easy and natural to express.
* Looping until convergence on the other hand is an example
of something that can't be expressed easily. Such checks
would have to be performed outside of :mod:`loopy` code.
Can I see some examples?
------------------------
Loopy has a ton of tests, and right now, those are probably the best
source of examples. Here are some links:
* `Tests directory <https://github.com/inducer/loopy/tree/master/test>`_
* `Applications tests <https://github.com/inducer/loopy/blob/master/test/test_apps.py>`_
* `Feature tests <https://github.com/inducer/loopy/blob/master/test/test_loopy.py>`_
Here's a more complicated example of a loopy code:
.. literalinclude:: ../examples/python/find-centers.py
:language: python
This example is included in the :mod:`loopy` distribution as
:download:`examples/python/find-centers.py <../examples/python/find-centers.py>`.
What this does is find nearby "centers" satisfying some criteria
for an array of points ("targets").
Specifying dependencies for groups of instructions is cumbersome. Help?
-----------------------------------------------------------------------
You can now specify instruction ID prefixes and dependencies for groups
of instructions, like this::
with {id_prefix=init_m}
<> m[0] = ...
m[1] = ...
m[2] = ...
end
with {id_prefix=update_m,dep=init_m*}
m[0] = m[0] + ...
m[1] = m[1] + ...
m[2] = m[2] * ...
end
with {dep=update_m*}
output[i, j, 0] = 0.25*m[0]
output[i, j, 1] = 0.25*m[1]
output[i, j, 2] = 0.25*m[2]
end
.. versionadded:: 2016.2.1
(There was a bug in prior versions that kept this from working.)
What types of transformations can I do?
---------------------------------------
This list is always growing, but here are a few pointers:
* Unroll
Use :func:`loopy.tag_inames` with the ``"unr"`` tag.
Unrolled loops must have a fixed size. (See either
:func:`loopy.split_iname` or :func:`loopy.fix_parameters`.)
* Stride changes (Row/column/something major)
Use :func:`loopy.tag_array_axes` with (e.g.) ``stride:17`` or
``N1,N2,N0`` to determine how each axis of an array is realized.
* Prefetch
Use :func:`loopy.add_prefetch`.
* Reorder loops
Use :func:`loopy.prioritize_loops`.
* Precompute subexpressions:
Use a :ref:`substitution-rule` to assign a name to a subexpression,
using may be :func:`loopy.assignment_to_subst` or :func:`loopy.extract_subst`.
Then use :func:`loopy.precompute` to create an (array or scalar)
temporary with precomputed values.
* Tile:
Use :func:`loopy.split_iname` to produce enough loops, then use
:func:`loopy.prioritize_loops` to set the ordering.
* Fix constants
Use :func:`loopy.fix_parameters`.
* Parallelize (across cores)
Use :func:`loopy.tag_inames` with the ``"g.0"``, ``"g.1"`` (and so on) tags.
* Parallelize (across vector lanes)
Use :func:`loopy.tag_inames` with the ``"l.0"``, ``"l.1"`` (and so on) tags.
* Affinely map loop domains
Use :func:`loopy.affine_map_inames`.
* Texture-based data access
Use :func:`loopy.change_arg_to_image` to use texture memory
for an argument.
* Kernel Fusion
Use :func:`loopy.fuse_kernels`.
* Explicit-SIMD Vectorization
Use :func:`loopy.tag_inames` with the ``"vec"`` iname tag.
Note that the corresponding axis of an array must
also be tagged using the ``"vec"`` array axis tag
(using :func:`loopy.tag_array_axes`) in order for vector code to be
generated.
Vectorized loops (and array axes) must have a fixed size. (See either
:func:`loopy.split_iname` or :func:`loopy.fix_parameters` along with
:func:`loopy.split_array_axis`.)
* Reuse of Temporary Storage
Use :func:`loopy.alias_temporaries` to reduce the size of intermediate
storage.
* SoA $\leftrightarrow$ AoS
Use :func:`loopy.tag_array_axes` with the ``"sep"`` array axis tag
to generate separate arrays for each entry of a short, fixed-length
array axis.
Separated array axes must have a fixed size. (See either
:func:`loopy.split_array_axis`.)
* Realization of Instruction-level parallelism
Use :func:`loopy.tag_inames` with the ``"ilp"`` tag.
ILP loops must have a fixed size. (See either
:func:`loopy.split_iname` or :func:`loopy.fix_parameters`.)
* Type inference
Use :func:`loopy.add_and_infer_dtypes`.
* Convey assumptions:
Use :func:`loopy.assume` to say, e.g.
``loopy.assume(knl, "N mod 4 = 0")`` or
``loopy.assume(knl, "N > 0")``.
* Perform batch computations
Use :func:`loopy.to_batched`.
* Interface with your own library functions
See :ref:`func-interface` for details.
* Loop collapse
Use :func:`loopy.join_inames`.
In what sense does Loopy support vectorization?
-----------------------------------------------
There are really two ways in which the OpenCL/CUDA model of computation exposes
vectorization:
* "SIMT": The user writes scalar program instances and either the compiler or
the hardware joins the individual program instances into vectors of a
hardware-given length for execution.
* "Short vectors": This type of vectorization is based on vector types,
e.g. ``float4``, which support arithmetic with implicit vector semantics
as well as a number of 'intrinsic' functions.
Loopy supports both. The first one, SIMT, is accessible by tagging inames with,
e.g., ``l.0```. Accessing the second one requires using both execution- and
data-reshaping capabilities in loopy. To start with, you need an array that
has an axis with the length of the desired vector. If that's not yet available,
you may use :func:`loopy.split_array_axis` to produce one. Similarly, you need
an iname whose bounds match those of the desired vector length. Again, if you
don't already have one, :func:`loopy.split_iname` will easily produce one.
Lastly, both the array axis an the iname need the implementation tag ``"vec"``.
Here is an example of this machinery in action:
.. literalinclude:: ../examples/python/vector-types.py
:language: python
Note how the example slices off the last 'slab' of iterations to ensure that
the bulk of the iteration does not require conditionals which would prevent
successful vectorization. This generates the following code:
.. literalinclude:: ../examples/python/vector-types.cl
:language: c
What is the story with language versioning?
-------------------------------------------
The idea is to keep supporting multiple versions at a time. There's a
tension in loopy between the need to build code that keeps working
unchanged for some number of years, and needing the language to
evolve--not just as a research vehicle, but also to enable to respond
to emerging needs in applications and hardware.
The idea is not to support all versions indefinitely, merely to allow
users to upgrade on their own schedule on the scale of a couple years.
Warnings about needing to upgrade would get noisier as a version nears
deprecation. In a way, it is intended to be a version of Python's
`__future__` flags, which IMO have the served the language tremendously
well.
One can also obtain the current language version programmatically:
:data:`loopy.MOST_RECENT_LANGUAGE_VERSION`.
But pinning your code to that would mean choosing to not use the
potentially valuable guarantee to keep existing code working unchanged
for a while. Instead, it might be wiser to just grab the version of the
language current at the time of writing the code.
Uh-oh. I got a scheduling error. Any hints?
-------------------------------------------
* Make sure that dependencies between instructions are as
you intend.
Use :func:`loopy.show_dependency_graph` to check.
There's a heuristic that tries to help find dependencies. If there's
only a single write to a variable, then it adds dependencies from all
readers to the writer. In your case, that's actually counterproductive,
because it creates a circular dependency, hence the scheduling issue.
So you'll have to turn that off, like so::
knl = lp.make_kernel(
"{ [t]: 0 <= t < T}",
"""
<> xt = x[t] {id=fetch,dep=*}
x[t + 1] = xt * 0.1 {dep=fetch}
""")
* Make sure that your loops are correctly nested.
Print the kernel to make sure all instructions are within
the set of inames you intend them to be in.
* One iname is one for loop.
For sequential loops, one iname corresponds to exactly one
``for`` loop in generated code. Loopy will not generate multiple
loops from one iname.
* Make sure that your loops are correctly nested.
The scheduler will try to be as helpful as it can in telling
you where it got stuck.
Citing Loopy
============
......@@ -93,7 +439,7 @@ If you use loopy for your work and find its approach helpful, please
consider citing the following article.
A. Klöckner. `Loo.py: transformation-based code generation for GPUs and
CPUs <http://arxiv.org/abs/1405.7470>`_. Proceedings of ARRAY '14: ACM
CPUs <https://arxiv.org/abs/1405.7470>`_. Proceedings of ARRAY '14: ACM
SIGPLAN Workshop on Libraries, Languages, and Compilers for Array
Programming. Edinburgh, Scotland.
......@@ -110,5 +456,33 @@ Here's a Bibtex entry for your convenience::
doi = "{10.1145/2627373.2627387}",
}
Getting help
============
Email the friendly folks on the `loopy mailing list <https://lists.tiker.net/listinfo/loopy>`_.
Acknowledgments
===============
Work on loopy was supported in part by
- the Department of Energy, National Nuclear Security Administration, under Award Number DE-NA0003963,
- the US Navy ONR, under grant number N00014-14-1-0117, and
- the US National Science Foundation under grant numbers DMS-1418961, CCF-1524433, DMS-1654756, SHF-1911019, and OAC-1931577.
AK also gratefully acknowledges a hardware gift from Nvidia Corporation.
The views and opinions expressed herein do not necessarily reflect those of the funding agencies.
Cross-References to Other Documentation
=======================================
.. currentmodule:: numpy
.. class:: int16
See :class:`numpy.generic`.
.. class:: complex128
See :class:`numpy.generic`.
.. currentmodule:: loopy
.. _func-interface:
Function Interface
==================
Resolving and specialization
----------------------------
In :mod:`loopy`, a :class:`loopy.TranslationUnit` is a collection of callables
and entrypoints. Callables are of type
:class:`loopy.kernel.function_interface.InKernelCallable`. Functions start life
as simple :class:`pymbolic.primitives.Call` nodes. Call resolution turns the function
identifiers in those calls into :class:`~loopy.symbolic.ResolvedFunction` objects.
Each resolved function has an entry in :attr:`TranslationUnit.callables_table`.
The process of realizing a function as a
:class:`~loopy.kernel.function_interface.InKernelCallable` is referred to as
resolving.
During code generation for a :class:`~loopy.TranslationUnit`, a (resolved) callable
is *specialized* depending on the types and shapes of the arguments passed at a
call site. For example, a call to ``sin(x)`` in :mod:`loopy` is type-generic to
begin with, but it later specialized to either ``sinf``, ``sin`` or ``sinl``
depending on the type of its argument ``x``. A callable's behavior during type
or shape specialization is encoded via
:meth:`~loopy.InKernelCallable.with_types` and
:meth:`~loopy.InKernelCallable.with_descrs`.
Registering callables
---------------------
A user can *register* callables within a :class:`~loopy.TranslationUnit` to
allow loopy to resolve calls not pre-defined in :mod:`loopy`. In :mod:`loopy`,
we typically aim to expose all the standard math functions defined for
a :class:`~loopy.target.TargetBase`. Other foreign functions could be invoked by
*registering* them.
An example demonstrating registering a ``CBlasGemv`` as a loopy callable:
.. literalinclude:: ../examples/python/call-external.py
Call Instruction for a kernel call
----------------------------------
At a call-site involving a call to a :class:`loopy.LoopKernel`, the arguments to
the call must be ordered by the order of input arguments of the callee kernel.
Similarly, the assignees must be ordered by the order of callee kernel's output
arguments. Since a :class:`~loopy.kernel.data.KernelArgument` can be both an
input and an output, such arguments would be a part of the call instruction's
assignees as well as the call expression node's parameters.
Entry points
------------
Only callables in :attr:`loopy.TranslationUnit.entrypoints` can be called from
the outside. All other callables are only visible from within the translation
unit, similar to C's ``static`` functions.
Reference
---------
.. automodule:: loopy.kernel.function_interface
.. module:: loopy
.. moduleauthor:: Andreas Kloeckner <inform@tiker.net>
.. currentmodule:: loopy
.. _creating-kernels:
Reference: Creating Kernels
......@@ -30,4 +28,11 @@ To Copy between Data Formats
.. autofunction:: make_copy_kernel
Einstein summation convention kernels
-------------------------------------
.. autofunction:: make_einsum
.. automodule:: loopy.version
.. vim: tw=75:spell:fdm=marker
Reference: Documentation for Internal API
=========================================
Targets
-------
See also :ref:`targets`.
.. automodule:: loopy.target.c
Symbolic
--------
See also :ref:`expression-syntax`.
.. automodule:: loopy.symbolic
Types
-----
DTypes of variables in a :class:`loopy.LoopKernel` must be picklable, so in
the codegen pipeline user-provided types are converted to
:class:`loopy.types.LoopyType`.
.. automodule:: loopy.types
Type inference
^^^^^^^^^^^^^^
.. automodule:: loopy.type_inference
Codegen
-------
.. automodule:: loopy.codegen
Reduction Operation
-------------------
.. automodule:: loopy.library.reduction
Iname Tags
----------
.. automodule:: loopy.kernel.data
Array
-----
.. automodule:: loopy.kernel.array
Checks
------
.. automodule:: loopy.check
Schedule
--------
.. automodule:: loopy.schedule
.. automodule:: loopy.schedule.tools
.. automodule:: loopy.schedule.tree
......@@ -3,10 +3,76 @@
Reference: Loopy's Model of a Kernel
====================================
What Types of Computation can a Loopy Program Express?
------------------------------------------------------
Loopy programs consist of an a-priori unordered set of statements, operating
on :math:`n`-dimensional array variables.
Arrays consist of "plain old data" and structures thereof, as describable
by a :class:`numpy.dtype`. The n-dimensional shape of these arrays is
given by a tuple of expressions at most affine in parameters that are
fixed for the duration of program execution.
Each array variable in the program is either an argument or a temporary
variable. A temporary variable is only live within the program, while
argument variables are accessible outside the program and constitute the
program's inputs and outputs.
A statement (still called 'instruction' in some places, cf.
:class:`loopy.InstructionBase`) encodes an assignment to an entry of an array.
The right-hand side of an assignment consists of an expression that may
consist of arithmetic operations and calls to functions.
If the outermost operation of the RHS expression is a function call,
the RHS value may be a tuple, and multiple (still scalar) arrays appear
as LHS values. (This is the only sense in which tuple types are supported.)
Each statement is parameterized by zero or more loop variables ("inames").
A statement is executed once for each integer point defined by the domain
forest for the iname tuple given for that statement
(:attr:`loopy.InstructionBase.within_inames`). Each execution of a
statement (with specific values of the inames) is called a *statement
instance*. Dependencies between these instances as well as instances of
other statements are encoded in the program representation and specify permissible
execution orderings. (The semantics of the dependencies are `being
sharpened <https://github.com/inducer/loopy/pull/168>`__.) Assignments
(comprising the evaluation of the RHS and the assignment to the LHS) may
be specified to be atomic.
The basic building blocks of the domain forest are sets given as
conjunctions of equalities and inequalities of quasi-affine expressions on
integer tuples, called domains, and represented as instances of
:class:`islpy.BasicSet`. The entries of each integer tuple are
either *parameters* or *inames*. Each domain may optionally have a *parent
domain*. Parameters of parent-less domains are given by value arguments
supplied to the program that will remain unchanged during program
execution. Parameters of domains with parents may be
- run-time-constant value arguments to the program, or
- inames from parent domains, or
- scalar, integer temporary variables that are written by statements
with iteration domains controlled by a parent domain.
For each tuple of concrete parameter values, the set of iname tuples must be
finite. Each iname is defined by exactly one domain.
For a tuple of inames, the domain forest defines an iteration domain
by finding all the domains defining the inames involved, along with their
parent domains. The resulting tree of domains may contain multiple roots,
but no branches. The iteration domain is then constructed by intersecting
these domains and constructing the projection of that set onto the space
given by the required iname tuple. Observe that, via the parent-child
domain mechanism, imperfectly-nested and data-dependent loops become
expressible.
The set of functions callable from the language is predefined by the system.
Additional functions may be defined by the user by registering them. It is
not currently possible to define functions from within Loopy, however work
is progressing on permitting this. Even once this is allowed, recursion
will not be permitted.
.. _domain-tree:
Loop Domain Tree
----------------
Loop Domain Forest
------------------
.. {{{
......@@ -29,10 +95,29 @@ Note that *n* in the example is not an iname. It is a
:ref:`domain-parameters` that is passed to the kernel by the user.
To accommodate some data-dependent control flow, there is not actually
a single loop domain, but rather a *tree of loop domains*,
allowing more deeply nested domains to depend on inames
a single loop domain, but rather a *forest of loop domains* (a collection
of trees) allowing more deeply nested domains to depend on inames
introduced by domains closer to the root.
Here is an example::
{ [l] : 0 <= l <= 2 }
{ [i] : start <= i < end }
{ [j] : start <= j < end }
The i and j domains are "children" of the l domain (visible from indentation).
This is also how :mod:`loopy` prints the domain forest, to make the parent/child
relationship visible. In the example, the parameters start/end might be read
inside of the 'l' loop.
The idea is that domains form a forest (a collection of trees), and a
"sub-forest" is extracted that covers all the inames for each
instruction. Each individual sub-tree is then checked for branching,
which is ill-formed. It is declared ill-formed because intersecting, in
the above case, the l, i, and j domains could result in restrictions from the
i domain affecting the j domain by way of how i affects l--which would
be counterintuitive to say the least.)
.. _inames:
Inames
......@@ -111,9 +196,12 @@ Iname Implementation Tags
Tag Meaning
=============================== ====================================================
``None`` | ``"for"`` Sequential loop
``"ord"`` Forced-order sequential loop
``"l.N"`` Local (intra-group) axis N ("local")
``"g.N"`` Group-number axis N ("group")
``"unr"`` Unroll
``"unr_hint"`` Unroll using compiler directives
``"unr_hint.N"`` Unroll at most N times using compiler directives
``"ilp"`` | ``"ilp.unr"`` Unroll using instruction-level parallelism
``"ilp.seq"`` Realize parallel iname as innermost loop
``"like.INAME"`` Can be used when tagging inames to tag like another
......@@ -131,6 +219,42 @@ Tag Meaning
.. }}}
Identifiers
-----------
Reserved Identifiers
^^^^^^^^^^^^^^^^^^^^
The identifier prefix ``_lp_`` is reserved for internal usage; when creating
*inames*, *argument names*, *temporary variable names*, *substitution rule
names*, *instruction IDs*, and other identifiers, users should *not* use names
beginning with ``_lp_``. This prefix is used for identifiers created
internally when operating on Loopy's kernel IR. For Loopy developers, further
information on name prefixes used within submodules is below.
Identifier Registry
^^^^^^^^^^^^^^^^^^^
Functionality in :mod:`loopy` *must* use identifiers beginning with ``_lp_`` for
all internally-created identifiers. Additionally, each name beginning with
``_lp_`` must start with one of the reserved prefixes below. New prefixes may
be registered by adding them to the table below. New prefixes may not themselves
be the prefix of an existing prefix.
**Reserved Identifier Prefixes**
======================= ==================================
Reserved Prefix Usage (module or purpose)
======================= ==================================
``_lp_linchk_`` ``loopy.linearization.checker``
======================= ==================================
.. note::
Existing Loopy code may not yet fully satisfy these naming requirements.
Name changes are in progress, and prefixes will be added to this registry
as they are created.
.. _instructions:
Instructions
......@@ -138,6 +262,7 @@ Instructions
.. {{{
.. autoclass:: HappensAfter
.. autoclass:: InstructionBase
.. _assignments:
......@@ -242,15 +367,40 @@ These are usually key-value pairs. The following attributes are recognized:
heuristic and indicate that the specified list of dependencies is
exhaustive.
* ``dep_query=...`` provides an alternative way of specifying instruction
dependencies. The given string is parsed as a match expression object by
:func:`loopy.match.parse_match`. Upon kernel generation, this match
expression is used to match instructions in the kernel and add them as
dependencies.
* ``nosync=id1:id2`` prescribes that no barrier synchronization is necessary
the instructions with identifiers ``id1`` and ``id2`` to the, even if
a dependency chain exists and variables are accessed in an apparently
racy way.
for the instructions with identifiers ``id1`` and ``id2``, even if a
dependency chain exists and variables are accessed in an apparently racy
way.
Identifiers here are allowed to be wildcards as defined by the Python
function :func:`fnmatch.fnmatchcase`. This is helpful in conjunction with
``id_prefix``.
Identifiers (including wildcards) accept an optional `@scope` suffix,
which prescribes that no synchronization at level `scope` is needed.
This does not preclude barriers at levels different from `scope`.
Allowable `scope` values are:
* `local`
* `global`
* `any`
As an example, ``nosync=id1@local:id2@global`` prescribes that no local
synchronization is needed with instruction ``id1`` and no global
synchronization is needed with instruction ``id2``.
``nosync=id1@any`` has the same effect as ``nosync=id1``.
* ``nosync_query=...`` provides an alternative way of specifying ``nosync``,
just like ``dep_query`` and ``dep``. As with ``nosync``, ``nosync_query``
accepts an optional `@scope` suffix.
* ``priority=integer`` sets the instructions priority to the value
``integer``. Instructions with higher priority will be scheduled sooner,
if possible. Note that the scheduler may still schedule a lower-priority
......@@ -282,13 +432,30 @@ Expressions
Loopy's expressions are a slight superset of the expressions supported by
:mod:`pymbolic`.
* ``if``
* ``if(cond, then, else_)``
* ``a[[ 8*i + j ]]``: Linear subscripts.
See :class:`loopy.symbolic.LinearSubscript`.
* ``reductions``
* duplication of reduction inames
See :class:`loopy.symbolic.Reduction`.
* ``reduce`` vs ``simul_reduce``
* complex-valued arithmetic
* tagging of array access and substitution rule use ("$")
See :class:`loopy.symbolic.TaggedVariable`.
* ``indexof``, ``indexof_vec``
* ``cast(type, value)``: No parse syntax currently.
See :class:`loopy.symbolic.TypeCast`.
* If constants in expressions are subclasses of :class:`numpy.generic`,
generated code will contain literals of exactly that type, making them
*explicitly typed*. Constants given as Python types such as :class:`int`,
:class:`float` or :class:`complex` are called *implicitly* typed and
adapt to the type of the expected result.
TODO: Functions
TODO: Reductions
......@@ -296,6 +463,7 @@ TODO: Reductions
Function Call Instructions
^^^^^^^^^^^^^^^^^^^^^^^^^^
.. automodule:: loopy
.. autoclass:: CallInstruction
C Block Instructions
......@@ -306,16 +474,34 @@ C Block Instructions
Atomic Operations
^^^^^^^^^^^^^^^^^
.. autoclass:: memory_ordering
.. autoclass:: MemoryOrdering
.. autoclass:: memory_scope
.. autoclass:: MemoryScope
.. autoclass:: VarAtomicity
.. autoclass:: OrderedAtomic
.. autoclass:: AtomicInit
.. autoclass:: AtomicUpdate
No-Op Instruction
^^^^^^^^^^^^^^^^^
.. autoclass:: NoOpInstruction
Barrier Instructions
^^^^^^^^^^^^^^^^^^^^
.. autoclass:: BarrierInstruction
Instruction Tags
^^^^^^^^^^^^^^^^
.. autoclass:: LegacyStringInstructionTag
.. autoclass:: UseStreamingStoreTag
.. }}}
Data: Arguments and Temporaries
......@@ -332,24 +518,14 @@ Arguments
^^^^^^^^^
.. autoclass:: KernelArgument
:members:
:undoc-members:
.. autoclass:: ValueArg
:members:
:undoc-members:
.. autoclass:: GlobalArg
:members:
:undoc-members:
.. autoclass:: ArrayArg
.. autoclass:: ConstantArg
:members:
:undoc-members:
.. autoclass:: ImageArg
:members:
:undoc-members:
.. _temporaries:
......@@ -359,11 +535,9 @@ Temporary Variables
Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both
have the lifetime of a kernel invocation.
.. autoclass:: temp_var_scope
.. autoclass:: AddressSpace
.. autoclass:: TemporaryVariable
:members:
:undoc-members:
.. _types:
......@@ -468,6 +642,8 @@ Kernel Options
.. autoclass:: Options
.. _targets:
Targets
-------
......@@ -480,10 +656,10 @@ Helper values
.. {{{
.. autoclass:: auto
.. autoclass:: UniqueName
.. autoclass:: Optional
.. }}}
Libraries: Extending and Interfacing with External Functionality
......@@ -499,6 +675,8 @@ Symbols
Functions
^^^^^^^^^
.. autoclass:: PreambleInfo
.. autoclass:: CallMangleInfo
.. _reductions:
......@@ -513,10 +691,18 @@ The Kernel Object
Do not create :class:`LoopKernel` objects directly. Instead, refer to
:ref:`creating-kernels`.
.. autoclass:: LoopKernel
.. automodule:: loopy.kernel
Implementation Details: The Base Array
--------------------------------------
All array-like data in :mod:`loopy` (such as :class:`ArrayArg` and
:class:`TemporaryVariable`) derive from single, shared base array type,
described next.
.. currentmodule:: loopy.kernel.array
.. autoclass:: ArrayBase
.. autoclass:: kernel_state
:members:
:undoc-members:
.. vim: tw=75:spell:fdm=marker
Reference: Other Functionality
==============================
Auxiliary Data Types
--------------------
.. automodule:: loopy.typing
Obtaining Kernel Performance Statistics
---------------------------------------
......@@ -9,6 +14,24 @@ Obtaining Kernel Performance Statistics
Controlling caching
-------------------
.. envvar:: LOOPY_NO_CACHE
.. envvar:: CG_NO_CACHE
By default, loopy will cache (on disk) the result of various stages
of code generation to speed up future code generation of the same kernel.
By setting the environment variables :envvar:`LOOPY_NO_CACHE` or
:envvar:`CG_NO_CACHE` to any
string that :func:`pytools.strtobool` evaluates as ``True``, this caching
is suppressed.
.. envvar:: LOOPY_ABORT_ON_CACHE_MISS
If set to a string that :func:`pytools.strtobool` evaluates as ``True``,
loopy will raise an exception if a cache miss occurs. This can be useful
for debugging cache-related issues. For example, it can be used to automatically test whether caching is successful for a particular code, by setting this variable to ``True`` and re-running the code.
.. autofunction:: set_caching_enabled
.. autoclass:: CacheMode
......@@ -16,10 +39,11 @@ Controlling caching
Running Kernels
---------------
In addition to simply calling kernels using :class:`LoopKernel.__call__`,
the following underlying functionality may be used:
Use :class:`TranslationUnit.executor` to bind a translation unit
to execution resources, and then use :class:`ExecutorBase.__call__`
to invoke the kernel.
.. autoclass:: CompiledKernel
.. autoclass:: ExecutorBase
Automatic Testing
-----------------
......@@ -44,3 +68,4 @@ following always works::
.. autofunction:: show_dependency_graph
.. autofunction:: t_unit_to_python
......@@ -44,16 +44,20 @@ Influencing data access
.. autofunction:: change_arg_to_image
.. autofunction:: tag_data_axes
.. autofunction:: tag_array_axes
.. autofunction:: remove_unused_arguments
.. autofunction:: set_array_dim_names
.. autofunction:: set_array_axis_names
.. automodule:: loopy.transform.privatize
.. autofunction:: allocate_temporaries_for_base_storage
Padding Data
------------
.. autofunction:: split_array_dim
.. autofunction:: split_array_axis
.. autofunction:: find_padding_multiple
......@@ -72,6 +76,10 @@ Manipulating Instructions
.. autofunction:: tag_instructions
.. autofunction:: add_nosync
.. autofunction:: add_barrier
Registering Library Routines
----------------------------
......@@ -81,8 +89,6 @@ Registering Library Routines
.. autofunction:: register_symbol_manglers
.. autofunction:: register_function_manglers
Modifying Arguments
-------------------
......@@ -96,7 +102,7 @@ Modifying Arguments
.. autofunction:: rename_argument
.. autofunction:: set_temporary_scope
.. autofunction:: set_temporary_address_space
Creating Batches of Operations
------------------------------
......@@ -112,13 +118,17 @@ Finishing up
.. autofunction:: generate_loop_schedules
.. autofunction:: get_one_scheduled_kernel
.. autofunction:: get_one_linearized_kernel
.. autofunction:: save_and_reload_temporaries
.. autoclass:: GeneratedProgram
.. autoclass:: CodeGenerationResult
.. autofunction:: generate_code_v2
.. autofunction:: generate_header
Setting options
---------------
......@@ -134,4 +144,3 @@ TODO: Matching instruction tags
.. automodule:: loopy.match
.. vim: tw=75:spell
.. currentmodule:: loopy
Translation Units
=================
.. automodule:: loopy.translation_unit
......@@ -25,9 +25,12 @@ import a few modules and set up a :class:`pyopencl.Context` and a
>>> import loopy as lp
>>> lp.set_caching_enabled(False)
>>> from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2
>>> from warnings import filterwarnings, catch_warnings
>>> filterwarnings('error', category=lp.LoopyWarning)
>>> from loopy.diagnostic import DirectCallUncachedWarning
>>> filterwarnings('ignore', category=DirectCallUncachedWarning)
>>> ctx = cl.create_some_context(interactive=False)
>>> queue = cl.CommandQueue(ctx)
......@@ -53,6 +56,15 @@ And some data on the host:
.. }}}
We'll also disable console syntax highlighting because it confuses
doctest:
.. doctest::
>>> # not a documented interface
>>> import loopy.options
>>> loopy.options.ALLOW_TERMINAL_COLORS = False
Getting started
---------------
......@@ -102,18 +114,20 @@ always see loopy's view of a kernel by printing it.
KERNEL: loopy_kernel
---------------------------------------------------------------------------
ARGUMENTS:
a: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
n: ValueArg, type: <runtime>
out: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
a: type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1) in aspace: global
n: ValueArg, type: <auto/runtime>
out: type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1) out aspace: global
---------------------------------------------------------------------------
DOMAINS:
[n] -> { [i] : 0 <= i < n }
---------------------------------------------------------------------------
INAME IMPLEMENTATION TAGS:
INAME TAGS:
i: None
---------------------------------------------------------------------------
INSTRUCTIONS:
[i] out[i] <- 2*a[i] # insn
for i
out[i] = 2*a[i] {id=insn}
end i
---------------------------------------------------------------------------
You'll likely have noticed that there's quite a bit more information here
......@@ -143,7 +157,7 @@ following:
See :ref:`specifying-arguments`.
* Loopy has not determined the type of ``a`` and ``out``. The data type is
given as ``<runtime>``, which means that these types will be determined
given as ``<auto/runtime>``, which means that these types will be determined
by the data passed in when the kernel is invoked. Loopy generates (and
caches!) a copy of the kernel for each combination of types passed in.
......@@ -167,16 +181,16 @@ for good measure.
>>> assert (out.get() == (2*x_vec_dev).get()).all()
We can have loopy print the OpenCL kernel it generated
by passing :attr:`loopy.Options.write_cl`.
by passing :attr:`loopy.Options.write_code`.
.. doctest::
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
<BLANKLINE>
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
{
for (int i = 0; i <= -1 + n; ++i)
out[i] = 2.0f * a[i];
......@@ -196,6 +210,7 @@ For convenience, loopy kernels also directly accept :mod:`numpy` arrays:
.. doctest::
>>> knl = lp.set_options(knl, write_code=False)
>>> evt, (out,) = knl(queue, a=x_vec_host)
>>> assert (out == (2*x_vec_host)).all()
......@@ -215,47 +230,65 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`:
.. doctest::
>>> knl = lp.set_options(knl, write_wrapper=True, write_cl=False)
>>> knl = lp.set_options(knl, write_wrapper=True, write_code=False)
>>> evt, (out,) = knl(queue, a=x_vec_host)
from __future__ import division
import numpy as _lpy_np
...
def invoke_loopy_kernel_loopy_kernel(_lpy_cl_kernels, queue, allocator=None, wait_for=None, out_host=None, a=None, n=None, out=None):
if allocator is None:
allocator = _lpy_cl_tools.DeferredAllocator(queue.context)
<BLANKLINE>
# {{{ find integer arguments from shapes
# {{{ find integer arguments from array data
<BLANKLINE>
if n is None:
if a is not None:
n = a.shape[0]
<BLANKLINE>
elif out is not None:
n = out.shape[0]
<BLANKLINE>
# }}}
...
You can also pass options to the OpenCL implementation
by passing :attr:`loopy.Options.build_options`.
.. doctest::
>>> knl = lp.set_options(knl, build_options=["-cl-mad-enable"])
Generating code
~~~~~~~~~~~~~~~
Instead of using loopy to run the code it generates, you can also just use
loopy as a code generator and take care of executing the generated kernels
yourself. In this case, make sure loopy knows about all types, and then
call :func:`loopy.generate_code`:
call :func:`loopy.generate_code_v2`:
.. doctest::
>>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32))
>>> code, _ = lp.generate_code(typed_knl)
>>> code = lp.generate_code_v2(typed_knl).device_code()
>>> print(code)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
<BLANKLINE>
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
{
for (int i = 0; i <= -1 + n; ++i)
out[i] = 2.0f * a[i];
}
Additionally, for C-based languages, header definitions can be obtained via
the :func:`loopy.generate_header`:
.. doctest::
>>> header = str(lp.generate_header(typed_knl)[0])
>>> print(header)
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out);
.. }}}
.. _ordering:
......@@ -277,7 +310,8 @@ argument:
... """
... out[j,i] = a[i,j]
... out[i,j] = 2*out[i,j]
... """)
... """,
... [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...])
loopy's programming model is completely *unordered* by default. This means
that:
......@@ -304,7 +338,9 @@ an explicit dependency:
... """
... out[j,i] = a[i,j] {id=transpose}
... out[i,j] = 2*out[i,j] {dep=transpose}
... """)
... """,
... [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...],
... name="transpose_and_dbl")
``{id=transpose}`` assigns the identifier *transpose* to the first
instruction, and ``{dep=transpose}`` declares a dependency of the second
......@@ -313,9 +349,9 @@ that these dependencies show up there, too:
.. doctest::
>>> print(knl.stringify(with_dependencies=True))
>>> print(knl["transpose_and_dbl"].stringify(with_dependencies=True))
---------------------------------------------------------------------------
KERNEL: loopy_kernel
KERNEL: transpose_and_dbl
---------------------------------------------------------------------------
...
---------------------------------------------------------------------------
......@@ -334,7 +370,7 @@ loopy can also show an instruction dependency graph, using
Dependencies are shown as arrows from prerequisite to dependent in the
graph. This functionality requires the open-source `graphviz
<http://graphviz.org>`_ graph drawing tools to be installed. The generated
<https://graphviz.org>`_ graph drawing tools to be installed. The generated
graph will open in a browser window.
Since manually notating lots of dependencies is cumbersome, loopy has
......@@ -359,13 +395,13 @@ Let us take a look at the generated code for the above kernel:
.. doctest::
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_loop_priority(knl, "i,j")
>>> knl = lp.set_options(knl, write_code=True)
>>> knl = lp.prioritize_loops(knl, "i,j")
>>> evt, (out,) = knl(queue, a=a_mat_dev)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
<BLANKLINE>
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) transpose_and_dbl(__global float *__restrict__ out, __global float const *__restrict__ a, int const n)
{
for (int i = 0; i <= -1 + n; ++i)
for (int j = 0; j <= -1 + n; ++j)
......@@ -401,20 +437,22 @@ with identical bounds, for the use of the transpose:
... """
... out[j,i] = a[i,j] {id=transpose}
... out[ii,jj] = 2*out[ii,jj] {dep=transpose}
... """)
>>> knl = lp.set_loop_priority(knl, "i,j,ii,jj")
... """,
... [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...])
>>> knl = lp.prioritize_loops(knl, "i,j")
>>> knl = lp.prioritize_loops(knl, "ii,jj")
:func:`loopy.duplicate_inames` can be used to achieve the same goal.
Now the intended code is generated and our test passes.
.. doctest::
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=a_mat_dev)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
<BLANKLINE>
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *__restrict__ out, __global float const *__restrict__ a, int const n)
{
for (int i = 0; i <= -1 + n; ++i)
for (int j = 0; j <= -1 + n; ++j)
......@@ -438,8 +476,8 @@ control is the nesting of loops. For example, should the *i* loop be nested
around the *j* loop, or the other way around, in the following simple
zero-fill kernel?
It turns out that Loopy will typically choose a loop nesting for us, but it
does not like doing so. Loo.py will react to the following code
It turns out that Loopy will choose a loop nesting for us, but it might be
ambiguous. Consider the following code:
.. doctest::
......@@ -449,19 +487,14 @@ does not like doing so. Loo.py will react to the following code
... a[i,j] = 0
... """)
By saying::
LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring
And by picking one of the possible loop orderings at random.
The warning (and the nondeterminism it warns about) is easily resolved:
Both nestings of the inames `i` and `j` result in a correct kernel.
This ambiguity can be resolved:
.. doctest::
>>> knl = lp.set_loop_priority(knl, "j,i")
>>> knl = lp.prioritize_loops(knl, "j,i")
:func:`loopy.set_loop_priority` indicates the textual order in which loops
:func:`loopy.prioritize_loops` indicates the textual order in which loops
should be entered in the kernel code. Note that this priority has an
advisory role only. If the kernel logically requires a different nesting,
loop priority is ignored. Priority is only considered if loop nesting is
......@@ -469,13 +502,13 @@ ambiguous.
.. doctest::
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=a_mat_dev)
#define lid(N) ((int) get_local_id(N))
...
for (int j = 0; j <= -1 + n; ++j)
for (int i = 0; i <= -1 + n; ++i)
a[n * i + j] = 0.0f;
a[n * i + j] = (float) (0.0f);
...
No more warnings! Loop nesting is also reflected in the dependency graph:
......@@ -507,7 +540,7 @@ is overwritten with the new kernel::
knl = lp.do_something(knl, arguments...)
We've already seen an example of a transformation above:
For instance, :func:`set_loop_priority` fit the pattern.
For instance, :func:`loopy.prioritize_loops` fit the pattern.
:func:`loopy.split_iname` is another fundamental (and useful) transformation. It
turns one existing iname (recall that this is loopy's word for a 'loop
......@@ -526,15 +559,14 @@ Consider this example:
... "{ [i]: 0<=i<n }",
... "a[i] = 0", assumptions="n>=1")
>>> knl = lp.split_iname(knl, "i", 16)
>>> knl = lp.set_loop_priority(knl, "i_outer,i_inner")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
for (int i_outer = 0; i_outer <= -1 + ((15 + n) / 16); ++i_outer)
for (int i_inner = 0; i_inner <= 15; ++i_inner)
if (-1 + -1 * i_inner + -16 * i_outer + n >= 0)
a[i_inner + i_outer * 16] = 0.0f;
for (int i_outer = 0; i_outer <= -1 + (15 + n) / 16; ++i_outer)
for (int i_inner = 0; i_inner <= ((-17 + n + -16 * i_outer >= 0) ? 15 : -1 + n + -16 * i_outer); ++i_inner)
a[16 * i_outer + i_inner] = (float) (0.0f);
...
By default, the new, split inames are named *OLD_outer* and *OLD_inner*,
......@@ -554,21 +586,25 @@ relation to loop nesting. For example, it's perfectly possible to request
.. doctest::
>>> knl = lp.set_loop_priority(knl, "i_inner,i_outer")
>>> knl = lp.make_kernel(
... "{ [i]: 0<=i<n }",
... "a[i] = 0", assumptions="n>=1")
>>> knl = lp.split_iname(knl, "i", 16)
>>> knl = lp.prioritize_loops(knl, "i_inner,i_outer")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
for (int i_inner = 0; i_inner <= 15; ++i_inner)
if (-1 + -1 * i_inner + n >= 0)
for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
a[i_inner + i_outer * 16] = 0.0f;
for (int i_inner = 0; i_inner <= ((-17 + n >= 0) ? 15 : -1 + n); ++i_inner)
for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + (15 + n + 15 * i_inner) / 16; ++i_outer)
a[16 * i_outer + i_inner] = (float) (0.0f);
...
Notice how loopy has automatically generated guard conditionals to make
sure the bounds on the old iname are obeyed.
The combination of :func:`loopy.split_iname` and
:func:`loopy.set_loop_priority` is already good enough to implement what is
:func:`loopy.prioritize_loops` is already good enough to implement what is
commonly called 'loop tiling':
.. doctest::
......@@ -579,16 +615,16 @@ commonly called 'loop tiling':
... assumptions="n mod 16 = 0 and n >= 1")
>>> knl = lp.split_iname(knl, "i", 16)
>>> knl = lp.split_iname(knl, "j", 16)
>>> knl = lp.set_loop_priority(knl, "i_outer,j_outer,i_inner")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner,j_inner")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=a_mat_dev)
#define lid(N) ((int) get_local_id(N))
...
for (int i_outer = 0; i_outer <= ((-16 + n) / 16); ++i_outer)
for (int j_outer = 0; j_outer <= ((-16 + n) / 16); ++j_outer)
for (int i_outer = 0; i_outer <= (-16 + n) / 16; ++i_outer)
for (int j_outer = 0; j_outer <= (-16 + n) / 16; ++j_outer)
for (int i_inner = 0; i_inner <= 15; ++i_inner)
for (int j_inner = 0; j_inner <= 15; ++j_inner)
out[n * (i_inner + i_outer * 16) + j_inner + j_outer * 16] = a[n * (j_inner + j_outer * 16) + i_inner + i_outer * 16];
out[n * (16 * i_outer + i_inner) + 16 * j_outer + j_inner] = a[n * (16 * j_outer + j_inner) + 16 * i_outer + i_inner];
...
.. }}}
......@@ -621,19 +657,18 @@ loop's tag to ``"unr"``:
>>> orig_knl = knl
>>> knl = lp.split_iname(knl, "i", 4)
>>> knl = lp.tag_inames(knl, dict(i_inner="unr"))
>>> knl = lp.set_loop_priority(knl, "i_outer,i_inner")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#define int_floor_div_pos_b(a,b) ( ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) )
...
for (int i_outer = 0; i_outer <= int_floor_div_pos_b(-4 + n, 4); ++i_outer)
for (int i_outer = 0; i_outer <= loopy_floor_div_pos_b_int32(-4 + n, 4); ++i_outer)
{
a[0 + i_outer * 4] = 0.0f;
a[1 + i_outer * 4] = 0.0f;
a[2 + i_outer * 4] = 0.0f;
a[3 + i_outer * 4] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
a[1 + 4 * i_outer] = (float) (0.0f);
a[2 + 4 * i_outer] = (float) (0.0f);
a[3 + 4 * i_outer] = (float) (0.0f);
}
...
......@@ -672,7 +707,7 @@ Iname implementation tags are also printed along with the entire kernel:
>>> print(knl)
---------------------------------------------------------------------------
...
INAME IMPLEMENTATION TAGS:
INAME TAGS:
i_inner: unr
i_outer: None
---------------------------------------------------------------------------
......@@ -698,14 +733,14 @@ Let's try this out on our vector fill kernel by creating workgroups of size
... "a[i] = 0", assumptions="n>=0")
>>> knl = lp.split_iname(knl, "i", 128,
... outer_tag="g.0", inner_tag="l.0")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
__kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *restrict a, int const n)
__kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n)
{
if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
a[lid(0) + gid(0) * 128] = 0.0f;
a[128 * gid(0) + lid(0)] = (float) (0.0f);
}
Loopy requires that workgroup sizes are fixed and constant at compile time.
......@@ -717,11 +752,11 @@ those for us:
.. doctest::
>>> glob, loc = knl.get_grid_size_upper_bounds()
>>> glob, loc = knl["loopy_kernel"].get_grid_size_upper_bounds(knl.callables_table)
>>> print(glob)
(Aff("[n] -> { [(floor((127 + n)/128))] }"),)
(PwAff("[n] -> { [(floor((127 + n)/128))] }"),)
>>> print(loc)
(Aff("[n] -> { [(128)] }"),)
(PwAff("[n] -> { [(128)] }"),)
Note that this functionality returns internal objects and is not really
intended for end users.
......@@ -743,27 +778,27 @@ assumption:
>>> orig_knl = knl
>>> knl = lp.split_iname(knl, "i", 4)
>>> knl = lp.tag_inames(knl, dict(i_inner="unr"))
>>> knl = lp.set_loop_priority(knl, "i_outer,i_inner")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
for (int i_outer = 0; i_outer <= -1 + ((3 + n) / 4); ++i_outer)
for (int i_outer = 0; i_outer <= -1 + (3 + n) / 4; ++i_outer)
{
a[0 + i_outer * 4] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
if (-2 + -4 * i_outer + n >= 0)
a[1 + i_outer * 4] = 0.0f;
a[1 + 4 * i_outer] = (float) (0.0f);
if (-3 + -4 * i_outer + n >= 0)
a[2 + i_outer * 4] = 0.0f;
a[2 + 4 * i_outer] = (float) (0.0f);
if (-4 + -4 * i_outer + n >= 0)
a[3 + i_outer * 4] = 0.0f;
a[3 + 4 * i_outer] = (float) (0.0f);
}
...
While these conditionals enable the generated code to deal with arbitrary
*n*, they come at a performance cost. Loopy allows generating separate code
for the last iteration of the *i_outer* loop, by using the *slabs* keyword
argument to :func:`split_iname`. Since this last iteration of *i_outer* is
argument to :func:`loopy.split_iname`. Since this last iteration of *i_outer* is
the only iteration for which ``i_inner + 4*i_outer`` can become larger than
*n*, only the (now separate) code for that iteration contains conditionals,
enabling some cost savings:
......@@ -772,31 +807,34 @@ enabling some cost savings:
>>> knl = orig_knl
>>> knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_loop_priority(knl, "i_outer,i_inner")
>>> knl = lp.set_options(knl, write_code=True)
>>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
/* bulk slab for 'i_outer' */
for (int i_outer = 0; i_outer <= -2 + ((3 + n) / 4); ++i_outer)
for (int i_outer = 0; i_outer <= -2 + (3 + n) / 4; ++i_outer)
{
a[0 + i_outer * 4] = 0.0f;
a[1 + i_outer * 4] = 0.0f;
a[2 + i_outer * 4] = 0.0f;
a[3 + i_outer * 4] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
a[1 + 4 * i_outer] = (float) (0.0f);
a[2 + 4 * i_outer] = (float) (0.0f);
a[3 + 4 * i_outer] = (float) (0.0f);
}
/* final slab for 'i_outer' */
for (int i_outer = -1 + n + -1 * (3 * n / 4); i_outer <= -1 + ((3 + n) / 4); ++i_outer)
if (-1 + n >= 0)
{
int const i_outer = -1 + n + -1 * ((3 * n) / 4);
<BLANKLINE>
if (i_outer >= 0)
{
a[0 + i_outer * 4] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
if (-2 + -4 * i_outer + n >= 0)
a[1 + i_outer * 4] = 0.0f;
a[1 + 4 * i_outer] = (float) (0.0f);
if (-3 + -4 * i_outer + n >= 0)
a[2 + i_outer * 4] = 0.0f;
a[2 + 4 * i_outer] = (float) (0.0f);
if (4 + 4 * i_outer + -1 * n == 0)
a[3 + i_outer * 4] = 0.0f;
a[3 + 4 * i_outer] = (float) (0.0f);
}
}
...
.. }}}
......@@ -865,7 +903,7 @@ memory, local to each work item.
.. doctest::
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out1, out2) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
......@@ -901,6 +939,8 @@ expression being assigned.
... """)
>>> evt, (out1, out2) = knl(queue, a=x_vec_dev)
.. _local_temporaries:
Temporaries in local memory
~~~~~~~~~~~~~~~~~~~~~~~~~~~
......@@ -919,11 +959,12 @@ Consider the following example:
... "{ [i_outer,i_inner, k]: "
... "0<= 16*i_outer + i_inner <n and 0<= i_inner,k <16}",
... """
... <> a_temp[i_inner] = a[16*i_outer + i_inner] {priority=10}
... <> a_temp[i_inner] = a[16*i_outer + i_inner]
... out[16*i_outer + i_inner] = sum(k, a_temp[k])
... """)
>>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0"))
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_temporary_address_space(knl, "a_temp", "local")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
......@@ -988,13 +1029,13 @@ transformation exists in :func:`loopy.add_prefetch`:
... out[16*i_outer + i_inner] = sum(k, a[16*i_outer + i_inner])
... """)
>>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0"))
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> knl_pf = lp.add_prefetch(knl, "a")
>>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
acc_k = 0.0f;
a_fetch = a[16 * gid(0) + lid(0)];
acc_k = 0.0f;
for (int k = 0; k <= 15; ++k)
acc_k = acc_k + a_fetch;
out[16 * gid(0) + lid(0)] = acc_k;
......@@ -1013,17 +1054,15 @@ earlier:
.. doctest::
>>> knl_pf = lp.add_prefetch(knl, "a", ["i_inner"])
>>> knl_pf = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.0")
>>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
acc_k = 0.0f;
if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];
barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */;
a_fetch[lid(0)] = a[16 * gid(0) + lid(0)];
if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
{
acc_k = 0.0f;
for (int k = 0; k <= 15; ++k)
acc_k = acc_k + a_fetch[lid(0)];
out[16 * gid(0) + lid(0)] = acc_k;
......@@ -1033,6 +1072,16 @@ earlier:
Tagged prefetching
~~~~~~~~~~~~~~~~~~
.. _global_temporaries:
Temporaries in global memory
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:mod:`loopy` supports using temporaries with global storage duration. As with
local and private temporaries, the runtime allocates storage for global
temporaries when the kernel gets executed. The user must explicitly specify that
a temporary is global. To specify that a temporary is global, use
:func:`loopy.set_temporary_address_space`.
Substitution rules
~~~~~~~~~~~~~~~~~~
......@@ -1042,30 +1091,252 @@ Generic Precomputation
.. }}}
.. _more-complicated-programs:
More complicated programs
-------------------------
.. _synchronization:
Synchronization
---------------
.. {{{
SCOP
When one work item executing with others writes to a memory location, OpenCL
does not guarantee that other work items will immediately be able to read the
memory location and get back the same thing that was written. In order to ensure
that memory is consistent across work items, some sort of synchronization
operation is used.
Data-dependent control flow
~~~~~~~~~~~~~~~~~~~~~~~~~~~
:mod:`loopy` supports synchronization in the form of *barriers* or *atomic
operations*.
Conditionals
~~~~~~~~~~~~
Barriers
~~~~~~~~
Snippets of C
~~~~~~~~~~~~~
Prior to code generation, :mod:`loopy` performs a check to see that every memory
access is free of dependencies requiring a barrier. The following kinds of
memory access dependencies require a barrier when they involve more than one
work item:
* read-after-write
* write-after-read
* write-after-write.
:mod:`loopy` supports two kinds of barriers:
* *Local barriers* ensure consistency of memory accesses to items within
*the same* work group. This synchronizes with all instructions in the work
group. The type of memory (local or global) may be specified by the
:attr:`loopy.BarrierInstruction.mem_kind`
* *Global barriers* ensure consistency of memory accesses
across *all* work groups, i.e. it synchronizes with every work item
executing the kernel. Note that there is no exact equivalent for
this kind of barrier in OpenCL. [#global-barrier-note]_
Once a work item has reached a barrier, it waits for everyone that it
synchronizes with to reach the barrier before continuing. This means that unless
all work items reach the same barrier, the kernel will hang during execution.
Barrier insertion
~~~~~~~~~~~~~~~~~
By default, :mod:`loopy` inserts local barriers between two instructions when it
detects that a dependency involving local memory may occur across work items. To
see this in action, take a look at the section on :ref:`local_temporaries`.
In contrast, :mod:`loopy` will *not* insert global barriers automatically and
instead will report an error if it detects the need for a global barrier. As an
example, consider the following kernel, which attempts to rotate its input to
the right by 1 in parallel:
.. doctest::
>>> knl = lp.make_kernel(
... "[n] -> {[i] : 0<=i<n}",
... """
... for i
... <>tmp = arr[i] {id=maketmp,dep=*}
... arr[(i + 1) % n] = tmp {id=rotate,dep=*maketmp}
... end
... """,
... [
... lp.GlobalArg("arr", shape=("n",), dtype=np.int32),
... "...",
... ],
... name="rotate_v1",
... assumptions="n mod 16 = 0")
>>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0")
Note the presence of the write-after-read dependency in global memory. Due to
this, :mod:`loopy` will complain that global barrier needs to be inserted:
>>> cgr = lp.generate_code_v2(knl)
Traceback (most recent call last):
...
loopy.diagnostic.MissingBarrierError: rotate_v1: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
The syntax for a inserting a global barrier instruction is
``... gbarrier``. :mod:`loopy` also supports manually inserting local
barriers. The syntax for a local barrier instruction is ``... lbarrier``.
Saving temporaries across global barriers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
For some platforms (currently only PyOpenCL), :mod:`loopy` implements global
barriers by splitting the kernel into a host side kernel and multiple
device-side kernels. On such platforms, it will be necessary to save non-global
temporaries that are live across kernel calls. This section presents an example
of how to use :func:`loopy.save_and_reload_temporaries` which is helpful for
that purpose.
Let us start with an example. Consider the kernel from above with a
``... gbarrier`` instruction that has already been inserted.
.. doctest::
>>> prog = lp.make_kernel(
... "[n] -> {[i] : 0<=i<n}",
... """
... for i
... <>tmp = arr[i] {id=maketmp,dep=*}
... ... gbarrier {id=bar,dep=*maketmp}
... arr[(i + 1) % n] = tmp {id=rotate,dep=*bar}
... end
... """,
... [
... lp.GlobalArg("arr", shape=("n",), dtype=np.int32),
... "...",
... ],
... name="rotate_v2",
... assumptions="n mod 16 = 0")
>>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0")
.. testsetup::
>>> prog = prog.with_kernel(
... prog.default_entrypoint.copy(
... silenced_warnings=["v1_scheduler_fallback"]))
Here is what happens when we try to generate code for the kernel:
>>> cgr = lp.generate_code_v2(prog)
Traceback (most recent call last):
...
loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?)
This happens due to the kernel splitting done by :mod:`loopy`. The splitting
happens when the instruction schedule is generated. To see the schedule, we
should call :func:`loopy.get_one_linearized_kernel`:
>>> prog = lp.preprocess_kernel(prog)
>>> knl = lp.get_one_linearized_kernel(prog["rotate_v2"], prog.callables_table)
>>> prog = prog.with_kernel(knl)
>>> print(prog)
---------------------------------------------------------------------------
KERNEL: rotate_v2
---------------------------------------------------------------------------
...
---------------------------------------------------------------------------
LINEARIZATION:
0: CALL KERNEL rotate_v2
1: tmp = arr[i_inner + i_outer*16] {id=maketmp}
2: RETURN FROM KERNEL rotate_v2
3: ... gbarrier
4: CALL KERNEL rotate_v2_0
5: arr[(i_inner + i_outer*16 + 1) % n] = tmp {id=rotate}
6: RETURN FROM KERNEL rotate_v2_0
---------------------------------------------------------------------------
As the error message suggests, taking a look at the generated instruction
schedule will show that while ``tmp`` is assigned in the first kernel, the
assignment to ``tmp`` is not seen by the second kernel. Because the temporary is
in private memory, it does not persist across calls to device kernels (the same
goes for local temporaries).
:mod:`loopy` provides a function called
:func:`loopy.save_and_reload_temporaries` for the purpose of handling the
task of saving and restoring temporary values across global barriers. This
function adds instructions to the kernel without scheduling them. That means
that :func:`loopy.get_one_linearized_kernel` needs to be called one more time to
put those instructions into the schedule.
>>> prog = lp.save_and_reload_temporaries(prog)
>>> knl = lp.get_one_linearized_kernel(prog["rotate_v2"], prog.callables_table) # Schedule added instructions
>>> prog = prog.with_kernel(knl)
>>> print(prog)
---------------------------------------------------------------------------
KERNEL: rotate_v2
---------------------------------------------------------------------------
...
---------------------------------------------------------------------------
TEMPORARIES:
tmp: type: np:dtype('int32'), shape: () aspace: private
tmp_save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) aspace: global
---------------------------------------------------------------------------
...
---------------------------------------------------------------------------
LINEARIZATION:
0: CALL KERNEL rotate_v2
1: tmp = arr[i_inner + i_outer*16] {id=maketmp}
2: tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp {id=tmp.save}
3: RETURN FROM KERNEL rotate_v2
4: ... gbarrier
5: CALL KERNEL rotate_v2_0
6: tmp = tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] {id=tmp.reload}
7: arr[(i_inner + i_outer*16 + 1) % n] = tmp {id=rotate}
8: RETURN FROM KERNEL rotate_v2_0
---------------------------------------------------------------------------
Here's an overview of what :func:`loopy.save_and_reload_temporaries` actually
does in more detail:
1. :mod:`loopy` first uses liveness analysis to determine which temporary
variables' live ranges cross a global barrier.
2. For each temporary, :mod:`loopy` creates a storage slot for the temporary in
global memory (see :ref:`global_temporaries`).
3. :mod:`loopy` saves the temporary into its global storage slot whenever it
detects the temporary is live-out from a kernel, and reloads the temporary
from its global storage slot when it detects that it needs to do so.
The kernel translates into two OpenCL kernels.
>>> cgr = lp.generate_code_v2(prog)
>>> print(cgr.device_code())
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
<BLANKLINE>
__kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int const *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
{
int tmp;
<BLANKLINE>
tmp = arr[16 * gid(0) + lid(0)];
tmp_save_slot[16 * gid(0) + lid(0)] = tmp;
}
<BLANKLINE>
__kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int const *__restrict__ tmp_save_slot)
{
int tmp;
<BLANKLINE>
tmp = tmp_save_slot[16 * gid(0) + lid(0)];
arr[(1 + lid(0) + gid(0) * 16) % n] = tmp;
}
Now we can execute the kernel.
>>> arr = cl.array.arange(queue, 16, dtype=np.int32)
>>> print(arr)
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
>>> evt, (out,) = prog(queue, arr=arr)
>>> print(arr)
[15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14]
Atomic operations
~~~~~~~~~~~~~~~~~
Loopy supports atomic operations. To use them, both the data on which the
atomic operations work as well as the operations themselves must be
suitably tagged, as in the following example::
:mod:`loopy` supports atomic operations. To use them, both the data on which the
atomic operations work as well as the operations themselves must be suitably
tagged, as in the following example::
knl = lp.make_kernel(
......@@ -1078,6 +1349,52 @@ suitably tagged, as in the following example::
],
assumptions="n>0")
.. [#global-barrier-note] In particular, this is *not* the same as a call to
``barrier(CLK_GLOBAL_MEM_FENCE)``.
.. }}}
.. _more-complicated-programs:
More complicated programs
-------------------------
.. {{{
SCOP
External Functions
~~~~~~~~~~~~~~~~~~
Loopy currently supports calls to several commonly used mathematical functions,
e.g. exp/log, min/max, sin/cos/tan, sinh/cosh, abs, etc. They may be used in
a loopy kernel by simply calling them, e.g.::
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"""
for i
a[i] = sqrt(i)
end
""")
Additionally, all functions of one variable are currently recognized during
code-generation however additional implementation may be required for custom
functions. The full lists of available functions may be found in a the
:class:`loopy.TargetBase` implementation (e.g. :class:`loopy.CudaTarget`)
Custom user functions may be represented using the method described in :ref:`functions`
Data-dependent control flow
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Conditionals
~~~~~~~~~~~~
Snippets of C
~~~~~~~~~~~~~
.. }}}
Common Problems
......@@ -1102,7 +1419,7 @@ Attempting to create this kernel results in an error:
... # While trying to find shape axis 0 of argument 'out', the following exception occurred:
Traceback (most recent call last):
...
StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n <= 1; [(n)] : n >= 2 }'
loopy.diagnostic.StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n <= 1; [(n)] : n >= 2 }'
The problem is that loopy cannot find a simple, universally valid expression
for the length of *out* in this case. Notice how the kernel accesses both the
......@@ -1158,7 +1475,7 @@ We'll also request a prefetch--but suppose we only do so across the
.. doctest::
>>> knl = lp.add_prefetch(knl, "a", "i_inner")
>>> knl = lp.add_prefetch(knl, "a", "i_inner", default_tag="l.auto")
When we try to run our code, we get the following warning from loopy as a first
sign that something is amiss:
......@@ -1168,13 +1485,13 @@ sign that something is amiss:
>>> evt, (out,) = knl(queue, a=a_mat_dev)
Traceback (most recent call last):
...
WriteRaceConditionWarning: instruction 'a_fetch_rule' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch_rule)' to silenced_warnings kernel argument to disable)
loopy.diagnostic.WriteRaceConditionWarning: in kernel transpose: instruction 'a_fetch_rule' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch_rule)' to silenced_warnings kernel attribute to disable)
When we ask to see the code, the issue becomes apparent:
.. doctest::
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> from warnings import catch_warnings
>>> with catch_warnings():
... filterwarnings("always", category=lp.LoopyWarning)
......@@ -1182,14 +1499,14 @@ When we ask to see the code, the issue becomes apparent:
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
<BLANKLINE>
__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *restrict a, int const n, __global float *restrict out)
__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
{
float a_fetch[16];
<BLANKLINE>
...
a_fetch[lid(0)] = a[n * (lid(0) + 16 * gid(1)) + lid(1) + 16 * gid(0)];
a_fetch[lid(0)] = a[n * (16 * gid(1) + lid(0)) + 16 * gid(0) + lid(1)];
...
out[n * (lid(1) + gid(0) * 16) + lid(0) + gid(1) * 16] = a_fetch[lid(0)];
out[n * (16 * gid(0) + lid(1)) + 16 * gid(1) + lid(0)] = a_fetch[lid(0)];
...
}
......@@ -1198,6 +1515,14 @@ considers making *a_fetch* ``local`` (in the OpenCL memory sense of the word)
to make use of parallelism in prefetching, it discovers that a write race
across the remaining axis of the workgroup would emerge.
Barriers
~~~~~~~~
:mod:`loopy` may infer the need for a barrier when it is not necessary. The
``no_sync_with`` instruction attribute can be used to resolve this.
See also :func:`loopy.add_nosync`.
TODO
.. }}}
......@@ -1207,26 +1532,30 @@ Obtaining Performance Statistics
.. {{{
Operations, array access, and barriers can all be counted, which may facilitate
performance prediction and optimization of a :mod:`loopy` kernel.
Arithmetic operations, array accesses, and synchronization operations can all
be counted, which may facilitate performance prediction and optimization of a
:mod:`loopy` kernel.
.. note::
The functions used in the following examples may produce warnings. If you have
already made the filterwarnings and catch_warnings calls used in the examples
above, you may need to reset these before continuing:
above, you may want to reset these before continuing. We will temporarily
suppress warnings to keep the output clean:
.. doctest::
>>> from warnings import resetwarnings
>>> from warnings import resetwarnings, filterwarnings
>>> resetwarnings()
>>> filterwarnings('ignore', category=Warning)
Counting operations
~~~~~~~~~~~~~~~~~~~
:func:`loopy.get_op_poly` provides information on the number and type of operations
being performed in a kernel. To demonstrate this, we'll create an example kernel
that performs several operations on arrays containing different types of data:
:func:`loopy.get_op_map` provides information on the characteristics and
quantity of arithmetic operations being performed in a kernel. To demonstrate
this, we'll create an example kernel that performs several operations on arrays
containing different types of data:
.. doctest::
......@@ -1235,47 +1564,50 @@ that performs several operations on arrays containing different types of data:
... """
... c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
... e[i, k] = g[i,k]*(2+h[i,k+1])
... """)
... """, name="stats_knl")
>>> knl = lp.add_and_infer_dtypes(knl,
... dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
Note that loopy will infer the data types for arrays c and e from the
Note that loopy will infer the data types for arrays ``c`` and ``e`` from the
information provided. Now we will count the operations:
.. doctest::
>>> from loopy.statistics import get_op_poly
>>> op_map = get_op_poly(knl)
>>> op_map = lp.get_op_map(knl, subgroup_size=32)
>>> print(op_map)
Op(np:dtype('float32'), add, subgroup, "stats_knl"): ...
:func:`loopy.get_op_poly` returns a mapping of **{(** :class:`numpy.dtype` **,**
:class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. The
:class:`islpy.PwQPolynomial` holds the number of operations for the type specified
in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this
map now:
Each line of output will look roughly like::
.. doctest::
Op(np:dtype('float32'), add, subgroup, "kernel_name") : [l, m, n] -> { l * m * n : l > 0 and m > 0 and n > 0 }
>>> print(lp.stringify_stats_mapping(op_map))
(dtype('float32'), 'add') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
(dtype('float32'), 'div') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
(dtype('float32'), 'mul') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
(dtype('float64'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
(dtype('float64'), 'mul') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
(dtype('int32'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
<BLANKLINE>
:func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
:class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A
:class:`loopy.ToCountMap` holds a dictionary mapping any type of key to an
arithmetic type. In this case, the :class:`islpy.PwQPolynomial` holds the
number of operations matching the characteristics of the :class:`loopy.Op`
specified in the key (in terms of the :class:`loopy.LoopKernel`
*inames*). :class:`loopy.Op` attributes include:
We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
- dtype: A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the
data type operated on.
- name: A :class:`str` that specifies the kind of arithmetic operation as
*add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
One way to evaluate these polynomials is with :meth:`islpy.PwQPolynomial.eval_with_dict`:
.. doctest::
>>> param_dict = {'n': 256, 'm': 256, 'l': 8}
>>> f32add = op_map[(np.dtype(np.float32), 'add')].eval_with_dict(param_dict)
>>> f32div = op_map[(np.dtype(np.float32), 'div')].eval_with_dict(param_dict)
>>> f32mul = op_map[(np.dtype(np.float32), 'mul')].eval_with_dict(param_dict)
>>> f64add = op_map[(np.dtype(np.float64), 'add')].eval_with_dict(param_dict)
>>> f64mul = op_map[(np.dtype(np.float64), 'mul')].eval_with_dict(param_dict)
>>> i32add = op_map[(np.dtype(np.int32), 'add')].eval_with_dict(param_dict)
>>> print("%i\n%i\n%i\n%i\n%i\n%i" %
>>> from loopy.statistics import CountGranularity as CG
>>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
>>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
>>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
>>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
>>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
>>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
>>> print("%i\n%i\n%i\n%i\n%i\n%i" %
... (f32add, f32div, f32mul, f64add, f64mul, i32add))
524288
524288
......@@ -1284,182 +1616,271 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
65536
65536
Counting array accesses
~~~~~~~~~~~~~~~~~~~~~~~
:class:`loopy.ToCountMap` provides member functions that facilitate filtering,
grouping, and evaluating subsets of the counts. Suppose we want to know the
total number of 32-bit operations of any kind. We can easily count these
using functions :func:`loopy.ToCountMap.filter_by` and
:func:`loopy.ToCountPolynomialMap.eval_and_sum`:
.. doctest::
>>> filtered_op_map = op_map.filter_by(dtype=[np.float32])
>>> f32op_count = filtered_op_map.eval_and_sum(param_dict)
>>> print(f32op_count)
1572864
:func:`loopy.get_gmem_access_poly` provides information on the number and type of
array loads and stores being performed in a kernel. To demonstrate this, we'll
continue using the kernel from the previous example:
We could accomplish the same goal using :func:`loopy.ToCountMap.group_by`,
which produces a :class:`loopy.ToCountMap` that contains the same counts grouped
together into keys containing only the specified fields:
.. doctest::
>>> from loopy.statistics import get_gmem_access_poly
>>> load_store_map = get_gmem_access_poly(knl)
>>> print(lp.stringify_stats_mapping(load_store_map))
(dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 }
(dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
(dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 }
(dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
<BLANKLINE>
>>> op_map_dtype = op_map.group_by('dtype')
>>> print(op_map_dtype)
Op(np:dtype('float32'), None, None): ...
>>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32)
... ].eval_with_dict(param_dict)
>>> print(f32op_count)
1572864
The lines of output above might look like::
Op(np:dtype('float32'), None, None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 }
Op(np:dtype('float64'), None, None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 }
See the reference page for :class:`loopy.ToCountMap` and :class:`loopy.Op` for
more information on these functions.
Counting memory accesses
~~~~~~~~~~~~~~~~~~~~~~~~
:func:`loopy.get_mem_access_map` provides information on the number and
characteristics of memory accesses performed in a kernel. To demonstrate this,
we'll continue using the kernel from the previous example:
.. doctest::
>>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32)
>>> print(mem_map)
MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, 'stats_knl'): ...
Each line of output will look roughly like::
MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroupw, 'stats_knl') : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, 'stats_knl') : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, 'stats_knl') : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
:func:`loopy.get_gmem_access_poly` returns a mapping of **{(**
:class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)**
**:** :class:`islpy.PwQPolynomial` **}**.
:func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
:class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
:class:`loopy.MemAccess` attributes include:
- The :class:`numpy.dtype` specifies the type of the data being accessed.
- mtype: A :class:`str` that specifies the memory type accessed as **global**
or **local**
- The first string in the map key specifies the DRAM access type as *consecutive*,
*nonconsecutive*, or *uniform*. *Consecutive* memory accesses occur when
consecutive threads access consecutive array elements in memory, *nonconsecutive*
accesses occur when consecutive threads access nonconsecutive array elements in
memory, and *uniform* accesses occur when consecutive threads access the *same*
element in memory.
- dtype: A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the
data type accessed.
- The second string in the map key specifies the DRAM access type as a *load*, or a
*store*.
- lid_strides: A :class:`dict` of **{** :class:`int` **:**
:data:`~pymbolic.typing.Expression` or :class:`int` **}** that specifies
local strides for each local id in the memory access index. Local ids not
found will not be present in ``lid_strides.keys()``. Uniform access (i.e.
work-items within a sub-group access the same item) is indicated by setting
``lid_strides[0]=0``, but may also occur when no local id 0 is found, in
which case the 0 key will not be present in lid_strides.
- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the
characteristics specified in the key (in terms of the :class:`loopy.LoopKernel`
*inames*).
- gid_strides: A :class:`dict` of **{** :class:`int` **:**
:data:`~pymbolic.typing.Expression` or :class:`int` **}** that specifies
global strides for each global id in the memory access index. Global ids not
found will not be present in ``gid_strides.keys()``.
We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
- direction: A :class:`str` that specifies the direction of memory access as
**load** or **store**.
- variable: A :class:`str` that specifies the variable name of the data
accessed.
We can evaluate these polynomials using :meth:`islpy.PwQPolynomial.eval_with_dict`:
.. doctest::
>>> f64ld = load_store_map[(np.dtype(np.float64), "uniform", "load")
... ].eval_with_dict(param_dict)
>>> f64st = load_store_map[(np.dtype(np.float64), "uniform", "store")
... ].eval_with_dict(param_dict)
>>> f32ld = load_store_map[(np.dtype(np.float32), "uniform", "load")
... ].eval_with_dict(param_dict)
>>> f32st = load_store_map[(np.dtype(np.float32), "uniform", "store")
... ].eval_with_dict(param_dict)
>>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
... (f32ld, f32st, f64ld, f64st))
f32 load: 1572864
f32 store: 524288
f64 load: 131072
f64 store: 65536
>>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g',
... variable_tags=None, count_granularity=CG.SUBGROUP, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e',
... variable_tags=None, count_granularity=CG.SUBGROUP, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a',
... variable_tags=None, count_granularity=CG.SUBGROUP, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c',
... variable_tags=None, count_granularity=CG.SUBGROUP, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
... (f32ld_a, f32st_c, f64ld_g, f64st_e))
f32 ld a: 1048576
f32 st c: 524288
f64 ld g: 65536
f64 st e: 65536
:class:`loopy.ToCountMap` also makes it easy to determine the total amount
of data moved in bytes. Suppose we want to know the total amount of global
memory data loaded and stored. We can produce a map with just this information
using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
.. doctest::
>>> bytes_map = mem_map.to_bytes()
>>> print(bytes_map)
MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, 'stats_knl'): ...
>>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
... ).group_by('direction')
>>> print(global_ld_st_bytes)
MemAccess(None, None, None, None, load, None, None, None, None): ...
MemAccess(None, None, None, None, store, None, None, None, None): ...
>>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
... ].eval_with_dict(param_dict)
>>> stored = global_ld_st_bytes[lp.MemAccess(direction='store')
... ].eval_with_dict(param_dict)
>>> print("bytes loaded: %s\nbytes stored: %s" % (loaded, stored))
bytes loaded: 7340032
bytes stored: 2621440
The lines of output above might look like::
MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup): [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup): [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup): [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float64'), {}, {}, load, g, None, subgroup): [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float64'), {}, {}, load, h, None, subgroup): [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float64'), {}, {}, store, e, None, subgroup): [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
One can see how these functions might be useful in computing, for example,
achieved memory bandwidth in byte/sec or performance in FLOP/sec.
~~~~~~~~~~~
Since we have not tagged any of the inames or parallelized the kernel across threads
(which would have produced iname tags), :func:`loopy.get_gmem_access_poly` considers
the array accesses *uniform*. Now we'll parallelize the kernel and count the array
Since we have not tagged any of the inames or parallelized the kernel across
work-items (which would have produced iname tags), :func:`loopy.get_mem_access_map`
finds no local or global id strides, leaving ``lid_strides`` and ``gid_strides``
empty for each memory access. Now we'll parallelize the kernel and count the array
accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated
this time, so we'll print the mapping manually to make it more legible:
this time.
.. doctest::
>>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0")
>>> load_store_map = get_gmem_access_poly(knl_consec)
>>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)):
... print("%s :\n%s\n" % (key, load_store_map[key]))
(dtype('float32'), 'consecutive', 'load') :
[n, m, l] -> { ... }
<BLANKLINE>
(dtype('float32'), 'consecutive', 'store') :
[n, m, l] -> { ... }
<BLANKLINE>
(dtype('float64'), 'consecutive', 'load') :
[n, m, l] -> { ... }
<BLANKLINE>
(dtype('float64'), 'consecutive', 'store') :
[n, m, l] -> { ... }
<BLANKLINE>
>>> knl_consec = lp.split_iname(knl, "k", 128,
... outer_tag="l.1", inner_tag="l.0")
>>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32)
>>> print(mem_map)
MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, 'stats_knl'): ...
With this parallelization, consecutive work-items will access consecutive array
elements in memory. The polynomials are a bit more complicated now due to the
parallelization, but when we evaluate them, we see that the total number of
array accesses has not changed:
.. doctest::
With this parallelization, consecutive threads will access consecutive array
elements in memory. The polynomials are a bit more complicated now due to the
parallelization, but when we evaluate them, we see that the total number of array
accesses has not changed:
.. doctest::
>>> f64ld = load_store_map[(np.dtype(np.float64), "consecutive", "load")
... ].eval_with_dict(param_dict)
>>> f64st = load_store_map[(np.dtype(np.float64), "consecutive", "store")
... ].eval_with_dict(param_dict)
>>> f32ld = load_store_map[(np.dtype(np.float32), "consecutive", "load")
... ].eval_with_dict(param_dict)
>>> f32st = load_store_map[(np.dtype(np.float32), "consecutive", "store")
... ].eval_with_dict(param_dict)
>>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
... (f32ld, f32st, f64ld, f64st))
f32 load: 1572864
f32 store: 524288
f64 load: 131072
f64 store: 65536
>>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
... (f32ld_a, f32st_c, f64ld_g, f64st_e))
f32 ld a: 1048576
f32 st c: 524288
f64 ld g: 65536
f64 st e: 65536
~~~~~~~~~~~
To produce *nonconsecutive* array accesses, we'll switch the inner and outer tags in
our parallelization of the kernel:
To produce *nonconsecutive* array accesses with local id 0 stride greater than 1,
we'll switch the inner and outer tags in our parallelization of the kernel:
.. doctest::
>>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1")
>>> load_store_map = get_gmem_access_poly(knl_nonconsec)
>>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)):
... print("%s :\n%s\n" % (key, load_store_map[key]))
(dtype('float32'), 'nonconsecutive', 'load') :
[n, m, l] -> { ... }
<BLANKLINE>
(dtype('float32'), 'nonconsecutive', 'store') :
[n, m, l] -> { ... }
<BLANKLINE>
(dtype('float64'), 'nonconsecutive', 'load') :
[n, m, l] -> { ... }
<BLANKLINE>
(dtype('float64'), 'nonconsecutive', 'store') :
[n, m, l] -> { ... }
<BLANKLINE>
>>> knl_nonconsec = lp.split_iname(knl, "k", 128,
... outer_tag="l.0", inner_tag="l.1")
>>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32)
>>> print(mem_map)
MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, 'stats_knl'): ...
With this parallelization, consecutive work-items will access *nonconsecutive*
array elements in memory. The total number of array accesses still has not
changed:
.. doctest::
With this parallelization, consecutive threads will access *nonconsecutive* array
elements in memory. The total number of array accesses has not changed:
>>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
... (f32ld_a, f32st_c, f64ld_g, f64st_e))
f32 ld a: 1048576
f32 st c: 524288
f64 ld g: 65536
f64 st e: 65536
We can also filter using an arbitrary test function using
:func:`loopy.ToCountMap.filter_by_func`. This is useful when the filter
criteria are more complicated than a simple list of allowable values:
.. doctest::
>>> f64ld = load_store_map[
... (np.dtype(np.float64), "nonconsecutive", "load")
... ].eval_with_dict(param_dict)
>>> f64st = load_store_map[
... (np.dtype(np.float64), "nonconsecutive", "store")
... ].eval_with_dict(param_dict)
>>> f32ld = load_store_map[
... (np.dtype(np.float32), "nonconsecutive", "load")
... ].eval_with_dict(param_dict)
>>> f32st = load_store_map[
... (np.dtype(np.float32), "nonconsecutive", "store")
... ].eval_with_dict(param_dict)
>>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
... (f32ld, f32st, f64ld, f64st))
f32 load: 1572864
f32 store: 524288
f64 load: 131072
f64 store: 65536
>>> def f(key):
... from loopy.types import to_loopy_type
... return key.dtype == to_loopy_type(np.float32) and \
... key.lid_strides[0] > 1
>>> count = mem_map.filter_by_func(f).eval_and_sum(param_dict)
>>> print(count)
2097152
Counting synchronization events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:func:`loopy.get_synchronization_poly` counts the number of synchronization
events per **thread** in a kernel. First, we'll call this function on the
:func:`loopy.get_synchronization_map` counts the number of synchronization
events per **work-item** in a kernel. First, we'll call this function on the
kernel from the previous example:
.. doctest::
>>> from loopy.statistics import get_synchronization_poly
>>> barrier_poly = get_synchronization_poly(knl)
>>> print(lp.stringify_stats_mapping(barrier_poly))
kernel_launch : { 1 }
<BLANKLINE>
>>> sync_map = lp.get_synchronization_map(knl)
>>> print(sync_map)
Sync(kernel_launch, stats_knl): [l, m, n] -> { 1 }
We can evaluate this polynomial using :func:`islpy.eval_with_dict`:
We can evaluate this polynomial using :meth:`islpy.PwQPolynomial.eval_with_dict`:
.. doctest::
>>> launch_count = barrier_poly["kernel_launch"].eval_with_dict(param_dict)
>>> launch_count = sync_map[lp.Sync("kernel_launch", "stats_knl")].eval_with_dict(param_dict)
>>> print("Kernel launch count: %s" % launch_count)
Kernel launch count: 1
......@@ -1475,48 +1896,49 @@ Now to make things more interesting, we'll create a kernel with barriers:
... e[i,j,k] = c[i,j,k+1]+c[i,j,k-1]
... """
... ], [
... lp.TemporaryVariable("c", lp.auto, shape=(50, 10, 99)),
... lp.TemporaryVariable("c", dtype=None, shape=(50, 10, 99)),
... "..."
... ])
>>> knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
>>> knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0")
>>> knl = lp.split_iname(knl, "k", 128, inner_tag="l.0")
>>> code, _ = lp.generate_code(lp.preprocess_kernel(knl))
>>> print(code)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
<BLANKLINE>
__kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *restrict a, __global int *restrict e)
__kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *__restrict__ a, __global int *__restrict__ e)
{
__local int c[50 * 10 * 99];
<BLANKLINE>
for (int j = 0; j <= 9; ++j)
for (int i = 0; i <= 49; ++i)
for (int i = 0; i <= 49; ++i)
for (int j = 0; j <= 9; ++j)
{
int const k_outer = 0;
<BLANKLINE>
barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
c[990 * i + 99 * j + lid(0) + 1 + gid(0) * 128] = 2 * a[980 * i + 98 * j + lid(0) + 1 + gid(0) * 128];
c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
e[980 * i + 98 * j + lid(0) + 1 + gid(0) * 128] = c[990 * i + 99 * j + 1 + lid(0) + 1 + gid(0) * 128] + c[990 * i + 99 * j + -1 + lid(0) + 1 + gid(0) * 128];
e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
}
}
In this kernel, when a thread performs the second instruction it uses data produced
by *different* threads during the first instruction. Because of this, barriers are
required for correct execution, so loopy inserts them. Now we'll count the barriers
using :func:`loopy.get_barrier_poly`:
In this kernel, when a work-item performs the second instruction it uses data
produced by *different* work-items during the first instruction. Because of this,
barriers are required for correct execution, so loopy inserts them. Now we'll
count the barriers using :func:`loopy.get_synchronization_map`:
.. doctest::
>>> sync_map = lp.get_synchronization_poly(knl)
>>> print(lp.stringify_stats_mapping(sync_map))
barrier_local : { 1000 }
kernel_launch : { 1 }
<BLANKLINE>
>>> sync_map = lp.get_synchronization_map(knl)
>>> print(sync_map)
Sync(barrier_local, loopy_kernel): { 1000 }
Sync(kernel_launch, loopy_kernel): { 1 }
Based on the kernel code printed above, we would expect each thread to encounter
50x10x2 barriers, which matches the result from :func:`loopy.get_barrier_poly`. In
this case, the number of barriers does not depend on any inames, so we can pass an
empty dictionary to :func:`islpy.eval_with_dict`.
Based on the kernel code printed above, we would expect each work-item to
encounter 50x10x2 barriers, which matches the result from
:func:`loopy.get_synchronization_map`. In this case, the number of barriers
does not depend on any inames, so we can pass an empty dictionary to
:meth:`islpy.PwQPolynomial.eval_with_dict`.
.. }}}
......
#! /bin/bash
cat > _build/html/.htaccess <<EOF
AuthUserFile /home/andreas/htpasswd
AuthGroupFile /dev/null
AuthName "Pre-Release Documentation"
AuthType Basic
require user iliketoast
EOF
rsync --progress --verbose --archive --delete _build/html/{.*,*} doc-upload:doc/loopy
rsync --verbose --archive --delete _build/html/{.*,*} doc-upload:doc/loopy
subroutine fill(out, a, n)
implicit none
real_type a, out(n)
integer n, i
do i = 1, n
out(i) = a
end do
do i = 1, n
out(i) = out(i) * factor
end do
end
!$loopy begin
!
! SOURCE = lp.c_preprocess(SOURCE, [
! "factor 4.0",
! "real_type real*8",
! ])
! fill, = lp.parse_fortran(SOURCE, FILENAME)
! fill = lp.split_iname(fill, "i", 128,
! outer_tag="g.0", inner_tag="l.0")
! fill = lp.split_iname(fill, "i_1", 128,
! outer_tag="g.0", inner_tag="l.0")
! RESULT = [fill]
!
!$loopy end
! vim:filetype=floopy
{
"metadata": {
"name": "",
"signature": "sha256:c9f8334aa7aa4a5ad1437fa5871aafa52bbc9131271d9e90e7be47d22725cc94"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Loopy IPython Integration Demo"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%load_ext loopy.ipython_ext"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Without transform code"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%fortran_kernel\n",
"\n",
"subroutine fill(out, a, n)\n",
" implicit none\n",
"\n",
" real*8 a, out(n)\n",
" integer n, i\n",
"\n",
" do i = 1, n\n",
" out(i) = a\n",
" end do\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(prog) # noqa: F821"
]
},
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Loopy IPython Integration Demo"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%load_ext loopy.ipython_ext"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Without transform code"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%%fortran_kernel\n",
"\n",
"subroutine fill(out, a, n)\n",
" implicit none\n",
"\n",
" real*8 a, out(n)\n",
" integer n, i\n",
"\n",
" do i = 1, n\n",
" out(i) = a\n",
" end do\n",
"end"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(fill)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"---------------------------------------------------------------------------\n",
"KERNEL: fill\n",
"---------------------------------------------------------------------------\n",
"ARGUMENTS:\n",
"a: ValueArg, type: float64\n",
"n: ValueArg, type: int32\n",
"out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n",
"---------------------------------------------------------------------------\n",
"DOMAINS:\n",
"[n] -> { [i] : i >= 0 and i <= -1 + n }\n",
"---------------------------------------------------------------------------\n",
"INAME IMPLEMENTATION TAGS:\n",
"i: None\n",
"---------------------------------------------------------------------------\n",
"INSTRUCTIONS:\n",
"[i] out[i] <- a # insn0\n",
"---------------------------------------------------------------------------\n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## With transform code"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"split_amount = 128"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%%transformed_fortran_kernel\n",
"\n",
"subroutine tr_fill(out, a, n)\n",
" implicit none\n",
"\n",
" real*8 a, out(n)\n",
" integer n, i\n",
"\n",
" do i = 1, n\n",
" out(i) = a\n",
" end do\n",
"end\n",
"\n",
"!$loopy begin\n",
"!\n",
"! tr_fill, = lp.parse_fortran(SOURCE)\n",
"! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n",
"! outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
"! RESULT = [tr_fill]\n",
"!\n",
"!$loopy end"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(tr_fill)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"---------------------------------------------------------------------------\n",
"KERNEL: tr_fill\n",
"---------------------------------------------------------------------------\n",
"ARGUMENTS:\n",
"a: ValueArg, type: float64\n",
"n: ValueArg, type: int32\n",
"out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n",
"---------------------------------------------------------------------------\n",
"DOMAINS:\n",
"[n] -> { [i_outer, i_inner] : i_inner >= -128i_outer and i_inner <= -1 + n - 128i_outer and i_inner >= 0 and i_inner <= 127 }\n",
"---------------------------------------------------------------------------\n",
"INAME IMPLEMENTATION TAGS:\n",
"i_inner: l.0\n",
"i_outer: g.0\n",
"---------------------------------------------------------------------------\n",
"INSTRUCTIONS:\n",
"[i_inner,i_outer] out[i_inner + i_outer*128] <- a # insn0\n",
"---------------------------------------------------------------------------\n"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
"cell_type": "markdown",
"metadata": {},
"source": [
"## With transform code"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"split_amount = 128"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%transformed_fortran_kernel\n",
"\n",
"subroutine tr_fill(out, a, n)\n",
" implicit none\n",
"\n",
" real*8 a, out(n)\n",
" integer n, i\n",
"\n",
" do i = 1, n\n",
" out(i) = a\n",
" end do\n",
"end\n",
"\n",
"!$loopy begin\n",
"!\n",
"! tr_fill = lp.parse_fortran(SOURCE)\n",
"! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n",
"! outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
"! RESULT = tr_fill\n",
"!\n",
"!$loopy end"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(prog) # noqa: F821"
]
}
]
}
\ No newline at end of file
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}