Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tasmith4/loopy
  • ben_sepanski/loopy
  • arghdos/loopy
  • inducer/loopy
  • wence-/loopy
  • isuruf/loopy
  • fikl2/loopy
  • xywei/loopy
  • kaushikcfd/loopy
  • zweiner2/loopy
10 results
Show changes
Showing
with 1410 additions and 835 deletions
#! /bin/bash
# should be run in this directory (build-helpers)
if test "$1" = "--nodate"; then
TGT_NAME=loopy-centos6
else
TGT_NAME=loopy-centos6-$(date +"%Y-%m-%d")
fi
echo "Generating $TGT_NAME..."
set -e
set -x
docker pull centos:6
CNT=$(docker create -t -v $(pwd):/mnt centos:6 /mnt/make-linux-build-docker-inner.sh)
echo "working in container $CNT"
docker start -i $CNT
docker cp $CNT:/tmp/build/loopy/dist/loopy $(pwd) || true
mv loopy $TGT_NAME
docker rm $CNT
#! /bin/bash
# run this from the loopy root directory
rm -Rf dist build
pyinstaller \
--workpath=build/pyinstaller \
build-helpers/loopy.spec
#! /bin/bash
set -e
scp "$1" tiker.net:public_html/pub/loopy-binaries/
#!/usr/bin/env python
import ctypes
from os import system
C_SRC = """
#include <stdlib.h>
#include <stdint.h>
int64_t cdiv(int64_t a, int64_t b)
{
return a/b;
}
int64_t cmod(int64_t a, int64_t b)
{
return a%b;
}
#define LOOPY_CALL_WITH_INTEGER_TYPES(MACRO_NAME) \
MACRO_NAME(int8, char) \
MACRO_NAME(int16, short) \
MACRO_NAME(int32, int) \
MACRO_NAME(int64, long long)
#define LOOPY_DEFINE_FLOOR_DIV(SUFFIX, TYPE) \
TYPE loopy_floor_div_##SUFFIX(TYPE a, TYPE b) \
{ \
if ((a<0) != (b<0)) \
a = a - (b + (b<0) - (b>=0)); \
return a/b; \
}
LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV)
#undef LOOPY_DEFINE_FLOOR_DIV
#define LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \
TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \
{ \
if (a<0) \
a = a - (b-1); \
return a/b; \
}
LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B)
#undef LOOPY_DEFINE_FLOOR_DIV_POS_B
#define LOOPY_DEFINE_MOD_POS_B(SUFFIX, TYPE) \
TYPE loopy_mod_pos_b_##SUFFIX(TYPE a, TYPE b) \
{ \
TYPE result = a%b; \
if (result < 0) \
result += b; \
return result; \
}
LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD_POS_B)
#undef LOOPY_DEFINE_MOD_POS_B
#define LOOPY_DEFINE_MOD(SUFFIX, TYPE) \
TYPE loopy_mod_##SUFFIX(TYPE a, TYPE b) \
{ \
TYPE result = a%b; \
if (result < 0 && b > 0) \
result += b; \
if (result > 0 && b < 0) \
result = result + b; \
return result; \
}
LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD)
#undef LOOPY_DEFINE_MOD
"""
def main():
with open("int-experiments.c", "w") as outf:
outf.write(C_SRC)
system("gcc -Wall -shared int-experiments.c -o int-experiments.so")
int_exp = ctypes.CDLL("int-experiments.so")
for func in [
int_exp.cdiv,
int_exp.cmod,
int_exp.loopy_floor_div_int64,
int_exp.loopy_floor_div_pos_b_int64,
int_exp.loopy_mod_pos_b_int64,
int_exp.loopy_mod_int64,
]:
func.argtypes = [ctypes.c_longlong, ctypes.c_longlong]
func.restype = ctypes.c_longlong
cmod = int_exp.cmod
int_floor_div = int_exp.loopy_floor_div_int64
int_floor_div_pos_b = int_exp.loopy_floor_div_pos_b_int64
int_mod_pos_b = int_exp.loopy_mod_pos_b_int64
int_mod = int_exp.loopy_mod_int64
m = 50
for a in range(-m, m):
for b in range(1, m):
cresult = int_floor_div_pos_b(a, b)
presult = a // b
assert cresult == presult
if cresult != presult:
print(a, b, cresult, presult)
for a in range(-m, m):
for b in range(-m, m):
if b == 0:
continue
cresult = int_floor_div(a, b)
presult = a // b
assert cresult == presult
if cresult != presult:
print(a, b, cresult, presult)
for a in range(-m, m):
for b in range(1, m):
cresult = int_mod_pos_b(a, b)
presult = a % b
assert cresult == presult
for a in range(-m, m):
for b in range(-m, m):
if b == 0:
continue
cresult = int_mod(a, b)
presult = a % b
assert cresult == presult
if cresult != presult:
print(a, b, cresult, presult)
# print(int_mod(552, -918), 552 % -918)
print(cmod(23, -11), 23 % -11)
if __name__ == "__main__":
main()
import numpy as np
# Inspired by a visualization used in the Halide tutorial
# https://www.youtube.com/watch?v=3uiEyEKji0M
def div_ceil(nr, dr):
return -(-nr // dr)
def product(iterable):
from functools import reduce
from operator import mul
return reduce(mul, iterable, 1)
class ArrayAccessPatternContext:
def __init__(self, gsize, lsize, subgroup_size=32, decay_constant=0.75):
self.lsize = lsize
self.gsize = gsize
self.subgroup_size = subgroup_size
self.timestamp = 0
self.decay_constant = decay_constant
self.ind_length = len(gsize) + len(lsize)
self.arrays = []
def l(self, index): # noqa: E743
subscript = [np.newaxis] * self.ind_length
subscript[len(self.gsize) + index] = slice(None)
return np.arange(self.lsize[index])[tuple(subscript)]
def g(self, index):
subscript = [np.newaxis] * self.ind_length
subscript[index] = slice(None)
return np.arange(self.gsize[index])[tuple(subscript)]
def nsubgroups(self):
return div_ceil(product(self.lsize), self.subgroup_size)
def animate(self, f, interval=200):
import matplotlib.animation as animation
import matplotlib.pyplot as plt
fig = plt.figure()
plots = []
for iary, ary in enumerate(self.arrays):
ax = fig.add_subplot(1, len(self.arrays), 1+iary)
ax.set_title(ary.name)
plots.append(ary.plot(ax))
def data_gen():
for _ in f():
self.tick()
for ary, plot in zip(self.arrays, plots):
plot.set_array(ary.get_plot_data())
fig.canvas.draw()
yield plots
# must be kept alive until after plt.show()
return animation.FuncAnimation(
fig, lambda x: x, data_gen,
blit=False, interval=interval, repeat=True)
def tick(self):
self.timestamp += 1
class Array:
def __init__(self, ctx, name, shape, strides, elements_per_row=None):
# Each array element stores a tuple:
# (timestamp, subgroup, g0, g1, g2, ) of last access
assert len(shape) == len(strides)
self.nattributes = 2+len(ctx.gsize)
if elements_per_row is None:
if len(shape) > 1:
minstride = min(strides)
for sh_i, st_i in zip(shape, strides):
if st_i == minstride:
elements_per_row = sh_i
break
else:
elements_per_row = 256
self.array = np.zeros((product(shape), self.nattributes,), dtype=np.int32)
self.ctx = ctx
self.name = name
self.shape = shape
self.strides = strides
self.elements_per_row = elements_per_row
ctx.arrays.append(self)
def __getitem__(self, index):
if not isinstance(index, tuple):
index = (index,)
assert len(index) == len(self.shape)
all_subscript = (np.newaxis,) * self.ctx.ind_length
def reshape_ind(ind):
if not isinstance(ind, np.ndarray):
return ind[all_subscript]
else:
assert len(ind.shape) == self.ctx.ind_length
lin_index = sum(
ind_i * stride_i
for ind_i, stride_i in zip(index, self.strides))
if not isinstance(lin_index, np.ndarray):
subscript = [np.newaxis] * self.ctx.ind_length
lin_index = np.array(lin_index)[subscript]
self.array[lin_index, 0] = self.ctx.timestamp
for i, _glength in enumerate(self.ctx.gsize):
if lin_index.shape[i] > 1:
self.array[lin_index, 2+i] = self.ctx.g(i)
workitem_index = 0
for i in range(len(self.ctx.lsize))[::-1]:
workitem_index = (
workitem_index * self.ctx.lsize[i]
+ self.ctx.l(i))
subgroup = workitem_index//self.ctx.subgroup_size
self.array[lin_index, 1] = subgroup
def __setitem__(self, index, value):
self.__getitem__(index)
def get_plot_data(self):
nelements = self.array.shape[0]
base_shape = (
div_ceil(nelements, self.elements_per_row),
self.elements_per_row,)
shaped_array = np.zeros(
(*base_shape, self.nattributes),
dtype=np.float32)
shaped_array.reshape(-1, self.nattributes)[:nelements] = self.array
modulation = np.exp(
-self.ctx.decay_constant*(self.ctx.timestamp-shaped_array[:, :, 0]))
subgroup = shaped_array[:, :, 1]
if self.ctx.nsubgroups() > 1:
subgroup = subgroup/(self.ctx.nsubgroups()-1)
else:
subgroup.fill(1)
rgb_array = np.zeros((*base_shape, 3))
if 1:
if len(self.ctx.gsize) > 1:
# g.0 -> red
rgb_array[:, :, 0] = shaped_array[:, :, 2]/(self.ctx.gsize[0]-1)
if len(self.ctx.gsize) > 1:
# g.1 -> blue
rgb_array[:, :, 2] = shaped_array[:, :, 3]/(self.ctx.gsize[1]-1)
if 1:
rgb_array[:, :, 1] = subgroup
return rgb_array*modulation[:, :, np.newaxis]
def plot(self, ax, **kwargs):
return ax.imshow(
self.get_plot_data(), interpolation="nearest",
**kwargs)
def show_example():
n = 2**7
n16 = div_ceil(n, 16)
ctx = ArrayAccessPatternContext(gsize=(n16, n16), lsize=(16, 16))
in0 = Array(ctx, "in0", (n, n), (n, 1))
if 0:
# knl a
i_inner = ctx.l(1)
i_outer = ctx.g(1)
k_inner = ctx.l(0)
def f():
for k_outer in range(n16):
in0[i_inner + i_outer*16, k_inner + k_outer*16]
yield
elif 0:
# knl b
j_inner = ctx.l(0)
j_outer = ctx.g(0)
k_inner = ctx.l(1)
def f():
for k_outer in range(n16):
in0[k_inner + k_outer*16, j_inner + j_outer*16]
yield
ani = ctx.animate(f)
import matplotlib.pyplot as plt
if 1:
plt.show()
else:
ani.save("access.mp4")
def show_example_2():
bsize = 8
blocks = 3
ctx = ArrayAccessPatternContext(gsize=(1,), lsize=(1,),
decay_constant=0.005)
in0 = Array(ctx, "in0", (blocks*bsize, blocks*bsize), (blocks*bsize, 1))
def f():
for i_outer in range(blocks):
for j_outer in range(blocks):
for i_inner in range(bsize):
for j_inner in range(bsize):
in0[i_inner + i_outer*bsize, j_inner + j_outer*bsize]
yield
ani = ctx.animate(f, interval=10)
import matplotlib.pyplot as plt
if 1:
plt.show()
else:
ani.save("access.mp4")
if __name__ == "__main__":
show_example_2()
......@@ -3,7 +3,7 @@
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = python ` which sphinx-build`
SPHINXBUILD = python `which sphinx-build`
PAPER =
BUILDDIR = _build
......
# -*- coding: utf-8 -*-
#
# loopy documentation build configuration file, created by
# sphinx-quickstart on Tue Aug 9 13:40:49 2011.
#
# This file is execfile()d with the current directory set to its containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import os
from urllib.request import urlopen
#import sys, os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#sys.path.insert(0, os.path.abspath('.'))
_conf_url = "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py"
with urlopen(_conf_url) as _inf:
exec(compile(_inf.read(), _conf_url, "exec"), globals())
# -- General configuration -----------------------------------------------------
copyright = "2016, Andreas Klöckner"
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.intersphinx',
#'sphinx.ext.viewcode',
'sphinx.ext.doctest',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'loopy'
copyright = u'2016, Andreas Klöckner'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
ver_dic = {}
_version_source = "../loopy/version.py"
with open(_version_source) as vpy_file:
version_py = vpy_file.read()
exec(compile(version_py, _version_source, 'exec'), ver_dic)
os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"] = "1"
exec(compile(version_py, _version_source, "exec"), ver_dic)
version = ".".join(str(x) for x in ver_dic["VERSION"])
# The full version, including alpha/beta/rc tags.
release = ver_dic["VERSION_TEXT"]
del os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"]
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
# The reST default role (used for this markup: `text`) to use for all documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
exclude_patterns = ["_build"]
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# -- Options for HTML output ---------------------------------------------------
html_theme = "alabaster"
html_theme_options = {
"extra_nav_links": {
"🚀 Github": "https://github.com/inducer/loopy",
"💾 Download Releases": "https://pypi.python.org/pypi/loo.py",
}
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
"python": ("https://docs.python.org/3", None),
"numpy": ("https://numpy.org/doc/stable/", None),
"pytools": ("https://documen.tician.de/pytools", None),
"islpy": ("https://documen.tician.de/islpy", None),
"pyopencl": ("https://documen.tician.de/pyopencl", None),
"cgen": ("https://documen.tician.de/cgen", None),
"pymbolic": ("https://documen.tician.de/pymbolic", None),
"constantdict": ("https://matthiasdiener.github.io/constantdict/", None),
}
html_sidebars = {
'**': [
'about.html',
'navigation.html',
'relations.html',
'searchbox.html',
]
}
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
nitpicky = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
nitpick_ignore_regex = [
["py:class", r"typing_extensions\.(.+)"],
["py:class", r"numpy\.u?int[0-9]+"],
["py:class", r"numpy\.float[0-9]+"],
["py:class", r"numpy\.complex[0-9]+"],
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# Reference not found from "<unknown>"? I'm not even sure where to look.
["py:class", r"ExpressionNode"],
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
html_show_sourcelink = False
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'loopydoc'
# -- Options for LaTeX output --------------------------------------------------
# The paper size ('letter' or 'a4').
#latex_paper_size = 'letter'
# The font size ('10pt', '11pt' or '12pt').
#latex_font_size = '10pt'
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
('index', 'loopy.tex', u'loopy Documentation',
u'Andreas Kloeckner', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Additional stuff for the LaTeX preamble.
#latex_preamble = ''
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output --------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'loopy', u'loopy Documentation',
[u'Andreas Kloeckner'], 1)
]
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
'https://docs.python.org/3': None,
'https://documen.tician.de/islpy': None,
'https://documen.tician.de/pyopencl': None,
'https://documen.tician.de/cgen': None,
'https://docs.scipy.org/doc/numpy/': None,
}
autoclass_content = "class"
# Type aliases
["py:class", r"InameStr"],
["py:class", r"ConcreteCallablesTable"],
["py:class", r"LoopNestTree"],
["py:class", r"LoopTree"],
["py:class", r"ToLoopyTypeConvertible"],
["py:class", r"ToStackMatchConvertible"],
]
......@@ -18,43 +18,20 @@ When you run this script, the following kernel is generated, compiled, and execu
(See the full example for how to print the generated code.)
Want to try out loopy?
----------------------
There's no need to go through :ref:`installation` if you'd just like to get a
feel for what loopy is. Instead, you may
`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/builds/1989/artifacts/browse/build-helpers/>`_.
This is purposefully built on an ancient Linux distribution, so it should work
on most versions of Linux that are currently out there.
Once you have the binary, do the following::
chmod +x ./loopy-centos6
./loopy-centos6 --target=opencl hello-loopy-lp.py
./loopy-centos6 --target=cuda hello-loopy-lp.py
./loopy-centos6 --target=ispc hello-loopy-lp.py
Grab the example here: :download:`examples/python/hello-loopy.py <../examples/python/hello-loopy-lp.py>`.
You may also donwload the most recent version by going to the `list of builds
<https://gitlab.tiker.net/inducer/loopy/builds>`_, clicking on the newest one
of type "CentOS binary", clicking on "Browse" under "Build Artifacts", then
navigating to "build-helpers", and downloading the binary from there.
.. _static-binary:
Places on the web related to Loopy
----------------------------------
* `Python package index <http://pypi.python.org/pypi/loo.py>`_ (download releases) Note the extra '.' in the PyPI identifier!
* `Github <http://github.com/inducer/loopy>`_ (get latest source code, file bugs)
* `Wiki <http://wiki.tiker.net/Loopy>`_ (read installation tips, get examples, read FAQ)
* `Homepage <http://mathema.tician.de/software/loopy>`_
* `Python package index <https://pypi.org/project/loopy>`_ (download releases)
* `Github <https://github.com/inducer/loopy>`_ (get latest source code, file bugs)
* `Homepage <https://mathema.tician.de/software/loopy>`_
Table of Contents
-----------------
If you're only just learning about loopy, consider the following `paper
<http://arxiv.org/abs/1405.7470>`_ on loo.py that may serve as a good
<https://arxiv.org/abs/1405.7470>`_ on loopy that may serve as a good
introduction.
Please check :ref:`installation` to get started.
......@@ -65,9 +42,14 @@ Please check :ref:`installation` to get started.
tutorial
ref_creation
ref_kernel
ref_translation_unit
ref_transform
ref_call
ref_other
misc
ref_internals
🚀 Github <https://github.com/inducer/loopy>
💾 Download Releases <https://pypi.org/project/loopy>
Indices and tables
==================
......
......@@ -3,11 +3,21 @@
Installation
============
This command should install :mod:`loopy`::
Option 0: Static Binary
-----------------------
If you would just like to experiment with :mod:`loopy`'s code transformation
abilities, the easiest way to get loopy is to download a statically-linked
Linux binary.
pip install loo.py
See :ref:`static-binary` for details.
(Note the extra "."!)
Option 1: From Source, no PyOpenCL integration
-----------------------------------------------
This command should install :mod:`loopy`::
pip install loopy
You may need to run this with :command:`sudo`.
If you don't already have `pip <https://pypi.python.org/pypi/pip>`_,
......@@ -17,19 +27,66 @@ run this beforehand::
python get-pip.py
For a more manual installation, `download the source
<http://pypi.python.org/pypi/loo.py>`_, unpack it, and say::
<https://pypi.org/project/loopy>`_, unpack it, and say::
python setup.py install
You may also clone its git repository::
git clone --recursive git://github.com/inducer/loopy
git clone --recursive http://git.tiker.net/trees/loopy.git
git clone --recursive https://github.com/inducer/loopy.git
Option 2: From Conda Forge, with PyOpenCL integration
-----------------------------------------------------
This set of instructions is intended for 64-bit Linux and
MacOS support computers:
#. Make sure your system has the basics to build software.
On Debian derivatives (Ubuntu and many more),
installing ``build-essential`` should do the trick.
Everywhere else, just making sure you have the ``g++`` package should be
enough.
#. Install `miniforge <https://github.com/conda-forge/miniforge>`_.
#. ``export CONDA=/WHERE/YOU/INSTALLED/miniforge3``
If you accepted the default location, this should work:
``export CONDA=$HOME/miniforge3``
#. ``$CONDA/bin/conda create -n dev``
#. ``source $CONDA/bin/activate dev``
#. ``conda install git pip pocl islpy pyopencl`` (Linux)
or
``conda install osx-pocl-opencl git pip pocl islpy pyopencl`` (OS X)
#. Type the following command::
pip install git+https://github.com/inducer/loopy
Next time you want to use :mod:`loopy`, just run the following command::
source /WHERE/YOU/INSTALLED/miniforge3/bin/activate dev
You may also like to add this to a startup file (like :file:`$HOME/.bashrc`) or create an alias for it.
See the `PyOpenCL installation instructions
<https://documen.tician.de/pyopencl/misc.html#installation>`_ for options
regarding OpenCL drivers.
User-visible Changes
====================
Version 2016.2
See also :ref:`language-versioning`.
Version 2018.1
--------------
.. note::
......@@ -57,7 +114,7 @@ Licensing
Loopy is licensed to you under the MIT/X Consortium license:
Copyright (c) 2009-13 Andreas Klöckner and Contributors.
Copyright (c) 2009-17 Andreas Klöckner and Contributors.
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
......@@ -101,7 +158,7 @@ In the meantime, you can generate code simply by saying::
print(cg_result.host_code())
print(cg_result.device_code())
Additionally, for C-based languages, header defintions are available via::
Additionally, for C-based languages, header definitions are available via::
loopy.generate_header(knl)
......@@ -131,7 +188,7 @@ source of examples. Here are some links:
Here's a more complicated example of a loopy code:
.. literalinclude:: ../examples/python/find-centers.py
:language: c
:language: python
This example is included in the :mod:`loopy` distribution as
:download:`examples/python/find-centers.py <../examples/python/find-centers.py>`.
......@@ -193,7 +250,7 @@ This list is always growing, but here are a few pointers:
* Precompute subexpressions:
Use a :ref:`substitution-rule` to assign a name to a subexpression,
using may be :func:`loopy.assignment_to_subst` or :func:`extract_subst`.
using may be :func:`loopy.assignment_to_subst` or :func:`loopy.extract_subst`.
Then use :func:`loopy.precompute` to create an (array or scalar)
temporary with precomputed values.
......@@ -232,12 +289,12 @@ This list is always growing, but here are a few pointers:
Use :func:`loopy.tag_inames` with the ``"vec"`` iname tag.
Note that the corresponding axis of an array must
also be tagged using the ``"vec"`` array axis tag
(using :func:`tag_array_axes`) in order for vector code to be
(using :func:`loopy.tag_array_axes`) in order for vector code to be
generated.
Vectorized loops (and array axes) must have a fixed size. (See either
:func:`split_iname` or :func:`fix_parameters` along with
:func:`split_array_axis`.)
:func:`loopy.split_iname` or :func:`loopy.fix_parameters` along with
:func:`loopy.split_array_axis`.)
* Reuse of Temporary Storage
......@@ -246,7 +303,7 @@ This list is always growing, but here are a few pointers:
* SoA $\leftrightarrow$ AoS
Use :func:`tag_array_axes` with the ``"sep"`` array axis tag
Use :func:`loopy.tag_array_axes` with the ``"sep"`` array axis tag
to generate separate arrays for each entry of a short, fixed-length
array axis.
......@@ -257,7 +314,7 @@ This list is always growing, but here are a few pointers:
Use :func:`loopy.tag_inames` with the ``"ilp"`` tag.
ILP loops must have a fixed size. (See either
:func:`split_iname` or :func:`fix_parameters`.)
:func:`loopy.split_iname` or :func:`loopy.fix_parameters`.)
* Type inference
......@@ -275,12 +332,69 @@ This list is always growing, but here are a few pointers:
* Interface with your own library functions
Use :func:`loopy.register_function_manglers`.
See :ref:`func-interface` for details.
* Loop collapse
Use :func:`loopy.join_inames`.
In what sense does Loopy support vectorization?
-----------------------------------------------
There are really two ways in which the OpenCL/CUDA model of computation exposes
vectorization:
* "SIMT": The user writes scalar program instances and either the compiler or
the hardware joins the individual program instances into vectors of a
hardware-given length for execution.
* "Short vectors": This type of vectorization is based on vector types,
e.g. ``float4``, which support arithmetic with implicit vector semantics
as well as a number of 'intrinsic' functions.
Loopy supports both. The first one, SIMT, is accessible by tagging inames with,
e.g., ``l.0```. Accessing the second one requires using both execution- and
data-reshaping capabilities in loopy. To start with, you need an array that
has an axis with the length of the desired vector. If that's not yet available,
you may use :func:`loopy.split_array_axis` to produce one. Similarly, you need
an iname whose bounds match those of the desired vector length. Again, if you
don't already have one, :func:`loopy.split_iname` will easily produce one.
Lastly, both the array axis an the iname need the implementation tag ``"vec"``.
Here is an example of this machinery in action:
.. literalinclude:: ../examples/python/vector-types.py
:language: python
Note how the example slices off the last 'slab' of iterations to ensure that
the bulk of the iteration does not require conditionals which would prevent
successful vectorization. This generates the following code:
.. literalinclude:: ../examples/python/vector-types.cl
:language: c
What is the story with language versioning?
-------------------------------------------
The idea is to keep supporting multiple versions at a time. There's a
tension in loopy between the need to build code that keeps working
unchanged for some number of years, and needing the language to
evolve--not just as a research vehicle, but also to enable to respond
to emerging needs in applications and hardware.
The idea is not to support all versions indefinitely, merely to allow
users to upgrade on their own schedule on the scale of a couple years.
Warnings about needing to upgrade would get noisier as a version nears
deprecation. In a way, it is intended to be a version of Python's
`__future__` flags, which IMO have the served the language tremendously
well.
One can also obtain the current language version programmatically:
:data:`loopy.MOST_RECENT_LANGUAGE_VERSION`.
But pinning your code to that would mean choosing to not use the
potentially valuable guarantee to keep existing code working unchanged
for a while. Instead, it might be wiser to just grab the version of the
language current at the time of writing the code.
Uh-oh. I got a scheduling error. Any hints?
-------------------------------------------
......@@ -325,7 +439,7 @@ If you use loopy for your work and find its approach helpful, please
consider citing the following article.
A. Klöckner. `Loo.py: transformation-based code generation for GPUs and
CPUs <http://arxiv.org/abs/1405.7470>`_. Proceedings of ARRAY '14: ACM
CPUs <https://arxiv.org/abs/1405.7470>`_. Proceedings of ARRAY '14: ACM
SIGPLAN Workshop on Libraries, Languages, and Compilers for Array
Programming. Edinburgh, Scotland.
......@@ -342,14 +456,33 @@ Here's a Bibtex entry for your convenience::
doi = "{10.1145/2627373.2627387}",
}
Getting help
============
Email the friendly folks on the `loopy mailing list <https://lists.tiker.net/listinfo/loopy>`_.
Acknowledgments
===============
Andreas Klöckner's work on :mod:`loopy` was supported in part by
Work on loopy was supported in part by
- the Department of Energy, National Nuclear Security Administration, under Award Number DE-NA0003963,
- the US Navy ONR, under grant number N00014-14-1-0117, and
- the US National Science Foundation under grant numbers DMS-1418961, CCF-1524433, DMS-1654756, SHF-1911019, and OAC-1931577.
AK also gratefully acknowledges a hardware gift from Nvidia Corporation.
The views and opinions expressed herein do not necessarily reflect those of the funding agencies.
Cross-References to Other Documentation
=======================================
.. currentmodule:: numpy
.. class:: int16
See :class:`numpy.generic`.
* US Navy ONR grant number N00014-14-1-0117
* the US National Science Foundation under grant numbers DMS-1418961 and CCF-1524433.
.. class:: complex128
AK also gratefully acknowledges a hardware gift from Nvidia Corporation. The
views and opinions expressed herein do not necessarily reflect those of the
funding agencies.
See :class:`numpy.generic`.
.. currentmodule:: loopy
.. _func-interface:
Function Interface
==================
Resolving and specialization
----------------------------
In :mod:`loopy`, a :class:`loopy.TranslationUnit` is a collection of callables
and entrypoints. Callables are of type
:class:`loopy.kernel.function_interface.InKernelCallable`. Functions start life
as simple :class:`pymbolic.primitives.Call` nodes. Call resolution turns the function
identifiers in those calls into :class:`~loopy.symbolic.ResolvedFunction` objects.
Each resolved function has an entry in :attr:`TranslationUnit.callables_table`.
The process of realizing a function as a
:class:`~loopy.kernel.function_interface.InKernelCallable` is referred to as
resolving.
During code generation for a :class:`~loopy.TranslationUnit`, a (resolved) callable
is *specialized* depending on the types and shapes of the arguments passed at a
call site. For example, a call to ``sin(x)`` in :mod:`loopy` is type-generic to
begin with, but it later specialized to either ``sinf``, ``sin`` or ``sinl``
depending on the type of its argument ``x``. A callable's behavior during type
or shape specialization is encoded via
:meth:`~loopy.InKernelCallable.with_types` and
:meth:`~loopy.InKernelCallable.with_descrs`.
Registering callables
---------------------
A user can *register* callables within a :class:`~loopy.TranslationUnit` to
allow loopy to resolve calls not pre-defined in :mod:`loopy`. In :mod:`loopy`,
we typically aim to expose all the standard math functions defined for
a :class:`~loopy.target.TargetBase`. Other foreign functions could be invoked by
*registering* them.
An example demonstrating registering a ``CBlasGemv`` as a loopy callable:
.. literalinclude:: ../examples/python/call-external.py
Call Instruction for a kernel call
----------------------------------
At a call-site involving a call to a :class:`loopy.LoopKernel`, the arguments to
the call must be ordered by the order of input arguments of the callee kernel.
Similarly, the assignees must be ordered by the order of callee kernel's output
arguments. Since a :class:`~loopy.kernel.data.KernelArgument` can be both an
input and an output, such arguments would be a part of the call instruction's
assignees as well as the call expression node's parameters.
Entry points
------------
Only callables in :attr:`loopy.TranslationUnit.entrypoints` can be called from
the outside. All other callables are only visible from within the translation
unit, similar to C's ``static`` functions.
Reference
---------
.. automodule:: loopy.kernel.function_interface
.. module:: loopy
.. moduleauthor:: Andreas Kloeckner <inform@tiker.net>
.. currentmodule:: loopy
.. _creating-kernels:
Reference: Creating Kernels
......@@ -30,4 +28,11 @@ To Copy between Data Formats
.. autofunction:: make_copy_kernel
Einstein summation convention kernels
-------------------------------------
.. autofunction:: make_einsum
.. automodule:: loopy.version
.. vim: tw=75:spell:fdm=marker
Reference: Documentation for Internal API
=========================================
Targets
-------
See also :ref:`targets`.
.. automodule:: loopy.target.c
Symbolic
--------
See also :ref:`expression-syntax`.
.. automodule:: loopy.symbolic
Types
-----
DTypes of variables in a :class:`loopy.LoopKernel` must be picklable, so in
the codegen pipeline user-provided types are converted to
:class:`loopy.types.LoopyType`.
.. automodule:: loopy.types
Type inference
^^^^^^^^^^^^^^
.. automodule:: loopy.type_inference
Codegen
-------
.. automodule:: loopy.codegen
Reduction Operation
-------------------
.. automodule:: loopy.library.reduction
Iname Tags
----------
.. automodule:: loopy.kernel.data
Array
-----
.. automodule:: loopy.kernel.array
Checks
------
.. automodule:: loopy.check
Schedule
--------
.. automodule:: loopy.schedule
.. automodule:: loopy.schedule.tools
.. automodule:: loopy.schedule.tree
......@@ -3,6 +3,72 @@
Reference: Loopy's Model of a Kernel
====================================
What Types of Computation can a Loopy Program Express?
------------------------------------------------------
Loopy programs consist of an a-priori unordered set of statements, operating
on :math:`n`-dimensional array variables.
Arrays consist of "plain old data" and structures thereof, as describable
by a :class:`numpy.dtype`. The n-dimensional shape of these arrays is
given by a tuple of expressions at most affine in parameters that are
fixed for the duration of program execution.
Each array variable in the program is either an argument or a temporary
variable. A temporary variable is only live within the program, while
argument variables are accessible outside the program and constitute the
program's inputs and outputs.
A statement (still called 'instruction' in some places, cf.
:class:`loopy.InstructionBase`) encodes an assignment to an entry of an array.
The right-hand side of an assignment consists of an expression that may
consist of arithmetic operations and calls to functions.
If the outermost operation of the RHS expression is a function call,
the RHS value may be a tuple, and multiple (still scalar) arrays appear
as LHS values. (This is the only sense in which tuple types are supported.)
Each statement is parameterized by zero or more loop variables ("inames").
A statement is executed once for each integer point defined by the domain
forest for the iname tuple given for that statement
(:attr:`loopy.InstructionBase.within_inames`). Each execution of a
statement (with specific values of the inames) is called a *statement
instance*. Dependencies between these instances as well as instances of
other statements are encoded in the program representation and specify permissible
execution orderings. (The semantics of the dependencies are `being
sharpened <https://github.com/inducer/loopy/pull/168>`__.) Assignments
(comprising the evaluation of the RHS and the assignment to the LHS) may
be specified to be atomic.
The basic building blocks of the domain forest are sets given as
conjunctions of equalities and inequalities of quasi-affine expressions on
integer tuples, called domains, and represented as instances of
:class:`islpy.BasicSet`. The entries of each integer tuple are
either *parameters* or *inames*. Each domain may optionally have a *parent
domain*. Parameters of parent-less domains are given by value arguments
supplied to the program that will remain unchanged during program
execution. Parameters of domains with parents may be
- run-time-constant value arguments to the program, or
- inames from parent domains, or
- scalar, integer temporary variables that are written by statements
with iteration domains controlled by a parent domain.
For each tuple of concrete parameter values, the set of iname tuples must be
finite. Each iname is defined by exactly one domain.
For a tuple of inames, the domain forest defines an iteration domain
by finding all the domains defining the inames involved, along with their
parent domains. The resulting tree of domains may contain multiple roots,
but no branches. The iteration domain is then constructed by intersecting
these domains and constructing the projection of that set onto the space
given by the required iname tuple. Observe that, via the parent-child
domain mechanism, imperfectly-nested and data-dependent loops become
expressible.
The set of functions callable from the language is predefined by the system.
Additional functions may be defined by the user by registering them. It is
not currently possible to define functions from within Loopy, however work
is progressing on permitting this. Even once this is allowed, recursion
will not be permitted.
.. _domain-tree:
Loop Domain Forest
......@@ -130,9 +196,12 @@ Iname Implementation Tags
Tag Meaning
=============================== ====================================================
``None`` | ``"for"`` Sequential loop
``"ord"`` Forced-order sequential loop
``"l.N"`` Local (intra-group) axis N ("local")
``"g.N"`` Group-number axis N ("group")
``"unr"`` Unroll
``"unr_hint"`` Unroll using compiler directives
``"unr_hint.N"`` Unroll at most N times using compiler directives
``"ilp"`` | ``"ilp.unr"`` Unroll using instruction-level parallelism
``"ilp.seq"`` Realize parallel iname as innermost loop
``"like.INAME"`` Can be used when tagging inames to tag like another
......@@ -150,6 +219,42 @@ Tag Meaning
.. }}}
Identifiers
-----------
Reserved Identifiers
^^^^^^^^^^^^^^^^^^^^
The identifier prefix ``_lp_`` is reserved for internal usage; when creating
*inames*, *argument names*, *temporary variable names*, *substitution rule
names*, *instruction IDs*, and other identifiers, users should *not* use names
beginning with ``_lp_``. This prefix is used for identifiers created
internally when operating on Loopy's kernel IR. For Loopy developers, further
information on name prefixes used within submodules is below.
Identifier Registry
^^^^^^^^^^^^^^^^^^^
Functionality in :mod:`loopy` *must* use identifiers beginning with ``_lp_`` for
all internally-created identifiers. Additionally, each name beginning with
``_lp_`` must start with one of the reserved prefixes below. New prefixes may
be registered by adding them to the table below. New prefixes may not themselves
be the prefix of an existing prefix.
**Reserved Identifier Prefixes**
======================= ==================================
Reserved Prefix Usage (module or purpose)
======================= ==================================
``_lp_linchk_`` ``loopy.linearization.checker``
======================= ==================================
.. note::
Existing Loopy code may not yet fully satisfy these naming requirements.
Name changes are in progress, and prefixes will be added to this registry
as they are created.
.. _instructions:
Instructions
......@@ -157,6 +262,7 @@ Instructions
.. {{{
.. autoclass:: HappensAfter
.. autoclass:: InstructionBase
.. _assignments:
......@@ -326,15 +432,30 @@ Expressions
Loopy's expressions are a slight superset of the expressions supported by
:mod:`pymbolic`.
* ``if``
* ``elif`` (following an ``if``)
* ``else`` (following an ``if`` / ``elif``)
* ``if(cond, then, else_)``
* ``a[[ 8*i + j ]]``: Linear subscripts.
See :class:`loopy.symbolic.LinearSubscript`.
* ``reductions``
* duplication of reduction inames
See :class:`loopy.symbolic.Reduction`.
* ``reduce`` vs ``simul_reduce``
* complex-valued arithmetic
* tagging of array access and substitution rule use ("$")
See :class:`loopy.symbolic.TaggedVariable`.
* ``indexof``, ``indexof_vec``
* ``cast(type, value)``: No parse syntax currently.
See :class:`loopy.symbolic.TypeCast`.
* If constants in expressions are subclasses of :class:`numpy.generic`,
generated code will contain literals of exactly that type, making them
*explicitly typed*. Constants given as Python types such as :class:`int`,
:class:`float` or :class:`complex` are called *implicitly* typed and
adapt to the type of the expected result.
TODO: Functions
TODO: Reductions
......@@ -342,6 +463,7 @@ TODO: Reductions
Function Call Instructions
^^^^^^^^^^^^^^^^^^^^^^^^^^
.. automodule:: loopy
.. autoclass:: CallInstruction
C Block Instructions
......@@ -352,12 +474,14 @@ C Block Instructions
Atomic Operations
^^^^^^^^^^^^^^^^^
.. autoclass:: memory_ordering
.. autoclass:: MemoryOrdering
.. autoclass:: memory_scope
.. autoclass:: MemoryScope
.. autoclass:: VarAtomicity
.. autoclass:: OrderedAtomic
.. autoclass:: AtomicInit
.. autoclass:: AtomicUpdate
......@@ -372,6 +496,12 @@ Barrier Instructions
.. autoclass:: BarrierInstruction
Instruction Tags
^^^^^^^^^^^^^^^^
.. autoclass:: LegacyStringInstructionTag
.. autoclass:: UseStreamingStoreTag
.. }}}
Data: Arguments and Temporaries
......@@ -388,24 +518,14 @@ Arguments
^^^^^^^^^
.. autoclass:: KernelArgument
:members:
:undoc-members:
.. autoclass:: ValueArg
:members:
:undoc-members:
.. autoclass:: GlobalArg
:members:
:undoc-members:
.. autoclass:: ArrayArg
.. autoclass:: ConstantArg
:members:
:undoc-members:
.. autoclass:: ImageArg
:members:
:undoc-members:
.. _temporaries:
......@@ -415,11 +535,9 @@ Temporary Variables
Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both
have the lifetime of a kernel invocation.
.. autoclass:: temp_var_scope
.. autoclass:: AddressSpace
.. autoclass:: TemporaryVariable
:members:
:undoc-members:
.. _types:
......@@ -538,10 +656,10 @@ Helper values
.. {{{
.. autoclass:: auto
.. autoclass:: UniqueName
.. autoclass:: Optional
.. }}}
Libraries: Extending and Interfacing with External Functionality
......@@ -573,10 +691,18 @@ The Kernel Object
Do not create :class:`LoopKernel` objects directly. Instead, refer to
:ref:`creating-kernels`.
.. autoclass:: LoopKernel
.. automodule:: loopy.kernel
Implementation Details: The Base Array
--------------------------------------
All array-like data in :mod:`loopy` (such as :class:`ArrayArg` and
:class:`TemporaryVariable`) derive from single, shared base array type,
described next.
.. currentmodule:: loopy.kernel.array
.. autoclass:: ArrayBase
.. autoclass:: kernel_state
:members:
:undoc-members:
.. vim: tw=75:spell:fdm=marker
Reference: Other Functionality
==============================
Auxiliary Data Types
--------------------
.. automodule:: loopy.typing
Obtaining Kernel Performance Statistics
---------------------------------------
......@@ -9,6 +14,24 @@ Obtaining Kernel Performance Statistics
Controlling caching
-------------------
.. envvar:: LOOPY_NO_CACHE
.. envvar:: CG_NO_CACHE
By default, loopy will cache (on disk) the result of various stages
of code generation to speed up future code generation of the same kernel.
By setting the environment variables :envvar:`LOOPY_NO_CACHE` or
:envvar:`CG_NO_CACHE` to any
string that :func:`pytools.strtobool` evaluates as ``True``, this caching
is suppressed.
.. envvar:: LOOPY_ABORT_ON_CACHE_MISS
If set to a string that :func:`pytools.strtobool` evaluates as ``True``,
loopy will raise an exception if a cache miss occurs. This can be useful
for debugging cache-related issues. For example, it can be used to automatically test whether caching is successful for a particular code, by setting this variable to ``True`` and re-running the code.
.. autofunction:: set_caching_enabled
.. autoclass:: CacheMode
......@@ -16,10 +39,11 @@ Controlling caching
Running Kernels
---------------
In addition to simply calling kernels using :class:`LoopKernel.__call__`,
the following underlying functionality may be used:
Use :class:`TranslationUnit.executor` to bind a translation unit
to execution resources, and then use :class:`ExecutorBase.__call__`
to invoke the kernel.
.. autoclass:: CompiledKernel
.. autoclass:: ExecutorBase
Automatic Testing
-----------------
......@@ -44,3 +68,4 @@ following always works::
.. autofunction:: show_dependency_graph
.. autofunction:: t_unit_to_python
......@@ -50,6 +50,10 @@ Influencing data access
.. autofunction:: set_array_axis_names
.. automodule:: loopy.transform.privatize
.. autofunction:: allocate_temporaries_for_base_storage
Padding Data
------------
......@@ -74,6 +78,8 @@ Manipulating Instructions
.. autofunction:: add_nosync
.. autofunction:: add_barrier
Registering Library Routines
----------------------------
......@@ -83,8 +89,6 @@ Registering Library Routines
.. autofunction:: register_symbol_manglers
.. autofunction:: register_function_manglers
Modifying Arguments
-------------------
......@@ -98,7 +102,7 @@ Modifying Arguments
.. autofunction:: rename_argument
.. autofunction:: set_temporary_scope
.. autofunction:: set_temporary_address_space
Creating Batches of Operations
------------------------------
......@@ -114,7 +118,7 @@ Finishing up
.. autofunction:: generate_loop_schedules
.. autofunction:: get_one_scheduled_kernel
.. autofunction:: get_one_linearized_kernel
.. autofunction:: save_and_reload_temporaries
......@@ -140,4 +144,3 @@ TODO: Matching instruction tags
.. automodule:: loopy.match
.. vim: tw=75:spell
.. currentmodule:: loopy
Translation Units
=================
.. automodule:: loopy.translation_unit
......@@ -25,9 +25,12 @@ import a few modules and set up a :class:`pyopencl.Context` and a
>>> import loopy as lp
>>> lp.set_caching_enabled(False)
>>> from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2
>>> from warnings import filterwarnings, catch_warnings
>>> filterwarnings('error', category=lp.LoopyWarning)
>>> from loopy.diagnostic import DirectCallUncachedWarning
>>> filterwarnings('ignore', category=DirectCallUncachedWarning)
>>> ctx = cl.create_some_context(interactive=False)
>>> queue = cl.CommandQueue(ctx)
......@@ -111,18 +114,20 @@ always see loopy's view of a kernel by printing it.
KERNEL: loopy_kernel
---------------------------------------------------------------------------
ARGUMENTS:
a: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
n: ValueArg, type: <runtime>
out: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
a: type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1) in aspace: global
n: ValueArg, type: <auto/runtime>
out: type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1) out aspace: global
---------------------------------------------------------------------------
DOMAINS:
[n] -> { [i] : 0 <= i < n }
---------------------------------------------------------------------------
INAME IMPLEMENTATION TAGS:
INAME TAGS:
i: None
---------------------------------------------------------------------------
INSTRUCTIONS:
[i] out[i] <- 2*a[i] # insn
for i
out[i] = 2*a[i] {id=insn}
end i
---------------------------------------------------------------------------
You'll likely have noticed that there's quite a bit more information here
......@@ -152,7 +157,7 @@ following:
See :ref:`specifying-arguments`.
* Loopy has not determined the type of ``a`` and ``out``. The data type is
given as ``<runtime>``, which means that these types will be determined
given as ``<auto/runtime>``, which means that these types will be determined
by the data passed in when the kernel is invoked. Loopy generates (and
caches!) a copy of the kernel for each combination of types passed in.
......@@ -176,11 +181,11 @@ for good measure.
>>> assert (out.get() == (2*x_vec_dev).get()).all()
We can have loopy print the OpenCL kernel it generated
by passing :attr:`loopy.Options.write_cl`.
by passing :attr:`loopy.Options.write_code`.
.. doctest::
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
......@@ -205,6 +210,7 @@ For convenience, loopy kernels also directly accept :mod:`numpy` arrays:
.. doctest::
>>> knl = lp.set_options(knl, write_code=False)
>>> evt, (out,) = knl(queue, a=x_vec_host)
>>> assert (out == (2*x_vec_host)).all()
......@@ -224,37 +230,46 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`:
.. doctest::
>>> knl = lp.set_options(knl, write_wrapper=True, write_cl=False)
>>> knl = lp.set_options(knl, write_wrapper=True, write_code=False)
>>> evt, (out,) = knl(queue, a=x_vec_host)
from __future__ import division
import numpy as _lpy_np
...
def invoke_loopy_kernel_loopy_kernel(_lpy_cl_kernels, queue, allocator=None, wait_for=None, out_host=None, a=None, n=None, out=None):
if allocator is None:
allocator = _lpy_cl_tools.DeferredAllocator(queue.context)
<BLANKLINE>
# {{{ find integer arguments from shapes
# {{{ find integer arguments from array data
<BLANKLINE>
if n is None:
if a is not None:
n = a.shape[0]
<BLANKLINE>
elif out is not None:
n = out.shape[0]
<BLANKLINE>
# }}}
...
You can also pass options to the OpenCL implementation
by passing :attr:`loopy.Options.build_options`.
.. doctest::
>>> knl = lp.set_options(knl, build_options=["-cl-mad-enable"])
Generating code
~~~~~~~~~~~~~~~
Instead of using loopy to run the code it generates, you can also just use
loopy as a code generator and take care of executing the generated kernels
yourself. In this case, make sure loopy knows about all types, and then
call :func:`loopy.generate_code`:
call :func:`loopy.generate_code_v2`:
.. doctest::
>>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32))
>>> code, _ = lp.generate_code(typed_knl)
>>> code = lp.generate_code_v2(typed_knl).device_code()
>>> print(code)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
......@@ -295,7 +310,8 @@ argument:
... """
... out[j,i] = a[i,j]
... out[i,j] = 2*out[i,j]
... """)
... """,
... [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...])
loopy's programming model is completely *unordered* by default. This means
that:
......@@ -322,7 +338,9 @@ an explicit dependency:
... """
... out[j,i] = a[i,j] {id=transpose}
... out[i,j] = 2*out[i,j] {dep=transpose}
... """)
... """,
... [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...],
... name="transpose_and_dbl")
``{id=transpose}`` assigns the identifier *transpose* to the first
instruction, and ``{dep=transpose}`` declares a dependency of the second
......@@ -331,9 +349,9 @@ that these dependencies show up there, too:
.. doctest::
>>> print(knl.stringify(with_dependencies=True))
>>> print(knl["transpose_and_dbl"].stringify(with_dependencies=True))
---------------------------------------------------------------------------
KERNEL: loopy_kernel
KERNEL: transpose_and_dbl
---------------------------------------------------------------------------
...
---------------------------------------------------------------------------
......@@ -352,7 +370,7 @@ loopy can also show an instruction dependency graph, using
Dependencies are shown as arrows from prerequisite to dependent in the
graph. This functionality requires the open-source `graphviz
<http://graphviz.org>`_ graph drawing tools to be installed. The generated
<https://graphviz.org>`_ graph drawing tools to be installed. The generated
graph will open in a browser window.
Since manually notating lots of dependencies is cumbersome, loopy has
......@@ -377,13 +395,13 @@ Let us take a look at the generated code for the above kernel:
.. doctest::
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> knl = lp.prioritize_loops(knl, "i,j")
>>> evt, (out,) = knl(queue, a=a_mat_dev)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
<BLANKLINE>
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) transpose_and_dbl(__global float *__restrict__ out, __global float const *__restrict__ a, int const n)
{
for (int i = 0; i <= -1 + n; ++i)
for (int j = 0; j <= -1 + n; ++j)
......@@ -419,20 +437,22 @@ with identical bounds, for the use of the transpose:
... """
... out[j,i] = a[i,j] {id=transpose}
... out[ii,jj] = 2*out[ii,jj] {dep=transpose}
... """)
>>> knl = lp.prioritize_loops(knl, "i,j,ii,jj")
... """,
... [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...])
>>> knl = lp.prioritize_loops(knl, "i,j")
>>> knl = lp.prioritize_loops(knl, "ii,jj")
:func:`loopy.duplicate_inames` can be used to achieve the same goal.
Now the intended code is generated and our test passes.
.. doctest::
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=a_mat_dev)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
<BLANKLINE>
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *__restrict__ out, __global float const *__restrict__ a, int const n)
{
for (int i = 0; i <= -1 + n; ++i)
for (int j = 0; j <= -1 + n; ++j)
......@@ -482,13 +502,13 @@ ambiguous.
.. doctest::
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=a_mat_dev)
#define lid(N) ((int) get_local_id(N))
...
for (int j = 0; j <= -1 + n; ++j)
for (int i = 0; i <= -1 + n; ++i)
a[n * i + j] = 0.0f;
a[n * i + j] = (float) (0.0f);
...
No more warnings! Loop nesting is also reflected in the dependency graph:
......@@ -520,7 +540,7 @@ is overwritten with the new kernel::
knl = lp.do_something(knl, arguments...)
We've already seen an example of a transformation above:
For instance, :func:`prioritize_loops` fit the pattern.
For instance, :func:`loopy.prioritize_loops` fit the pattern.
:func:`loopy.split_iname` is another fundamental (and useful) transformation. It
turns one existing iname (recall that this is loopy's word for a 'loop
......@@ -540,13 +560,13 @@ Consider this example:
... "a[i] = 0", assumptions="n>=1")
>>> knl = lp.split_iname(knl, "i", 16)
>>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
for (int i_outer = 0; i_outer <= -1 + ((15 + n) / 16); ++i_outer)
for (int i_inner = 0; i_inner <= (-16 + n + -16 * i_outer >= 0 ? 15 : -1 + n + -16 * i_outer); ++i_inner)
a[16 * i_outer + i_inner] = 0.0f;
for (int i_outer = 0; i_outer <= -1 + (15 + n) / 16; ++i_outer)
for (int i_inner = 0; i_inner <= ((-17 + n + -16 * i_outer >= 0) ? 15 : -1 + n + -16 * i_outer); ++i_inner)
a[16 * i_outer + i_inner] = (float) (0.0f);
...
By default, the new, split inames are named *OLD_outer* and *OLD_inner*,
......@@ -571,13 +591,13 @@ relation to loop nesting. For example, it's perfectly possible to request
... "a[i] = 0", assumptions="n>=1")
>>> knl = lp.split_iname(knl, "i", 16)
>>> knl = lp.prioritize_loops(knl, "i_inner,i_outer")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
for (int i_inner = 0; i_inner <= (-17 + n >= 0 ? 15 : -1 + n); ++i_inner)
for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
a[16 * i_outer + i_inner] = 0.0f;
for (int i_inner = 0; i_inner <= ((-17 + n >= 0) ? 15 : -1 + n); ++i_inner)
for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + (15 + n + 15 * i_inner) / 16; ++i_outer)
a[16 * i_outer + i_inner] = (float) (0.0f);
...
Notice how loopy has automatically generated guard conditionals to make
......@@ -595,13 +615,13 @@ commonly called 'loop tiling':
... assumptions="n mod 16 = 0 and n >= 1")
>>> knl = lp.split_iname(knl, "i", 16)
>>> knl = lp.split_iname(knl, "j", 16)
>>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner,j_inner")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=a_mat_dev)
#define lid(N) ((int) get_local_id(N))
...
for (int i_outer = 0; i_outer <= ((-16 + n) / 16); ++i_outer)
for (int j_outer = 0; j_outer <= ((-16 + n) / 16); ++j_outer)
for (int i_outer = 0; i_outer <= (-16 + n) / 16; ++i_outer)
for (int j_outer = 0; j_outer <= (-16 + n) / 16; ++j_outer)
for (int i_inner = 0; i_inner <= 15; ++i_inner)
for (int j_inner = 0; j_inner <= 15; ++j_inner)
out[n * (16 * i_outer + i_inner) + 16 * j_outer + j_inner] = a[n * (16 * j_outer + j_inner) + 16 * i_outer + i_inner];
......@@ -638,18 +658,17 @@ loop's tag to ``"unr"``:
>>> knl = lp.split_iname(knl, "i", 4)
>>> knl = lp.tag_inames(knl, dict(i_inner="unr"))
>>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#define int_floor_div_pos_b(a,b) ( ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) )
...
for (int i_outer = 0; i_outer <= int_floor_div_pos_b(-4 + n, 4); ++i_outer)
for (int i_outer = 0; i_outer <= loopy_floor_div_pos_b_int32(-4 + n, 4); ++i_outer)
{
a[4 * i_outer] = 0.0f;
a[4 * i_outer + 1] = 0.0f;
a[4 * i_outer + 2] = 0.0f;
a[4 * i_outer + 3] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
a[1 + 4 * i_outer] = (float) (0.0f);
a[2 + 4 * i_outer] = (float) (0.0f);
a[3 + 4 * i_outer] = (float) (0.0f);
}
...
......@@ -688,7 +707,7 @@ Iname implementation tags are also printed along with the entire kernel:
>>> print(knl)
---------------------------------------------------------------------------
...
INAME IMPLEMENTATION TAGS:
INAME TAGS:
i_inner: unr
i_outer: None
---------------------------------------------------------------------------
......@@ -714,14 +733,14 @@ Let's try this out on our vector fill kernel by creating workgroups of size
... "a[i] = 0", assumptions="n>=0")
>>> knl = lp.split_iname(knl, "i", 128,
... outer_tag="g.0", inner_tag="l.0")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
__kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n)
{
if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
a[128 * gid(0) + lid(0)] = 0.0f;
a[128 * gid(0) + lid(0)] = (float) (0.0f);
}
Loopy requires that workgroup sizes are fixed and constant at compile time.
......@@ -733,11 +752,11 @@ those for us:
.. doctest::
>>> glob, loc = knl.get_grid_size_upper_bounds()
>>> glob, loc = knl["loopy_kernel"].get_grid_size_upper_bounds(knl.callables_table)
>>> print(glob)
(Aff("[n] -> { [(floor((127 + n)/128))] }"),)
(PwAff("[n] -> { [(floor((127 + n)/128))] }"),)
>>> print(loc)
(Aff("[n] -> { [(128)] }"),)
(PwAff("[n] -> { [(128)] }"),)
Note that this functionality returns internal objects and is not really
intended for end users.
......@@ -760,26 +779,26 @@ assumption:
>>> knl = lp.split_iname(knl, "i", 4)
>>> knl = lp.tag_inames(knl, dict(i_inner="unr"))
>>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
for (int i_outer = 0; i_outer <= -1 + ((3 + n) / 4); ++i_outer)
for (int i_outer = 0; i_outer <= -1 + (3 + n) / 4; ++i_outer)
{
a[4 * i_outer] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
if (-2 + -4 * i_outer + n >= 0)
a[4 * i_outer + 1] = 0.0f;
a[1 + 4 * i_outer] = (float) (0.0f);
if (-3 + -4 * i_outer + n >= 0)
a[4 * i_outer + 2] = 0.0f;
a[2 + 4 * i_outer] = (float) (0.0f);
if (-4 + -4 * i_outer + n >= 0)
a[4 * i_outer + 3] = 0.0f;
a[3 + 4 * i_outer] = (float) (0.0f);
}
...
While these conditionals enable the generated code to deal with arbitrary
*n*, they come at a performance cost. Loopy allows generating separate code
for the last iteration of the *i_outer* loop, by using the *slabs* keyword
argument to :func:`split_iname`. Since this last iteration of *i_outer* is
argument to :func:`loopy.split_iname`. Since this last iteration of *i_outer* is
the only iteration for which ``i_inner + 4*i_outer`` can become larger than
*n*, only the (now separate) code for that iteration contains conditionals,
enabling some cost savings:
......@@ -788,32 +807,32 @@ enabling some cost savings:
>>> knl = orig_knl
>>> knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
/* bulk slab for 'i_outer' */
for (int i_outer = 0; i_outer <= -2 + ((3 + n) / 4); ++i_outer)
for (int i_outer = 0; i_outer <= -2 + (3 + n) / 4; ++i_outer)
{
a[4 * i_outer] = 0.0f;
a[4 * i_outer + 1] = 0.0f;
a[4 * i_outer + 2] = 0.0f;
a[4 * i_outer + 3] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
a[1 + 4 * i_outer] = (float) (0.0f);
a[2 + 4 * i_outer] = (float) (0.0f);
a[3 + 4 * i_outer] = (float) (0.0f);
}
/* final slab for 'i_outer' */
{
int const i_outer = -1 + n + -1 * (3 * n / 4);
int const i_outer = -1 + n + -1 * ((3 * n) / 4);
<BLANKLINE>
if (-1 + n >= 0)
if (i_outer >= 0)
{
a[4 * i_outer] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
if (-2 + -4 * i_outer + n >= 0)
a[4 * i_outer + 1] = 0.0f;
a[1 + 4 * i_outer] = (float) (0.0f);
if (-3 + -4 * i_outer + n >= 0)
a[4 * i_outer + 2] = 0.0f;
a[2 + 4 * i_outer] = (float) (0.0f);
if (4 + 4 * i_outer + -1 * n == 0)
a[4 * i_outer + 3] = 0.0f;
a[3 + 4 * i_outer] = (float) (0.0f);
}
}
...
......@@ -884,7 +903,7 @@ memory, local to each work item.
.. doctest::
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out1, out2) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
......@@ -940,12 +959,12 @@ Consider the following example:
... "{ [i_outer,i_inner, k]: "
... "0<= 16*i_outer + i_inner <n and 0<= i_inner,k <16}",
... """
... <> a_temp[i_inner] = a[16*i_outer + i_inner] {priority=10}
... <> a_temp[i_inner] = a[16*i_outer + i_inner]
... out[16*i_outer + i_inner] = sum(k, a_temp[k])
... """)
>>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0"))
>>> knl = lp.set_temporary_scope(knl, "a_temp", "local")
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_temporary_address_space(knl, "a_temp", "local")
>>> knl = lp.set_options(knl, write_code=True)
>>> evt, (out,) = knl(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
......@@ -1010,13 +1029,13 @@ transformation exists in :func:`loopy.add_prefetch`:
... out[16*i_outer + i_inner] = sum(k, a[16*i_outer + i_inner])
... """)
>>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0"))
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> knl_pf = lp.add_prefetch(knl, "a")
>>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
acc_k = 0.0f;
a_fetch = a[16 * gid(0) + lid(0)];
acc_k = 0.0f;
for (int k = 0; k <= 15; ++k)
acc_k = acc_k + a_fetch;
out[16 * gid(0) + lid(0)] = acc_k;
......@@ -1035,17 +1054,15 @@ earlier:
.. doctest::
>>> knl_pf = lp.add_prefetch(knl, "a", ["i_inner"])
>>> knl_pf = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.0")
>>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
#define lid(N) ((int) get_local_id(N))
...
if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
acc_k = 0.0f;
if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
a_fetch[lid(0)] = a[16 * gid(0) + lid(0)];
barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */;
if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
{
acc_k = 0.0f;
for (int k = 0; k <= 15; ++k)
acc_k = acc_k + a_fetch[lid(0)];
out[16 * gid(0) + lid(0)] = acc_k;
......@@ -1064,7 +1081,7 @@ Temporaries in global memory
local and private temporaries, the runtime allocates storage for global
temporaries when the kernel gets executed. The user must explicitly specify that
a temporary is global. To specify that a temporary is global, use
:func:`loopy.set_temporary_scope`.
:func:`loopy.set_temporary_address_space`.
Substitution rules
~~~~~~~~~~~~~~~~~~
......@@ -1105,11 +1122,12 @@ work item:
:mod:`loopy` supports two kinds of barriers:
* *Local barriers* ensure consistency of local memory accesses to items within
* *Local barriers* ensure consistency of memory accesses to items within
*the same* work group. This synchronizes with all instructions in the work
group.
group. The type of memory (local or global) may be specified by the
:attr:`loopy.BarrierInstruction.mem_kind`
* *Global barriers* ensure consistency of global memory accesses
* *Global barriers* ensure consistency of memory accesses
across *all* work groups, i.e. it synchronizes with every work item
executing the kernel. Note that there is no exact equivalent for
this kind of barrier in OpenCL. [#global-barrier-note]_
......@@ -1118,14 +1136,17 @@ Once a work item has reached a barrier, it waits for everyone that it
synchronizes with to reach the barrier before continuing. This means that unless
all work items reach the same barrier, the kernel will hang during execution.
Barrier insertion
~~~~~~~~~~~~~~~~~
By default, :mod:`loopy` inserts local barriers between two instructions when it
detects that a dependency involving local memory may occur across work items. To
see this in action, take a look at the section on :ref:`local_temporaries`.
In contrast, :mod:`loopy` will *not* insert global barriers automatically.
Global barriers require manual intervention along with some special
post-processing which we describe below. Consider the following kernel, which
attempts to rotate its input to the right by 1 in parallel:
In contrast, :mod:`loopy` will *not* insert global barriers automatically and
instead will report an error if it detects the need for a global barrier. As an
example, consider the following kernel, which attempts to rotate its input to
the right by 1 in parallel:
.. doctest::
......@@ -1151,14 +1172,28 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted:
>>> cgr = lp.generate_code_v2(knl)
Traceback (most recent call last):
...
MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
loopy.diagnostic.MissingBarrierError: rotate_v1: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
The syntax for a inserting a global barrier instruction is
``... gbarrier``. :mod:`loopy` also supports manually inserting local
barriers. The syntax for a local barrier instruction is ``... lbarrier``.
Saving temporaries across global barriers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The syntax for a global barrier instruction is ``... gbarrier``. This needs to
be added between the pair of offending instructions.
For some platforms (currently only PyOpenCL), :mod:`loopy` implements global
barriers by splitting the kernel into a host side kernel and multiple
device-side kernels. On such platforms, it will be necessary to save non-global
temporaries that are live across kernel calls. This section presents an example
of how to use :func:`loopy.save_and_reload_temporaries` which is helpful for
that purpose.
Let us start with an example. Consider the kernel from above with a
``... gbarrier`` instruction that has already been inserted.
.. doctest::
>>> knl = lp.make_kernel(
>>> prog = lp.make_kernel(
... "[n] -> {[i] : 0<=i<n}",
... """
... for i
......@@ -1173,34 +1208,41 @@ be added between the pair of offending instructions.
... ],
... name="rotate_v2",
... assumptions="n mod 16 = 0")
>>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0")
>>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0")
When we try to generate code for this, it will still not work.
.. testsetup::
>>> cgr = lp.generate_code_v2(knl)
>>> prog = prog.with_kernel(
... prog.default_entrypoint.copy(
... silenced_warnings=["v1_scheduler_fallback"]))
Here is what happens when we try to generate code for the kernel:
>>> cgr = lp.generate_code_v2(prog)
Traceback (most recent call last):
...
MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?)
loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?)
To understand what is going on, you need to know that :mod:`loopy` implements
global barriers by splitting the kernel into multiple device-side kernels. The
splitting happens when the instruction schedule is generated. To see the
schedule, we must first call :func:`loopy.get_one_scheduled_kernel`:
This happens due to the kernel splitting done by :mod:`loopy`. The splitting
happens when the instruction schedule is generated. To see the schedule, we
should call :func:`loopy.get_one_linearized_kernel`:
>>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
>>> print(knl)
>>> prog = lp.preprocess_kernel(prog)
>>> knl = lp.get_one_linearized_kernel(prog["rotate_v2"], prog.callables_table)
>>> prog = prog.with_kernel(knl)
>>> print(prog)
---------------------------------------------------------------------------
KERNEL: rotate_v2
---------------------------------------------------------------------------
...
---------------------------------------------------------------------------
SCHEDULE:
0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[])
1: [maketmp] tmp <- arr[i_inner + i_outer*16]
LINEARIZATION:
0: CALL KERNEL rotate_v2
1: tmp = arr[i_inner + i_outer*16] {id=maketmp}
2: RETURN FROM KERNEL rotate_v2
3: ---BARRIER:global---
4: CALL KERNEL rotate_v2_0(extra_args=[], extra_inames=[])
5: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp
3: ... gbarrier
4: CALL KERNEL rotate_v2_0
5: arr[(i_inner + i_outer*16 + 1) % n] = tmp {id=rotate}
6: RETURN FROM KERNEL rotate_v2_0
---------------------------------------------------------------------------
......@@ -1214,33 +1256,33 @@ goes for local temporaries).
:func:`loopy.save_and_reload_temporaries` for the purpose of handling the
task of saving and restoring temporary values across global barriers. This
function adds instructions to the kernel without scheduling them. That means
that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to
that :func:`loopy.get_one_linearized_kernel` needs to be called one more time to
put those instructions into the schedule.
>>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
>>> knl = lp.save_and_reload_temporaries(knl)
>>> knl = lp.get_one_scheduled_kernel(knl) # Schedule added instructions
>>> print(knl)
>>> prog = lp.save_and_reload_temporaries(prog)
>>> knl = lp.get_one_linearized_kernel(prog["rotate_v2"], prog.callables_table) # Schedule added instructions
>>> prog = prog.with_kernel(knl)
>>> print(prog)
---------------------------------------------------------------------------
KERNEL: rotate_v2
---------------------------------------------------------------------------
...
---------------------------------------------------------------------------
TEMPORARIES:
tmp: type: np:dtype('int32'), shape: () scope:private
tmp_save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) scope:global
tmp: type: np:dtype('int32'), shape: () aspace: private
tmp_save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) aspace: global
---------------------------------------------------------------------------
...
---------------------------------------------------------------------------
SCHEDULE:
0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[])
1: [maketmp] tmp <- arr[i_inner + i_outer*16]
2: [tmp.save] tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp
LINEARIZATION:
0: CALL KERNEL rotate_v2
1: tmp = arr[i_inner + i_outer*16] {id=maketmp}
2: tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp {id=tmp.save}
3: RETURN FROM KERNEL rotate_v2
4: ---BARRIER:global---
5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[])
6: [tmp.reload] tmp <- tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0]
7: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp
4: ... gbarrier
5: CALL KERNEL rotate_v2_0
6: tmp = tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] {id=tmp.reload}
7: arr[(i_inner + i_outer*16 + 1) % n] = tmp {id=rotate}
8: RETURN FROM KERNEL rotate_v2_0
---------------------------------------------------------------------------
......@@ -1259,12 +1301,12 @@ does in more detail:
The kernel translates into two OpenCL kernels.
>>> cgr = lp.generate_code_v2(knl)
>>> cgr = lp.generate_code_v2(prog)
>>> print(cgr.device_code())
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
<BLANKLINE>
__kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
__kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int const *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
{
int tmp;
<BLANKLINE>
......@@ -1272,20 +1314,20 @@ The kernel translates into two OpenCL kernels.
tmp_save_slot[16 * gid(0) + lid(0)] = tmp;
}
<BLANKLINE>
__kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
__kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int const *__restrict__ tmp_save_slot)
{
int tmp;
<BLANKLINE>
tmp = tmp_save_slot[16 * gid(0) + lid(0)];
arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp;
arr[(1 + lid(0) + gid(0) * 16) % n] = tmp;
}
Executing the kernel does what we expect.
Now we can execute the kernel.
>>> arr = cl.array.arange(queue, 16, dtype=np.int32)
>>> print(arr)
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
>>> evt, (out,) = knl(queue, arr=arr)
>>> evt, (out,) = prog(queue, arr=arr)
>>> print(arr)
[15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14]
......@@ -1339,9 +1381,9 @@ a loopy kernel by simply calling them, e.g.::
Additionally, all functions of one variable are currently recognized during
code-generation however additional implementation may be required for custom
functions. The full lists of available functions may be found in a the
:class:`TargetBase` implementation (e.g. :class:`CudaTarget`)
:class:`loopy.TargetBase` implementation (e.g. :class:`loopy.CudaTarget`)
Custom user functions may be represented using the method described in :ref:`_functions`
Custom user functions may be represented using the method described in :ref:`functions`
Data-dependent control flow
......@@ -1377,7 +1419,7 @@ Attempting to create this kernel results in an error:
... # While trying to find shape axis 0 of argument 'out', the following exception occurred:
Traceback (most recent call last):
...
StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n <= 1; [(n)] : n >= 2 }'
loopy.diagnostic.StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n <= 1; [(n)] : n >= 2 }'
The problem is that loopy cannot find a simple, universally valid expression
for the length of *out* in this case. Notice how the kernel accesses both the
......@@ -1433,7 +1475,7 @@ We'll also request a prefetch--but suppose we only do so across the
.. doctest::
>>> knl = lp.add_prefetch(knl, "a", "i_inner")
>>> knl = lp.add_prefetch(knl, "a", "i_inner", default_tag="l.auto")
When we try to run our code, we get the following warning from loopy as a first
sign that something is amiss:
......@@ -1443,13 +1485,13 @@ sign that something is amiss:
>>> evt, (out,) = knl(queue, a=a_mat_dev)
Traceback (most recent call last):
...
WriteRaceConditionWarning: in kernel transpose: instruction 'a_fetch_rule' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch_rule)' to silenced_warnings kernel attribute to disable)
loopy.diagnostic.WriteRaceConditionWarning: in kernel transpose: instruction 'a_fetch_rule' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch_rule)' to silenced_warnings kernel attribute to disable)
When we ask to see the code, the issue becomes apparent:
.. doctest::
>>> knl = lp.set_options(knl, "write_cl")
>>> knl = lp.set_options(knl, write_code=True)
>>> from warnings import catch_warnings
>>> with catch_warnings():
... filterwarnings("always", category=lp.LoopyWarning)
......@@ -1522,24 +1564,22 @@ containing different types of data:
... """
... c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
... e[i, k] = g[i,k]*(2+h[i,k+1])
... """)
... """, name="stats_knl")
>>> knl = lp.add_and_infer_dtypes(knl,
... dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
Note that loopy will infer the data types for arrays c and e from the
Note that loopy will infer the data types for arrays ``c`` and ``e`` from the
information provided. Now we will count the operations:
.. doctest::
>>> op_map = lp.get_op_map(knl)
>>> print(lp.stringify_stats_mapping(op_map))
Op(np:dtype('float32'), add) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
Op(np:dtype('float32'), div) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
Op(np:dtype('float32'), mul) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
Op(np:dtype('float64'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
Op(np:dtype('float64'), mul) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
Op(np:dtype('int32'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
<BLANKLINE>
>>> op_map = lp.get_op_map(knl, subgroup_size=32)
>>> print(op_map)
Op(np:dtype('float32'), add, subgroup, "stats_knl"): ...
Each line of output will look roughly like::
Op(np:dtype('float32'), add, subgroup, "kernel_name") : [l, m, n] -> { l * m * n : l > 0 and m > 0 and n > 0 }
:func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
:class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A
......@@ -1549,23 +1589,24 @@ number of operations matching the characteristics of the :class:`loopy.Op`
specified in the key (in terms of the :class:`loopy.LoopKernel`
*inames*). :class:`loopy.Op` attributes include:
- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
- dtype: A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the
data type operated on.
- name: A :class:`str` that specifies the kind of arithmetic operation as
*add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
One way to evaluate these polynomials is with :meth:`islpy.PwQPolynomial.eval_with_dict`:
.. doctest::
>>> param_dict = {'n': 256, 'm': 256, 'l': 8}
>>> f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(param_dict)
>>> f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(param_dict)
>>> f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(param_dict)
>>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict)
>>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict)
>>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict)
>>> from loopy.statistics import CountGranularity as CG
>>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
>>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
>>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
>>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
>>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
>>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
>>> print("%i\n%i\n%i\n%i\n%i\n%i" %
... (f32add, f32div, f32mul, f64add, f64mul, i32add))
524288
......@@ -1579,7 +1620,7 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
grouping, and evaluating subsets of the counts. Suppose we want to know the
total number of 32-bit operations of any kind. We can easily count these
using functions :func:`loopy.ToCountMap.filter_by` and
:func:`loopy.ToCountMap.eval_and_sum`:
:func:`loopy.ToCountPolynomialMap.eval_and_sum`:
.. doctest::
......@@ -1595,16 +1636,18 @@ together into keys containing only the specified fields:
.. doctest::
>>> op_map_dtype = op_map.group_by('dtype')
>>> print(lp.stringify_stats_mapping(op_map_dtype))
Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 }
Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 }
Op(np:dtype('int32'), None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
<BLANKLINE>
>>> print(op_map_dtype)
Op(np:dtype('float32'), None, None): ...
>>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32)
... ].eval_with_dict(param_dict)
>>> print(f32op_count)
1572864
The lines of output above might look like::
Op(np:dtype('float32'), None, None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 }
Op(np:dtype('float64'), None, None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 }
See the reference page for :class:`loopy.ToCountMap` and :class:`loopy.Op` for
more information on these functions.
......@@ -1617,15 +1660,16 @@ we'll continue using the kernel from the previous example:
.. doctest::
>>> mem_map = lp.get_mem_access_map(knl)
>>> print(lp.stringify_stats_mapping(mem_map))
MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
<BLANKLINE>
>>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32)
>>> print(mem_map)
MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, 'stats_knl'): ...
Each line of output will look roughly like::
MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroupw, 'stats_knl') : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, 'stats_knl') : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, 'stats_knl') : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
:func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
:class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
......@@ -1634,11 +1678,21 @@ we'll continue using the kernel from the previous example:
- mtype: A :class:`str` that specifies the memory type accessed as **global**
or **local**
- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
- dtype: A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the
data type accessed.
- stride: An :class:`int` that specifies stride of the memory access. A stride
of 0 indicates a uniform access (i.e. all threads access the same item).
- lid_strides: A :class:`dict` of **{** :class:`int` **:**
:data:`~pymbolic.typing.Expression` or :class:`int` **}** that specifies
local strides for each local id in the memory access index. Local ids not
found will not be present in ``lid_strides.keys()``. Uniform access (i.e.
work-items within a sub-group access the same item) is indicated by setting
``lid_strides[0]=0``, but may also occur when no local id 0 is found, in
which case the 0 key will not be present in lid_strides.
- gid_strides: A :class:`dict` of **{** :class:`int` **:**
:data:`~pymbolic.typing.Expression` or :class:`int` **}** that specifies
global strides for each global id in the memory access index. Global ids not
found will not be present in ``gid_strides.keys()``.
- direction: A :class:`str` that specifies the direction of memory access as
**load** or **store**.
......@@ -1646,17 +1700,21 @@ we'll continue using the kernel from the previous example:
- variable: A :class:`str` that specifies the variable name of the data
accessed.
We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
We can evaluate these polynomials using :meth:`islpy.PwQPolynomial.eval_with_dict`:
.. doctest::
>>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g')
>>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g',
... variable_tags=None, count_granularity=CG.SUBGROUP, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e')
>>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e',
... variable_tags=None, count_granularity=CG.SUBGROUP, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a')
>>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a',
... variable_tags=None, count_granularity=CG.SUBGROUP, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c')
>>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c',
... variable_tags=None, count_granularity=CG.SUBGROUP, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
... (f32ld_a, f32st_c, f64ld_g, f64st_e))
......@@ -1673,20 +1731,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
.. doctest::
>>> bytes_map = mem_map.to_bytes()
>>> print(lp.stringify_stats_mapping(bytes_map))
MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
<BLANKLINE>
>>> print(bytes_map)
MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, 'stats_knl'): ...
>>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
... ).group_by('direction')
>>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
MemAccess(None, None, None, load, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 }
MemAccess(None, None, None, store, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 }
<BLANKLINE>
>>> print(global_ld_st_bytes)
MemAccess(None, None, None, None, load, None, None, None, None): ...
MemAccess(None, None, None, None, store, None, None, None, None): ...
>>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
... ].eval_with_dict(param_dict)
>>> stored = global_ld_st_bytes[lp.MemAccess(direction='store')
......@@ -1695,45 +1746,58 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
bytes loaded: 7340032
bytes stored: 2621440
The lines of output above might look like::
MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup): [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup): [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup): [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float64'), {}, {}, load, g, None, subgroup): [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float64'), {}, {}, load, h, None, subgroup): [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
MemAccess(global, np:dtype('float64'), {}, {}, store, e, None, subgroup): [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
One can see how these functions might be useful in computing, for example,
achieved memory bandwidth in byte/sec or performance in FLOP/sec.
~~~~~~~~~~~
Since we have not tagged any of the inames or parallelized the kernel across
threads (which would have produced iname tags), :func:`loopy.get_mem_access_map`
considers the memory accesses *uniform*, so the *stride* of each access is 0.
Now we'll parallelize the kernel and count the array accesses again. The
resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
work-items (which would have produced iname tags), :func:`loopy.get_mem_access_map`
finds no local or global id strides, leaving ``lid_strides`` and ``gid_strides``
empty for each memory access. Now we'll parallelize the kernel and count the array
accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated
this time.
.. doctest::
>>> knl_consec = lp.split_iname(knl, "k", 128,
... outer_tag="l.1", inner_tag="l.0")
>>> mem_map = lp.get_mem_access_map(knl_consec)
>>> print(lp.stringify_stats_mapping(mem_map))
MemAccess(global, np:dtype('float32'), 1, load, a) : [m, l, n] -> { ... }
MemAccess(global, np:dtype('float32'), 1, load, b) : [m, l, n] -> { ... }
MemAccess(global, np:dtype('float32'), 1, store, c) : [m, l, n] -> { ... }
MemAccess(global, np:dtype('float64'), 1, load, g) : [m, l, n] -> { ... }
MemAccess(global, np:dtype('float64'), 1, load, h) : [m, l, n] -> { ... }
MemAccess(global, np:dtype('float64'), 1, store, e) : [m, l, n] -> { ... }
<BLANKLINE>
With this parallelization, consecutive threads will access consecutive array
>>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32)
>>> print(mem_map)
MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, 'stats_knl'): ...
With this parallelization, consecutive work-items will access consecutive array
elements in memory. The polynomials are a bit more complicated now due to the
parallelization, but when we evaluate them, we see that the total number of
array accesses has not changed:
.. doctest::
>>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g')
>>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e')
>>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a')
>>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c')
>>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
... (f32ld_a, f32st_c, f64ld_g, f64st_e))
......@@ -1744,36 +1808,39 @@ array accesses has not changed:
~~~~~~~~~~~
To produce *nonconsecutive* array accesses with stride greater than 1, we'll
switch the inner and outer tags in our parallelization of the kernel:
To produce *nonconsecutive* array accesses with local id 0 stride greater than 1,
we'll switch the inner and outer tags in our parallelization of the kernel:
.. doctest::
>>> knl_nonconsec = lp.split_iname(knl, "k", 128,
... outer_tag="l.0", inner_tag="l.1")
>>> mem_map = lp.get_mem_access_map(knl_nonconsec)
>>> print(lp.stringify_stats_mapping(mem_map))
MemAccess(global, np:dtype('float32'), 128, load, a) : [m, l, n] -> { ... }
MemAccess(global, np:dtype('float32'), 128, load, b) : [m, l, n] -> { ... }
MemAccess(global, np:dtype('float32'), 128, store, c) : [m, l, n] -> { ... }
MemAccess(global, np:dtype('float64'), 128, load, g) : [m, l, n] -> { ... }
MemAccess(global, np:dtype('float64'), 128, load, h) : [m, l, n] -> { ... }
MemAccess(global, np:dtype('float64'), 128, store, e) : [m, l, n] -> { ... }
<BLANKLINE>
With this parallelization, consecutive threads will access *nonconsecutive*
>>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32)
>>> print(mem_map)
MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, 'stats_knl'): ...
With this parallelization, consecutive work-items will access *nonconsecutive*
array elements in memory. The total number of array accesses still has not
changed:
.. doctest::
>>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g')
>>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e')
>>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a')
>>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c')
>>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c',
... variable_tags=None, count_granularity=CG.WORKITEM, kernel_name="stats_knl")
... ].eval_with_dict(param_dict)
>>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
... (f32ld_a, f32st_c, f64ld_g, f64st_e))
......@@ -1791,7 +1858,7 @@ criteria are more complicated than a simple list of allowable values:
>>> def f(key):
... from loopy.types import to_loopy_type
... return key.dtype == to_loopy_type(np.float32) and \
... key.stride > 1
... key.lid_strides[0] > 1
>>> count = mem_map.filter_by_func(f).eval_and_sum(param_dict)
>>> print(count)
2097152
......@@ -1800,21 +1867,20 @@ Counting synchronization events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:func:`loopy.get_synchronization_map` counts the number of synchronization
events per **thread** in a kernel. First, we'll call this function on the
events per **work-item** in a kernel. First, we'll call this function on the
kernel from the previous example:
.. doctest::
>>> sync_map = lp.get_synchronization_map(knl)
>>> print(lp.stringify_stats_mapping(sync_map))
kernel_launch : { 1 }
<BLANKLINE>
>>> print(sync_map)
Sync(kernel_launch, stats_knl): [l, m, n] -> { 1 }
We can evaluate this polynomial using :func:`islpy.eval_with_dict`:
We can evaluate this polynomial using :meth:`islpy.PwQPolynomial.eval_with_dict`:
.. doctest::
>>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict)
>>> launch_count = sync_map[lp.Sync("kernel_launch", "stats_knl")].eval_with_dict(param_dict)
>>> print("Kernel launch count: %s" % launch_count)
Kernel launch count: 1
......@@ -1830,7 +1896,7 @@ Now to make things more interesting, we'll create a kernel with barriers:
... e[i,j,k] = c[i,j,k+1]+c[i,j,k-1]
... """
... ], [
... lp.TemporaryVariable("c", lp.auto, shape=(50, 10, 99)),
... lp.TemporaryVariable("c", dtype=None, shape=(50, 10, 99)),
... "..."
... ])
>>> knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
......@@ -1844,38 +1910,35 @@ Now to make things more interesting, we'll create a kernel with barriers:
{
__local int c[50 * 10 * 99];
<BLANKLINE>
{
int const k_outer = 0;
<BLANKLINE>
for (int i = 0; i <= 49; ++i)
for (int j = 0; j <= 9; ++j)
for (int i = 0; i <= 49; ++i)
{
barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
}
}
{
int const k_outer = 0;
<BLANKLINE>
barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
}
}
In this kernel, when a thread performs the second instruction it uses data
produced by *different* threads during the first instruction. Because of this,
In this kernel, when a work-item performs the second instruction it uses data
produced by *different* work-items during the first instruction. Because of this,
barriers are required for correct execution, so loopy inserts them. Now we'll
count the barriers using :func:`loopy.get_synchronization_map`:
.. doctest::
>>> sync_map = lp.get_synchronization_map(knl)
>>> print(lp.stringify_stats_mapping(sync_map))
barrier_local : { 1000 }
kernel_launch : { 1 }
<BLANKLINE>
>>> print(sync_map)
Sync(barrier_local, loopy_kernel): { 1000 }
Sync(kernel_launch, loopy_kernel): { 1 }
Based on the kernel code printed above, we would expect each thread to
Based on the kernel code printed above, we would expect each work-item to
encounter 50x10x2 barriers, which matches the result from
:func:`loopy.get_synchronization_map`. In this case, the number of barriers
does not depend on any inames, so we can pass an empty dictionary to
:func:`islpy.eval_with_dict`.
:meth:`islpy.PwQPolynomial.eval_with_dict`.
.. }}}
......
subroutine fill(out, a, n)
implicit none
real_type a, out(n)
integer n, i
do i = 1, n
out(i) = a
end do
do i = 1, n
out(i) = out(i) * factor
end do
end
!$loopy begin
!
! SOURCE = lp.c_preprocess(SOURCE, [
! "factor 4.0",
! "real_type real*8",
! ])
! fill, = lp.parse_fortran(SOURCE, FILENAME)
! fill = lp.split_iname(fill, "i", 128,
! outer_tag="g.0", inner_tag="l.0")
! fill = lp.split_iname(fill, "i_1", 128,
! outer_tag="g.0", inner_tag="l.0")
! RESULT = [fill]
!
!$loopy end
! vim:filetype=floopy
{
"metadata": {
"name": "",
"signature": "sha256:c9f8334aa7aa4a5ad1437fa5871aafa52bbc9131271d9e90e7be47d22725cc94"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Loopy IPython Integration Demo"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%load_ext loopy.ipython_ext"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Without transform code"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%fortran_kernel\n",
"\n",
"subroutine fill(out, a, n)\n",
" implicit none\n",
"\n",
" real*8 a, out(n)\n",
" integer n, i\n",
"\n",
" do i = 1, n\n",
" out(i) = a\n",
" end do\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(prog) # noqa: F821"
]
},
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Loopy IPython Integration Demo"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%load_ext loopy.ipython_ext"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Without transform code"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%%fortran_kernel\n",
"\n",
"subroutine fill(out, a, n)\n",
" implicit none\n",
"\n",
" real*8 a, out(n)\n",
" integer n, i\n",
"\n",
" do i = 1, n\n",
" out(i) = a\n",
" end do\n",
"end"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(fill)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"---------------------------------------------------------------------------\n",
"KERNEL: fill\n",
"---------------------------------------------------------------------------\n",
"ARGUMENTS:\n",
"a: ValueArg, type: float64\n",
"n: ValueArg, type: int32\n",
"out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n",
"---------------------------------------------------------------------------\n",
"DOMAINS:\n",
"[n] -> { [i] : i >= 0 and i <= -1 + n }\n",
"---------------------------------------------------------------------------\n",
"INAME IMPLEMENTATION TAGS:\n",
"i: None\n",
"---------------------------------------------------------------------------\n",
"INSTRUCTIONS:\n",
"[i] out[i] <- a # insn0\n",
"---------------------------------------------------------------------------\n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## With transform code"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"split_amount = 128"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%%transformed_fortran_kernel\n",
"\n",
"subroutine tr_fill(out, a, n)\n",
" implicit none\n",
"\n",
" real*8 a, out(n)\n",
" integer n, i\n",
"\n",
" do i = 1, n\n",
" out(i) = a\n",
" end do\n",
"end\n",
"\n",
"!$loopy begin\n",
"!\n",
"! tr_fill, = lp.parse_fortran(SOURCE)\n",
"! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n",
"! outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
"! RESULT = [tr_fill]\n",
"!\n",
"!$loopy end"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(tr_fill)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"---------------------------------------------------------------------------\n",
"KERNEL: tr_fill\n",
"---------------------------------------------------------------------------\n",
"ARGUMENTS:\n",
"a: ValueArg, type: float64\n",
"n: ValueArg, type: int32\n",
"out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n",
"---------------------------------------------------------------------------\n",
"DOMAINS:\n",
"[n] -> { [i_outer, i_inner] : i_inner >= -128i_outer and i_inner <= -1 + n - 128i_outer and i_inner >= 0 and i_inner <= 127 }\n",
"---------------------------------------------------------------------------\n",
"INAME IMPLEMENTATION TAGS:\n",
"i_inner: l.0\n",
"i_outer: g.0\n",
"---------------------------------------------------------------------------\n",
"INSTRUCTIONS:\n",
"[i_inner,i_outer] out[i_inner + i_outer*128] <- a # insn0\n",
"---------------------------------------------------------------------------\n"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
"cell_type": "markdown",
"metadata": {},
"source": [
"## With transform code"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"split_amount = 128"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%transformed_fortran_kernel\n",
"\n",
"subroutine tr_fill(out, a, n)\n",
" implicit none\n",
"\n",
" real*8 a, out(n)\n",
" integer n, i\n",
"\n",
" do i = 1, n\n",
" out(i) = a\n",
" end do\n",
"end\n",
"\n",
"!$loopy begin\n",
"!\n",
"! tr_fill = lp.parse_fortran(SOURCE)\n",
"! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n",
"! outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
"! RESULT = tr_fill\n",
"!\n",
"!$loopy end"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(prog) # noqa: F821"
]
}
]
}
\ No newline at end of file
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
import numpy as np
import numpy.linalg as la
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp
def main():
import pathlib
fn = pathlib.Path(__file__).parent / "matmul.floopy"
with open(fn) as inf:
source = inf.read()
dgemm = lp.parse_transformed_fortran(source, filename=fn)
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
n = 2048
a = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
b = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
c = cl.array.zeros(queue, (n, n), dtype=np.float64, order="F")
cl.clrandom.fill_rand(a)
cl.clrandom.fill_rand(b)
dgemm = lp.set_options(dgemm, write_code=True)
dgemm(queue, a=a, b=b, alpha=1, c=c)
c_ref = (a.get() @ b.get())
assert la.norm(c_ref - c.get())/la.norm(c_ref) < 1e-10
if __name__ == "__main__":
main()