Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.


Select target project
No results found


Select target project
  • tasmith4/loopy
  • ben_sepanski/loopy
  • arghdos/loopy
  • inducer/loopy
  • wence-/loopy
  • isuruf/loopy
  • fikl2/loopy
  • xywei/loopy
  • kaushikcfd/loopy
  • zweiner2/loopy
10 results
Show changes
with 1410 additions and 835 deletions
#! /bin/bash
# should be run in this directory (build-helpers)
if test "$1" = "--nodate"; then
TGT_NAME=loopy-centos6-$(date +"%Y-%m-%d")
echo "Generating $TGT_NAME..."
set -e
set -x
docker pull centos:6
CNT=$(docker create -t -v $(pwd):/mnt centos:6 /mnt/
echo "working in container $CNT"
docker start -i $CNT
docker cp $CNT:/tmp/build/loopy/dist/loopy $(pwd) || true
mv loopy $TGT_NAME
docker rm $CNT
#! /bin/bash
# run this from the loopy root directory
rm -Rf dist build
pyinstaller \
--workpath=build/pyinstaller \
#! /bin/bash
set -e
scp "$1"
#!/usr/bin/env python
import ctypes
from os import system
C_SRC = """
#include <stdlib.h>
#include <stdint.h>
int64_t cdiv(int64_t a, int64_t b)
return a/b;
int64_t cmod(int64_t a, int64_t b)
return a%b;
MACRO_NAME(int8, char) \
MACRO_NAME(int16, short) \
MACRO_NAME(int32, int) \
MACRO_NAME(int64, long long)
TYPE loopy_floor_div_##SUFFIX(TYPE a, TYPE b) \
{ \
if ((a<0) != (b<0)) \
a = a - (b + (b<0) - (b>=0)); \
return a/b; \
TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \
{ \
if (a<0) \
a = a - (b-1); \
return a/b; \
TYPE loopy_mod_pos_b_##SUFFIX(TYPE a, TYPE b) \
{ \
TYPE result = a%b; \
if (result < 0) \
result += b; \
return result; \
TYPE loopy_mod_##SUFFIX(TYPE a, TYPE b) \
{ \
TYPE result = a%b; \
if (result < 0 && b > 0) \
result += b; \
if (result > 0 && b < 0) \
result = result + b; \
return result; \
def main():
with open("int-experiments.c", "w") as outf:
system("gcc -Wall -shared int-experiments.c -o")
int_exp = ctypes.CDLL("")
for func in [
func.argtypes = [ctypes.c_longlong, ctypes.c_longlong]
func.restype = ctypes.c_longlong
cmod = int_exp.cmod
int_floor_div = int_exp.loopy_floor_div_int64
int_floor_div_pos_b = int_exp.loopy_floor_div_pos_b_int64
int_mod_pos_b = int_exp.loopy_mod_pos_b_int64
int_mod = int_exp.loopy_mod_int64
m = 50
for a in range(-m, m):
for b in range(1, m):
cresult = int_floor_div_pos_b(a, b)
presult = a // b
assert cresult == presult
if cresult != presult:
print(a, b, cresult, presult)
for a in range(-m, m):
for b in range(-m, m):
if b == 0:
cresult = int_floor_div(a, b)
presult = a // b
assert cresult == presult
if cresult != presult:
print(a, b, cresult, presult)
for a in range(-m, m):
for b in range(1, m):
cresult = int_mod_pos_b(a, b)
presult = a % b
assert cresult == presult
for a in range(-m, m):
for b in range(-m, m):
if b == 0:
cresult = int_mod(a, b)
presult = a % b
assert cresult == presult
if cresult != presult:
print(a, b, cresult, presult)
# print(int_mod(552, -918), 552 % -918)
print(cmod(23, -11), 23 % -11)
if __name__ == "__main__":
import numpy as np
# Inspired by a visualization used in the Halide tutorial
def div_ceil(nr, dr):
return -(-nr // dr)
def product(iterable):
from functools import reduce
from operator import mul
return reduce(mul, iterable, 1)
class ArrayAccessPatternContext:
def __init__(self, gsize, lsize, subgroup_size=32, decay_constant=0.75):
self.lsize = lsize
self.gsize = gsize
self.subgroup_size = subgroup_size
self.timestamp = 0
self.decay_constant = decay_constant
self.ind_length = len(gsize) + len(lsize)
self.arrays = []
def l(self, index): # noqa: E743
subscript = [np.newaxis] * self.ind_length
subscript[len(self.gsize) + index] = slice(None)
return np.arange(self.lsize[index])[tuple(subscript)]
def g(self, index):
subscript = [np.newaxis] * self.ind_length
subscript[index] = slice(None)
return np.arange(self.gsize[index])[tuple(subscript)]
def nsubgroups(self):
return div_ceil(product(self.lsize), self.subgroup_size)
def animate(self, f, interval=200):
import matplotlib.animation as animation
import matplotlib.pyplot as plt
fig = plt.figure()
plots = []
for iary, ary in enumerate(self.arrays):
ax = fig.add_subplot(1, len(self.arrays), 1+iary)
def data_gen():
for _ in f():
for ary, plot in zip(self.arrays, plots):
yield plots
# must be kept alive until after
return animation.FuncAnimation(
fig, lambda x: x, data_gen,
blit=False, interval=interval, repeat=True)
def tick(self):
self.timestamp += 1
class Array:
def __init__(self, ctx, name, shape, strides, elements_per_row=None):
# Each array element stores a tuple:
# (timestamp, subgroup, g0, g1, g2, ) of last access
assert len(shape) == len(strides)
self.nattributes = 2+len(ctx.gsize)
if elements_per_row is None:
if len(shape) > 1:
minstride = min(strides)
for sh_i, st_i in zip(shape, strides):
if st_i == minstride:
elements_per_row = sh_i
elements_per_row = 256
self.array = np.zeros((product(shape), self.nattributes,), dtype=np.int32)
self.ctx = ctx = name
self.shape = shape
self.strides = strides
self.elements_per_row = elements_per_row
def __getitem__(self, index):
if not isinstance(index, tuple):
index = (index,)
assert len(index) == len(self.shape)
all_subscript = (np.newaxis,) * self.ctx.ind_length
def reshape_ind(ind):
if not isinstance(ind, np.ndarray):
return ind[all_subscript]
assert len(ind.shape) == self.ctx.ind_length
lin_index = sum(
ind_i * stride_i
for ind_i, stride_i in zip(index, self.strides))
if not isinstance(lin_index, np.ndarray):
subscript = [np.newaxis] * self.ctx.ind_length
lin_index = np.array(lin_index)[subscript]
self.array[lin_index, 0] = self.ctx.timestamp
for i, _glength in enumerate(self.ctx.gsize):
if lin_index.shape[i] > 1:
self.array[lin_index, 2+i] = self.ctx.g(i)
workitem_index = 0
for i in range(len(self.ctx.lsize))[::-1]:
workitem_index = (
workitem_index * self.ctx.lsize[i]
+ self.ctx.l(i))
subgroup = workitem_index//self.ctx.subgroup_size
self.array[lin_index, 1] = subgroup
def __setitem__(self, index, value):
def get_plot_data(self):
nelements = self.array.shape[0]
base_shape = (
div_ceil(nelements, self.elements_per_row),
shaped_array = np.zeros(
(*base_shape, self.nattributes),
shaped_array.reshape(-1, self.nattributes)[:nelements] = self.array
modulation = np.exp(
-self.ctx.decay_constant*(self.ctx.timestamp-shaped_array[:, :, 0]))
subgroup = shaped_array[:, :, 1]
if self.ctx.nsubgroups() > 1:
subgroup = subgroup/(self.ctx.nsubgroups()-1)
rgb_array = np.zeros((*base_shape, 3))
if 1:
if len(self.ctx.gsize) > 1:
# g.0 -> red
rgb_array[:, :, 0] = shaped_array[:, :, 2]/(self.ctx.gsize[0]-1)
if len(self.ctx.gsize) > 1:
# g.1 -> blue
rgb_array[:, :, 2] = shaped_array[:, :, 3]/(self.ctx.gsize[1]-1)
if 1:
rgb_array[:, :, 1] = subgroup
return rgb_array*modulation[:, :, np.newaxis]
def plot(self, ax, **kwargs):
return ax.imshow(
self.get_plot_data(), interpolation="nearest",
def show_example():
n = 2**7
n16 = div_ceil(n, 16)
ctx = ArrayAccessPatternContext(gsize=(n16, n16), lsize=(16, 16))
in0 = Array(ctx, "in0", (n, n), (n, 1))
if 0:
# knl a
i_inner = ctx.l(1)
i_outer = ctx.g(1)
k_inner = ctx.l(0)
def f():
for k_outer in range(n16):
in0[i_inner + i_outer*16, k_inner + k_outer*16]
elif 0:
# knl b
j_inner = ctx.l(0)
j_outer = ctx.g(0)
k_inner = ctx.l(1)
def f():
for k_outer in range(n16):
in0[k_inner + k_outer*16, j_inner + j_outer*16]
ani = ctx.animate(f)
import matplotlib.pyplot as plt
if 1:
def show_example_2():
bsize = 8
blocks = 3
ctx = ArrayAccessPatternContext(gsize=(1,), lsize=(1,),
in0 = Array(ctx, "in0", (blocks*bsize, blocks*bsize), (blocks*bsize, 1))
def f():
for i_outer in range(blocks):
for j_outer in range(blocks):
for i_inner in range(bsize):
for j_inner in range(bsize):
in0[i_inner + i_outer*bsize, j_inner + j_outer*bsize]
ani = ctx.animate(f, interval=10)
import matplotlib.pyplot as plt
if 1:
if __name__ == "__main__":
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# You can set these variables from the command line. # You can set these variables from the command line.
SPHINXBUILD = python ` which sphinx-build` SPHINXBUILD = python `which sphinx-build`
BUILDDIR = _build BUILDDIR = _build
# -*- coding: utf-8 -*- import os
# from urllib.request import urlopen
# loopy documentation build configuration file, created by
# sphinx-quickstart on Tue Aug 9 13:40:49 2011.
# This file is execfile()d with the current directory set to its containing dir.
# Note that not all possible configuration values are present in this
# autogenerated file.
# All configuration values have a default; values that are commented out
# serve to show the default.
#import sys, os
# If extensions (or modules to document with autodoc) are in another directory, _conf_url = ""
# add these directories to sys.path here. If the directory is relative to the with urlopen(_conf_url) as _inf:
# documentation root, use os.path.abspath to make it absolute, like shown here. exec(compile(, _conf_url, "exec"), globals())
#sys.path.insert(0, os.path.abspath('.'))
# -- General configuration ----------------------------------------------------- copyright = "2016, Andreas Klöckner"
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'loopy'
copyright = u'2016, Andreas Klöckner'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
# The short X.Y version. # The short X.Y version.
ver_dic = {} ver_dic = {}
_version_source = "../loopy/" _version_source = "../loopy/"
with open(_version_source) as vpy_file: with open(_version_source) as vpy_file:
version_py = version_py =
exec(compile(version_py, _version_source, 'exec'), ver_dic) os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"] = "1"
exec(compile(version_py, _version_source, "exec"), ver_dic)
version = ".".join(str(x) for x in ver_dic["VERSION"]) version = ".".join(str(x) for x in ver_dic["VERSION"])
# The full version, including alpha/beta/rc tags. # The full version, including alpha/beta/rc tags.
release = ver_dic["VERSION_TEXT"] release = ver_dic["VERSION_TEXT"]
# The language for content autogenerated by Sphinx. Refer to documentation exclude_patterns = ["_build"]
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
# The reST default role (used for this markup: `text`) to use for all documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting. # Example configuration for intersphinx: refer to the Python standard library.
#modindex_common_prefix = [] intersphinx_mapping = {
"python": ("", None),
"numpy": ("", None),
# -- Options for HTML output --------------------------------------------------- "pytools": ("", None),
"islpy": ("", None),
html_theme = "alabaster" "pyopencl": ("", None),
"cgen": ("", None),
html_theme_options = { "pymbolic": ("", None),
"extra_nav_links": { "constantdict": ("", None),
"🚀 Github": "",
"💾 Download Releases": "",
} }
html_sidebars = { nitpicky = True
'**': [
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names. nitpick_ignore_regex = [
#html_sidebars = {} ["py:class", r"typing_extensions\.(.+)"],
["py:class", r"numpy\.u?int[0-9]+"],
["py:class", r"numpy\.float[0-9]+"],
["py:class", r"numpy\.complex[0-9]+"],
# Additional templates that should be rendered to pages, maps page names to # Reference not found from "<unknown>"? I'm not even sure where to look.
# template names. ["py:class", r"ExpressionNode"],
#html_additional_pages = {}
# If false, no module index is generated. # Type aliases
#html_domain_indices = True ["py:class", r"InameStr"],
["py:class", r"ConcreteCallablesTable"],
# If false, no index is generated. ["py:class", r"LoopNestTree"],
#html_use_index = True ["py:class", r"LoopTree"],
["py:class", r"ToLoopyTypeConvertible"],
# If true, the index is split into individual pages for each letter. ["py:class", r"ToStackMatchConvertible"],
#html_split_index = False ]
# If true, links to the reST sources are added to the pages.
html_show_sourcelink = False
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'loopydoc'
# -- Options for LaTeX output --------------------------------------------------
# The paper size ('letter' or 'a4').
#latex_paper_size = 'letter'
# The font size ('10pt', '11pt' or '12pt').
#latex_font_size = '10pt'
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
('index', 'loopy.tex', u'loopy Documentation',
u'Andreas Kloeckner', 'manual'),
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Additional stuff for the LaTeX preamble.
#latex_preamble = ''
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output --------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'loopy', u'loopy Documentation',
[u'Andreas Kloeckner'], 1)
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
'': None,
'': None,
'': None,
'': None,
'': None,
autoclass_content = "class"
...@@ -18,43 +18,20 @@ When you run this script, the following kernel is generated, compiled, and execu ...@@ -18,43 +18,20 @@ When you run this script, the following kernel is generated, compiled, and execu
(See the full example for how to print the generated code.) (See the full example for how to print the generated code.)
Want to try out loopy? .. _static-binary:
There's no need to go through :ref:`installation` if you'd just like to get a
feel for what loopy is. Instead, you may
`download a self-contained Linux binary <>`_.
This is purposefully built on an ancient Linux distribution, so it should work
on most versions of Linux that are currently out there.
Once you have the binary, do the following::
chmod +x ./loopy-centos6
./loopy-centos6 --target=opencl
./loopy-centos6 --target=cuda
./loopy-centos6 --target=ispc
Grab the example here: :download:`examples/python/ <../examples/python/>`.
You may also donwload the most recent version by going to the `list of builds
<>`_, clicking on the newest one
of type "CentOS binary", clicking on "Browse" under "Build Artifacts", then
navigating to "build-helpers", and downloading the binary from there.
Places on the web related to Loopy Places on the web related to Loopy
---------------------------------- ----------------------------------
* `Python package index <>`_ (download releases) Note the extra '.' in the PyPI identifier! * `Python package index <>`_ (download releases)
* `Github <>`_ (get latest source code, file bugs)
* `Github <>`_ (get latest source code, file bugs) * `Homepage <>`_
* `Wiki <>`_ (read installation tips, get examples, read FAQ)
* `Homepage <>`_
Table of Contents Table of Contents
----------------- -----------------
If you're only just learning about loopy, consider the following `paper If you're only just learning about loopy, consider the following `paper
<>`_ on that may serve as a good <>`_ on loopy that may serve as a good
introduction. introduction.
Please check :ref:`installation` to get started. Please check :ref:`installation` to get started.
...@@ -65,9 +42,14 @@ Please check :ref:`installation` to get started. ...@@ -65,9 +42,14 @@ Please check :ref:`installation` to get started.
tutorial tutorial
ref_creation ref_creation
ref_kernel ref_kernel
ref_transform ref_transform
ref_other ref_other
misc misc
🚀 Github <>
💾 Download Releases <>
Indices and tables Indices and tables
================== ==================
...@@ -3,11 +3,21 @@ ...@@ -3,11 +3,21 @@
Installation Installation
============ ============
This command should install :mod:`loopy`:: Option 0: Static Binary
If you would just like to experiment with :mod:`loopy`'s code transformation
abilities, the easiest way to get loopy is to download a statically-linked
Linux binary.
pip install See :ref:`static-binary` for details.
(Note the extra "."!) Option 1: From Source, no PyOpenCL integration
This command should install :mod:`loopy`::
pip install loopy
You may need to run this with :command:`sudo`. You may need to run this with :command:`sudo`.
If you don't already have `pip <>`_, If you don't already have `pip <>`_,
...@@ -17,19 +27,66 @@ run this beforehand:: ...@@ -17,19 +27,66 @@ run this beforehand::
python python
For a more manual installation, `download the source For a more manual installation, `download the source
<>`_, unpack it, and say:: <>`_, unpack it, and say::
python install python install
You may also clone its git repository:: You may also clone its git repository::
git clone --recursive git:// git clone --recursive
git clone --recursive
Option 2: From Conda Forge, with PyOpenCL integration
This set of instructions is intended for 64-bit Linux and
MacOS support computers:
#. Make sure your system has the basics to build software.
On Debian derivatives (Ubuntu and many more),
installing ``build-essential`` should do the trick.
Everywhere else, just making sure you have the ``g++`` package should be
#. Install `miniforge <>`_.
#. ``export CONDA=/WHERE/YOU/INSTALLED/miniforge3``
If you accepted the default location, this should work:
``export CONDA=$HOME/miniforge3``
#. ``$CONDA/bin/conda create -n dev``
#. ``source $CONDA/bin/activate dev``
#. ``conda install git pip pocl islpy pyopencl`` (Linux)
``conda install osx-pocl-opencl git pip pocl islpy pyopencl`` (OS X)
#. Type the following command::
pip install git+
Next time you want to use :mod:`loopy`, just run the following command::
source /WHERE/YOU/INSTALLED/miniforge3/bin/activate dev
You may also like to add this to a startup file (like :file:`$HOME/.bashrc`) or create an alias for it.
See the `PyOpenCL installation instructions
<>`_ for options
regarding OpenCL drivers.
User-visible Changes User-visible Changes
==================== ====================
Version 2016.2 See also :ref:`language-versioning`.
Version 2018.1
-------------- --------------
.. note:: .. note::
...@@ -57,7 +114,7 @@ Licensing ...@@ -57,7 +114,7 @@ Licensing
Loopy is licensed to you under the MIT/X Consortium license: Loopy is licensed to you under the MIT/X Consortium license:
Copyright (c) 2009-13 Andreas Klöckner and Contributors. Copyright (c) 2009-17 Andreas Klöckner and Contributors.
Permission is hereby granted, free of charge, to any person Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation obtaining a copy of this software and associated documentation
...@@ -101,7 +158,7 @@ In the meantime, you can generate code simply by saying:: ...@@ -101,7 +158,7 @@ In the meantime, you can generate code simply by saying::
print(cg_result.host_code()) print(cg_result.host_code())
print(cg_result.device_code()) print(cg_result.device_code())
Additionally, for C-based languages, header defintions are available via:: Additionally, for C-based languages, header definitions are available via::
loopy.generate_header(knl) loopy.generate_header(knl)
...@@ -131,7 +188,7 @@ source of examples. Here are some links: ...@@ -131,7 +188,7 @@ source of examples. Here are some links:
Here's a more complicated example of a loopy code: Here's a more complicated example of a loopy code:
.. literalinclude:: ../examples/python/ .. literalinclude:: ../examples/python/
:language: c :language: python
This example is included in the :mod:`loopy` distribution as This example is included in the :mod:`loopy` distribution as
:download:`examples/python/ <../examples/python/>`. :download:`examples/python/ <../examples/python/>`.
...@@ -193,7 +250,7 @@ This list is always growing, but here are a few pointers: ...@@ -193,7 +250,7 @@ This list is always growing, but here are a few pointers:
* Precompute subexpressions: * Precompute subexpressions:
Use a :ref:`substitution-rule` to assign a name to a subexpression, Use a :ref:`substitution-rule` to assign a name to a subexpression,
using may be :func:`loopy.assignment_to_subst` or :func:`extract_subst`. using may be :func:`loopy.assignment_to_subst` or :func:`loopy.extract_subst`.
Then use :func:`loopy.precompute` to create an (array or scalar) Then use :func:`loopy.precompute` to create an (array or scalar)
temporary with precomputed values. temporary with precomputed values.
...@@ -232,12 +289,12 @@ This list is always growing, but here are a few pointers: ...@@ -232,12 +289,12 @@ This list is always growing, but here are a few pointers:
Use :func:`loopy.tag_inames` with the ``"vec"`` iname tag. Use :func:`loopy.tag_inames` with the ``"vec"`` iname tag.
Note that the corresponding axis of an array must Note that the corresponding axis of an array must
also be tagged using the ``"vec"`` array axis tag also be tagged using the ``"vec"`` array axis tag
(using :func:`tag_array_axes`) in order for vector code to be (using :func:`loopy.tag_array_axes`) in order for vector code to be
generated. generated.
Vectorized loops (and array axes) must have a fixed size. (See either Vectorized loops (and array axes) must have a fixed size. (See either
:func:`split_iname` or :func:`fix_parameters` along with :func:`loopy.split_iname` or :func:`loopy.fix_parameters` along with
:func:`split_array_axis`.) :func:`loopy.split_array_axis`.)
* Reuse of Temporary Storage * Reuse of Temporary Storage
...@@ -246,7 +303,7 @@ This list is always growing, but here are a few pointers: ...@@ -246,7 +303,7 @@ This list is always growing, but here are a few pointers:
* SoA $\leftrightarrow$ AoS * SoA $\leftrightarrow$ AoS
Use :func:`tag_array_axes` with the ``"sep"`` array axis tag Use :func:`loopy.tag_array_axes` with the ``"sep"`` array axis tag
to generate separate arrays for each entry of a short, fixed-length to generate separate arrays for each entry of a short, fixed-length
array axis. array axis.
...@@ -257,7 +314,7 @@ This list is always growing, but here are a few pointers: ...@@ -257,7 +314,7 @@ This list is always growing, but here are a few pointers:
Use :func:`loopy.tag_inames` with the ``"ilp"`` tag. Use :func:`loopy.tag_inames` with the ``"ilp"`` tag.
ILP loops must have a fixed size. (See either ILP loops must have a fixed size. (See either
:func:`split_iname` or :func:`fix_parameters`.) :func:`loopy.split_iname` or :func:`loopy.fix_parameters`.)
* Type inference * Type inference
...@@ -275,12 +332,69 @@ This list is always growing, but here are a few pointers: ...@@ -275,12 +332,69 @@ This list is always growing, but here are a few pointers:
* Interface with your own library functions * Interface with your own library functions
Use :func:`loopy.register_function_manglers`. See :ref:`func-interface` for details.
* Loop collapse * Loop collapse
Use :func:`loopy.join_inames`. Use :func:`loopy.join_inames`.
In what sense does Loopy support vectorization?
There are really two ways in which the OpenCL/CUDA model of computation exposes
* "SIMT": The user writes scalar program instances and either the compiler or
the hardware joins the individual program instances into vectors of a
hardware-given length for execution.
* "Short vectors": This type of vectorization is based on vector types,
e.g. ``float4``, which support arithmetic with implicit vector semantics
as well as a number of 'intrinsic' functions.
Loopy supports both. The first one, SIMT, is accessible by tagging inames with,
e.g., ``l.0```. Accessing the second one requires using both execution- and
data-reshaping capabilities in loopy. To start with, you need an array that
has an axis with the length of the desired vector. If that's not yet available,
you may use :func:`loopy.split_array_axis` to produce one. Similarly, you need
an iname whose bounds match those of the desired vector length. Again, if you
don't already have one, :func:`loopy.split_iname` will easily produce one.
Lastly, both the array axis an the iname need the implementation tag ``"vec"``.
Here is an example of this machinery in action:
.. literalinclude:: ../examples/python/
:language: python
Note how the example slices off the last 'slab' of iterations to ensure that
the bulk of the iteration does not require conditionals which would prevent
successful vectorization. This generates the following code:
.. literalinclude:: ../examples/python/
:language: c
What is the story with language versioning?
The idea is to keep supporting multiple versions at a time. There's a
tension in loopy between the need to build code that keeps working
unchanged for some number of years, and needing the language to
evolve--not just as a research vehicle, but also to enable to respond
to emerging needs in applications and hardware.
The idea is not to support all versions indefinitely, merely to allow
users to upgrade on their own schedule on the scale of a couple years.
Warnings about needing to upgrade would get noisier as a version nears
deprecation. In a way, it is intended to be a version of Python's
`__future__` flags, which IMO have the served the language tremendously
One can also obtain the current language version programmatically:
But pinning your code to that would mean choosing to not use the
potentially valuable guarantee to keep existing code working unchanged
for a while. Instead, it might be wiser to just grab the version of the
language current at the time of writing the code.
Uh-oh. I got a scheduling error. Any hints? Uh-oh. I got a scheduling error. Any hints?
------------------------------------------- -------------------------------------------
...@@ -325,7 +439,7 @@ If you use loopy for your work and find its approach helpful, please ...@@ -325,7 +439,7 @@ If you use loopy for your work and find its approach helpful, please
consider citing the following article. consider citing the following article.
A. Klöckner. ` transformation-based code generation for GPUs and A. Klöckner. ` transformation-based code generation for GPUs and
CPUs <>`_. Proceedings of ARRAY '14: ACM CPUs <>`_. Proceedings of ARRAY '14: ACM
SIGPLAN Workshop on Libraries, Languages, and Compilers for Array SIGPLAN Workshop on Libraries, Languages, and Compilers for Array
Programming. Edinburgh, Scotland. Programming. Edinburgh, Scotland.
...@@ -342,14 +456,33 @@ Here's a Bibtex entry for your convenience:: ...@@ -342,14 +456,33 @@ Here's a Bibtex entry for your convenience::
doi = "{10.1145/2627373.2627387}", doi = "{10.1145/2627373.2627387}",
} }
Getting help
Email the friendly folks on the `loopy mailing list <>`_.
Acknowledgments Acknowledgments
=============== ===============
Andreas Klöckner's work on :mod:`loopy` was supported in part by Work on loopy was supported in part by
- the Department of Energy, National Nuclear Security Administration, under Award Number DE-NA0003963,
- the US Navy ONR, under grant number N00014-14-1-0117, and
- the US National Science Foundation under grant numbers DMS-1418961, CCF-1524433, DMS-1654756, SHF-1911019, and OAC-1931577.
AK also gratefully acknowledges a hardware gift from Nvidia Corporation.
The views and opinions expressed herein do not necessarily reflect those of the funding agencies.
Cross-References to Other Documentation
.. currentmodule:: numpy
.. class:: int16
See :class:`numpy.generic`.
* US Navy ONR grant number N00014-14-1-0117 .. class:: complex128
* the US National Science Foundation under grant numbers DMS-1418961 and CCF-1524433.
AK also gratefully acknowledges a hardware gift from Nvidia Corporation. The See :class:`numpy.generic`.
views and opinions expressed herein do not necessarily reflect those of the
funding agencies.
.. currentmodule:: loopy
.. _func-interface:
Function Interface
Resolving and specialization
In :mod:`loopy`, a :class:`loopy.TranslationUnit` is a collection of callables
and entrypoints. Callables are of type
:class:`loopy.kernel.function_interface.InKernelCallable`. Functions start life
as simple :class:`pymbolic.primitives.Call` nodes. Call resolution turns the function
identifiers in those calls into :class:`~loopy.symbolic.ResolvedFunction` objects.
Each resolved function has an entry in :attr:`TranslationUnit.callables_table`.
The process of realizing a function as a
:class:`~loopy.kernel.function_interface.InKernelCallable` is referred to as
During code generation for a :class:`~loopy.TranslationUnit`, a (resolved) callable
is *specialized* depending on the types and shapes of the arguments passed at a
call site. For example, a call to ``sin(x)`` in :mod:`loopy` is type-generic to
begin with, but it later specialized to either ``sinf``, ``sin`` or ``sinl``
depending on the type of its argument ``x``. A callable's behavior during type
or shape specialization is encoded via
:meth:`~loopy.InKernelCallable.with_types` and
Registering callables
A user can *register* callables within a :class:`~loopy.TranslationUnit` to
allow loopy to resolve calls not pre-defined in :mod:`loopy`. In :mod:`loopy`,
we typically aim to expose all the standard math functions defined for
a :class:``. Other foreign functions could be invoked by
*registering* them.
An example demonstrating registering a ``CBlasGemv`` as a loopy callable:
.. literalinclude:: ../examples/python/
Call Instruction for a kernel call
At a call-site involving a call to a :class:`loopy.LoopKernel`, the arguments to
the call must be ordered by the order of input arguments of the callee kernel.
Similarly, the assignees must be ordered by the order of callee kernel's output
arguments. Since a :class:`` can be both an
input and an output, such arguments would be a part of the call instruction's
assignees as well as the call expression node's parameters.
Entry points
Only callables in :attr:`loopy.TranslationUnit.entrypoints` can be called from
the outside. All other callables are only visible from within the translation
unit, similar to C's ``static`` functions.
.. automodule:: loopy.kernel.function_interface
.. module:: loopy .. currentmodule:: loopy
.. moduleauthor:: Andreas Kloeckner <>
.. _creating-kernels: .. _creating-kernels:
Reference: Creating Kernels Reference: Creating Kernels
...@@ -30,4 +28,11 @@ To Copy between Data Formats ...@@ -30,4 +28,11 @@ To Copy between Data Formats
.. autofunction:: make_copy_kernel .. autofunction:: make_copy_kernel
Einstein summation convention kernels
.. autofunction:: make_einsum
.. automodule:: loopy.version
.. vim: tw=75:spell:fdm=marker .. vim: tw=75:spell:fdm=marker
Reference: Documentation for Internal API
See also :ref:`targets`.
.. automodule::
See also :ref:`expression-syntax`.
.. automodule:: loopy.symbolic
DTypes of variables in a :class:`loopy.LoopKernel` must be picklable, so in
the codegen pipeline user-provided types are converted to
.. automodule:: loopy.types
Type inference
.. automodule:: loopy.type_inference
.. automodule:: loopy.codegen
Reduction Operation
.. automodule:: loopy.library.reduction
Iname Tags
.. automodule::
.. automodule:: loopy.kernel.array
.. automodule:: loopy.check
.. automodule:: loopy.schedule
.. automodule::
.. automodule:: loopy.schedule.tree
...@@ -3,6 +3,72 @@ ...@@ -3,6 +3,72 @@
Reference: Loopy's Model of a Kernel Reference: Loopy's Model of a Kernel
==================================== ====================================
What Types of Computation can a Loopy Program Express?
Loopy programs consist of an a-priori unordered set of statements, operating
on :math:`n`-dimensional array variables.
Arrays consist of "plain old data" and structures thereof, as describable
by a :class:`numpy.dtype`. The n-dimensional shape of these arrays is
given by a tuple of expressions at most affine in parameters that are
fixed for the duration of program execution.
Each array variable in the program is either an argument or a temporary
variable. A temporary variable is only live within the program, while
argument variables are accessible outside the program and constitute the
program's inputs and outputs.
A statement (still called 'instruction' in some places, cf.
:class:`loopy.InstructionBase`) encodes an assignment to an entry of an array.
The right-hand side of an assignment consists of an expression that may
consist of arithmetic operations and calls to functions.
If the outermost operation of the RHS expression is a function call,
the RHS value may be a tuple, and multiple (still scalar) arrays appear
as LHS values. (This is the only sense in which tuple types are supported.)
Each statement is parameterized by zero or more loop variables ("inames").
A statement is executed once for each integer point defined by the domain
forest for the iname tuple given for that statement
(:attr:`loopy.InstructionBase.within_inames`). Each execution of a
statement (with specific values of the inames) is called a *statement
instance*. Dependencies between these instances as well as instances of
other statements are encoded in the program representation and specify permissible
execution orderings. (The semantics of the dependencies are `being
sharpened <>`__.) Assignments
(comprising the evaluation of the RHS and the assignment to the LHS) may
be specified to be atomic.
The basic building blocks of the domain forest are sets given as
conjunctions of equalities and inequalities of quasi-affine expressions on
integer tuples, called domains, and represented as instances of
:class:`islpy.BasicSet`. The entries of each integer tuple are
either *parameters* or *inames*. Each domain may optionally have a *parent
domain*. Parameters of parent-less domains are given by value arguments
supplied to the program that will remain unchanged during program
execution. Parameters of domains with parents may be
- run-time-constant value arguments to the program, or
- inames from parent domains, or
- scalar, integer temporary variables that are written by statements
with iteration domains controlled by a parent domain.
For each tuple of concrete parameter values, the set of iname tuples must be
finite. Each iname is defined by exactly one domain.
For a tuple of inames, the domain forest defines an iteration domain
by finding all the domains defining the inames involved, along with their
parent domains. The resulting tree of domains may contain multiple roots,
but no branches. The iteration domain is then constructed by intersecting
these domains and constructing the projection of that set onto the space
given by the required iname tuple. Observe that, via the parent-child
domain mechanism, imperfectly-nested and data-dependent loops become
The set of functions callable from the language is predefined by the system.
Additional functions may be defined by the user by registering them. It is
not currently possible to define functions from within Loopy, however work
is progressing on permitting this. Even once this is allowed, recursion
will not be permitted.
.. _domain-tree: .. _domain-tree:
Loop Domain Forest Loop Domain Forest
...@@ -130,9 +196,12 @@ Iname Implementation Tags ...@@ -130,9 +196,12 @@ Iname Implementation Tags
Tag Meaning Tag Meaning
=============================== ==================================================== =============================== ====================================================
``None`` | ``"for"`` Sequential loop ``None`` | ``"for"`` Sequential loop
``"ord"`` Forced-order sequential loop
``"l.N"`` Local (intra-group) axis N ("local") ``"l.N"`` Local (intra-group) axis N ("local")
``"g.N"`` Group-number axis N ("group") ``"g.N"`` Group-number axis N ("group")
``"unr"`` Unroll ``"unr"`` Unroll
``"unr_hint"`` Unroll using compiler directives
``"unr_hint.N"`` Unroll at most N times using compiler directives
``"ilp"`` | ``"ilp.unr"`` Unroll using instruction-level parallelism ``"ilp"`` | ``"ilp.unr"`` Unroll using instruction-level parallelism
``"ilp.seq"`` Realize parallel iname as innermost loop ``"ilp.seq"`` Realize parallel iname as innermost loop
``"like.INAME"`` Can be used when tagging inames to tag like another ``"like.INAME"`` Can be used when tagging inames to tag like another
...@@ -150,6 +219,42 @@ Tag Meaning ...@@ -150,6 +219,42 @@ Tag Meaning
.. }}} .. }}}
Reserved Identifiers
The identifier prefix ``_lp_`` is reserved for internal usage; when creating
*inames*, *argument names*, *temporary variable names*, *substitution rule
names*, *instruction IDs*, and other identifiers, users should *not* use names
beginning with ``_lp_``. This prefix is used for identifiers created
internally when operating on Loopy's kernel IR. For Loopy developers, further
information on name prefixes used within submodules is below.
Identifier Registry
Functionality in :mod:`loopy` *must* use identifiers beginning with ``_lp_`` for
all internally-created identifiers. Additionally, each name beginning with
``_lp_`` must start with one of the reserved prefixes below. New prefixes may
be registered by adding them to the table below. New prefixes may not themselves
be the prefix of an existing prefix.
**Reserved Identifier Prefixes**
======================= ==================================
Reserved Prefix Usage (module or purpose)
======================= ==================================
``_lp_linchk_`` ``loopy.linearization.checker``
======================= ==================================
.. note::
Existing Loopy code may not yet fully satisfy these naming requirements.
Name changes are in progress, and prefixes will be added to this registry
as they are created.
.. _instructions: .. _instructions:
Instructions Instructions
...@@ -157,6 +262,7 @@ Instructions ...@@ -157,6 +262,7 @@ Instructions
.. {{{ .. {{{
.. autoclass:: HappensAfter
.. autoclass:: InstructionBase .. autoclass:: InstructionBase
.. _assignments: .. _assignments:
...@@ -326,15 +432,30 @@ Expressions ...@@ -326,15 +432,30 @@ Expressions
Loopy's expressions are a slight superset of the expressions supported by Loopy's expressions are a slight superset of the expressions supported by
:mod:`pymbolic`. :mod:`pymbolic`.
* ``if`` * ``if(cond, then, else_)``
* ``elif`` (following an ``if``)
* ``else`` (following an ``if`` / ``elif``) * ``a[[ 8*i + j ]]``: Linear subscripts.
See :class:`loopy.symbolic.LinearSubscript`.
* ``reductions`` * ``reductions``
* duplication of reduction inames See :class:`loopy.symbolic.Reduction`.
* ``reduce`` vs ``simul_reduce`` * ``reduce`` vs ``simul_reduce``
* complex-valued arithmetic * complex-valued arithmetic
* tagging of array access and substitution rule use ("$") * tagging of array access and substitution rule use ("$")
See :class:`loopy.symbolic.TaggedVariable`.
* ``indexof``, ``indexof_vec`` * ``indexof``, ``indexof_vec``
* ``cast(type, value)``: No parse syntax currently.
See :class:`loopy.symbolic.TypeCast`.
* If constants in expressions are subclasses of :class:`numpy.generic`,
generated code will contain literals of exactly that type, making them
*explicitly typed*. Constants given as Python types such as :class:`int`,
:class:`float` or :class:`complex` are called *implicitly* typed and
adapt to the type of the expected result.
TODO: Functions TODO: Functions
TODO: Reductions TODO: Reductions
...@@ -342,6 +463,7 @@ TODO: Reductions ...@@ -342,6 +463,7 @@ TODO: Reductions
Function Call Instructions Function Call Instructions
^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^
.. automodule:: loopy
.. autoclass:: CallInstruction .. autoclass:: CallInstruction
C Block Instructions C Block Instructions
...@@ -352,12 +474,14 @@ C Block Instructions ...@@ -352,12 +474,14 @@ C Block Instructions
Atomic Operations Atomic Operations
^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^
.. autoclass:: memory_ordering .. autoclass:: MemoryOrdering
.. autoclass:: memory_scope .. autoclass:: MemoryScope
.. autoclass:: VarAtomicity .. autoclass:: VarAtomicity
.. autoclass:: OrderedAtomic
.. autoclass:: AtomicInit .. autoclass:: AtomicInit
.. autoclass:: AtomicUpdate .. autoclass:: AtomicUpdate
...@@ -372,6 +496,12 @@ Barrier Instructions ...@@ -372,6 +496,12 @@ Barrier Instructions
.. autoclass:: BarrierInstruction .. autoclass:: BarrierInstruction
Instruction Tags
.. autoclass:: LegacyStringInstructionTag
.. autoclass:: UseStreamingStoreTag
.. }}} .. }}}
Data: Arguments and Temporaries Data: Arguments and Temporaries
...@@ -388,24 +518,14 @@ Arguments ...@@ -388,24 +518,14 @@ Arguments
^^^^^^^^^ ^^^^^^^^^
.. autoclass:: KernelArgument .. autoclass:: KernelArgument
.. autoclass:: ValueArg .. autoclass:: ValueArg
.. autoclass:: GlobalArg .. autoclass:: ArrayArg
.. autoclass:: ConstantArg .. autoclass:: ConstantArg
.. autoclass:: ImageArg .. autoclass:: ImageArg
.. _temporaries: .. _temporaries:
...@@ -415,11 +535,9 @@ Temporary Variables ...@@ -415,11 +535,9 @@ Temporary Variables
Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both
have the lifetime of a kernel invocation. have the lifetime of a kernel invocation.
.. autoclass:: temp_var_scope .. autoclass:: AddressSpace
.. autoclass:: TemporaryVariable .. autoclass:: TemporaryVariable
.. _types: .. _types:
...@@ -538,10 +656,10 @@ Helper values ...@@ -538,10 +656,10 @@ Helper values
.. {{{ .. {{{
.. autoclass:: auto
.. autoclass:: UniqueName .. autoclass:: UniqueName
.. autoclass:: Optional
.. }}} .. }}}
Libraries: Extending and Interfacing with External Functionality Libraries: Extending and Interfacing with External Functionality
...@@ -573,10 +691,18 @@ The Kernel Object ...@@ -573,10 +691,18 @@ The Kernel Object
Do not create :class:`LoopKernel` objects directly. Instead, refer to Do not create :class:`LoopKernel` objects directly. Instead, refer to
:ref:`creating-kernels`. :ref:`creating-kernels`.
.. autoclass:: LoopKernel .. automodule:: loopy.kernel
Implementation Details: The Base Array
All array-like data in :mod:`loopy` (such as :class:`ArrayArg` and
:class:`TemporaryVariable`) derive from single, shared base array type,
described next.
.. currentmodule:: loopy.kernel.array
.. autoclass:: ArrayBase
.. autoclass:: kernel_state
.. vim: tw=75:spell:fdm=marker .. vim: tw=75:spell:fdm=marker
Reference: Other Functionality Reference: Other Functionality
============================== ==============================
Auxiliary Data Types
.. automodule:: loopy.typing
Obtaining Kernel Performance Statistics Obtaining Kernel Performance Statistics
--------------------------------------- ---------------------------------------
...@@ -9,6 +14,24 @@ Obtaining Kernel Performance Statistics ...@@ -9,6 +14,24 @@ Obtaining Kernel Performance Statistics
Controlling caching Controlling caching
------------------- -------------------
.. envvar:: LOOPY_NO_CACHE
.. envvar:: CG_NO_CACHE
By default, loopy will cache (on disk) the result of various stages
of code generation to speed up future code generation of the same kernel.
By setting the environment variables :envvar:`LOOPY_NO_CACHE` or
:envvar:`CG_NO_CACHE` to any
string that :func:`pytools.strtobool` evaluates as ``True``, this caching
is suppressed.
If set to a string that :func:`pytools.strtobool` evaluates as ``True``,
loopy will raise an exception if a cache miss occurs. This can be useful
for debugging cache-related issues. For example, it can be used to automatically test whether caching is successful for a particular code, by setting this variable to ``True`` and re-running the code.
.. autofunction:: set_caching_enabled .. autofunction:: set_caching_enabled
.. autoclass:: CacheMode .. autoclass:: CacheMode
...@@ -16,10 +39,11 @@ Controlling caching ...@@ -16,10 +39,11 @@ Controlling caching
Running Kernels Running Kernels
--------------- ---------------
In addition to simply calling kernels using :class:`LoopKernel.__call__`, Use :class:`TranslationUnit.executor` to bind a translation unit
the following underlying functionality may be used: to execution resources, and then use :class:`ExecutorBase.__call__`
to invoke the kernel.
.. autoclass:: CompiledKernel .. autoclass:: ExecutorBase
Automatic Testing Automatic Testing
----------------- -----------------
...@@ -44,3 +68,4 @@ following always works:: ...@@ -44,3 +68,4 @@ following always works::
.. autofunction:: show_dependency_graph .. autofunction:: show_dependency_graph
.. autofunction:: t_unit_to_python
...@@ -50,6 +50,10 @@ Influencing data access ...@@ -50,6 +50,10 @@ Influencing data access
.. autofunction:: set_array_axis_names .. autofunction:: set_array_axis_names
.. automodule:: loopy.transform.privatize
.. autofunction:: allocate_temporaries_for_base_storage
Padding Data Padding Data
------------ ------------
...@@ -74,6 +78,8 @@ Manipulating Instructions ...@@ -74,6 +78,8 @@ Manipulating Instructions
.. autofunction:: add_nosync .. autofunction:: add_nosync
.. autofunction:: add_barrier
Registering Library Routines Registering Library Routines
---------------------------- ----------------------------
...@@ -83,8 +89,6 @@ Registering Library Routines ...@@ -83,8 +89,6 @@ Registering Library Routines
.. autofunction:: register_symbol_manglers .. autofunction:: register_symbol_manglers
.. autofunction:: register_function_manglers
Modifying Arguments Modifying Arguments
------------------- -------------------
...@@ -98,7 +102,7 @@ Modifying Arguments ...@@ -98,7 +102,7 @@ Modifying Arguments
.. autofunction:: rename_argument .. autofunction:: rename_argument
.. autofunction:: set_temporary_scope .. autofunction:: set_temporary_address_space
Creating Batches of Operations Creating Batches of Operations
------------------------------ ------------------------------
...@@ -114,7 +118,7 @@ Finishing up ...@@ -114,7 +118,7 @@ Finishing up
.. autofunction:: generate_loop_schedules .. autofunction:: generate_loop_schedules
.. autofunction:: get_one_scheduled_kernel .. autofunction:: get_one_linearized_kernel
.. autofunction:: save_and_reload_temporaries .. autofunction:: save_and_reload_temporaries
...@@ -140,4 +144,3 @@ TODO: Matching instruction tags ...@@ -140,4 +144,3 @@ TODO: Matching instruction tags
.. automodule:: loopy.match .. automodule:: loopy.match
.. vim: tw=75:spell .. vim: tw=75:spell
.. currentmodule:: loopy
Translation Units
.. automodule:: loopy.translation_unit
This diff is collapsed.
subroutine fill(out, a, n)
implicit none
real_type a, out(n)
integer n, i
do i = 1, n
out(i) = a
end do
do i = 1, n
out(i) = out(i) * factor
end do
!$loopy begin
! SOURCE = lp.c_preprocess(SOURCE, [
! "factor 4.0",
! "real_type real*8",
! ])
! fill, = lp.parse_fortran(SOURCE, FILENAME)
! fill = lp.split_iname(fill, "i", 128,
! outer_tag="g.0", inner_tag="l.0")
! fill = lp.split_iname(fill, "i_1", 128,
! outer_tag="g.0", inner_tag="l.0")
! RESULT = [fill]
!$loopy end
! vim:filetype=floopy
{ {
"metadata": { "cells": [
"name": "", {
"signature": "sha256:c9f8334aa7aa4a5ad1437fa5871aafa52bbc9131271d9e90e7be47d22725cc94" "cell_type": "markdown",
}, "metadata": {},
"nbformat": 3, "source": [
"nbformat_minor": 0, "# Loopy IPython Integration Demo"
"worksheets": [ ]
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%load_ext loopy.ipython_ext"
"cell_type": "markdown",
"metadata": {},
"source": [
"## Without transform code"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"subroutine fill(out, a, n)\n",
" implicit none\n",
" real*8 a, out(n)\n",
" integer n, i\n",
" do i = 1, n\n",
" out(i) = a\n",
" end do\n",
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(prog) # noqa: F821"
{ {
"cells": [ "cell_type": "markdown",
{ "metadata": {},
"cell_type": "markdown", "source": [
"metadata": {}, "## With transform code"
"source": [ ]
"# Loopy IPython Integration Demo" },
] {
}, "cell_type": "code",
{ "execution_count": null,
"cell_type": "code", "metadata": {
"collapsed": false, "collapsed": true
"input": [ },
"%load_ext loopy.ipython_ext" "outputs": [],
], "source": [
"language": "python", "split_amount = 128"
"metadata": {}, ]
"outputs": [], },
"prompt_number": 1 {
}, "cell_type": "code",
{ "execution_count": null,
"cell_type": "markdown", "metadata": {},
"metadata": {}, "outputs": [],
"source": [ "source": [
"## Without transform code" "%%transformed_fortran_kernel\n",
] "\n",
}, "subroutine tr_fill(out, a, n)\n",
{ " implicit none\n",
"cell_type": "code", "\n",
"collapsed": false, " real*8 a, out(n)\n",
"input": [ " integer n, i\n",
"%%fortran_kernel\n", "\n",
"\n", " do i = 1, n\n",
"subroutine fill(out, a, n)\n", " out(i) = a\n",
" implicit none\n", " end do\n",
"\n", "end\n",
" real*8 a, out(n)\n", "\n",
" integer n, i\n", "!$loopy begin\n",
"\n", "!\n",
" do i = 1, n\n", "! tr_fill = lp.parse_fortran(SOURCE)\n",
" out(i) = a\n", "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n",
" end do\n", "! outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
"end" "! RESULT = tr_fill\n",
], "!\n",
"language": "python", "!$loopy end"
"metadata": {}, ]
"outputs": [], },
"prompt_number": 2 {
}, "cell_type": "code",
{ "execution_count": null,
"cell_type": "code", "metadata": {},
"collapsed": false, "outputs": [],
"input": [ "source": [
"print(fill)" "print(prog) # noqa: F821"
], ]
"language": "python",
"metadata": {},
"outputs": [
"output_type": "stream",
"stream": "stdout",
"text": [
"KERNEL: fill\n",
"a: ValueArg, type: float64\n",
"n: ValueArg, type: int32\n",
"out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n",
"[n] -> { [i] : i >= 0 and i <= -1 + n }\n",
"i: None\n",
"[i] out[i] <- a # insn0\n",
"prompt_number": 3
"cell_type": "markdown",
"metadata": {},
"source": [
"## With transform code"
"cell_type": "code",
"collapsed": false,
"input": [
"split_amount = 128"
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
"cell_type": "code",
"collapsed": false,
"input": [
"subroutine tr_fill(out, a, n)\n",
" implicit none\n",
" real*8 a, out(n)\n",
" integer n, i\n",
" do i = 1, n\n",
" out(i) = a\n",
" end do\n",
"!$loopy begin\n",
"! tr_fill, = lp.parse_fortran(SOURCE)\n",
"! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n",
"! outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
"! RESULT = [tr_fill]\n",
"!$loopy end"
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
"cell_type": "code",
"collapsed": false,
"input": [
"language": "python",
"metadata": {},
"outputs": [
"output_type": "stream",
"stream": "stdout",
"text": [
"KERNEL: tr_fill\n",
"a: ValueArg, type: float64\n",
"n: ValueArg, type: int32\n",
"out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n",
"[n] -> { [i_outer, i_inner] : i_inner >= -128i_outer and i_inner <= -1 + n - 128i_outer and i_inner >= 0 and i_inner <= 127 }\n",
"i_inner: l.0\n",
"i_outer: g.0\n",
"[i_inner,i_outer] out[i_inner + i_outer*128] <- a # insn0\n",
"prompt_number": 6
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
"metadata": {}
} }
] ],
} "metadata": {
\ No newline at end of file "kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
"nbformat": 4,
"nbformat_minor": 1
import numpy as np
import numpy.linalg as la
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp
def main():
import pathlib
fn = pathlib.Path(__file__).parent / "matmul.floopy"
with open(fn) as inf:
source =
dgemm = lp.parse_transformed_fortran(source, filename=fn)
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
n = 2048
a = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
b = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
c = cl.array.zeros(queue, (n, n), dtype=np.float64, order="F")
dgemm = lp.set_options(dgemm, write_code=True)
dgemm(queue, a=a, b=b, alpha=1, c=c)
c_ref = (a.get() @ b.get())
assert la.norm(c_ref - c.get())/la.norm(c_ref) < 1e-10
if __name__ == "__main__":