Compare revisions

bf22cc27 · 9a8b5547 · 9a8b5547 · 9a8b5547 · bf22cc27 · bf22cc27
--- a/examples/fortran/foo.floopy
+++ b/examples/fortran/foo.floopy
-subroutine fill(out, a, n)
-  implicit none
-
-  real*8 a, out(n)
-  integer n
-
-  do i = 1, n
-    out(i) = a
-  end do
-  do i = 1, n
-    out(i) = out(i) * 2
-  end do
-end
-
-!$loopy begin transform
-!
-! fill = lp.split_iname(fill, "i", 128,
-!     outer_tag="g.0", inner_tag="l.0")
-! fill = lp.split_iname(fill, "i_1", 128,
-!     outer_tag="g.0", inner_tag="l.0")
-!
-!$loopy end transform
-
-! vim:filetype=floopy
--- a/examples/fortran/ipython-integration-demo.ipynb
+++ b/examples/fortran/ipython-integration-demo.ipynb
+%% Cell type:markdown id: tags:
+
+# Loopy IPython Integration Demo
+
+%% Cell type:code id: tags:
+
+``` python
+%load_ext loopy.ipython_ext
+```
+
+%% Cell type:markdown id: tags:
+
+## Without transform code
+
+%% Cell type:code id: tags:
+
+``` python
+%%fortran_kernel
+
+subroutine fill(out, a, n)
+  implicit none
+
+  real*8 a, out(n)
+  integer n, i
+
+  do i = 1, n
+    out(i) = a
+  end do
+end
+```
+
+%% Cell type:code id: tags:
+
+``` python
+print(prog)  # noqa: F821
+```
+
+%% Cell type:markdown id: tags:
+
+## With transform code
+
+%% Cell type:code id: tags:
+
+``` python
+split_amount = 128
+```
+
+%% Cell type:code id: tags:
+
+``` python
+%%transformed_fortran_kernel
+
+subroutine tr_fill(out, a, n)
+  implicit none
+
+  real*8 a, out(n)
+  integer n, i
+
+  do i = 1, n
+    out(i) = a
+  end do
+end
+
+!$loopy begin
+!
+! tr_fill = lp.parse_fortran(SOURCE)
+! tr_fill = lp.split_iname(tr_fill, "i", split_amount,
+!     outer_tag="g.0", inner_tag="l.0")
+! RESULT = tr_fill
+!
+!$loopy end
+```
+
+%% Cell type:code id: tags:
+
+``` python
+print(prog)  # noqa: F821
+```
+%% Cell type:markdown id: tags:
+
+# Loopy IPython Integration Demo
+
+%% Cell type:code id: tags:
+
+``` python
+%load_ext loopy.ipython_ext
+```
+
+%% Cell type:markdown id: tags:
+
+## Without transform code
+
+%% Cell type:code id: tags:
+
+``` python
+%%fortran_kernel
+
+subroutine fill(out, a, n)
+  implicit none
+
+  real*8 a, out(n)
+  integer n, i
+
+  do i = 1, n
+    out(i) = a
+  end do
+end
+```
+
+%% Cell type:code id: tags:
+
+``` python
+print(prog)  # noqa: F821
+```
+
+%% Cell type:markdown id: tags:
+
+## With transform code
+
+%% Cell type:code id: tags:
+
+``` python
+split_amount = 128
+```
+
+%% Cell type:code id: tags:
+
+``` python
+%%transformed_fortran_kernel
+
+subroutine tr_fill(out, a, n)
+  implicit none
+
+  real*8 a, out(n)
+  integer n, i
+
+  do i = 1, n
+    out(i) = a
+  end do
+end
+
+!$loopy begin
+!
+! tr_fill = lp.parse_fortran(SOURCE)
+! tr_fill = lp.split_iname(tr_fill, "i", split_amount,
+!     outer_tag="g.0", inner_tag="l.0")
+! RESULT = tr_fill
+!
+!$loopy end
+```
+
+%% Cell type:code id: tags:
+
+``` python
+print(prog)  # noqa: F821
+```
--- a/examples/fortran/matmul-driver.py
+++ b/examples/fortran/matmul-driver.py
+import numpy as np
+import numpy.linalg as la
+
+import pyopencl as cl
+import pyopencl.array
+import pyopencl.clrandom
+
+import loopy as lp
+
+
+def main():
+    import pathlib
+    fn = pathlib.Path(__file__).parent / "matmul.floopy"
+
+    with open(fn) as inf:
+        source = inf.read()
+
+    dgemm = lp.parse_transformed_fortran(source, filename=fn)
+
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    n = 2048
+    a = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
+    b = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
+    c = cl.array.zeros(queue, (n, n), dtype=np.float64, order="F")
+    cl.clrandom.fill_rand(a)
+    cl.clrandom.fill_rand(b)
+
+    dgemm = lp.set_options(dgemm, write_code=True)
+
+    dgemm(queue, a=a, b=b, alpha=1, c=c)
+
+    c_ref = (a.get() @ b.get())
+    assert la.norm(c_ref - c.get())/la.norm(c_ref) < 1e-10
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/fortran/matmul.floopy
+++ b/examples/fortran/matmul.floopy
+subroutine dgemm(m,n,l,alpha,a,b,c)
+  implicit none
+  real*8 a(m,l),b(l,n),c(m,n), alpha
+  integer m,n,k,i,j,l
+
+  do j = 1,n
+    do k = 1,l
+      do i = 1,m
+        c(i,j) = c(i,j) + alpha*b(k,j)*a(i,k)
+      end do
+    end do
+  end do
+end subroutine
+
+!$loopy begin
+! dgemm = lp.parse_fortran(SOURCE, FILENAME)
+! dgemm = lp.split_iname(dgemm, "i", 16,
+!         outer_tag="g.0", inner_tag="l.1")
+! dgemm = lp.split_iname(dgemm, "j", 8,
+!         outer_tag="g.1", inner_tag="l.0")
+! dgemm = lp.split_iname(dgemm, "k", 32)
+!
+! dgemm = lp.extract_subst(dgemm, "a_acc", "a[i1,i2]", parameters="i1, i2")
+! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2")
+! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner",
+!         precompute_outer_inames="i_outer, j_outer, k_outer",
+!         default_tag="l.auto")
+! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner",
+!         precompute_outer_inames="i_outer, j_outer, k_outer",
+!         default_tag="l.auto")
+! RESULT = dgemm
+!$loopy end
--- a/examples/fortran/outerprod.py
+++ b/examples/fortran/outerprod.py
-lp_knl = lp.make_kernel(
-	"{[i,j]: 0<=i,j<n}",
-	"c[i,j] = a[i]*b[j]")
-
-lp_knl = lp.add_dtypes(lp_knl, {"a": np.float64, "b": np.float64})
-lp_knl = lp.split_iname(lp_knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
-lp_knl = lp.split_iname(lp_knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
-
--- a/examples/fortran/run-floopy.sh
+++ b/examples/fortran/run-floopy.sh
-#! /bin/sh
-
-NAME="$1"
-shift
-
-python $(which loopy) --lang=floopy "$NAME" - "$@"
--- a/examples/fortran/run-loopy.sh
+++ b/examples/fortran/run-loopy.sh
-#! /bin/sh
-
-python $(which loopy) --lang=loopy "$NAME" - "$@"
--- a/examples/fortran/sparse.floopy
+++ b/examples/fortran/sparse.floopy
@@ -6,6 +6,7 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y)
  real*8 x(n), y(n), rowsum

  integer m, n, rowstart, rowend, length, nvals
+  integer i, j

  do i = 1, m
    rowstart = rowstarts(i)
@@ -21,10 +22,12 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y)
  end do
 end

-!$loopy begin transform
+!$loopy begin
+! sparse = lp.parse_fortran(SOURCE, FILENAME)
 ! sparse = lp.split_iname(sparse, "i", 128)
 ! sparse = lp.tag_inames(sparse, {"i_outer": "g.0"})
 ! sparse = lp.tag_inames(sparse, {"i_inner": "l.0"})
 ! sparse = lp.split_iname(sparse, "j", 4)
 ! sparse = lp.tag_inames(sparse, {"j_inner": "unr"})
-!$loopy end transform
+! RESULT = sparse
+!$loopy end
--- a/examples/fortran/tagging.floopy
+++ b/examples/fortran/tagging.floopy
 subroutine fill(out, a, n)
  implicit none

-  real*8 a, out(n)
-  integer n
+  real_type a, out(n)
+  integer n, i

 !$loopy begin tagged: init
  do i = 1, n
    out(i) = a
  end do
 !$loopy end tagged: init
+
+!$loopy begin tagged: mult
  do i = 1, n
-    out(i) = out(i) * 2
+    out(i) = out(i) * factor
  end do
+!$loopy end tagged: mult
 end

-!$loopy begin transform
+!$loopy begin
 !
+! SOURCE = lp.c_preprocess(SOURCE, [
+!       "factor 4.0",
+!       "real_type real*8",
+!       ])
+! fill = lp.parse_fortran(SOURCE, FILENAME)
+! fill = lp.add_barrier(fill, "tag:init", "tag:mult", "gb1")
 ! fill = lp.split_iname(fill, "i", 128,
 !     outer_tag="g.0", inner_tag="l.0")
 ! fill = lp.split_iname(fill, "i_1", 128,
 !     outer_tag="g.0", inner_tag="l.0")
-!$loopy end transform
+! RESULT = fill
+!
+!$loopy end
+
 ! vim:filetype=floopy
--- a/examples/fortran/volumeKernel.floopy
+++ b/examples/fortran/volumeKernel.floopy
@@ -65,8 +65,9 @@ subroutine volumeKernel(elements, Nfields, Ngeo, Ndim, Dop, geo, Q, rhsQ  )

 end subroutine volumeKernel

-!$loopy begin transform
+!$loopy begin
 !
+! volumeKernel = lp.parse_fortran(SOURCE, FILENAME)
 ! volumeKernel = lp.split_iname(volumeKernel,
 !     "e", 32, outer_tag="g.1", inner_tag="g.0")
 ! volumeKernel = lp.fix_parameters(volumeKernel,
@@ -75,5 +76,6 @@ end subroutine volumeKernel
 !     i="l.0", j="l.1", k="l.2",
 !     i_1="l.0", j_1="l.1", k_1="l.2"
 !     ))
+! RESULT = volumeKernel
 !
-!$loopy end transform
+!$loopy end
--- a/examples/fortran/volumeKernelSimple.floopy
+++ b/examples/fortran/volumeKernelSimple.floopy
-subroutine volumeKernel(elements, Nfields, Ngeo, Ndim, Dop, geo, Q, rhsQ  )
-
-  implicit none
-
-  integer elements, Nfields, Ngeo, Ndim
-
-  real*4 Dop(Nq,Nq)
-  real*4 Q(Nq,Nq,Nq,Nfields,elements)
-  real*4 geo(Nq,Nq,Nq,Ngeo,elements)
-  real*4 rhsQ(Nq,Nq,Nq,Nfields,elements)
-
-  integer e,i,j,k,d,n,cnt
-
-  real*4 u,v,w,p, dFdr, dFds, dFdt, divF
-  real*4 F(Nq,Ndim)
-
-
-  do e=1,elements
-     do i=1,Nq
-
-        F(i,1) = 5
-        F(i,2) = 7
-
-     end do
-
-  end do
-
-end subroutine volumeKernel
-
-!$loopy begin transform
-!
-! volumeKernel = lp.fix_parameters(volumeKernel,
-!     Nq=5, Ndim=3)
-! volumeKernel = lp.tag_inames(volumeKernel, dict(i="l.0"))
-!
-!$loopy end transform
--- a/examples/python/call-external.py
+++ b/examples/python/call-external.py
+import numpy as np
+from constantdict import constantdict
+
+import loopy as lp
+from loopy.diagnostic import LoopyError
+from loopy.target.c import CTarget
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
+
+
+# {{{ blas callable
+
+class CBLASGEMV(lp.ScalarCallable):
+    def with_types(self, arg_id_to_dtype, callables_table):
+        mat_dtype = arg_id_to_dtype.get(0)
+        vec_dtype = arg_id_to_dtype.get(1)
+
+        if mat_dtype is None or vec_dtype is None:
+            # types aren't specialized enough to be resolved
+            return self, callables_table
+
+        if mat_dtype != vec_dtype:
+            raise LoopyError("GEMV requires same dtypes for matrix and "
+                             "vector")
+
+        if vec_dtype.numpy_dtype == np.float32:
+            name_in_target = "cblas_sgemv"
+        elif vec_dtype. numpy_dtype == np.float64:
+            name_in_target = "cblas_dgemv"
+        else:
+            raise LoopyError("GEMV is only supported for float32 and float64 "
+                             "types")
+
+        return (self.copy(name_in_target=name_in_target,
+                          arg_id_to_dtype=constantdict({
+                              0: vec_dtype,
+                              1: vec_dtype,
+                              -1: vec_dtype})),
+                callables_table)
+
+    def with_descrs(self, arg_id_to_descr, callables_table):
+        mat_descr = arg_id_to_descr.get(0)
+        vec_descr = arg_id_to_descr.get(1)
+        res_descr = arg_id_to_descr.get(-1)
+
+        if mat_descr is None or vec_descr is None or res_descr is None:
+            # shapes aren't specialized enough to be resolved
+            return self, callables_table
+
+        assert mat_descr.shape[1] == vec_descr.shape[0]
+        assert mat_descr.shape[0] == res_descr.shape[0]
+        assert len(vec_descr.shape) == len(res_descr.shape) == 1
+        # handling only the easy case when stride == 1
+        assert vec_descr.dim_tags[0].stride == 1
+        assert mat_descr.dim_tags[1].stride == 1
+        assert res_descr.dim_tags[0].stride == 1
+
+        return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table
+
+    def emit_call_insn(self, insn, target, expression_to_code_mapper):
+        from pymbolic import var
+        mat_descr = self.arg_id_to_descr[0]
+        m, n = mat_descr.shape
+        ecm = expression_to_code_mapper
+        mat, vec = insn.expression.parameters
+        result, = insn.assignees
+
+        c_parameters = [var("CblasRowMajor"),
+                        var("CblasNoTrans"),
+                        m, n,
+                        1,
+                        ecm(mat).expr,
+                        1,
+                        ecm(vec).expr,
+                        1,
+                        ecm(result).expr,
+                        1]
+        return (var(self.name_in_target)(*c_parameters),
+                False  # cblas_gemv does not return anything
+                )
+
+    def generate_preambles(self, target):
+        assert isinstance(target, CTarget)
+        yield ("99_cblas", "#include <cblas.h>")
+        return
+
+# }}}
+
+
+n = 10
+
+knl = lp.make_kernel(
+        "{:}",
+        """
+        y[:] = gemv(A[:, :], x[:])
+        """, [
+            lp.GlobalArg("A", dtype=np.float64, shape=(n, n)),
+            lp.GlobalArg("x", dtype=np.float64, shape=(n, )),
+            lp.GlobalArg("y", shape=(n, )), ...],
+        target=CTarget())
+
+knl = lp.register_callable(knl, "gemv", CBLASGEMV(name="gemv"))
+print(lp.generate_code_v2(knl).device_code())
--- a/examples/python/find-centers.py
+++ b/examples/python/find-centers.py
+import numpy as np
+
+import pyopencl as cl
+
+import loopy as lp
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
+
+
+cl_ctx = cl.create_some_context()
+
+knl = lp.make_kernel(
+    "{[ictr,itgt,idim]: "
+    "0<=itgt<ntargets "
+    "and 0<=ictr<ncenters "
+    "and 0<=idim<ambient_dim}",
+
+    """
+    for itgt
+        for ictr
+            <> dist_sq = sum(idim,
+                    (tgt[idim,itgt] - center[idim,ictr])**2)
+            <> in_disk = dist_sq < (radius[ictr]*1.05)**2
+            <> matches = (
+                    (in_disk
+                        and qbx_forced_limit == 0)
+                    or (in_disk
+                            and qbx_forced_limit != 0
+                            and qbx_forced_limit * center_side[ictr] > 0)
+                    )
+
+            <> post_dist_sq = dist_sq if matches else HUGE
+        end
+        <> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq)
+
+        tgt_to_qbx_center[itgt] = min_ictr if min_dist_sq < HUGE else -1
+    end
+    """)
+
+knl = lp.fix_parameters(knl, ambient_dim=2)
+knl = lp.add_and_infer_dtypes(knl, {
+        "tgt,center,radius,HUGE": np.float32,
+        "center_side,qbx_forced_limit": np.int32,
+        })
+
+lp.auto_test_vs_ref(knl, cl_ctx, knl, parameters={
+        "HUGE": 1e20, "ncenters": 200, "ntargets": 300,
+        "qbx_forced_limit": 1})
--- a/examples/python/global_barrier_removal.py
+++ b/examples/python/global_barrier_removal.py
+import numpy as np
+
+import loopy as lp
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
+
+
+knl = lp.make_kernel(
+        "{ [i,k]: 0<=i<n and 0<=k<3 }",
+        """
+        for i, k
+            ... gbarrier
+            c[k,i] = a[k, i + 1]
+            ... gbarrier
+            out[k,i] = c[k,i]
+        end
+        """, seq_dependencies=True)
+
+# transform
+knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+knl = lp.add_and_infer_dtypes(knl,
+        {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
+
+# schedule
+from loopy.preprocess import preprocess_kernel
+
+
+knl = preprocess_kernel(knl)
+
+from loopy.schedule import get_one_linearized_kernel
+
+
+knl = knl.with_kernel(get_one_linearized_kernel(knl["loopy_kernel"],
+                                                knl.callables_table))
+
+
+# map schedule onto host or device
+print(knl)
+
+cgr = lp.generate_code_v2(knl)
+
+print(cgr.device_code())
+print(cgr.host_code())
--- a/examples/python/hello-loopy.loopy
+++ b/examples/python/hello-loopy.loopy
+# This is a version of hello-loopy.py that can be run through
+# a loopy binary using
+#
+# ./loopy --lang=loopy hello-loopy.loopy -
+
+knl = lp.make_kernel(
+        "{ [i]: 0<=i<n }",
+        "out[i] = 2*a[i]")
+
+knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+lp_knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0")
--- a/examples/python/hello-loopy.py
+++ b/examples/python/hello-loopy.py
 import numpy as np
-import loopy as lp
+
 import pyopencl as cl
 import pyopencl.array

+import loopy as lp
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
+
+
 # setup
 # -----
 ctx = cl.create_some_context()
@@ -23,8 +27,12 @@ knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

 # execute
 # -------
+# easy, slower:
 evt, (out,) = knl(queue, a=a)
+# efficient, with caching:
+knl_ex = knl.executor(ctx)
+evt, (out,) = knl_ex(queue, a=a)
 # ENDEXAMPLE

-cknl = lp.CompiledKernel(ctx, knl)
-print(cknl.get_highlighted_code({"a": np.float32}))
+knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.float32)})
+print(lp.generate_code_v2(knl).device_code())
--- a/examples/python/ispc-stream-harness.py
+++ b/examples/python/ispc-stream-harness.py
+import ctypes
+import ctypes.util
+import os
+from tempfile import TemporaryDirectory
+from time import time
+
+import numpy as np
+import numpy.linalg as la
+
+import loopy as lp
+from loopy.tools import (
+    address_from_numpy,
+    build_ispc_shared_lib,
+    cptr_from_numpy,
+    empty_aligned,
+)
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
+
+
+def transform(knl, vars, stream_dtype):
+    vars = [v.strip() for v in vars.split(",")]
+    knl = lp.assume(knl, "n>0")
+    knl = lp.split_iname(
+        knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
+    knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
+
+    knl = lp.add_and_infer_dtypes(knl, dict.fromkeys(vars, stream_dtype))
+
+    knl = lp.set_argument_order(knl, [*vars, "n"])
+
+    return knl
+
+
+def gen_code(knl):
+    codegen_result = lp.generate_code_v2(knl)
+
+    return codegen_result.device_code() + "\n" + codegen_result.host_code()
+
+
+NRUNS = 10
+ALIGN_TO = 4096
+ARRAY_SIZE = 2**28
+
+if 0:
+    STREAM_DTYPE = np.float64
+    STREAM_CTYPE = ctypes.c_double
+else:
+    STREAM_DTYPE = np.float32
+    STREAM_CTYPE = ctypes.c_float
+
+if 1:
+    INDEX_DTYPE = np.int32
+    INDEX_CTYPE = ctypes.c_int
+else:
+    INDEX_DTYPE = np.int64
+    INDEX_CTYPE = ctypes.c_longlong
+
+
+def main():
+    this_dir = os.path.dirname(__file__)
+    with open(os.path.join(this_dir, "tasksys.cpp")) as ts_file:
+        tasksys_source = ts_file.read()
+
+    def make_knl(name, insn, vars):
+        knl = lp.make_kernel(
+                "{[i]: 0<=i<n}",
+                insn,
+                target=lp.ISPCTarget(), index_dtype=INDEX_DTYPE,
+                name="stream_"+name+"_tasks")
+
+        knl = transform(knl, vars, STREAM_DTYPE)
+        return knl
+
+    init_knl = make_knl("init", """
+                a[i] = 1
+                b[i] = 2
+                c[i] = 0
+                """, "a,b,c")
+    triad_knl = make_knl("triad", """
+            a[i] = b[i] + scalar * c[i]
+            """, "a,b,c,scalar")
+
+    with TemporaryDirectory() as tmpdir:
+        ispc_code = gen_code(init_knl) + gen_code(triad_knl)
+        print(ispc_code)
+
+        build_ispc_shared_lib(
+                tmpdir,
+                [("stream.ispc", ispc_code)],
+                [("tasksys.cpp", tasksys_source)],
+                cxx_options=["-g", "-fopenmp", "-DISPC_USE_OMP"],
+                ispc_options=([
+                    # "-g", "--no-omit-frame-pointer",
+                    "--target=avx2-i32x8",
+                    "--opt=force-aligned-memory",
+                    "--opt=disable-loop-unroll",
+                    # "--opt=fast-math",
+                    # "--opt=disable-fma",
+                    ]
+                    + (["--addressing=64"] if INDEX_DTYPE == np.int64 else [])
+                    ),
+                # ispc_bin="/home/andreask/pack/ispc-v1.9.0-linux/ispc",
+                quiet=False,
+                )
+
+        knl_lib = ctypes.cdll.LoadLibrary(os.path.join(tmpdir, "shared.so"))
+
+        scalar = 5
+
+        a = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO)
+        b = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO)
+        c = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO)
+
+        print(
+                hex(address_from_numpy(a)),
+                hex(address_from_numpy(b)),
+                hex(address_from_numpy(c)))
+        assert address_from_numpy(a) % ALIGN_TO == 0
+        assert address_from_numpy(b) % ALIGN_TO == 0
+        assert address_from_numpy(c) % ALIGN_TO == 0
+
+        knl_lib.stream_init_tasks(
+                cptr_from_numpy(a),
+                cptr_from_numpy(b),
+                cptr_from_numpy(c),
+                INDEX_CTYPE(ARRAY_SIZE),
+                )
+
+        def call_kernel():
+            knl_lib.stream_triad_tasks(
+                    cptr_from_numpy(a),
+                    cptr_from_numpy(b),
+                    cptr_from_numpy(c),
+                    STREAM_CTYPE(scalar),
+                    INDEX_CTYPE(ARRAY_SIZE),
+                    )
+
+        call_kernel()
+        call_kernel()
+
+        start_time = time()
+
+        for _irun in range(NRUNS):
+            call_kernel()
+
+        elapsed = time() - start_time
+
+        print(elapsed/NRUNS)
+
+        print(1e-9*3*a.nbytes*NRUNS/elapsed, "GB/s")
+
+        assert la.norm(a-b+scalar*c, np.inf) < np.finfo(STREAM_DTYPE).eps * 10
+
+
+if __name__ == "__main__":
+    main()
+
+# vim: foldmethod=marker
--- a/examples/python/rank-one.py
+++ b/examples/python/rank-one.py
 # SETUPBEGIN
 import numpy as np
+
 import pyopencl as cl
+
 import loopy as lp
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
+

 ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)

-knl = lp.make_kernel(queue.device,
-    "{[i,j]: 0<=i,j<n}",
+knl = lp.make_kernel(
+    "{[i, j]: 0<=i<n and 0<=j<n}",
    "c[i, j] = a[i]*b[j]",
    assumptions="n >= 16")

 a = np.arange(200, dtype=np.float32)
 b = np.arange(200, dtype=np.float32)

-evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
+knl = lp.set_options(knl, write_code=True)
+evt, (c,) = knl(queue, a=a, b=b)
 # SETUPEND

 orig_knl = knl
@@ -26,25 +31,36 @@ knl = lp.split_iname(knl, "j", 16,
        outer_tag="g.1", inner_tag="l.1")
 # SPLITEND

-evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
+knl = lp.set_options(knl, write_code=True)
+evt, (c,) = knl(queue, a=a, b=b)

 split_knl = knl

 # PREFETCH1BEGIN
-knl = lp.add_prefetch(knl, "a")
-knl = lp.add_prefetch(knl, "b")
+knl = lp.add_prefetch(knl, "a",
+        fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
+knl = lp.add_prefetch(knl, "b",
+        fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
 # PREFETCH1END

-evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
+knl = lp.set_options(knl, write_code=True)
+evt, (c,) = knl(queue, a=a, b=b)

 knl = split_knl

 # PREFETCH2BEGIN
-knl = lp.add_prefetch(knl, "a", ["i_inner"])
-knl = lp.add_prefetch(knl, "b", ["j_inner"])
+knl = lp.add_prefetch(knl, "a", ["i_inner"],
+        fetch_outer_inames="i_outer, j_outer, j_inner",
+        temporary_address_space=lp.AddressSpace.LOCAL,
+        default_tag="l.0")
+knl = lp.add_prefetch(knl, "b", ["j_inner"],
+        fetch_outer_inames="i_outer, j_outer, j_inner",
+        temporary_address_space=lp.AddressSpace.LOCAL,
+        default_tag="l.0")
 # PREFETCH2END

-evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
+knl = lp.set_options(knl, write_code=True)
+evt, (c,) = knl(queue, a=a, b=b)

 knl = orig_knl

@@ -54,8 +70,10 @@ knl = lp.split_iname(knl, "i", 256,
 knl = lp.split_iname(knl, "j", 256,
        outer_tag="g.1", slabs=(0, 1))

-knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None)
-knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None)
+knl = lp.add_prefetch(knl, "a", ["i_inner"],
+        fetch_outer_inames="i_outer, j_outer", default_tag=None)
+knl = lp.add_prefetch(knl, "b", ["j_inner"],
+        fetch_outer_inames="i_outer, j_outer", default_tag=None)

 knl = lp.split_iname(knl, "i_inner", 16,
        inner_tag="l.0")
@@ -68,4 +86,5 @@ knl = lp.split_iname(knl, "a_dim_0", 16,
        outer_tag="l.1", inner_tag="l.0")
 # PREFETCH3END

-evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
+knl = lp.set_options(knl, write_code=True)
+evt, (c,) = knl(queue, a=a, b=b)
--- a/examples/python/run-ispc-harness.sh
+++ b/examples/python/run-ispc-harness.sh
+#! /bin/bash
+
+OMP_PLACES=cores OMP_DISPLAY_ENV=true OMP_SCHEDULE=static python "$@"
--- a/examples/python/sparse.py
+++ b/examples/python/sparse.py
+import numpy as np
+
+import loopy as lp
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
+
+
+k = lp.make_kernel([
+    "{ [i] : 0 <= i < m }",
+    "{ [j] : 0 <= j < length }"],
+    """
+    for i
+        <> rowstart = rowstarts[i]
+        <> rowend = rowstarts[i+1]
+        <> length = rowend - rowstart
+        y[i] = sum(j, values[rowstart+j] * x[colindices[rowstart + j]])
+    end
+    """, name="spmv")
+
+k = lp.add_and_infer_dtypes(k, {
+    "values,x": np.float64, "rowstarts,colindices": k["spmv"].index_dtype
+    })
+print(lp.generate_code_v2(k).device_code())
No results found