Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tasmith4/loopy
  • ben_sepanski/loopy
  • arghdos/loopy
  • inducer/loopy
  • wence-/loopy
  • isuruf/loopy
  • fikl2/loopy
  • xywei/loopy
  • kaushikcfd/loopy
  • zweiner2/loopy
10 results
Show changes
Showing
with 2515 additions and 871 deletions
subroutine dgemm(m,n,l,alpha,a,b,c)
implicit none
real*8 temp, a(m,l),b(l,n),c(m,n), alpha
real*8 a(m,l),b(l,n),c(m,n), alpha
integer m,n,k,i,j,l
do j = 1,n
......@@ -13,7 +13,7 @@ subroutine dgemm(m,n,l,alpha,a,b,c)
end subroutine
!$loopy begin
! dgemm, = lp.parse_fortran(SOURCE, FILENAME)
! dgemm = lp.parse_fortran(SOURCE, FILENAME)
! dgemm = lp.split_iname(dgemm, "i", 16,
! outer_tag="g.0", inner_tag="l.1")
! dgemm = lp.split_iname(dgemm, "j", 8,
......@@ -22,7 +22,11 @@ end subroutine
!
! dgemm = lp.extract_subst(dgemm, "a_acc", "a[i1,i2]", parameters="i1, i2")
! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2")
! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner")
! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner")
! RESULT = [dgemm]
! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner",
! precompute_outer_inames="i_outer, j_outer, k_outer",
! default_tag="l.auto")
! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner",
! precompute_outer_inames="i_outer, j_outer, k_outer",
! default_tag="l.auto")
! RESULT = dgemm
!$loopy end
......@@ -23,11 +23,11 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y)
end
!$loopy begin
! sparse, = lp.parse_fortran(SOURCE, FILENAME)
! sparse = lp.parse_fortran(SOURCE, FILENAME)
! sparse = lp.split_iname(sparse, "i", 128)
! sparse = lp.tag_inames(sparse, {"i_outer": "g.0"})
! sparse = lp.tag_inames(sparse, {"i_inner": "l.0"})
! sparse = lp.split_iname(sparse, "j", 4)
! sparse = lp.tag_inames(sparse, {"j_inner": "unr"})
! RESULT = [sparse]
! RESULT = sparse
!$loopy end
subroutine fill(out, a, n)
implicit none
real*8 a, out(n)
real_type a, out(n)
integer n, i
!$loopy begin tagged: init
......@@ -9,17 +9,28 @@ subroutine fill(out, a, n)
out(i) = a
end do
!$loopy end tagged: init
!$loopy begin tagged: mult
do i = 1, n
out(i) = out(i) * 2
out(i) = out(i) * factor
end do
!$loopy end tagged: mult
end
!$loopy begin
! fill, = lp.parse_fortran(SOURCE, FILENAME)
!
! SOURCE = lp.c_preprocess(SOURCE, [
! "factor 4.0",
! "real_type real*8",
! ])
! fill = lp.parse_fortran(SOURCE, FILENAME)
! fill = lp.add_barrier(fill, "tag:init", "tag:mult", "gb1")
! fill = lp.split_iname(fill, "i", 128,
! outer_tag="g.0", inner_tag="l.0")
! fill = lp.split_iname(fill, "i_1", 128,
! outer_tag="g.0", inner_tag="l.0")
! RESULT = [fill]
! RESULT = fill
!
!$loopy end
! vim:filetype=floopy
......@@ -67,7 +67,7 @@ end subroutine volumeKernel
!$loopy begin
!
! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME)
! volumeKernel = lp.parse_fortran(SOURCE, FILENAME)
! volumeKernel = lp.split_iname(volumeKernel,
! "e", 32, outer_tag="g.1", inner_tag="g.0")
! volumeKernel = lp.fix_parameters(volumeKernel,
......@@ -76,6 +76,6 @@ end subroutine volumeKernel
! i="l.0", j="l.1", k="l.2",
! i_1="l.0", j_1="l.1", k_1="l.2"
! ))
! RESULT = [volumeKernel]
! RESULT = volumeKernel
!
!$loopy end
subroutine volumeKernel(elements, Nfields, Ngeo, Ndim, Dop, geo, Q, rhsQ )
implicit none
integer elements, Nfields, Ngeo, Ndim
real*4 Dop(Nq,Nq)
real*4 Q(Nq,Nq,Nq,Nfields,elements)
real*4 geo(Nq,Nq,Nq,Ngeo,elements)
real*4 rhsQ(Nq,Nq,Nq,Nfields,elements)
integer e,i,j,k,d,n,cnt
real*4 u,v,w,p, dFdr, dFds, dFdt, divF
real*4 F(Nq,Ndim)
do e=1,elements
do i=1,Nq
F(i,1) = 5
F(i,2) = 7
end do
end do
end subroutine volumeKernel
!$loopy begin
!
! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME)
! volumeKernel = lp.fix_parameters(volumeKernel,
! Nq=5, Ndim=3)
! volumeKernel = lp.tag_inames(volumeKernel, dict(i="l.0"))
! RESULT = [volumeKernel]
!
!$loopy end
import numpy as np
from constantdict import constantdict
import loopy as lp
from loopy.diagnostic import LoopyError
from loopy.target.c import CTarget
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
# {{{ blas callable
class CBLASGEMV(lp.ScalarCallable):
def with_types(self, arg_id_to_dtype, callables_table):
mat_dtype = arg_id_to_dtype.get(0)
vec_dtype = arg_id_to_dtype.get(1)
if mat_dtype is None or vec_dtype is None:
# types aren't specialized enough to be resolved
return self, callables_table
if mat_dtype != vec_dtype:
raise LoopyError("GEMV requires same dtypes for matrix and "
"vector")
if vec_dtype.numpy_dtype == np.float32:
name_in_target = "cblas_sgemv"
elif vec_dtype. numpy_dtype == np.float64:
name_in_target = "cblas_dgemv"
else:
raise LoopyError("GEMV is only supported for float32 and float64 "
"types")
return (self.copy(name_in_target=name_in_target,
arg_id_to_dtype=constantdict({
0: vec_dtype,
1: vec_dtype,
-1: vec_dtype})),
callables_table)
def with_descrs(self, arg_id_to_descr, callables_table):
mat_descr = arg_id_to_descr.get(0)
vec_descr = arg_id_to_descr.get(1)
res_descr = arg_id_to_descr.get(-1)
if mat_descr is None or vec_descr is None or res_descr is None:
# shapes aren't specialized enough to be resolved
return self, callables_table
assert mat_descr.shape[1] == vec_descr.shape[0]
assert mat_descr.shape[0] == res_descr.shape[0]
assert len(vec_descr.shape) == len(res_descr.shape) == 1
# handling only the easy case when stride == 1
assert vec_descr.dim_tags[0].stride == 1
assert mat_descr.dim_tags[1].stride == 1
assert res_descr.dim_tags[0].stride == 1
return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table
def emit_call_insn(self, insn, target, expression_to_code_mapper):
from pymbolic import var
mat_descr = self.arg_id_to_descr[0]
m, n = mat_descr.shape
ecm = expression_to_code_mapper
mat, vec = insn.expression.parameters
result, = insn.assignees
c_parameters = [var("CblasRowMajor"),
var("CblasNoTrans"),
m, n,
1,
ecm(mat).expr,
1,
ecm(vec).expr,
1,
ecm(result).expr,
1]
return (var(self.name_in_target)(*c_parameters),
False # cblas_gemv does not return anything
)
def generate_preambles(self, target):
assert isinstance(target, CTarget)
yield ("99_cblas", "#include <cblas.h>")
return
# }}}
n = 10
knl = lp.make_kernel(
"{:}",
"""
y[:] = gemv(A[:, :], x[:])
""", [
lp.GlobalArg("A", dtype=np.float64, shape=(n, n)),
lp.GlobalArg("x", dtype=np.float64, shape=(n, )),
lp.GlobalArg("y", shape=(n, )), ...],
target=CTarget())
knl = lp.register_callable(knl, "gemv", CBLASGEMV(name="gemv"))
print(lp.generate_code_v2(knl).device_code())
import numpy as np
import loopy as lp
import pyopencl as cl
cl_ctx = cl.create_some_context(interactive=True)
import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
cl_ctx = cl.create_some_context()
knl = lp.make_kernel(
"{[ictr,itgt,idim]: "
......@@ -24,11 +28,11 @@ knl = lp.make_kernel(
and qbx_forced_limit * center_side[ictr] > 0)
)
<> post_dist_sq = if(matches, dist_sq, HUGE)
<> post_dist_sq = dist_sq if matches else HUGE
end
<> min_dist_sq, <> min_ictr = argmin(ictr, post_dist_sq)
<> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq)
tgt_to_qbx_center[itgt] = if(min_dist_sq < HUGE, min_ictr, -1)
tgt_to_qbx_center[itgt] = min_ictr if min_dist_sq < HUGE else -1
end
""")
......
import numpy as np
import loopy as lp
import pyopencl as cl
import pyopencl.array
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
knl = lp.make_kernel(
"{ [i,k]: 0<=i<n and 0<=k<3 }",
"""
c[k,i] = a[k, i + 1]
out[k,i] = c[k,i]
""")
for i, k
... gbarrier
c[k,i] = a[k, i + 1]
... gbarrier
out[k,i] = c[k,i]
end
""", seq_dependencies=True)
# transform
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
from loopy.kernel.tools import add_dtypes
knl = add_dtypes(knl,
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
from loopy.schedule import get_one_linearized_kernel
knl = knl.with_kernel(get_one_linearized_kernel(knl["loopy_kernel"],
knl.callables_table))
# map schedule onto host or device
print(knl)
......
# This is a version of hello-loopy.py that can be run through
# a loopy binary using
#
# ./loopy --lang=loopy hello-loopy-lp.py -
# ./loopy --lang=loopy hello-loopy.loopy -
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
......
import numpy as np
import loopy as lp
import pyopencl as cl
import pyopencl.array
import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
# setup
# -----
ctx = cl.create_some_context()
......@@ -23,7 +27,11 @@ knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# execute
# -------
# easy, slower:
evt, (out,) = knl(queue, a=a)
# efficient, with caching:
knl_ex = knl.executor(ctx)
evt, (out,) = knl_ex(queue, a=a)
# ENDEXAMPLE
knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.float32)})
......
import loopy as lp
import numpy as np
import numpy.linalg as la
import ctypes
import ctypes.util
import os
from time import time
from tempfile import TemporaryDirectory
from time import time
from loopy.tools import (empty_aligned, address_from_numpy,
build_ispc_shared_lib, cptr_from_numpy)
import numpy as np
import numpy.linalg as la
import loopy as lp
from loopy.tools import (
address_from_numpy,
build_ispc_shared_lib,
cptr_from_numpy,
empty_aligned,
)
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
def transform(knl, vars, stream_dtype):
......@@ -18,19 +24,14 @@ def transform(knl, vars, stream_dtype):
knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
knl = lp.add_and_infer_dtypes(knl, {
var: stream_dtype
for var in vars
})
knl = lp.add_and_infer_dtypes(knl, dict.fromkeys(vars, stream_dtype))
knl = lp.set_argument_order(knl, vars + ["n"])
knl = lp.set_argument_order(knl, [*vars, "n"])
return knl
def gen_code(knl):
knl = lp.preprocess_kernel(knl)
knl = lp.get_one_scheduled_kernel(knl)
codegen_result = lp.generate_code_v2(knl)
return codegen_result.device_code() + "\n" + codegen_result.host_code()
......@@ -56,7 +57,8 @@ else:
def main():
with open("tasksys.cpp", "r") as ts_file:
this_dir = os.path.dirname(__file__)
with open(os.path.join(this_dir, "tasksys.cpp")) as ts_file:
tasksys_source = ts_file.read()
def make_knl(name, insn, vars):
......@@ -88,16 +90,16 @@ def main():
[("tasksys.cpp", tasksys_source)],
cxx_options=["-g", "-fopenmp", "-DISPC_USE_OMP"],
ispc_options=([
#"-g", "--no-omit-frame-pointer",
# "-g", "--no-omit-frame-pointer",
"--target=avx2-i32x8",
"--opt=force-aligned-memory",
"--opt=disable-loop-unroll",
#"--opt=fast-math",
#"--opt=disable-fma",
# "--opt=fast-math",
# "--opt=disable-fma",
]
+ (["--addressing=64"] if INDEX_DTYPE == np.int64 else [])
),
#ispc_bin="/home/andreask/pack/ispc-v1.9.0-linux/ispc",
# ispc_bin="/home/andreask/pack/ispc-v1.9.0-linux/ispc",
quiet=False,
)
......@@ -138,7 +140,7 @@ def main():
start_time = time()
for irun in range(NRUNS):
for _irun in range(NRUNS):
call_kernel()
elapsed = time() - start_time
......
# SETUPBEGIN
import numpy as np
import pyopencl as cl
import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(queue.device,
"{[i,j]: 0<=i,j<n}",
knl = lp.make_kernel(
"{[i, j]: 0<=i<n and 0<=j<n}",
"c[i, j] = a[i]*b[j]",
assumptions="n >= 16")
a = np.arange(200, dtype=np.float32)
b = np.arange(200, dtype=np.float32)
evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
knl = lp.set_options(knl, write_code=True)
evt, (c,) = knl(queue, a=a, b=b)
# SETUPEND
orig_knl = knl
......@@ -26,25 +31,36 @@ knl = lp.split_iname(knl, "j", 16,
outer_tag="g.1", inner_tag="l.1")
# SPLITEND
evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
knl = lp.set_options(knl, write_code=True)
evt, (c,) = knl(queue, a=a, b=b)
split_knl = knl
# PREFETCH1BEGIN
knl = lp.add_prefetch(knl, "a")
knl = lp.add_prefetch(knl, "b")
knl = lp.add_prefetch(knl, "a",
fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
knl = lp.add_prefetch(knl, "b",
fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
# PREFETCH1END
evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
knl = lp.set_options(knl, write_code=True)
evt, (c,) = knl(queue, a=a, b=b)
knl = split_knl
# PREFETCH2BEGIN
knl = lp.add_prefetch(knl, "a", ["i_inner"])
knl = lp.add_prefetch(knl, "b", ["j_inner"])
knl = lp.add_prefetch(knl, "a", ["i_inner"],
fetch_outer_inames="i_outer, j_outer, j_inner",
temporary_address_space=lp.AddressSpace.LOCAL,
default_tag="l.0")
knl = lp.add_prefetch(knl, "b", ["j_inner"],
fetch_outer_inames="i_outer, j_outer, j_inner",
temporary_address_space=lp.AddressSpace.LOCAL,
default_tag="l.0")
# PREFETCH2END
evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
knl = lp.set_options(knl, write_code=True)
evt, (c,) = knl(queue, a=a, b=b)
knl = orig_knl
......@@ -54,8 +70,10 @@ knl = lp.split_iname(knl, "i", 256,
knl = lp.split_iname(knl, "j", 256,
outer_tag="g.1", slabs=(0, 1))
knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None)
knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None)
knl = lp.add_prefetch(knl, "a", ["i_inner"],
fetch_outer_inames="i_outer, j_outer", default_tag=None)
knl = lp.add_prefetch(knl, "b", ["j_inner"],
fetch_outer_inames="i_outer, j_outer", default_tag=None)
knl = lp.split_iname(knl, "i_inner", 16,
inner_tag="l.0")
......@@ -68,4 +86,5 @@ knl = lp.split_iname(knl, "a_dim_0", 16,
outer_tag="l.1", inner_tag="l.0")
# PREFETCH3END
evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
knl = lp.set_options(knl, write_code=True)
evt, (c,) = knl(queue, a=a, b=b)
import loopy as lp
import numpy as np
import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
k = lp.make_kernel([
"{ [i] : 0 <= i < m }",
"{ [j] : 0 <= j < length }"],
"""
<> rowstart = rowstarts[i]
<> rowend = rowstarts[i]
<> length = rowend - rowstart
y[i] = sum(j, values[rowstart+j] * x[colindices[rowstart + j]])
""")
for i
<> rowstart = rowstarts[i]
<> rowend = rowstarts[i+1]
<> length = rowend - rowstart
y[i] = sum(j, values[rowstart+j] * x[colindices[rowstart + j]])
end
""", name="spmv")
k = lp.add_and_infer_dtypes(k, {
"values,x": np.float64, "rowstarts,colindices": k.index_dtype
"values,x": np.float64, "rowstarts,colindices": k["spmv"].index_dtype
})
print(lp.generate_code(k)[0])
print(lp.generate_code_v2(k).device_code())
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#define int_floor_div_pos_b(a,b) ( ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) )
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float4 const *__restrict__ a, int const n, __global float4 *__restrict__ out)
{
/* bulk slab for 'i_outer' */
for (int i_outer = 0; i_outer <= -2 + int_floor_div_pos_b(3 + n, 4); ++i_outer)
out[i_outer] = 2.0f * a[i_outer];
/* final slab for 'i_outer' */
{
int const i_outer = -1 + n + -1 * int_floor_div_pos_b(3 * n, 4);
if (-1 + n >= 0)
{
if (-1 + -4 * i_outer + n >= 0)
out[i_outer].s0 = 2.0f * a[i_outer].s0;
if (-1 + -4 * i_outer + -1 + n >= 0)
out[i_outer].s1 = 2.0f * a[i_outer].s1;
if (-1 + -4 * i_outer + -1 * 2 + n >= 0)
out[i_outer].s2 = 2.0f * a[i_outer].s2;
if (-1 + -4 * i_outer + -1 * 3 + n >= 0)
out[i_outer].s3 = 2.0f * a[i_outer].s3;
}
}
}
import numpy as np
import pyopencl as cl
import pyopencl.array
import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
n = 15 * 10**6
a = cl.array.arange(queue, n, dtype=np.float32)
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"out[i] = 2*a[i]")
knl = lp.set_options(knl, write_code=True)
knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
knl = lp.tag_array_axes(knl, "a,out", "C,vec")
knl(queue, a=a.reshape(-1, 4), n=n)
This diff is collapsed.
from __future__ import annotations
import loopy.cli
loopy.cli.main()
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.