Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tasmith4/loopy
  • ben_sepanski/loopy
  • arghdos/loopy
  • inducer/loopy
  • wence-/loopy
  • isuruf/loopy
  • fikl2/loopy
  • xywei/loopy
  • kaushikcfd/loopy
  • zweiner2/loopy
10 results
Show changes
Showing
with 603 additions and 102 deletions
subroutine fill(out, a, n)
implicit none
real*8 a, out(n)
integer n
do i = 1, n
out(i) = a
end do
do i = 1, n
out(i) = out(i) * 2
end do
end
!$loopy begin transform
!
! fill = lp.split_iname(fill, "i", 128,
! outer_tag="g.0", inner_tag="l.0")
! fill = lp.split_iname(fill, "i_1", 128,
! outer_tag="g.0", inner_tag="l.0")
!
!$loopy end transform
! vim:filetype=floopy
%% Cell type:markdown id: tags:
# Loopy IPython Integration Demo
%% Cell type:code id: tags:
``` python
%load_ext loopy.ipython_ext
```
%% Cell type:markdown id: tags:
## Without transform code
%% Cell type:code id: tags:
``` python
%%fortran_kernel
subroutine fill(out, a, n)
implicit none
real*8 a, out(n)
integer n, i
do i = 1, n
out(i) = a
end do
end
```
%% Cell type:code id: tags:
``` python
print(prog) # noqa: F821
```
%% Cell type:markdown id: tags:
## With transform code
%% Cell type:code id: tags:
``` python
split_amount = 128
```
%% Cell type:code id: tags:
``` python
%%transformed_fortran_kernel
subroutine tr_fill(out, a, n)
implicit none
real*8 a, out(n)
integer n, i
do i = 1, n
out(i) = a
end do
end
!$loopy begin
!
! tr_fill = lp.parse_fortran(SOURCE)
! tr_fill = lp.split_iname(tr_fill, "i", split_amount,
! outer_tag="g.0", inner_tag="l.0")
! RESULT = tr_fill
!
!$loopy end
```
%% Cell type:code id: tags:
``` python
print(prog) # noqa: F821
```
import numpy as np
import numpy.linalg as la
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp
def main():
import pathlib
fn = pathlib.Path(__file__).parent / "matmul.floopy"
with open(fn) as inf:
source = inf.read()
dgemm = lp.parse_transformed_fortran(source, filename=fn)
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
n = 2048
a = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
b = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
c = cl.array.zeros(queue, (n, n), dtype=np.float64, order="F")
cl.clrandom.fill_rand(a)
cl.clrandom.fill_rand(b)
dgemm = lp.set_options(dgemm, write_code=True)
dgemm(queue, a=a, b=b, alpha=1, c=c)
c_ref = (a.get() @ b.get())
assert la.norm(c_ref - c.get())/la.norm(c_ref) < 1e-10
if __name__ == "__main__":
main()
subroutine dgemm(m,n,l,alpha,a,b,c)
implicit none
real*8 a(m,l),b(l,n),c(m,n), alpha
integer m,n,k,i,j,l
do j = 1,n
do k = 1,l
do i = 1,m
c(i,j) = c(i,j) + alpha*b(k,j)*a(i,k)
end do
end do
end do
end subroutine
!$loopy begin
! dgemm = lp.parse_fortran(SOURCE, FILENAME)
! dgemm = lp.split_iname(dgemm, "i", 16,
! outer_tag="g.0", inner_tag="l.1")
! dgemm = lp.split_iname(dgemm, "j", 8,
! outer_tag="g.1", inner_tag="l.0")
! dgemm = lp.split_iname(dgemm, "k", 32)
!
! dgemm = lp.extract_subst(dgemm, "a_acc", "a[i1,i2]", parameters="i1, i2")
! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2")
! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner",
! precompute_outer_inames="i_outer, j_outer, k_outer",
! default_tag="l.auto")
! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner",
! precompute_outer_inames="i_outer, j_outer, k_outer",
! default_tag="l.auto")
! RESULT = dgemm
!$loopy end
lp_knl = lp.make_kernel(
"{[i,j]: 0<=i,j<n}",
"c[i,j] = a[i]*b[j]")
lp_knl = lp.add_dtypes(lp_knl, {"a": np.float64, "b": np.float64})
lp_knl = lp.split_iname(lp_knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
lp_knl = lp.split_iname(lp_knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
#! /bin/sh
NAME="$1"
shift
python $(which loopy) --lang=floopy "$NAME" - "$@"
#! /bin/sh
python $(which loopy) --lang=loopy "$NAME" - "$@"
......@@ -6,6 +6,7 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y)
real*8 x(n), y(n), rowsum
integer m, n, rowstart, rowend, length, nvals
integer i, j
do i = 1, m
rowstart = rowstarts(i)
......@@ -21,10 +22,12 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y)
end do
end
!$loopy begin transform
!$loopy begin
! sparse = lp.parse_fortran(SOURCE, FILENAME)
! sparse = lp.split_iname(sparse, "i", 128)
! sparse = lp.tag_inames(sparse, {"i_outer": "g.0"})
! sparse = lp.tag_inames(sparse, {"i_inner": "l.0"})
! sparse = lp.split_iname(sparse, "j", 4)
! sparse = lp.tag_inames(sparse, {"j_inner": "unr"})
!$loopy end transform
! RESULT = sparse
!$loopy end
subroutine fill(out, a, n)
implicit none
real*8 a, out(n)
integer n
real_type a, out(n)
integer n, i
!$loopy begin tagged: init
do i = 1, n
out(i) = a
end do
!$loopy end tagged: init
!$loopy begin tagged: mult
do i = 1, n
out(i) = out(i) * 2
out(i) = out(i) * factor
end do
!$loopy end tagged: mult
end
!$loopy begin transform
!$loopy begin
!
! SOURCE = lp.c_preprocess(SOURCE, [
! "factor 4.0",
! "real_type real*8",
! ])
! fill = lp.parse_fortran(SOURCE, FILENAME)
! fill = lp.add_barrier(fill, "tag:init", "tag:mult", "gb1")
! fill = lp.split_iname(fill, "i", 128,
! outer_tag="g.0", inner_tag="l.0")
! fill = lp.split_iname(fill, "i_1", 128,
! outer_tag="g.0", inner_tag="l.0")
!$loopy end transform
! RESULT = fill
!
!$loopy end
! vim:filetype=floopy
......@@ -65,8 +65,9 @@ subroutine volumeKernel(elements, Nfields, Ngeo, Ndim, Dop, geo, Q, rhsQ )
end subroutine volumeKernel
!$loopy begin transform
!$loopy begin
!
! volumeKernel = lp.parse_fortran(SOURCE, FILENAME)
! volumeKernel = lp.split_iname(volumeKernel,
! "e", 32, outer_tag="g.1", inner_tag="g.0")
! volumeKernel = lp.fix_parameters(volumeKernel,
......@@ -75,5 +76,6 @@ end subroutine volumeKernel
! i="l.0", j="l.1", k="l.2",
! i_1="l.0", j_1="l.1", k_1="l.2"
! ))
! RESULT = volumeKernel
!
!$loopy end transform
!$loopy end
subroutine volumeKernel(elements, Nfields, Ngeo, Ndim, Dop, geo, Q, rhsQ )
implicit none
integer elements, Nfields, Ngeo, Ndim
real*4 Dop(Nq,Nq)
real*4 Q(Nq,Nq,Nq,Nfields,elements)
real*4 geo(Nq,Nq,Nq,Ngeo,elements)
real*4 rhsQ(Nq,Nq,Nq,Nfields,elements)
integer e,i,j,k,d,n,cnt
real*4 u,v,w,p, dFdr, dFds, dFdt, divF
real*4 F(Nq,Ndim)
do e=1,elements
do i=1,Nq
F(i,1) = 5
F(i,2) = 7
end do
end do
end subroutine volumeKernel
!$loopy begin transform
!
! volumeKernel = lp.fix_parameters(volumeKernel,
! Nq=5, Ndim=3)
! volumeKernel = lp.tag_inames(volumeKernel, dict(i="l.0"))
!
!$loopy end transform
import numpy as np
from constantdict import constantdict
import loopy as lp
from loopy.diagnostic import LoopyError
from loopy.target.c import CTarget
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
# {{{ blas callable
class CBLASGEMV(lp.ScalarCallable):
def with_types(self, arg_id_to_dtype, callables_table):
mat_dtype = arg_id_to_dtype.get(0)
vec_dtype = arg_id_to_dtype.get(1)
if mat_dtype is None or vec_dtype is None:
# types aren't specialized enough to be resolved
return self, callables_table
if mat_dtype != vec_dtype:
raise LoopyError("GEMV requires same dtypes for matrix and "
"vector")
if vec_dtype.numpy_dtype == np.float32:
name_in_target = "cblas_sgemv"
elif vec_dtype. numpy_dtype == np.float64:
name_in_target = "cblas_dgemv"
else:
raise LoopyError("GEMV is only supported for float32 and float64 "
"types")
return (self.copy(name_in_target=name_in_target,
arg_id_to_dtype=constantdict({
0: vec_dtype,
1: vec_dtype,
-1: vec_dtype})),
callables_table)
def with_descrs(self, arg_id_to_descr, callables_table):
mat_descr = arg_id_to_descr.get(0)
vec_descr = arg_id_to_descr.get(1)
res_descr = arg_id_to_descr.get(-1)
if mat_descr is None or vec_descr is None or res_descr is None:
# shapes aren't specialized enough to be resolved
return self, callables_table
assert mat_descr.shape[1] == vec_descr.shape[0]
assert mat_descr.shape[0] == res_descr.shape[0]
assert len(vec_descr.shape) == len(res_descr.shape) == 1
# handling only the easy case when stride == 1
assert vec_descr.dim_tags[0].stride == 1
assert mat_descr.dim_tags[1].stride == 1
assert res_descr.dim_tags[0].stride == 1
return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table
def emit_call_insn(self, insn, target, expression_to_code_mapper):
from pymbolic import var
mat_descr = self.arg_id_to_descr[0]
m, n = mat_descr.shape
ecm = expression_to_code_mapper
mat, vec = insn.expression.parameters
result, = insn.assignees
c_parameters = [var("CblasRowMajor"),
var("CblasNoTrans"),
m, n,
1,
ecm(mat).expr,
1,
ecm(vec).expr,
1,
ecm(result).expr,
1]
return (var(self.name_in_target)(*c_parameters),
False # cblas_gemv does not return anything
)
def generate_preambles(self, target):
assert isinstance(target, CTarget)
yield ("99_cblas", "#include <cblas.h>")
return
# }}}
n = 10
knl = lp.make_kernel(
"{:}",
"""
y[:] = gemv(A[:, :], x[:])
""", [
lp.GlobalArg("A", dtype=np.float64, shape=(n, n)),
lp.GlobalArg("x", dtype=np.float64, shape=(n, )),
lp.GlobalArg("y", shape=(n, )), ...],
target=CTarget())
knl = lp.register_callable(knl, "gemv", CBLASGEMV(name="gemv"))
print(lp.generate_code_v2(knl).device_code())
import numpy as np
import pyopencl as cl
import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
cl_ctx = cl.create_some_context()
knl = lp.make_kernel(
"{[ictr,itgt,idim]: "
"0<=itgt<ntargets "
"and 0<=ictr<ncenters "
"and 0<=idim<ambient_dim}",
"""
for itgt
for ictr
<> dist_sq = sum(idim,
(tgt[idim,itgt] - center[idim,ictr])**2)
<> in_disk = dist_sq < (radius[ictr]*1.05)**2
<> matches = (
(in_disk
and qbx_forced_limit == 0)
or (in_disk
and qbx_forced_limit != 0
and qbx_forced_limit * center_side[ictr] > 0)
)
<> post_dist_sq = dist_sq if matches else HUGE
end
<> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq)
tgt_to_qbx_center[itgt] = min_ictr if min_dist_sq < HUGE else -1
end
""")
knl = lp.fix_parameters(knl, ambient_dim=2)
knl = lp.add_and_infer_dtypes(knl, {
"tgt,center,radius,HUGE": np.float32,
"center_side,qbx_forced_limit": np.int32,
})
lp.auto_test_vs_ref(knl, cl_ctx, knl, parameters={
"HUGE": 1e20, "ncenters": 200, "ntargets": 300,
"qbx_forced_limit": 1})
import numpy as np
import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
knl = lp.make_kernel(
"{ [i,k]: 0<=i<n and 0<=k<3 }",
"""
for i, k
... gbarrier
c[k,i] = a[k, i + 1]
... gbarrier
out[k,i] = c[k,i]
end
""", seq_dependencies=True)
# transform
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_linearized_kernel
knl = knl.with_kernel(get_one_linearized_kernel(knl["loopy_kernel"],
knl.callables_table))
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
print(cgr.device_code())
print(cgr.host_code())
# This is a version of hello-loopy.py that can be run through
# a loopy binary using
#
# ./loopy --lang=loopy hello-loopy.loopy -
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"out[i] = 2*a[i]")
knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
lp_knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0")
import numpy as np
import loopy as lp
import pyopencl as cl
import pyopencl.array
import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
# setup
# -----
ctx = cl.create_some_context()
......@@ -23,8 +27,12 @@ knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# execute
# -------
# easy, slower:
evt, (out,) = knl(queue, a=a)
# efficient, with caching:
knl_ex = knl.executor(ctx)
evt, (out,) = knl_ex(queue, a=a)
# ENDEXAMPLE
cknl = lp.CompiledKernel(ctx, knl)
print(cknl.get_highlighted_code({"a": np.float32}))
knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.float32)})
print(lp.generate_code_v2(knl).device_code())
import ctypes
import ctypes.util
import os
from tempfile import TemporaryDirectory
from time import time
import numpy as np
import numpy.linalg as la
import loopy as lp
from loopy.tools import (
address_from_numpy,
build_ispc_shared_lib,
cptr_from_numpy,
empty_aligned,
)
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
def transform(knl, vars, stream_dtype):
vars = [v.strip() for v in vars.split(",")]
knl = lp.assume(knl, "n>0")
knl = lp.split_iname(
knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
knl = lp.add_and_infer_dtypes(knl, dict.fromkeys(vars, stream_dtype))
knl = lp.set_argument_order(knl, [*vars, "n"])
return knl
def gen_code(knl):
codegen_result = lp.generate_code_v2(knl)
return codegen_result.device_code() + "\n" + codegen_result.host_code()
NRUNS = 10
ALIGN_TO = 4096
ARRAY_SIZE = 2**28
if 0:
STREAM_DTYPE = np.float64
STREAM_CTYPE = ctypes.c_double
else:
STREAM_DTYPE = np.float32
STREAM_CTYPE = ctypes.c_float
if 1:
INDEX_DTYPE = np.int32
INDEX_CTYPE = ctypes.c_int
else:
INDEX_DTYPE = np.int64
INDEX_CTYPE = ctypes.c_longlong
def main():
this_dir = os.path.dirname(__file__)
with open(os.path.join(this_dir, "tasksys.cpp")) as ts_file:
tasksys_source = ts_file.read()
def make_knl(name, insn, vars):
knl = lp.make_kernel(
"{[i]: 0<=i<n}",
insn,
target=lp.ISPCTarget(), index_dtype=INDEX_DTYPE,
name="stream_"+name+"_tasks")
knl = transform(knl, vars, STREAM_DTYPE)
return knl
init_knl = make_knl("init", """
a[i] = 1
b[i] = 2
c[i] = 0
""", "a,b,c")
triad_knl = make_knl("triad", """
a[i] = b[i] + scalar * c[i]
""", "a,b,c,scalar")
with TemporaryDirectory() as tmpdir:
ispc_code = gen_code(init_knl) + gen_code(triad_knl)
print(ispc_code)
build_ispc_shared_lib(
tmpdir,
[("stream.ispc", ispc_code)],
[("tasksys.cpp", tasksys_source)],
cxx_options=["-g", "-fopenmp", "-DISPC_USE_OMP"],
ispc_options=([
# "-g", "--no-omit-frame-pointer",
"--target=avx2-i32x8",
"--opt=force-aligned-memory",
"--opt=disable-loop-unroll",
# "--opt=fast-math",
# "--opt=disable-fma",
]
+ (["--addressing=64"] if INDEX_DTYPE == np.int64 else [])
),
# ispc_bin="/home/andreask/pack/ispc-v1.9.0-linux/ispc",
quiet=False,
)
knl_lib = ctypes.cdll.LoadLibrary(os.path.join(tmpdir, "shared.so"))
scalar = 5
a = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO)
b = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO)
c = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO)
print(
hex(address_from_numpy(a)),
hex(address_from_numpy(b)),
hex(address_from_numpy(c)))
assert address_from_numpy(a) % ALIGN_TO == 0
assert address_from_numpy(b) % ALIGN_TO == 0
assert address_from_numpy(c) % ALIGN_TO == 0
knl_lib.stream_init_tasks(
cptr_from_numpy(a),
cptr_from_numpy(b),
cptr_from_numpy(c),
INDEX_CTYPE(ARRAY_SIZE),
)
def call_kernel():
knl_lib.stream_triad_tasks(
cptr_from_numpy(a),
cptr_from_numpy(b),
cptr_from_numpy(c),
STREAM_CTYPE(scalar),
INDEX_CTYPE(ARRAY_SIZE),
)
call_kernel()
call_kernel()
start_time = time()
for _irun in range(NRUNS):
call_kernel()
elapsed = time() - start_time
print(elapsed/NRUNS)
print(1e-9*3*a.nbytes*NRUNS/elapsed, "GB/s")
assert la.norm(a-b+scalar*c, np.inf) < np.finfo(STREAM_DTYPE).eps * 10
if __name__ == "__main__":
main()
# vim: foldmethod=marker
# SETUPBEGIN
import numpy as np
import pyopencl as cl
import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(queue.device,
"{[i,j]: 0<=i,j<n}",
knl = lp.make_kernel(
"{[i, j]: 0<=i<n and 0<=j<n}",
"c[i, j] = a[i]*b[j]",
assumptions="n >= 16")
a = np.arange(200, dtype=np.float32)
b = np.arange(200, dtype=np.float32)
evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
knl = lp.set_options(knl, write_code=True)
evt, (c,) = knl(queue, a=a, b=b)
# SETUPEND
orig_knl = knl
......@@ -26,25 +31,36 @@ knl = lp.split_iname(knl, "j", 16,
outer_tag="g.1", inner_tag="l.1")
# SPLITEND
evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
knl = lp.set_options(knl, write_code=True)
evt, (c,) = knl(queue, a=a, b=b)
split_knl = knl
# PREFETCH1BEGIN
knl = lp.add_prefetch(knl, "a")
knl = lp.add_prefetch(knl, "b")
knl = lp.add_prefetch(knl, "a",
fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
knl = lp.add_prefetch(knl, "b",
fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
# PREFETCH1END
evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
knl = lp.set_options(knl, write_code=True)
evt, (c,) = knl(queue, a=a, b=b)
knl = split_knl
# PREFETCH2BEGIN
knl = lp.add_prefetch(knl, "a", ["i_inner"])
knl = lp.add_prefetch(knl, "b", ["j_inner"])
knl = lp.add_prefetch(knl, "a", ["i_inner"],
fetch_outer_inames="i_outer, j_outer, j_inner",
temporary_address_space=lp.AddressSpace.LOCAL,
default_tag="l.0")
knl = lp.add_prefetch(knl, "b", ["j_inner"],
fetch_outer_inames="i_outer, j_outer, j_inner",
temporary_address_space=lp.AddressSpace.LOCAL,
default_tag="l.0")
# PREFETCH2END
evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
knl = lp.set_options(knl, write_code=True)
evt, (c,) = knl(queue, a=a, b=b)
knl = orig_knl
......@@ -54,8 +70,10 @@ knl = lp.split_iname(knl, "i", 256,
knl = lp.split_iname(knl, "j", 256,
outer_tag="g.1", slabs=(0, 1))
knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None)
knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None)
knl = lp.add_prefetch(knl, "a", ["i_inner"],
fetch_outer_inames="i_outer, j_outer", default_tag=None)
knl = lp.add_prefetch(knl, "b", ["j_inner"],
fetch_outer_inames="i_outer, j_outer", default_tag=None)
knl = lp.split_iname(knl, "i_inner", 16,
inner_tag="l.0")
......@@ -68,4 +86,5 @@ knl = lp.split_iname(knl, "a_dim_0", 16,
outer_tag="l.1", inner_tag="l.0")
# PREFETCH3END
evt, (c,) = knl(queue, a=a, b=b, options="write_cl")
knl = lp.set_options(knl, write_code=True)
evt, (c,) = knl(queue, a=a, b=b)
#! /bin/bash
OMP_PLACES=cores OMP_DISPLAY_ENV=true OMP_SCHEDULE=static python "$@"
import numpy as np
import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
k = lp.make_kernel([
"{ [i] : 0 <= i < m }",
"{ [j] : 0 <= j < length }"],
"""
for i
<> rowstart = rowstarts[i]
<> rowend = rowstarts[i+1]
<> length = rowend - rowstart
y[i] = sum(j, values[rowstart+j] * x[colindices[rowstart + j]])
end
""", name="spmv")
k = lp.add_and_infer_dtypes(k, {
"values,x": np.float64, "rowstarts,colindices": k["spmv"].index_dtype
})
print(lp.generate_code_v2(k).device_code())