Skip to content
Snippets Groups Projects
Commit fb3a9f2c authored by Andreas Klöckner's avatar Andreas Klöckner
Browse files

ISPC stream harness seems to get ICC performance (fingers crossed :)

parent fe922f94
No related branches found
No related tags found
No related merge requests found
...@@ -116,14 +116,16 @@ def main(): ...@@ -116,14 +116,16 @@ def main():
with open("tasksys.cpp", "r") as ts_file: with open("tasksys.cpp", "r") as ts_file:
tasksys_source = ts_file.read() tasksys_source = ts_file.read()
from loopy.target.ispc import ISPCTarget
stream_knl = lp.make_kernel(
"{[i,j]: 0<=i<n and 0<=j<4}",
"z[i] = a*x[i] + y[i] {inames=i:j}",
target=ISPCTarget())
stream_dtype = np.float64 stream_dtype = np.float64
stream_ctype = ctypes.c_double stream_ctype = ctypes.c_double
index_dtype = np.int32
from loopy.target.ispc import ISPCTarget
stream_knl = lp.make_kernel(
"{[i,j]: 0<=i<n}",
"z[i] = a*x[i] + y[i] {inames=i}",
target=ISPCTarget(),
index_dtype=index_dtype)
stream_knl = lp.add_and_infer_dtypes(stream_knl, { stream_knl = lp.add_and_infer_dtypes(stream_knl, {
"a": stream_dtype, "a": stream_dtype,
...@@ -141,39 +143,57 @@ def main(): ...@@ -141,39 +143,57 @@ def main():
ispc_code, arg_info = lp.generate_code(stream_knl) ispc_code, arg_info = lp.generate_code(stream_knl)
with TemporaryDirectory() as tmpdir: with TemporaryDirectory() as tmpdir:
print(ispc_code)
build_ispc_shared_lib( build_ispc_shared_lib(
tmpdir, tmpdir,
[("stream.ispc", ispc_code)], [("stream.ispc", ispc_code)],
[("tasksys.cpp", tasksys_source)], [("tasksys.cpp", tasksys_source)],
cxx_options=["-g", "-fopenmp", "-DISPC_USE_OMP"], cxx_options=["-g", "-fopenmp", "-DISPC_USE_OMP"],
ispc_options=[ ispc_options=([
#"-g", "--no-omit-frame-pointer", "-g", "--no-omit-frame-pointer",
"--target=avx2-i32x8", "--target=avx2-i32x8",
"--opt=force-aligned-memory", "--opt=force-aligned-memory",
], ]
+ ["--addressing=64"] if index_dtype == np.int64 else []
),
ispc_bin="/home/andreask/pack/ispc-v1.9.0-linux/ispc", ispc_bin="/home/andreask/pack/ispc-v1.9.0-linux/ispc",
quiet=False) quiet=False,
)
print(ispc_code)
knl_lib = ctypes.cdll.LoadLibrary(os.path.join(tmpdir, "shared.so")) knl_lib = ctypes.cdll.LoadLibrary(os.path.join(tmpdir, "shared.so"))
n = 2**28 n = 2**27
a = 5 a = 5
x = empty_aligned(n, dtype=stream_dtype) x = empty_aligned(n, dtype=stream_dtype)
y = empty_aligned(n, dtype=stream_dtype) y = empty_aligned(n, dtype=stream_dtype)
z = empty_aligned(n, dtype=stream_dtype) z = empty_aligned(n, dtype=stream_dtype)
nruns = 10 assert address_from_numpy(x) % 64 == 0
assert address_from_numpy(y) % 64 == 0
assert address_from_numpy(z) % 64 == 0
nruns = 20
start_time = time() start_time = time()
for irun in range(nruns):
def call_kernel():
knl_lib.loopy_kernel( knl_lib.loopy_kernel(
ctypes.c_int(n), stream_ctype(a), ctypes.c_int(n), stream_ctype(a),
cptr_from_numpy(x), cptr_from_numpy(x),
cptr_from_numpy(y), cptr_from_numpy(y),
cptr_from_numpy(z)) cptr_from_numpy(z))
call_kernel()
call_kernel()
for irun in range(nruns):
call_kernel()
elapsed = time() - start_time elapsed = time() - start_time
print(1e-9*3*x.nbytes*nruns/elapsed*4, "GB/s") print(elapsed/nruns)
print(1e-9*3*x.nbytes*nruns/elapsed, "GB/s")
assert la.norm(z-a*x+y) < 1e-10 assert la.norm(z-a*x+y) < 1e-10
......
#! /bin/sh
OMP_PLACES=cores OMP_DISPLAY_ENV=true OMP_SCHEDULE=static python ispc-harness.py
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment