From fb3a9f2ce824f615d7383c4e052fac9e467f901b Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Thu, 25 Feb 2016 17:38:06 -0600 Subject: [PATCH] ISPC stream harness seems to get ICC performance (fingers crossed :) --- examples/python/ispc-harness.py | 50 ++++++++++++++++++++--------- examples/python/run-ispc-harness.sh | 3 ++ 2 files changed, 38 insertions(+), 15 deletions(-) create mode 100755 examples/python/run-ispc-harness.sh diff --git a/examples/python/ispc-harness.py b/examples/python/ispc-harness.py index da1ecf112..8db7b0f92 100644 --- a/examples/python/ispc-harness.py +++ b/examples/python/ispc-harness.py @@ -116,14 +116,16 @@ def main(): with open("tasksys.cpp", "r") as ts_file: tasksys_source = ts_file.read() - from loopy.target.ispc import ISPCTarget - stream_knl = lp.make_kernel( - "{[i,j]: 0<=i<n and 0<=j<4}", - "z[i] = a*x[i] + y[i] {inames=i:j}", - target=ISPCTarget()) - stream_dtype = np.float64 stream_ctype = ctypes.c_double + index_dtype = np.int32 + + from loopy.target.ispc import ISPCTarget + stream_knl = lp.make_kernel( + "{[i,j]: 0<=i<n}", + "z[i] = a*x[i] + y[i] {inames=i}", + target=ISPCTarget(), + index_dtype=index_dtype) stream_knl = lp.add_and_infer_dtypes(stream_knl, { "a": stream_dtype, @@ -141,39 +143,57 @@ def main(): ispc_code, arg_info = lp.generate_code(stream_knl) with TemporaryDirectory() as tmpdir: + print(ispc_code) + build_ispc_shared_lib( tmpdir, [("stream.ispc", ispc_code)], [("tasksys.cpp", tasksys_source)], cxx_options=["-g", "-fopenmp", "-DISPC_USE_OMP"], - ispc_options=[ - #"-g", "--no-omit-frame-pointer", + ispc_options=([ + "-g", "--no-omit-frame-pointer", "--target=avx2-i32x8", "--opt=force-aligned-memory", - ], + ] + + ["--addressing=64"] if index_dtype == np.int64 else [] + ), ispc_bin="/home/andreask/pack/ispc-v1.9.0-linux/ispc", - quiet=False) + quiet=False, + ) - print(ispc_code) knl_lib = ctypes.cdll.LoadLibrary(os.path.join(tmpdir, "shared.so")) - n = 2**28 + n = 2**27 a = 5 x = empty_aligned(n, dtype=stream_dtype) y = empty_aligned(n, dtype=stream_dtype) z = empty_aligned(n, dtype=stream_dtype) - nruns = 10 + assert address_from_numpy(x) % 64 == 0 + assert address_from_numpy(y) % 64 == 0 + assert address_from_numpy(z) % 64 == 0 + + nruns = 20 start_time = time() - for irun in range(nruns): + + def call_kernel(): knl_lib.loopy_kernel( ctypes.c_int(n), stream_ctype(a), cptr_from_numpy(x), cptr_from_numpy(y), cptr_from_numpy(z)) + + call_kernel() + call_kernel() + + for irun in range(nruns): + call_kernel() + elapsed = time() - start_time - print(1e-9*3*x.nbytes*nruns/elapsed*4, "GB/s") + print(elapsed/nruns) + + print(1e-9*3*x.nbytes*nruns/elapsed, "GB/s") assert la.norm(z-a*x+y) < 1e-10 diff --git a/examples/python/run-ispc-harness.sh b/examples/python/run-ispc-harness.sh new file mode 100755 index 000000000..dfed8c221 --- /dev/null +++ b/examples/python/run-ispc-harness.sh @@ -0,0 +1,3 @@ +#! /bin/sh + +OMP_PLACES=cores OMP_DISPLAY_ENV=true OMP_SCHEDULE=static python ispc-harness.py -- GitLab