diff --git a/examples/python/ispc-harness.py b/examples/python/ispc-harness.py index 7b29340cf2ea73b3afb726da6fb81799ab3c5c2d..f18bc1c4bb9e5cf6f82ed0506a96b0a3924194dc 100644 --- a/examples/python/ispc-harness.py +++ b/examples/python/ispc-harness.py @@ -62,6 +62,8 @@ def build_ispc_shared_lib( # }}} +# {{{ numpy address munging + def address_from_numpy(obj): ary_intf = getattr(obj, "__array_interface__", None) if ary_intf is None: @@ -111,38 +113,101 @@ def empty_aligned(shape, dtype, order='C', n=64): return array +# }}} + + +def transform(knl, vars, stream_dtype): + vars = [v.strip() for v in vars.split(",")] + knl = lp.assume(knl, "n>0") + knl = lp.split_iname( + knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) + knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") + + knl = lp.add_and_infer_dtypes(knl, { + var: stream_dtype + for var in vars + }) + + knl = lp.set_argument_order(knl, vars + ["n"]) + + return knl + + +def gen_code(knl): + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + ispc_code, arg_info = lp.generate_code(knl) + + return ispc_code + + +NRUNS = 10 +ALIGN_TO = 4096 +ARRAY_SIZE = 2**28 + +if 0: + STREAM_DTYPE = np.float64 + STREAM_CTYPE = ctypes.c_double +else: + STREAM_DTYPE = np.float32 + STREAM_CTYPE = ctypes.c_float + +if 1: + INDEX_DTYPE = np.int32 + INDEX_CTYPE = ctypes.c_int +else: + INDEX_DTYPE = np.int64 + INDEX_CTYPE = ctypes.c_longlong + def main(): with open("tasksys.cpp", "r") as ts_file: tasksys_source = ts_file.read() - stream_dtype = np.float64 - stream_ctype = ctypes.c_double - index_dtype = np.int32 - - from loopy.target.ispc import ISPCTarget - stream_knl = lp.make_kernel( - "{[i]: 0<=i<n}", - "z[i] = x[i] + a*y[i]", - target=ISPCTarget(), - index_dtype=index_dtype) - - stream_knl = lp.add_and_infer_dtypes(stream_knl, { - "a": stream_dtype, - "x": stream_dtype, - "y": stream_dtype - }) - - stream_knl = lp.assume(stream_knl, "n>0") - stream_knl = lp.split_iname(stream_knl, - "i", 2**18, outer_tag="g.0", slabs=(0, 1)) - stream_knl = lp.split_iname(stream_knl, "i_inner", 8, inner_tag="l.0") - stream_knl = lp.preprocess_kernel(stream_knl) - stream_knl = lp.get_one_scheduled_kernel(stream_knl) - stream_knl = lp.set_argument_order(stream_knl, "n,a,x,y,z") - ispc_code, arg_info = lp.generate_code(stream_knl) + if 0: + from loopy.target.ispc import ISPCTarget + stream_knl = lp.make_kernel( + "{[i]: 0<=i<n}", + "z[i] = x[i] + a*y[i]", + target=ISPCTarget(), + index_dtype=INDEX_DTYPE) + + stream_knl = lp.add_and_infer_dtypes(stream_knl, { + "a": STREAM_DTYPE, + "x": STREAM_DTYPE, + "y": STREAM_DTYPE + }) + + stream_knl = lp.assume(stream_knl, "n>0") + stream_knl = lp.split_iname(stream_knl, + "i", 2**18, outer_tag="g.0", slabs=(0, 1)) + stream_knl = lp.split_iname(stream_knl, "i_inner", 8, inner_tag="l.0") + stream_knl = lp.preprocess_kernel(stream_knl) + stream_knl = lp.get_one_scheduled_kernel(stream_knl) + stream_knl = lp.set_argument_order(stream_knl, "n,a,x,y,z") + ispc_code, arg_info = lp.generate_code(stream_knl) + + def make_knl(name, insn, vars): + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + insn, + target=lp.ISPCTarget(), index_dtype=INDEX_DTYPE, + name="stream_"+name+"_tasks") + + knl = transform(knl, vars, STREAM_DTYPE) + return knl + + init_knl = make_knl("init", """ + a[i] = 1 + b[i] = 2 + c[i] = 0 + """, "a,b,c") + triad_knl = make_knl("triad", """ + a[i] = b[i] + scalar * c[i] + """, "a,b,c,scalar") with TemporaryDirectory() as tmpdir: + ispc_code = gen_code(init_knl) + gen_code(triad_knl) print(ispc_code) build_ispc_shared_lib( @@ -157,7 +222,7 @@ def main(): #"--opt=fast-math", #"--opt=disable-fma", ] - + (["--addressing=64"] if index_dtype == np.int64 else []) + + (["--addressing=64"] if INDEX_DTYPE == np.int64 else []) ), ispc_bin="/home/andreask/pack/ispc-v1.9.0-linux/ispc", quiet=False, @@ -165,46 +230,51 @@ def main(): knl_lib = ctypes.cdll.LoadLibrary(os.path.join(tmpdir, "shared.so")) - n = 2**28 - a = 5 + scalar = 5 - align_to = 64 - x = empty_aligned(n, dtype=stream_dtype, n=align_to) - y = empty_aligned(n, dtype=stream_dtype, n=align_to) - z = empty_aligned(n, dtype=stream_dtype, n=align_to) + a = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO) + b = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO) + c = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO) print( - hex(address_from_numpy(x)), - hex(address_from_numpy(y)), - hex(address_from_numpy(z))) - assert address_from_numpy(x) % align_to == 0 - assert address_from_numpy(y) % align_to == 0 - assert address_from_numpy(z) % align_to == 0 - - nruns = 10 + hex(address_from_numpy(a)), + hex(address_from_numpy(b)), + hex(address_from_numpy(c))) + assert address_from_numpy(a) % ALIGN_TO == 0 + assert address_from_numpy(b) % ALIGN_TO == 0 + assert address_from_numpy(c) % ALIGN_TO == 0 + + knl_lib.stream_init_tasks( + cptr_from_numpy(a), + cptr_from_numpy(b), + cptr_from_numpy(c), + INDEX_CTYPE(ARRAY_SIZE), + ) def call_kernel(): - knl_lib.loopy_kernel( - ctypes.c_int(n), stream_ctype(a), - cptr_from_numpy(x), - cptr_from_numpy(y), - cptr_from_numpy(z)) + knl_lib.stream_triad_tasks( + cptr_from_numpy(a), + cptr_from_numpy(b), + cptr_from_numpy(c), + STREAM_CTYPE(scalar), + INDEX_CTYPE(ARRAY_SIZE), + ) call_kernel() call_kernel() start_time = time() - for irun in range(nruns): + for irun in range(NRUNS): call_kernel() elapsed = time() - start_time - print(elapsed/nruns) + print(elapsed/NRUNS) - print(1e-9 * 3 * x.nbytes * nruns / elapsed, "GB/s") + print(1e-9*3*a.nbytes*NRUNS/elapsed, "GB/s") - assert la.norm(z-a*x+y) < 1e-10 + assert la.norm(a-b+scalar*c, np.inf) < np.finfo(STREAM_DTYPE).eps * 10 if __name__ == "__main__":