diff --git a/MEMO b/MEMO index 89b67986351d093a7e7dff01479ce72e1df781ae..694b49a929d7d6e25c522fec3552c5f6abd6d288 100644 --- a/MEMO +++ b/MEMO @@ -78,7 +78,7 @@ Future ideas - Check for unordered (no-dependency) writes to the same location -- String instructions? +- Vanilla C string instructions? - Barriers for data exchanged via global vars? diff --git a/loopy/__init__.py b/loopy/__init__.py index 0dc0fdf76e7ee38c6b47e97912df0fb1f3127396..eb89d65707a60b1be41f5fbbc57945fc3536f1e5 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -353,6 +353,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, arg = kernel.arg_dict[var_name] + # {{{ make parameter names and unification template + parameters = [] for i in range(arg.dimensions): based_on = "%s_dim_%d" % (c_name, i) @@ -371,8 +373,12 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, elif len(parameters) == 1: uni_template = uni_template[var(parameters[0])] + # }}} + kernel = extract_subst(kernel, rule_name, uni_template, parameters) + # {{{ track applied iname rewrites on footprint_subscripts + if footprint_subscripts is not None: if not isinstance(footprint_subscripts, (list, tuple)): footprint_subscripts = [footprint_subscripts] @@ -403,6 +409,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, else: subst_use = rule_name + # }}} + new_kernel = precompute(kernel, subst_use, arg.dtype, sweep_inames, new_storage_axis_names=dim_arg_names, default_tag=default_tag) diff --git a/loopy/kernel.py b/loopy/kernel.py index d8d78ece8ce38e8f921cfbe61af27c62b41dc104..f7b2f1590b0ba1e5d434a190a586ee3736dc4039 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -1690,14 +1690,14 @@ class SetOperationCacheManager: lower_bound_pw_aff = self.dim_min(set, iname_to_dim[iname][1]) upper_bound_pw_aff = self.dim_max(set, iname_to_dim[iname][1]) - from loopy.isl_helpers import static_max_of_pw_aff, static_min_of_pw_aff + from loopy.isl_helpers import static_max_of_pw_aff, static_value_of_pw_aff from loopy.symbolic import pw_aff_to_expr size = pw_aff_to_expr(static_max_of_pw_aff( upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=True, context=context)) base_index = pw_aff_to_expr( - static_min_of_pw_aff(lower_bound_pw_aff, constants_only=False, + static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context)) return base_index, size diff --git a/loopy/reduction.py b/loopy/reduction.py index 9a09b3c0793c431212cb42e55c6a0b9504dae0fd..9f2b83487144f00cb29a4dd08bae356566ec8359 100644 --- a/loopy/reduction.py +++ b/loopy/reduction.py @@ -33,7 +33,7 @@ class ScalarReductionOperation(ReductionOperation): result = type(self).__name__.replace("ReductionOperation", "").lower() if self.forced_result_dtype is not None: - result = "%s<%s>" % (result, str(self.dtype)) + result = "%s<%s>" % (result, str(self.forced_result_dtype)) return result diff --git a/test/test_linalg.py b/test/test_linalg.py index fefdaac15ec87c4c65e89202c951d2b1dbee8f6d..d4db1941ba144f890732510c058fb03e6694279a 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -221,7 +221,7 @@ def test_plain_matrix_mul(ctx_factory): kernel_gen = lp.check_kernels(kernel_gen, {}) lp.auto_test_vs_ref(ref_knl, ctx, kernel_gen, - op_count=vec_size*2*n**3/1e9, op_label="GFlops/s", + op_count=[vec_size*2*n**3/1e9], op_label=["GFlops"], parameters={"n": n}, check_result=check) @@ -354,7 +354,7 @@ def test_rank_one(ctx_factory): kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, - op_count=np.dtype(dtype).itemsize*n**2/1e9, op_label="GBytes", + op_count=[np.dtype(dtype).itemsize*n**2/1e9], op_label=["GBytes"], parameters={"n": n}) diff --git a/test/test_loopy.py b/test/test_loopy.py index 2c447e9cffcad0dfb44176934363479075c852d9..e4644bcf1de36cf5bc4416f2de2f6d998409edc9 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -89,39 +89,41 @@ def test_multi_cse(ctx_factory): def test_stencil(ctx_factory): ctx = ctx_factory() + # n=32 causes corner case behavior in size calculations for temprorary (a + # non-unifiable, two-constant-segments PwAff as the base index) + + n = 256 knl = lp.make_kernel(ctx.devices[0], - "{[i,j]: 0<= i,j < 32}", + "{[i,j]: 0<= i,j < %d}" % n, [ - "[i] z[i,j] = -2*a[i,j]" - " + a[i,j-1]" - " + a[i,j+1]" - " + a[i-1,j]" - " + a[i+1,j]" + "a_offset(ii, jj) := a[ii+1, jj+1]", + "z[i,j] = -2*a_offset(i,j)" + " + a_offset(i,j-1)" + " + a_offset(i,j+1)" + " + a_offset(i-1,j)" + " + a_offset(i+1,j)" ], [ - lp.GlobalArg("a", np.float32, shape=(32,32,)), - lp.GlobalArg("z", np.float32, shape=(32,32,)) + lp.GlobalArg("a", np.float32, shape=(n+2,n+2,)), + lp.GlobalArg("z", np.float32, shape=(n+2,n+2,)) ]) + ref_knl = knl def variant_1(knl): - knl = lp.add_prefetch(knl, "a", [0, 1]) - return knl - - def variant_2(knl): knl = lp.split_dimension(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"]) return knl - #for variant in [variant_1, variant_2]: - for variant in [variant_2]: + for variant in [variant_1]: kernel_gen = lp.generate_loop_schedules(variant(knl), loop_priority=["i_outer", "i_inner_0", "j_0"]) kernel_gen = lp.check_kernels(kernel_gen) - for knl in kernel_gen: - print lp.generate_code(knl) + lp.auto_test_vs_ref(ref_knl, ctx, kernel_gen, + fills_entire_output=False, print_ref_code=True, + op_count=[n*n], op_label=["cells"])