From 748f7bffc0fd3162a1bad718cd0d76eeb7bf6915 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 31 Jul 2018 21:09:08 -0500 Subject: [PATCH 01/80] now counting ops with count-granularity=subgroup --- loopy/statistics.py | 131 ++++++++++++++++-- test/test_statistics.py | 293 ++++++++++++++++++++++------------------ 2 files changed, 281 insertions(+), 143 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index cee28b24..2df3093d 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -715,7 +715,8 @@ class ExpressionOpCounter(CounterBase): return ToCountMap( {Op(dtype=self.type_inf(expr), name='func:'+str(expr.function), - count_granularity=CountGranularity.WORKITEM): 1} + #count_granularity=CountGranularity.WORKITEM): 1} + count_granularity=CountGranularity.SUBGROUP): 1} ) + self.rec(expr.parameters) def map_subscript(self, expr): @@ -726,7 +727,8 @@ class ExpressionOpCounter(CounterBase): return ToCountMap( {Op(dtype=self.type_inf(expr), name='add', - count_granularity=CountGranularity.WORKITEM): + #count_granularity=CountGranularity.WORKITEM): + count_granularity=CountGranularity.SUBGROUP): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) @@ -735,18 +737,21 @@ class ExpressionOpCounter(CounterBase): assert expr.children return sum(ToCountMap({Op(dtype=self.type_inf(expr), name='mul', - count_granularity=CountGranularity.WORKITEM): 1}) + #count_granularity=CountGranularity.WORKITEM): 1}) + count_granularity=CountGranularity.SUBGROUP): 1}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ ToCountMap({Op(dtype=self.type_inf(expr), name='mul', - count_granularity=CountGranularity.WORKITEM): -1}) + #count_granularity=CountGranularity.WORKITEM): -1}) + count_granularity=CountGranularity.SUBGROUP): -1}) def map_quotient(self, expr, *args): return ToCountMap({Op(dtype=self.type_inf(expr), name='div', - count_granularity=CountGranularity.WORKITEM): 1}) \ + #count_granularity=CountGranularity.WORKITEM): 1}) \ + count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -756,14 +761,16 @@ class ExpressionOpCounter(CounterBase): def map_power(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='pow', - count_granularity=CountGranularity.WORKITEM): 1}) \ + #count_granularity=CountGranularity.WORKITEM): 1}) \ + count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='shift', - count_granularity=CountGranularity.WORKITEM): 1}) \ + #count_granularity=CountGranularity.WORKITEM): 1}) \ + count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) @@ -772,13 +779,15 @@ class ExpressionOpCounter(CounterBase): def map_bitwise_not(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='bw', - count_granularity=CountGranularity.WORKITEM): 1}) \ + #count_granularity=CountGranularity.WORKITEM): 1}) \ + count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='bw', - count_granularity=CountGranularity.WORKITEM): + #count_granularity=CountGranularity.WORKITEM): + count_granularity=CountGranularity.SUBGROUP): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -802,7 +811,8 @@ class ExpressionOpCounter(CounterBase): def map_min(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin', - count_granularity=CountGranularity.WORKITEM): + #count_granularity=CountGranularity.WORKITEM): + count_granularity=CountGranularity.SUBGROUP): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -1329,14 +1339,109 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) + if not isinstance(subgroup_size, int): + # try to find subgroup_size + subgroup_size_guess = _find_subgroup_size_for_knl(knl) + + if subgroup_size is None: + if subgroup_size_guess is None: + # 'guess' was not passed and either no target device found + # or get_simd_group_size returned None + raise ValueError("No sub-group size passed, no target device found. " + "Either (1) pass integer value for subgroup_size, " + "(2) ensure that kernel.target is PyOpenClTarget " + "and kernel.target.device is set, or (3) pass " + "subgroup_size='guess' and hope for the best.") + else: + subgroup_size = subgroup_size_guess + + elif subgroup_size == 'guess': + if subgroup_size_guess is None: + # unable to get subgroup_size from device, so guess + subgroup_size = 32 + warn_with_kernel(knl, "get_op_map_guessing_subgroup_size", + "get_op_map: 'guess' sub-group size " + "passed, no target device found, wildly guessing " + "that sub-group size is %d." % (subgroup_size)) + else: + subgroup_size = subgroup_size_guess + else: + raise ValueError("Invalid value for subgroup_size: %s. subgroup_size " + "must be integer, 'guess', or, if you're feeling " + "lucky, None." % (subgroup_size)) + + # ------------------------------ + #class CacheHolder(object): + # pass + + #cache_holder = CacheHolder() + #from pytools import memoize_in + + #@memoize_in(cache_holder, "insn_count") + def get_insn_count(knl, insn, count_granularity=CountGranularity.WORKITEM): + + if count_granularity is None: + warn_with_kernel(knl, "get_insn_count_assumes_granularity", + "get_insn_count: No count granularity passed for " + "Op, assuming %s granularity." + % (CountGranularity.WORKITEM)) + count_granularity == CountGranularity.WORKITEM + + if count_granularity == CountGranularity.WORKITEM: + return count_insn_runs( + knl, insn, count_redundant_work=count_redundant_work, + disregard_local_axes=False) + + ct_disregard_local = count_insn_runs( + knl, insn, disregard_local_axes=True, + count_redundant_work=count_redundant_work) + + if count_granularity == CountGranularity.WORKGROUP: + return ct_disregard_local + elif count_granularity == CountGranularity.SUBGROUP: + # get the group size + from loopy.symbolic import aff_to_expr + _, local_size = knl.get_grid_size_upper_bounds() + workgroup_size = 1 + if local_size: + for size in local_size: + s = aff_to_expr(size) + if not isinstance(s, int): + raise LoopyError("Cannot count insn with %s granularity, " + "work-group size is not integer: %s" + % (CountGranularity.SUBGROUP, local_size)) + workgroup_size *= s + + warn_with_kernel(knl, "insn_count_subgroups_upper_bound", + "get_insn_count: when counting instruction %s with " + "count_granularity=%s, using upper bound for work-group size " + "(%d work-items) to compute sub-groups per work-group. When " + "multiple device programs present, actual sub-group count may be" + "lower." % (insn, CountGranularity.SUBGROUP, workgroup_size)) + + from pytools import div_ceil + return ct_disregard_local*div_ceil(workgroup_size, subgroup_size) + else: + # this should not happen since this is enforced in Op + raise ValueError("get_insn_count: count_granularity '%s' is" + "not allowed. count_granularity options: %s" + % (count_granularity, CountGranularity.ALL+[None])) + # ------------------------------ + op_map = ToCountMap() op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: if isinstance(insn, (CallInstruction, CInstruction, Assignment)): ops = op_counter(insn.assignee) + op_counter(insn.expression) - op_map = op_map + ops*count_insn_runs( - knl, insn, - count_redundant_work=count_redundant_work) + #op_map = op_map + ops*count_insn_runs( + # knl, insn, + # count_redundant_work=count_redundant_work) + for key, val in six.iteritems(ops): + op_map = ( + op_map + + ToCountMap({key: val}) + * get_insn_count(knl, insn, key.count_granularity)) + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass else: diff --git a/test/test_statistics.py b/test/test_statistics.py index 79c5ec7d..b5b55347 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -39,6 +39,9 @@ from pymbolic.primitives import Variable from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa +SGS = 32 # Subgroup size + + def test_op_counter_basic(): knl = lp.make_kernel( @@ -54,21 +57,26 @@ def test_op_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + n_workgroups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params) - f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params) - f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.WORKITEM) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) + f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) ].eval_with_dict(params) - assert f32add == f32mul == f32div == n*m*ell - assert f64mul == n*m - assert i32add == n*m*2 + # (count-per-sub-group)*n_subgroups + assert f32add == f32mul == f32div == n*m*ell*n_subgroups + assert f64mul == n*m*n_subgroups + assert i32add == n*m*2*n_subgroups def test_op_counter_reduction(): @@ -81,15 +89,20 @@ def test_op_counter_reduction(): name="matmul_serial", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + n_workgroups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params) - f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.WORKITEM) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) + f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP) ].eval_with_dict(params) - assert f32add == f32mul == n*m*ell + # (count-per-sub-group)*n_subgroups + assert f32add == f32mul == n*m*ell*n_subgroups op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) @@ -111,21 +124,26 @@ def test_op_counter_logic(): name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + n_workgroups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params) - f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(params) - f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.WORKITEM) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params) + f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) ].eval_with_dict(params) - assert f32mul == n*m - assert f64div == 2*n*m # TODO why? - assert f64add == n*m - assert i32add == n*m + # (count-per-sub-group)*n_subgroups + assert f32mul == n*m*n_subgroups + assert f64div == 2*n*m*n_subgroups # TODO why? + assert f64add == n*m*n_subgroups + assert i32add == n*m*n_subgroups def test_op_counter_specialops(): @@ -143,27 +161,32 @@ def test_op_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + n_workgroups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params) - f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params) - f64pow = op_map[lp.Op(np.float64, 'pow', CG.WORKITEM)].eval_with_dict(params) - f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.WORKITEM) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) + f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP)].eval_with_dict(params) + f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) ].eval_with_dict(params) - f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.WORKITEM) + f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP) ].eval_with_dict(params) - f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.WORKITEM) + f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP) ].eval_with_dict(params) - assert f32div == 2*n*m*ell - assert f32mul == f32add == n*m*ell - assert f64add == 3*n*m - assert f64pow == i32add == f64rsq == f64sin == n*m + # (count-per-sub-group)*n_subgroups + assert f32div == 2*n*m*ell*n_subgroups + assert f32mul == f32add == n*m*ell*n_subgroups + assert f64add == 3*n*m*n_subgroups + assert f64pow == i32add == f64rsq == f64sin == n*m*n_subgroups def test_op_counter_bitwise(): @@ -183,26 +206,31 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + n_workgroups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - i32add = op_map[lp.Op(np.int32, 'add', CG.WORKITEM)].eval_with_dict(params) - i32bw = op_map[lp.Op(np.int32, 'bw', CG.WORKITEM)].eval_with_dict(params) - i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.WORKITEM) + i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(params) + i32bw = op_map[lp.Op(np.int32, 'bw', CG.SUBGROUP)].eval_with_dict(params) + i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP) ].eval_with_dict(params) - i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.WORKITEM) + i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP) ].eval_with_dict(params) - i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.WORKITEM) + i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP) ].eval_with_dict(params) - i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.WORKITEM) + i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP) ].eval_with_dict(params) - assert i32add == n*m+n*m*ell - assert i32bw == 2*n*m*ell - assert i64bw == 2*n*m - assert i64add == i64mul == n*m - assert i64shift == 2*n*m + # (count-per-sub-group)*n_subgroups + assert i32add == n*m+n*m*ell*n_subgroups + assert i32bw == 2*n*m*ell*n_subgroups + assert i64bw == 2*n*m*n_subgroups + assert i64add == i64mul == n*m*n_subgroups + assert i64shift == 2*n*m*n_subgroups def test_op_counter_triangular_domain(): @@ -228,15 +256,21 @@ def test_op_counter_triangular_domain(): op_map = lp.get_op_map( knl, + subgroup_size=SGS, count_redundant_work=True - )[lp.Op(np.float64, 'mul', CG.WORKITEM)] + )[lp.Op(np.float64, 'mul', CG.SUBGROUP)] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) + n_workgroups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group + if expect_fallback: - assert flops == 144 + assert flops == 144*n_subgroups else: - assert flops == 78 + assert flops == 78*n_subgroups def test_mem_access_counter_basic(): @@ -254,10 +288,8 @@ def test_mem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - subgroup_size = 32 - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) n = 512 m = 256 @@ -266,7 +298,8 @@ def test_mem_access_counter_basic(): n_workgroups = 1 group_size = 1 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, @@ -289,9 +322,9 @@ def test_mem_access_counter_basic(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32l == (3*n*m*ell)*n_workgroups*subgroups_per_group - assert f64l == (2*n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32l == (3*n*m*ell)*n_subgroups + assert f64l == (2*n*m)*n_subgroups f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, @@ -304,9 +337,9 @@ def test_mem_access_counter_basic(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32s == (n*m*ell)*n_workgroups*subgroups_per_group - assert f64s == (n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32s == (n*m*ell)*n_subgroups + assert f64s == (n*m)*n_subgroups def test_mem_access_counter_reduction(): @@ -320,10 +353,8 @@ def test_mem_access_counter_reduction(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - subgroup_size = 32 - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) n = 512 m = 256 ell = 128 @@ -331,7 +362,8 @@ def test_mem_access_counter_reduction(): n_workgroups = 1 group_size = 1 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, @@ -344,8 +376,8 @@ def test_mem_access_counter_reduction(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32l == (2*n*m*ell)*n_subgroups f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, @@ -353,8 +385,8 @@ def test_mem_access_counter_reduction(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32s == (n*ell)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32s == (n*ell)*n_subgroups ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) @@ -379,10 +411,8 @@ def test_mem_access_counter_logic(): knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - subgroup_size = 32 - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) n = 512 m = 256 ell = 128 @@ -390,7 +420,8 @@ def test_mem_access_counter_logic(): n_workgroups = 1 group_size = 1 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') @@ -404,10 +435,10 @@ def test_mem_access_counter_logic(): direction='store') ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32_g_l == (2*n*m)*n_workgroups*subgroups_per_group - assert f64_g_l == (n*m)*n_workgroups*subgroups_per_group - assert f64_g_s == (n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32_g_l == (2*n*m)*n_subgroups + assert f64_g_l == (n*m)*n_subgroups + assert f64_g_s == (n*m)*n_subgroups def test_mem_access_counter_specialops(): @@ -425,10 +456,8 @@ def test_mem_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - subgroup_size = 32 - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) n = 512 m = 256 ell = 128 @@ -436,7 +465,8 @@ def test_mem_access_counter_specialops(): n_workgroups = 1 group_size = 1 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, @@ -459,9 +489,9 @@ def test_mem_access_counter_specialops(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32 == (2*n*m*ell)*n_workgroups*subgroups_per_group - assert f64 == (2*n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32 == (2*n*m*ell)*n_subgroups + assert f64 == (2*n*m)*n_subgroups f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, @@ -474,16 +504,16 @@ def test_mem_access_counter_specialops(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32 == (n*m*ell)*n_workgroups*subgroups_per_group - assert f64 == (n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32 == (n*m*ell)*n_subgroups + assert f64 == (n*m)*n_subgroups filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], count_granularity=CG.SUBGROUP) tot = filtered_map.eval_and_sum(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert tot == (n*m*ell + n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert tot == (n*m*ell + n*m)*n_subgroups def test_mem_access_counter_bitwise(): @@ -503,10 +533,8 @@ def test_mem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - subgroup_size = 32 - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) n = 512 m = 256 ell = 128 @@ -514,7 +542,8 @@ def test_mem_access_counter_bitwise(): n_workgroups = 1 group_size = 1 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, @@ -537,8 +566,8 @@ def test_mem_access_counter_bitwise(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert i32 == (4*n*m+2*n*m*ell)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert i32 == (4*n*m+2*n*m*ell)*n_subgroups i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, @@ -551,8 +580,8 @@ def test_mem_access_counter_bitwise(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert i32 == (n*m+n*m*ell)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert i32 == (n*m+n*m*ell)*n_subgroups def test_mem_access_counter_mixed(): @@ -571,7 +600,6 @@ def test_mem_access_counter_mixed(): x=np.float32)) group_size_0 = 65 - subgroup_size = 32 knl = lp.split_iname(knl, "j", group_size_0) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) @@ -583,10 +611,11 @@ def test_mem_access_counter_mixed(): n_workgroups = div_ceil(ell, group_size_0) group_size = group_size_0 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) f64uniform = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='g', @@ -617,9 +646,9 @@ def test_mem_access_counter_mixed(): count_granularity=CG.WORKITEM) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f64uniform == (2*n*m)*n_workgroups*subgroups_per_group - assert f32uniform == (m*n)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f64uniform == (2*n*m)*n_subgroups + assert f32uniform == (m*n)*n_subgroups expect_fallback = False import islpy as isl @@ -651,8 +680,8 @@ def test_mem_access_counter_mixed(): count_granularity=CG.WORKITEM) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f64uniform == m*n*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f64uniform == m*n*n_subgroups if expect_fallback: if ell < group_size_0: @@ -681,7 +710,7 @@ def test_mem_access_counter_nonconsec(): knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=32) # noqa + subgroup_size=SGS) # noqa n = 512 m = 256 ell = 128 @@ -939,30 +968,35 @@ def test_all_counters_parallel_matmul(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} + group_size = bsize*bsize + n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ - lp.Op(np.float32, 'mul', CG.WORKITEM) + lp.Op(np.float32, 'mul', CG.SUBGROUP) ].eval_with_dict(params) f32add = op_map[ - lp.Op(np.float32, 'add', CG.WORKITEM) + lp.Op(np.float32, 'add', CG.SUBGROUP) ].eval_with_dict(params) i32ops = op_map[ - lp.Op(np.int32, 'add', CG.WORKITEM) + lp.Op(np.int32, 'add', CG.SUBGROUP) ].eval_with_dict(params) i32ops += op_map[ - lp.Op(np.dtype(np.int32), 'mul', CG.WORKITEM) + lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP) ].eval_with_dict(params) - assert f32mul+f32add == n*m*ell*2 + # (count-per-sub-group)*n_subgroups + assert f32mul+f32add == m*2*n_subgroups mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=32) + subgroup_size=SGS) f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('ell')}, @@ -991,7 +1025,7 @@ def test_all_counters_parallel_matmul(): local_mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=32).filter_by(mtype=['local']) + subgroup_size=SGS).filter_by(mtype=['local']) local_mem_l = local_mem_map.filter_by(direction=['load'] ).eval_and_sum(params) @@ -1067,8 +1101,6 @@ def test_summations_and_filters(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - subgroup_size = 32 - n = 512 m = 256 ell = 128 @@ -1076,24 +1108,25 @@ def test_summations_and_filters(): n_workgroups = 1 group_size = 1 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) loads_a = mem_map.filter_by(direction=['load'], variable=['a'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert loads_a == (2*n*m*ell)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert loads_a == (2*n*m*ell)*n_subgroups global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert global_stores == (n*m*ell + n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert global_stores == (n*m*ell + n*m)*n_subgroups ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], count_granularity=[CG.SUBGROUP] @@ -1102,9 +1135,9 @@ def test_summations_and_filters(): count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_workgroups*subgroups_per_group - assert st_bytes == (4*n*m*ell + 8*n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_subgroups + assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') @@ -1113,11 +1146,11 @@ def test_summations_and_filters(): f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32lall == (3*n*m*ell)*n_workgroups*subgroups_per_group - assert f64lall == (2*n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32lall == (3*n*m*ell)*n_subgroups + assert f64lall == (2*n*m)*n_subgroups - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) @@ -1149,8 +1182,8 @@ def test_summations_and_filters(): key.direction == 'load' f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f64l == (2*n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f64l == (2*n*m)*n_subgroups def test_strided_footprint(): -- GitLab From e382422b64be0a67c5f892f5bfb22cae9aa5c846 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 1 Aug 2018 14:29:19 -0500 Subject: [PATCH 02/80] now counting local access with count_granularity=subgroup --- loopy/statistics.py | 6 ++++-- test/test_statistics.py | 13 ++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 2df3093d..f8999367 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -928,7 +928,8 @@ class LocalMemAccessCounter(MemAccessCounter): sub_map[MemAccess( mtype='local', dtype=dtype, - count_granularity=CountGranularity.WORKITEM) + #count_granularity=CountGranularity.WORKITEM) + count_granularity=CountGranularity.SUBGROUP) ] = 1 return sub_map @@ -948,7 +949,8 @@ class LocalMemAccessCounter(MemAccessCounter): lid_strides=dict(sorted(six.iteritems(lid_strides))), gid_strides=dict(sorted(six.iteritems(gid_strides))), variable=name, - count_granularity=CountGranularity.WORKITEM)] = 1 + #count_granularity=CountGranularity.WORKITEM)] = 1 + count_granularity=CountGranularity.SUBGROUP)] = 1 return sub_map diff --git a/test/test_statistics.py b/test/test_statistics.py index b5b55347..3f236652 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -1029,29 +1029,32 @@ def test_all_counters_parallel_matmul(): local_mem_l = local_mem_map.filter_by(direction=['load'] ).eval_and_sum(params) - assert local_mem_l == n*m*ell*2 + # (count-per-sub-group)*n_subgroups + assert local_mem_l == m*2*n_subgroups local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={1: 16}, gid_strides={}, variable='a_fetch', - count_granularity=CG.WORKITEM) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={0: 1}, gid_strides={}, variable='b_fetch', - count_granularity=CG.WORKITEM) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - assert local_mem_l_a == local_mem_l_b == n*m*ell + # (count-per-sub-group)*n_subgroups + assert local_mem_l_a == local_mem_l_b == m*n_subgroups local_mem_s = local_mem_map.filter_by(direction=['store'] ).eval_and_sum(params) - assert local_mem_s == n*m*ell*2/bsize + # (count-per-sub-group)*n_subgroups + assert local_mem_s == m*2/bsize*n_subgroups def test_gather_access_footprint(): -- GitLab From 906090e11804a78c0c06455f5ea29b7e61657868 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 12 Aug 2018 16:38:04 +0530 Subject: [PATCH 03/80] those were a lot of changes :o --- doc/index.rst | 1 + examples/python/global_barrier_removal.py | 2 +- examples/python/hello-loopy.py | 3 +- examples/python/ispc-stream-harness.py | 2 - examples/python/sparse.py | 4 +- loopy/__init__.py | 36 +- loopy/auto_test.py | 289 ++++++-------- loopy/check.py | 137 ++++++- loopy/cli.py | 2 +- loopy/codegen/__init__.py | 90 ++++- loopy/codegen/control.py | 3 +- loopy/codegen/loop.py | 2 +- loopy/codegen/result.py | 2 +- loopy/isl_helpers.py | 2 +- loopy/kernel/__init__.py | 132 ++++--- loopy/kernel/creation.py | 35 +- loopy/kernel/data.py | 6 +- loopy/kernel/instruction.py | 34 +- loopy/kernel/tools.py | 35 +- loopy/library/function.py | 54 +-- loopy/library/random123.py | 108 ++--- loopy/library/reduction.py | 256 ++++++------ loopy/loop.py | 2 + loopy/preprocess.py | 320 +++++++++++++-- loopy/schedule/__init__.py | 21 +- loopy/statistics.py | 462 ++++++++++++++-------- loopy/symbolic.py | 105 ++++- loopy/target/__init__.py | 9 +- loopy/target/c/__init__.py | 245 ++++++------ loopy/target/c/c_execution.py | 39 +- loopy/target/c/codegen/expression.py | 92 ++--- loopy/target/cuda.py | 98 +++-- loopy/target/execution.py | 116 +++--- loopy/target/ispc.py | 5 +- loopy/target/opencl.py | 209 ++++++---- loopy/target/pyopencl.py | 129 ++++-- loopy/target/pyopencl_execution.py | 61 +-- loopy/target/python.py | 57 ++- loopy/tools.py | 3 +- loopy/transform/add_barrier.py | 12 +- loopy/transform/arithmetic.py | 6 + loopy/transform/batch.py | 8 +- loopy/transform/buffer.py | 43 +- loopy/transform/data.py | 54 ++- loopy/transform/diff.py | 3 + loopy/transform/fusion.py | 56 ++- loopy/transform/iname.py | 60 ++- loopy/transform/instruction.py | 37 +- loopy/transform/padding.py | 15 +- loopy/transform/parameter.py | 6 + loopy/transform/precompute.py | 38 +- loopy/transform/save.py | 27 +- loopy/transform/subst.py | 20 +- loopy/type_inference.py | 354 +++++++++++++++-- test/test_apps.py | 19 +- test/test_c_execution.py | 1 + test/test_diff.py | 3 +- test/test_domain.py | 74 ++-- test/test_fortran.py | 12 +- test/test_loopy.py | 393 +++++++++--------- test/test_numa_diff.py | 4 +- test/test_reduction.py | 46 ++- test/test_target.py | 14 +- test/test_transform.py | 116 +++--- test/testlib.py | 50 ++- 65 files changed, 3071 insertions(+), 1608 deletions(-) diff --git a/doc/index.rst b/doc/index.rst index d862a8ac..0644b34c 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -68,6 +68,7 @@ Please check :ref:`installation` to get started. ref_creation ref_kernel ref_transform + ref_call ref_other misc diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index 7ab049cd..cc4926fe 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -24,7 +24,7 @@ from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel -knl = get_one_scheduled_kernel(knl) +knl = get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info) # map schedule onto host or device print(knl) diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py index 9098c544..764cea0e 100644 --- a/examples/python/hello-loopy.py +++ b/examples/python/hello-loopy.py @@ -16,7 +16,8 @@ a = cl.array.arange(queue, n, dtype=np.float32) # ------ knl = lp.make_kernel( "{ [i]: 0<=i max_test_kernel_count: - break + if need_check and not AUTO_TEST_SKIP_RUN: + for arg_desc in ref_arg_data: + if arg_desc is None: + continue + if not arg_desc.needs_checking: + continue - kernel = infer_unknown_types(kernel, expect_completion=True) + from pyopencl.compyte.array import as_strided + ref_ary = as_strided( + arg_desc.ref_storage_array.get(), + shape=arg_desc.ref_shape, + strides=arg_desc.ref_numpy_strides).flatten() + test_ary = as_strided( + arg_desc.test_storage_array.get(), + shape=arg_desc.test_shape, + strides=arg_desc.test_numpy_strides).flatten() + common_len = min(len(ref_ary), len(test_ary)) + ref_ary = ref_ary[:common_len] + test_ary = test_ary[:common_len] - compiled = CompiledKernel(ctx, kernel) + error_is_small, error = check_result(test_ary, ref_ary) + if not error_is_small: + raise AutomaticTestFailure(error) - if args is None: - kernel_info = compiled.kernel_info(frozenset()) + need_check = False - args = make_args(kernel, - kernel_info.implemented_data_info, - queue, ref_arg_data, parameters) - args["out_host"] = False + events = [] + queue.finish() - if not quiet: - print(75*"-") - print("Kernel #%d:" % i) - print(75*"-") - if print_code: - print(compiled.get_highlighted_code()) - print(75*"-") - if dump_binary: - print(type(compiled.cl_program)) - print(compiled.cl_program.binaries[0]) - print(75*"-") + logger.info("%s: warmup done" % (test_prog.name)) - logger.info("%s: run warmup" % (knl.name)) + logger.info("%s: timing run" % (test_prog.name)) - for i in range(warmup_rounds): - if not AUTO_TEST_SKIP_RUN: - compiled(queue, **args) - - if need_check and not AUTO_TEST_SKIP_RUN: - for arg_desc in ref_arg_data: - if arg_desc is None: - continue - if not arg_desc.needs_checking: - continue - - from pyopencl.compyte.array import as_strided - ref_ary = as_strided( - arg_desc.ref_storage_array.get(), - shape=arg_desc.ref_shape, - strides=arg_desc.ref_numpy_strides).flatten() - test_ary = as_strided( - arg_desc.test_storage_array.get(), - shape=arg_desc.test_shape, - strides=arg_desc.test_numpy_strides).flatten() - common_len = min(len(ref_ary), len(test_ary)) - ref_ary = ref_ary[:common_len] - test_ary = test_ary[:common_len] - - error_is_small, error = check_result(test_ary, ref_ary) - if not error_is_small: - raise AutomaticTestFailure(error) - - need_check = False - - events = [] - queue.finish() + timing_rounds = warmup_rounds - logger.info("%s: warmup done" % (knl.name)) + while True: + from time import time + start_time = time() - logger.info("%s: timing run" % (knl.name)) + evt_start = cl.enqueue_marker(queue) - timing_rounds = warmup_rounds + for i in range(timing_rounds): + if not AUTO_TEST_SKIP_RUN: + evt, _ = test_prog(queue, **args) + events.append(evt) + else: + events.append(cl.enqueue_marker(queue)) - while True: - from time import time - start_time = time() + evt_end = cl.enqueue_marker(queue) - evt_start = cl.enqueue_marker(queue) + queue.finish() + stop_time = time() - for i in range(timing_rounds): - if not AUTO_TEST_SKIP_RUN: - evt, _ = compiled(queue, **args) - events.append(evt) - else: - events.append(cl.enqueue_marker(queue)) + for evt in events: + evt.wait() + evt_start.wait() + evt_end.wait() - evt_end = cl.enqueue_marker(queue) + elapsed_event = (1e-9*events[-1].profile.END + - 1e-9*events[0].profile.START) \ + / timing_rounds + try: + elapsed_event_marker = ((1e-9*evt_end.profile.START + - 1e-9*evt_start.profile.START) + / timing_rounds) + except cl.RuntimeError: + elapsed_event_marker = None - queue.finish() - stop_time = time() + elapsed_wall = (stop_time-start_time)/timing_rounds - for evt in events: - evt.wait() - evt_start.wait() - evt_end.wait() + if elapsed_wall * timing_rounds < 0.3: + timing_rounds *= 4 + else: + break - elapsed_event = (1e-9*events[-1].profile.END - - 1e-9*events[0].profile.START) \ - / timing_rounds - try: - elapsed_event_marker = ((1e-9*evt_end.profile.START - - 1e-9*evt_start.profile.START) - / timing_rounds) - except cl.RuntimeError: - elapsed_event_marker = None + logger.info("%s: timing run done" % (test_prog.name)) - elapsed_wall = (stop_time-start_time)/timing_rounds + rates = "" + for cnt, lbl in zip(op_count, op_label): + rates += " %g %s/s" % (cnt/elapsed_wall, lbl) - if elapsed_wall * timing_rounds < 0.3: - timing_rounds *= 4 + if not quiet: + def format_float_or_none(v): + if v is None: + return "" else: - break + return "%g" % v - logger.info("%s: timing run done" % (knl.name)) + print("elapsed: %s s event, %s s marker-event %s s wall " + "(%d rounds)%s" % ( + format_float_or_none(elapsed_event), + format_float_or_none(elapsed_event_marker), + format_float_or_none(elapsed_wall), timing_rounds, rates)) - rates = "" + if do_check: + ref_rates = "" for cnt, lbl in zip(op_count, op_label): - rates += " %g %s/s" % (cnt/elapsed_wall, lbl) - + ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) if not quiet: - def format_float_or_none(v): - if v is None: - return "" - else: - return "%g" % v - - print("elapsed: %s s event, %s s marker-event %s s wall " - "(%d rounds)%s" % ( - format_float_or_none(elapsed_event), - format_float_or_none(elapsed_event_marker), - format_float_or_none(elapsed_wall), timing_rounds, rates)) - - if do_check: - ref_rates = "" - for cnt, lbl in zip(op_count, op_label): - ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) - if not quiet: - print("ref: elapsed: %g s event, %g s wall%s" % ( - ref_elapsed_event, ref_elapsed_wall, ref_rates)) + print("ref: elapsed: %g s event, %g s wall%s" % ( + ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} diff --git a/loopy/check.py b/loopy/check.py index c31304d8..ae5599bc 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -27,9 +27,13 @@ from six.moves import range from islpy import dim_type import islpy as isl -from loopy.symbolic import WalkMapper +from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) +from functools import reduce + import logging logger = logging.getLogger(__name__) @@ -56,6 +60,73 @@ def check_identifiers_in_subst_rules(knl): % (knl.name, rule.name, ", ".join(deps-rule_allowed_identifiers))) + +class UnscopedCallCollector(CombineMapper): + """ + Collects all the unscoped calls within a kernel. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the kernel. + + .. note:: + :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are + never scoped in the pipeline. + """ + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + from pymbolic.primitives import CallWithKwargs + return self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={})) + + def map_call_with_kwargs(self, expr): + from loopy.library.reduction import ArgExtOp + if not isinstance(expr.function, (ResolvedFunction, ArgExtOp)): + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values()))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def check_functions_are_scoped(kernel): + """ Checks if all the calls in the instruction expression have been scoped, + otherwise indicates to what all calls we await signature. Refer + :class:`loopy.symbolic.ResolvedFunction` for a detailed explanation of a + scoped function. + """ + + from loopy.symbolic import SubstitutionRuleExpander + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + unscoped_calls = UnscopedCallCollector()(subst_expander( + insn.expression)) + if unscoped_calls: + raise LoopyError("Unknown function '%s' obtained -- register a " + "function or a kernel corresponding to it." % + set(unscoped_calls).pop()) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unknown type of instruction %s" % type(insn).__name__) + # }}} @@ -114,6 +185,18 @@ def check_loop_priority_inames_known(kernel): raise LoopyError("unknown iname '%s' in loop priorities" % iname) +def _get_all_unique_iname_tags(kernel): + """Returns a set of all the iname tags used in *kernel* that + inherit from :class:`loopy.kernel.data.UniqueTag`. + """ + from loopy.kernel.data import UniqueTag + iname_tags = [kernel.iname_to_tag.get(iname) for iname in + kernel.all_inames()] + return set( + tag for tag in iname_tags if + isinstance(tag, UniqueTag)) + + def check_multiple_tags_allowed(kernel): from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, VectorizeTag, UnrollTag, ForceSequentialTag, IlpBaseTag, filter_iname_tags_by_type) @@ -128,8 +211,10 @@ def check_multiple_tags_allowed(kernel): "tags: {1}".format(iname, tags)) -def check_for_double_use_of_hw_axes(kernel): +def check_for_double_use_of_hw_axes(kernel, program_callables_info): from loopy.kernel.data import UniqueTag + from loopy.kernel.instruction import CallInstruction + from loopy.kernel.function_interface import CallableKernel for insn in kernel.instructions: insn_tag_keys = set() @@ -142,6 +227,21 @@ def check_for_double_use_of_hw_axes(kernel): insn_tag_keys.add(key) + # check usage of iname tags in the callee kernel + if isinstance(insn, CallInstruction): + in_knl_callable = program_callables_info[ + insn.expression.function.name] + if isinstance(in_knl_callable, CallableKernel): + # check for collision in iname_tag keys in the instruction + # due to the callee kernel + common_iname_tags = [tag for tag in + _get_all_unique_iname_tags(in_knl_callable.subkernel) + if tag.key in insn_tag_keys] + if common_iname_tags: + raise LoopyError("instruction '%s' has multiple " + "inames tagged '%s'" % (insn.id, + common_iname_tags.pop())) + def check_for_inactive_iname_access(kernel): for insn in kernel.instructions: @@ -387,11 +487,12 @@ def check_write_destinations(kernel): # {{{ check_has_schedulable_iname_nesting def check_has_schedulable_iname_nesting(kernel): - from loopy.transform.iname import (has_schedulable_iname_nesting, - get_iname_duplication_options) - if not has_schedulable_iname_nesting(kernel): + from loopy.transform.iname import ( + has_schedulable_iname_nesting_for_single_kernel, + get_iname_duplication_options_for_single_kernel) + if not has_schedulable_iname_nesting_for_single_kernel(kernel): import itertools as it - opt = get_iname_duplication_options(kernel) + opt = get_iname_duplication_options_for_single_kernel(kernel) opt_str = "\n".join("* Duplicate %s within instructions %s" % (i, w) for i, w in it.islice(opt, 3)) raise LoopyError("Kernel does not have a schedulable iname nesting. " @@ -616,13 +717,13 @@ def check_variable_access_ordered(kernel): # }}} -def pre_schedule_checks(kernel): +def pre_schedule_checks(kernel, program_callables_info): try: logger.debug("%s: pre-schedule check: start" % kernel.name) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) - check_for_double_use_of_hw_axes(kernel) + check_for_double_use_of_hw_axes(kernel, program_callables_info) check_insn_attributes(kernel) check_loop_priority_inames_known(kernel) check_multiple_tags_allowed(kernel) @@ -650,7 +751,8 @@ def pre_schedule_checks(kernel): # {{{ check for unused hw axes -def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): +def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info, + sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, get_insn_ids_for_block_at, gather_schedule_block) @@ -665,7 +767,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): assert isinstance(kernel.schedule[sched_index], CallKernel) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) + get_insn_ids_for_block_at(kernel.schedule, sched_index), + program_callables_info) group_axes = set(ax for ax, length in enumerate(group_size)) local_axes = set(ax for ax, length in enumerate(local_size)) @@ -682,7 +785,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): while i < loop_end_i: sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): - i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) + i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, + program_callables_info, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] @@ -733,9 +837,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): return past_end_i -def check_for_unused_hw_axes_in_insns(kernel): +def check_for_unused_hw_axes_in_insns(kernel, program_callables_info): if kernel.schedule: - _check_for_unused_hw_axes_in_kernel_chunk(kernel) + _check_for_unused_hw_axes_in_kernel_chunk(kernel, + program_callables_info) # }}} @@ -889,15 +994,15 @@ def check_that_shapes_and_strides_are_arguments(kernel): # }}} -def pre_codegen_checks(kernel): +def pre_codegen_checks(kernel, program_callables_info): try: logger.debug("pre-codegen check %s: start" % kernel.name) - check_for_unused_hw_axes_in_insns(kernel) + check_for_unused_hw_axes_in_insns(kernel, program_callables_info) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) check_that_all_insns_are_scheduled(kernel) - kernel.target.pre_codegen_check(kernel) + kernel.target.pre_codegen_check(kernel, program_callables_info) check_that_shapes_and_strides_are_arguments(kernel) logger.debug("pre-codegen check %s: done" % kernel.name) diff --git a/loopy/cli.py b/loopy/cli.py index a92922b1..060340d5 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -205,7 +205,7 @@ def main(): new_kernels = [] for kernel in kernels: new_args = [ - lp.ArrayArg("occa_info", np.int32, shape=None) + lp.GlobalArg("occa_info", np.int32, shape=None) ] + kernel.args new_kernels.append(kernel.copy(args=new_args)) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 11f874e1..3e675db7 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -32,6 +32,10 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION +from loopy.kernel.function_interface import CallableKernel +from cgen import Collection + + import logging logger = logging.getLogger(__name__) @@ -146,6 +150,7 @@ class SeenFunction(ImmutableRecord): class CodeGenerationState(object): """ .. attribute:: kernel + .. attribute:: target .. attribute:: implemented_data_info a list of :class:`ImplementedDataInfo` objects. @@ -187,17 +192,21 @@ class CodeGenerationState(object): generated. .. attribute:: schedule_index_end + + .. attribute:: program_callables_info """ - def __init__(self, kernel, + def __init__(self, kernel, target, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, + program_callables_info, vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, schedule_index_end=None): self.kernel = kernel + self.target = target self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain self.implemented_predicates = implemented_predicates @@ -206,6 +215,7 @@ class CodeGenerationState(object): self.seen_atomic_dtypes = seen_atomic_dtypes self.var_subst_map = var_subst_map.copy() self.allow_complex = allow_complex + self.program_callables_info = program_callables_info self.vectorization_info = vectorization_info self.var_name_generator = var_name_generator self.is_generating_device_code = is_generating_device_code @@ -214,7 +224,7 @@ class CodeGenerationState(object): # {{{ copy helpers - def copy(self, kernel=None, implemented_data_info=None, + def copy(self, kernel=None, target=None, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), var_subst_map=None, vectorization_info=None, is_generating_device_code=None, @@ -224,6 +234,9 @@ class CodeGenerationState(object): if kernel is None: kernel = self.kernel + if target is None: + target = self.target + if implemented_data_info is None: implemented_data_info = self.implemented_data_info @@ -244,6 +257,7 @@ class CodeGenerationState(object): return CodeGenerationState( kernel=kernel, + target=target, implemented_data_info=implemented_data_info, implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( @@ -253,6 +267,7 @@ class CodeGenerationState(object): seen_atomic_dtypes=self.seen_atomic_dtypes, var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, + program_callables_info=self.program_callables_info, vectorization_info=vectorization_info, var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, @@ -374,19 +389,15 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_v2(kernel): +def generate_code_for_a_single_kernel(kernel, program_callables_info, target): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import KernelState - if kernel.state == KernelState.INITIAL: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) - if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + kernel = get_one_scheduled_kernel(kernel, program_callables_info) if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " @@ -407,11 +418,8 @@ def generate_code_v2(kernel): # }}} - from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - from loopy.check import pre_codegen_checks - pre_codegen_checks(kernel) + pre_codegen_checks(kernel, program_callables_info) logger.info("%s: generate code: start" % kernel.name) @@ -469,10 +477,12 @@ def generate_code_v2(kernel): gen_program_name=( kernel.target.host_program_name_prefix + kernel.name - + kernel.target.host_program_name_suffix), - schedule_index_end=len(kernel.schedule)) + + target.host_program_name_suffix), + schedule_index_end=len(kernel.schedule), + program_callables_info=program_callables_info) from loopy.codegen.result import generate_host_or_device_program + codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) @@ -502,7 +512,7 @@ def generate_code_v2(kernel): ) preamble_generators = (kernel.preamble_generators - + kernel.target.get_device_ast_builder().preamble_generators()) + + target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) @@ -524,6 +534,56 @@ def generate_code_v2(kernel): return codegen_result +def generate_code_v2(program): + from loopy.kernel import LoopKernel + from loopy.program import make_program_from_kernel + + if isinstance(program, LoopKernel): + program = make_program_from_kernel(program) + + from loopy.kernel import KernelState + if program.root_kernel.state == KernelState.INITIAL: + from loopy.preprocess import preprocess_program + program = preprocess_program(program) + + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) + + codegen_results = {} + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + codegen_results[func_id] = ( + generate_code_for_a_single_kernel(in_knl_callable.subkernel, + program.program_callables_info, program.target)) + + device_preambles = set() + for cgr in codegen_results.values(): + device_preambles.update(cgr.device_preambles) + + for in_knl_callable in program.program_callables_info.values(): + for preamble in in_knl_callable.generate_preambles(program.target): + device_preambles.update([preamble]) + + collective_device_program = codegen_results[program.name].device_programs[0] + for func_id, callee_cgr in codegen_results.items(): + if func_id != program.name: + assert len(callee_cgr.device_programs) == 1 + callee_prog_ast = callee_cgr.device_programs[0].ast + collective_device_program = collective_device_program.copy( + ast=Collection([callee_prog_ast, collective_device_program.ast])) + + device_preambles.update([('98_%s' % func_id, + str(callee_prog_ast.fdecl)), ]) + + collective_device_programs = [collective_device_program] + ( + codegen_results[program.name].device_programs[1:]) + + return codegen_results[program.name].copy( + device_programs=collective_device_programs, + device_preambles=device_preambles) + + def generate_code(kernel, device=None): if device is not None: from warnings import warn diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 45e2a18c..90bdbda3 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -115,7 +115,8 @@ def generate_code_for_sched_index(codegen_state, sched_index): new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) + get_insn_ids_for_block_at(kernel.schedule, sched_index), + codegen_state.program_callables_info) return merge_codegen_results(codegen_state, [ codegen_result, diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index ebddf315..39cf20c7 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -248,7 +248,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( - insn_ids_for_block) + insn_ids_for_block, codegen_state.program_callables_info) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 4318ad71..00f19d99 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -133,7 +133,7 @@ class CodeGenerationResult(ImmutableRecord): preamble_codes = process_preambles( getattr(self, "host_preambles", []) + - getattr(self, "device_preambles", []) + list(getattr(self, "device_preambles", [])) ) return ( diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 5a747d07..ef07b7e2 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -27,7 +27,7 @@ THE SOFTWARE. from six.moves import range, zip -from loopy.diagnostic import StaticValueFindingError +from loopy.diagnostic import StaticValueFindingError, LoopyError import islpy as isl from islpy import dim_type diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 6b003380..d2723c57 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -37,10 +37,6 @@ import re from pytools import UniqueNameGenerator, generate_unique_names -from loopy.library.function import ( - default_function_mangler, - single_arg_function_mangler) - from loopy.diagnostic import CannotBranchDomainTree, LoopyError from loopy.tools import natsorted from loopy.diagnostic import StaticValueFindingError @@ -224,6 +220,12 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: target A subclass of :class:`loopy.TargetBase`. + + .. attribute:: is_called_from_host + + An instance of :class:`bool`. Will be set *False* for the kernel which + would be called from another top level kernels. Default value is + *True*. """ # {{{ constructor @@ -252,6 +254,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): state=KernelState.INITIAL, target=None, + is_called_from_host=True, + overridden_get_grid_sizes_for_insn_ids=None, _cached_written_variables=None): """ @@ -277,15 +281,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if substitutions is None: substitutions = {} if function_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] - if symbol_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] + function_manglers = [] if iname_slab_increments is None: iname_slab_increments = {} @@ -372,6 +368,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): options=options, state=state, target=target, + is_called_from_host=is_called_from_host, overridden_get_grid_sizes_for_insn_ids=( overridden_get_grid_sizes_for_insn_ids), _cached_written_variables=_cached_written_variables) @@ -380,7 +377,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ function mangling + # {{{ function mangling/scoping def mangle_function(self, identifier, arg_dtypes, ast_builder=None): if ast_builder is None: @@ -1039,21 +1036,25 @@ class LoopKernel(ImmutableRecordWithoutPickling): self.get_iname_bounds(iname, constants_only=True).size, constants_only=True))) - @memoize_method - def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False): + def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, + program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. :arg insn_ids: a :class:`frozenset` of instruction IDs - *global_size* and *local_size* are :class:`islpy.PwAff` objects. + *global_size* and *local_size* are instances of :class:`dict` with + mapping of the form from ``axis`` to :class:`islpy.PwAff` objects. """ - if self.overridden_get_grid_sizes_for_insn_ids: - return self.overridden_get_grid_sizes_for_insn_ids( - insn_ids, - ignore_auto=ignore_auto) + # {{{ collecting the callee kernels in insn_ids + + from loopy.kernel.tools import get_direct_callee_kernels + callee_kernels = get_direct_callee_kernels(self, + program_callables_info, insn_ids) + + # }}} all_inames_by_insns = set() for insn_id in insn_ids: @@ -1068,6 +1069,15 @@ class LoopKernel(ImmutableRecordWithoutPickling): global_sizes = {} local_sizes = {} + # updating the grid sizes from the callee_kernels. + for callee_kernel in callee_kernels: + gsize, lsize = callee_kernel.get_grid_sizes_for_insn_ids_as_dicts( + frozenset(insn.id for insn in callee_kernel.instructions), + program_callables_info, ignore_auto) + + global_sizes.update(gsize) + local_sizes.update(lsize) + from loopy.kernel.data import ( GroupIndexTag, LocalIndexTag, AutoLocalIndexTagBase) @@ -1108,6 +1118,31 @@ class LoopKernel(ImmutableRecordWithoutPickling): tgt_dict[tag.axis] = size + return global_sizes, local_sizes + + def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info, + ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of all instructions whose IDs are given + in *insn_ids*. + + :arg insn_ids: a :class:`frozenset` of instruction IDs + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + + if self.overridden_get_grid_sizes_for_insn_ids: + return self.overridden_get_grid_sizes_for_insn_ids( + insn_ids, + program_callables_info=program_callables_info, + ignore_auto=ignore_auto) + + assert self.is_called_from_host, ("Callee kernels do not have sufficient " + "information to compute grid sizes.") + + global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( + insn_ids, program_callables_info, ignore_auto=ignore_auto) + def to_dim_tuple(size_dict, which, forced_sizes={}): forced_sizes = forced_sizes.copy() @@ -1137,7 +1172,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) - def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, ignore_auto=False): + def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, + program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1148,7 +1184,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ grid_size, group_size = self.get_grid_sizes_for_insn_ids( - insn_ids, ignore_auto) + insn_ids, program_callables_info, ignore_auto) def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr @@ -1156,7 +1192,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return tup_to_exprs(grid_size), tup_to_exprs(group_size) - def get_grid_size_upper_bounds(self, ignore_auto=False): + def get_grid_size_upper_bounds(self, program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1164,9 +1200,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ return self.get_grid_sizes_for_insn_ids( frozenset(insn.id for insn in self.instructions), + program_callables_info, ignore_auto=ignore_auto) - def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + def get_grid_size_upper_bounds_as_exprs(self, program_callables_info, + ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1175,6 +1213,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return self.get_grid_sizes_for_insn_ids_as_exprs( frozenset(insn.id for insn in self.instructions), + program_callables_info, ignore_auto=ignore_auto) # }}} @@ -1365,47 +1404,13 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ implementation arguments - - @property - @memoize_method - def impl_arg_to_arg(self): - from loopy.kernel.array import ArrayBase - - result = {} - - for arg in self.args: - if not isinstance(arg, ArrayBase): - result[arg.name] = arg - continue - - if arg.shape is None or arg.dim_tags is None: - result[arg.name] = arg - continue - - subscripts_and_names = arg.subscripts_and_names() - if subscripts_and_names is None: - result[arg.name] = arg - continue - - for index, sub_arg_name in subscripts_and_names: - result[sub_arg_name] = arg - - return result - - # }}} - # {{{ direct execution def __call__(self, *args, **kwargs): - key = self.target.get_kernel_executor_cache_key(*args, **kwargs) - try: - kex = self._kernel_executor_cache[key] - except KeyError: - kex = self.target.get_kernel_executor(self, *args, **kwargs) - self._kernel_executor_cache[key] = kex - - return kex(*args, **kwargs) + # FIXME: scream and then convert to a program + from loopy.program import make_program_from_kernel + program = make_program_from_kernel(self) + return program(*args, **kwargs) # }}} @@ -1489,6 +1494,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "silenced_warnings", "options", "state", + "is_called_from_host", "target", ) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index c42db348..bac4afc8 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -24,16 +24,19 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - import numpy as np from pymbolic.mapper import CSECachingMapperMixin +from pymbolic.primitives import Slice, Variable, Subscript from loopy.tools import intern_frozenset_of_ids -from loopy.symbolic import IdentityMapper, WalkMapper +from loopy.symbolic import ( + IdentityMapper, WalkMapper, SubArrayRef) from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, - SubstitutionRule) + SubstitutionRule, AddressSpace) +from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, + CallInstruction) from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -504,9 +507,11 @@ def parse_insn(groups, insn_options): assignee_names.append(inner_lhs_i.name) elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)): assignee_names.append(inner_lhs_i.aggregate.name) + elif isinstance(inner_lhs_i, SubArrayRef): + assignee_names.append(inner_lhs_i.subscript.aggregate.name) else: raise LoopyError("left hand side of assignment '%s' must " - "be variable or subscript" % (lhs_i,)) + "be variable, subscript or a SubArrayRef" % (lhs_i,)) new_lhs.append(lhs_i) @@ -1139,7 +1144,7 @@ class ArgumentGuesser: def make_new_arg(self, arg_name): arg_name = arg_name.strip() - from loopy.kernel.data import ValueArg, ArrayArg, AddressSpace + from loopy.kernel.data import ValueArg, ArrayArg import loopy as lp if arg_name in self.all_params: @@ -1664,7 +1669,7 @@ def _is_wildcard(s): def _resolve_dependencies(what, knl, insn, deps): - from loopy import find_instructions + from loopy.transform.instruction import find_instructions_in_single_kernel from loopy.match import MatchExpressionBase new_deps = [] @@ -1673,7 +1678,7 @@ def _resolve_dependencies(what, knl, insn, deps): found_any = False if isinstance(dep, MatchExpressionBase): - for new_dep in find_instructions(knl, dep): + for new_dep in find_instructions_in_single_kernel(knl, dep): if new_dep.id != insn.id: new_deps.append(new_dep.id) found_any = True @@ -1954,6 +1959,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): target = kwargs.pop("target", None) seq_dependencies = kwargs.pop("seq_dependencies", False) fixed_parameters = kwargs.pop("fixed_parameters", {}) + make_program = kwargs.pop("make_program", True) if defines: from warnings import warn @@ -2165,15 +2171,24 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) + from loopy.kernel.tools import infer_arg_is_output_only + knl = infer_arg_is_output_only(knl) + from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) creation_plog.done() - from loopy.kernel.tools import infer_arg_is_output_only - knl = infer_arg_is_output_only(knl) + if make_program: + from loopy.program import make_program_from_kernel + return make_program_from_kernel(knl) + else: + return knl + - return knl +def make_kernel_function(*args, **kwargs): + kwargs['make_program'] = False + return make_kernel(*args, **kwargs) # }}} diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 3e776bd0..9ba28896 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -337,6 +337,7 @@ class KernelArgument(ImmutableRecord): dtype = None kwargs["dtype"] = dtype + kwargs["is_output_only"] = kwargs.pop("is_output_only", None) ImmutableRecord.__init__(self, **kwargs) @@ -362,7 +363,7 @@ class ArrayArg(ArrayBase, KernelArgument): def __init__(self, *args, **kwargs): if "address_space" not in kwargs: raise TypeError("'address_space' must be specified") - kwargs["is_output_only"] = kwargs.pop("is_output_only", False) + kwargs["is_output_only"] = kwargs.pop("is_output_only", None) super(ArrayArg, self).__init__(*args, **kwargs) @@ -402,6 +403,9 @@ class ConstantArg(ArrayBase, KernelArgument): min_target_axes = 0 max_target_axes = 1 + # Constant Arg cannot be an output + is_output_only = False + def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): return ast_builder.get_constant_arg_decl(self.name + name_suffix, shape, dtype, is_written) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index e9c7bde9..0f548bba 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -487,7 +487,7 @@ class InstructionBase(ImmutableRecord): def _get_assignee_var_name(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -506,13 +506,20 @@ def _get_assignee_var_name(expr): assert isinstance(agg, Variable) return agg.name + + elif isinstance(expr, SubArrayRef): + agg = expr.subscript.aggregate + assert isinstance(agg, Variable) + + return agg.name + else: raise RuntimeError("invalid lvalue '%s'" % expr) def _get_assignee_subscript_deps(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript, get_dependencies + from loopy.symbolic import LinearSubscript, get_dependencies, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -523,6 +530,8 @@ def _get_assignee_subscript_deps(expr): return get_dependencies(expr.index) elif isinstance(expr, LinearSubscript): return get_dependencies(expr.index) + elif isinstance(expr, SubArrayRef): + return get_dependencies(expr.get_begin_subscript().index) else: raise RuntimeError("invalid lvalue '%s'" % expr) @@ -942,12 +951,12 @@ class Assignment(MultiAssignmentBase): def assignee_subscript_deps(self): return (_get_assignee_subscript_deps(self.assignee),) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, *args, **kwargs): return self.copy( - assignee=f(self.assignee, *args), - expression=f(self.expression, *args), + assignee=f(self.assignee, *args, **kwargs), + expression=f(self.expression, *args, **kwargs), predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred, *args, **kwargs) for pred in self.predicates)) # }}} @@ -1052,9 +1061,10 @@ class CallInstruction(MultiAssignmentBase): forced_iname_deps=forced_iname_deps, forced_iname_deps_is_final=forced_iname_deps_is_final) - from pymbolic.primitives import Call + from pymbolic.primitives import Call, CallWithKwargs from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)) and expression is not None: + if not isinstance(expression, (Call, CallWithKwargs, Reduction)) and ( + expression is not None): raise LoopyError("'expression' argument to CallInstruction " "must be a function call") @@ -1094,12 +1104,12 @@ class CallInstruction(MultiAssignmentBase): _get_assignee_subscript_deps(a) for a in self.assignees) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, *args, **kwargs): return self.copy( - assignees=f(self.assignees, *args), - expression=f(self.expression, *args), + assignees=f(self.assignees, *args, **kwargs), + expression=f(self.expression, *args, **kwargs), predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred, *args, **kwargs) for pred in self.predicates)) # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 95c3c336..3c0c2443 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -36,6 +36,7 @@ from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg from loopy.tools import natsorted +from loopy.program import Program import logging logger = logging.getLogger(__name__) @@ -43,19 +44,25 @@ logger = logging.getLogger(__name__) # {{{ add and infer argument dtypes -def add_dtypes(knl, dtype_dict): +def add_dtypes(program, dtype_dict): """Specify remaining unspecified argument/temporary variable types. :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype` instances """ - dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(knl, dtype_dict) + root_kernel = program.root_kernel + dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes( + root_kernel, dtype_dict) if dtype_dict_remainder: raise RuntimeError("unused argument dtypes: %s" % ", ".join(dtype_dict_remainder)) + root_kernel - return knl.copy(args=new_args, temporary_variables=new_temp_vars) + root_kernel_with_added_dtypes = ( + root_kernel.copy(args=new_args, temporary_variables=new_temp_vars)) + + return program.with_root_kernel(root_kernel_with_added_dtypes) def _add_dtypes_overdetermined(knl, dtype_dict): @@ -107,7 +114,8 @@ def get_arguments_with_incomplete_dtype(knl): if arg.dtype is None] -def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False): +def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False): + assert isinstance(prog, Program) processed_dtype_dict = {} for k, v in six.iteritems(dtype_dict): @@ -116,10 +124,10 @@ def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False): if subkey: processed_dtype_dict[subkey] = v - knl = add_dtypes(knl, processed_dtype_dict) + prog = add_dtypes(prog, processed_dtype_dict) from loopy.type_inference import infer_unknown_types - return infer_unknown_types(knl, expect_completion=expect_completion) + return infer_unknown_types(prog, expect_completion=expect_completion) def _add_and_infer_dtypes_overdetermined(knl, dtype_dict): @@ -747,7 +755,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): # }}} -def assign_automatic_axes(kernel, axis=0, local_size=None): +def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=None): logger.debug("%s: assign automatic axes" % kernel.name) # TODO: do the tag removal rigorously, might be easier after switching # to set() from tuple() @@ -761,7 +769,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if local_size is None: _, local_size = kernel.get_grid_size_upper_bounds_as_exprs( - ignore_auto=True) + program_callables_info, ignore_auto=True) # {{{ axis assignment helper function @@ -789,6 +797,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): return assign_automatic_axes( kernel.copy(iname_to_tags=new_iname_to_tags), + program_callables_info, axis=recursion_axis) if axis is None: @@ -828,7 +837,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): else: new_tag = LocalIndexTag(axis) if desired_length > local_size[axis]: - from loopy import split_iname, untag_inames + from loopy import untag_inames + from loopy.transform.iname import split_iname # Don't be tempted to switch the outer tag to unroll--this may # generate tons of code on some examples. @@ -839,6 +849,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, do_tagged_check=False), + program_callables_info=program_callables_info, axis=recursion_axis, local_size=local_size) if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase): @@ -860,7 +871,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): del new_iname_to_tags[iname] return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags), - axis=recursion_axis, local_size=local_size) + program_callables_info, axis=recursion_axis, local_size=local_size) # }}} @@ -928,7 +939,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if axis >= len(local_size): return kernel else: - return assign_automatic_axes(kernel, axis=axis+1, + return assign_automatic_axes(kernel, + program_callables_info=program_callables_info, axis=axis+1, local_size=local_size) # }}} @@ -1866,6 +1878,7 @@ def infer_arg_is_output_only(kernel): """ from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg new_args = [] + for arg in kernel.args: if isinstance(arg, (ArrayArg, ImageArg, ValueArg)): if arg.is_output_only is not None: diff --git a/loopy/library/function.py b/loopy/library/function.py index 9d557ac9..8338875d 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -22,38 +22,48 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from loopy.kernel.function_interface import ScalarCallable -def default_function_mangler(kernel, name, arg_dtypes): - from loopy.library.reduction import reduction_function_mangler - manglers = [reduction_function_mangler, tuple_function_mangler] - for mangler in manglers: - result = mangler(kernel, name, arg_dtypes) - if result is not None: - return result +class MakeTupleCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + new_arg_id_to_dtype = arg_id_to_dtype.copy() + for i in range(len(arg_id_to_dtype)): + if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: + new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] - return None + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_make_tuple"), program_callables_info) + def with_descrs(self, arg_id_to_descr, program_callables_info): + from loopy.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), + (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) -def single_arg_function_mangler(kernel, name, arg_dtypes): - if len(arg_dtypes) == 1: - dtype, = arg_dtypes + return ( + self.copy(arg_id_to_descr=new_arg_id_to_descr), + program_callables_info) - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo(name, (dtype,), (dtype,)) - return None +class IndexOfCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + new_arg_id_to_dtype = dict((i, dtype) for i, dtype in + arg_id_to_dtype.items() if dtype is not None) + new_arg_id_to_dtype[-1] = kernel.index_dtype + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), + program_callables_info) -def tuple_function_mangler(kernel, name, arg_dtypes): - if name == "make_tuple": - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="loopy_make_tuple", - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) - return None +def loopy_specific_callable_scopers(target, identifier): + if identifier == "make_tuple": + return MakeTupleCallable(name="make_tuple") + + if identifier in ["indexof", "indexof_vec"]: + return IndexOfCallable(name=identifier) + + from loopy.library.reduction import reduction_scoper + return reduction_scoper(target, identifier) # vim: foldmethod=marker diff --git a/loopy/library/random123.py b/loopy/library/random123.py index b8633114..59ca72df 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -27,6 +27,7 @@ THE SOFTWARE. from pytools import ImmutableRecord from mako.template import Template +from loopy.kernel.function_interface import ScalarCallable import numpy as np @@ -163,60 +164,77 @@ double${ width } ${ name }_f64( # }}} -def random123_preamble_generator(preamble_info): - for f in preamble_info.seen_functions: - try: - rng_variant = FUNC_NAMES_TO_RNG[f.name] - except KeyError: - continue +class Random123Callable(ScalarCallable): + """ + Records information about for the random123 functions. + """ + + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return (self.copy(), + program_callables_info) + + name = self.name + target = kernel.target + + rng_variant = FUNC_NAMES_TO_RNG[name] + + from loopy.types import NumpyType + base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] + ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) + key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) + + fn = rng_variant.full_name + if name == fn: + new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return ( + self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=fn+"_gen"), + program_callables_info) + + elif name == fn + "_f32": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name), program_callables_info + + elif name == fn + "_f64": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name), program_callables_info + + return (self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + def generate_preambles(self, target): + rng_variant = FUNC_NAMES_TO_RNG[self.name] from loopy.target.pyopencl import PyOpenCLTarget yield ("90-random123-"+rng_variant.full_name, PREAMBLE_TEMPLATE.render( is_pyopencl_target=isinstance( - preamble_info.kernel.target, + target, PyOpenCLTarget), rng_variant=rng_variant, )) + return -def random123_function_mangler(kernel, name, arg_dtypes): - try: - rng_variant = FUNC_NAMES_TO_RNG[name] - except KeyError: - return None - - from loopy.types import NumpyType - target = kernel.target - base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] - ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) - key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) - - from loopy.kernel.data import CallMangleInfo - fn = rng_variant.full_name - if name == fn: - return CallMangleInfo( - target_name=fn+"_gen", - result_dtypes=(ctr_dtype, ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f32": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float32), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f64": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float64), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - else: - return None + +def random123_function_scoper(target, identifier): + if identifier in FUNC_NAMES_TO_RNG: + return Random123Callable(name=identifier) + + return None # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 8ed5cbe5..6ec8e4b2 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -1,4 +1,4 @@ -from __future__ import division +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -24,11 +24,14 @@ THE SOFTWARE. from pymbolic import var +from loopy.symbolic import ResolvedFunction +from loopy.kernel.function_interface import ScalarCallable import numpy as np from loopy.symbolic import FunctionIdentifier from loopy.diagnostic import LoopyError from loopy.types import NumpyType +from loopy.kernel import LoopKernel class ReductionOperation(object): @@ -81,6 +84,9 @@ class ReductionOperation(object): raise LoopyError("unable to parse reduction type: '%s'" % op_type) + def get_scalar_callables(self): + return frozenset() + class ScalarReductionOperation(ReductionOperation): def __init__(self, forced_result_type=None): @@ -180,7 +186,10 @@ class MaxReductionOperation(ScalarReductionOperation): return get_ge_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("max")(operand1, operand2) + return ResolvedFunction("max")(operand1, operand2) + + def get_scalar_callables(self): + return frozenset(["max"]) class MinReductionOperation(ScalarReductionOperation): @@ -188,7 +197,10 @@ class MinReductionOperation(ScalarReductionOperation): return get_le_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("min")(operand1, operand2) + return ResolvedFunction("min")(operand1, operand2) + + def get_scalar_callables(self): + return frozenset(["min"]) # {{{ base class for symbolic reduction ops @@ -212,6 +224,11 @@ class ReductionOpFunction(FunctionIdentifier): return type(self)(reduction_op) + hash_fields = ( + "reduction_op",) + + update_persistent_hash = LoopKernel.update_persistent_hash + # }}} @@ -237,7 +254,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def neutral_element(self, scalar_dtype, segment_flag_dtype): scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ResolvedFunction("make_tuple")(scalar_neutral_element, segment_flag_dtype.numpy_dtype.type(0)) def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): @@ -254,7 +271,10 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return type(self) == type(other) def __call__(self, dtypes, operand1, operand2): - return SegmentedOp(self)(*(operand1 + operand2)) + return ResolvedFunction(SegmentedOp(self))(*(operand1 + operand2)) + + def get_scalar_callables(self): + return frozenset(["make_tuple", SegmentedOp(self)]) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): @@ -262,34 +282,24 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): which = "sum" op = "((%s) + (%s))" + hash_fields = ( + "which", + "op",) + + update_persistent_hash = LoopKernel.update_persistent_hash + class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): base_reduction_class = ProductReductionOperation op = "((%s) * (%s))" which = "product" + hash_fields = ( + "which", + "op", + "base_reduction_class",) -def get_segmented_function_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - prefix = op.prefix(scalar_dtype, segment_flag_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, - %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, - %(segment_flag_t)s *segment_flag_out) - { - *segment_flag_out = segment_flag1 | segment_flag2; - return segment_flag2 ? op2 : %(combined)s; - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype), - combined=op.op % ("op1", "op2"), - )) + update_persistent_hash = LoopKernel.update_persistent_hash # }}} @@ -313,7 +323,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_neutral_func = ( get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) scalar_neutral_element = scalar_neutral_func(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ResolvedFunction("make_tuple")(scalar_neutral_element, index_dtype.numpy_dtype.type(-1)) def __str__(self): @@ -330,7 +340,10 @@ class _ArgExtremumReductionOperation(ReductionOperation): return 2 def __call__(self, dtypes, operand1, operand2): - return ArgExtOp(self)(*(operand1 + operand2)) + return ResolvedFunction(ArgExtOp(self))(*(operand1 + operand2)) + + def get_scalar_callables(self): + return frozenset([self.which, "make_tuple", ArgExtOp(self)]) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -338,43 +351,23 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation): update_comparison = ">=" neutral_sign = -1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) + + update_persistent_hash = LoopKernel.update_persistent_hash + class ArgMinReductionOperation(_ArgExtremumReductionOperation): which = "min" update_comparison = "<=" neutral_sign = +1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) -def get_argext_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - prefix = op.prefix(scalar_dtype, index_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(index_t)s index1, - %(scalar_t)s op2, %(index_t)s index2, - %(index_t)s *index_out) - { - if (op2 %(comp)s op1) - { - *index_out = index2; - return op2; - } - else - { - *index_out = index1; - return op1; - } - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - index_t=kernel.target.dtype_to_typename(index_dtype), - comp=op.update_comparison, - )) + update_persistent_hash = LoopKernel.update_persistent_hash # }}} @@ -429,70 +422,93 @@ def parse_reduction_op(name): # }}} -def reduction_function_mangler(kernel, func_id, arg_dtypes): - if isinstance(func_id, ArgExtOp): - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, index_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, index_dtype), - arg_dtypes=( - scalar_dtype, - index_dtype, - scalar_dtype, - index_dtype), - ) - - elif isinstance(func_id, SegmentedOp): - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, segment_flag_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, segment_flag_dtype), - arg_dtypes=( - scalar_dtype, - segment_flag_dtype, - scalar_dtype, - segment_flag_dtype), - ) +# {{{ reduction specific callables + +class ReductionCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + scalar_dtype = arg_id_to_dtype[0] + index_dtype = arg_id_to_dtype[1] + result_dtypes = self.name.reduction_op.result_dtypes(kernel, scalar_dtype, + index_dtype) + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = result_dtypes[0] + new_arg_id_to_dtype[-2] = result_dtypes[1] + name_in_target = self.name.reduction_op.prefix(scalar_dtype, + index_dtype) + "_op" + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name_in_target), program_callables_info + + def with_descr(self, arg_id_to_descr, program_callables_info): + from loopy.library.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = arg_id_to_descr.copy() + new_arg_id_to_descr[-1] = ValueArgDescriptor() + return ( + self.copy(arg_id_to_descr=arg_id_to_descr), + program_callables_info) + + def generate_preambles(self, target): + if isinstance(self.name, ArgExtOp): + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + index_dtype = self.arg_id_to_dtype[-2] + + prefix = op.prefix(scalar_dtype, index_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(index_t)s index1, + %(scalar_t)s op2, %(index_t)s index2, + %(index_t)s *index_out) + { + if (op2 %(comp)s op1) + { + *index_out = index2; + return op2; + } + else + { + *index_out = index1; + return op1; + } + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + index_t=target.dtype_to_typename(index_dtype), + comp=op.update_comparison, + )) + elif isinstance(self.name, SegmentedOp): + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + segment_flag_dtype = self.arg_id_to_dtype[-2] + prefix = op.prefix(scalar_dtype, segment_flag_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, + %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, + %(segment_flag_t)s *segment_flag_out) + { + *segment_flag_out = segment_flag1 | segment_flag2; + return segment_flag2 ? op2 : %(combined)s; + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + segment_flag_t=target.dtype_to_typename(segment_flag_dtype), + combined=op.op % ("op1", "op2"), + )) + + return + + +def reduction_scoper(target, identifier): + if isinstance(identifier, (ArgExtOp, SegmentedOp)): + return ReductionCallable(name=identifier) return None - -def reduction_preamble_generator(preamble_info): - from loopy.target.opencl import OpenCLTarget - - for func in preamble_info.seen_functions: - if isinstance(func.name, ArgExtOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_argext_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) - - elif isinstance(func.name, SegmentedOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_segmented_function_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) +# }}} # vim: fdm=marker diff --git a/loopy/loop.py b/loopy/loop.py index 45924638..66d41398 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -25,6 +25,7 @@ THE SOFTWARE. import islpy as isl import six +from loopy.program import iterate_over_kernels_if_given_program def potential_loop_nest_map(kernel): @@ -55,6 +56,7 @@ def potential_loop_nest_map(kernel): return result +@iterate_over_kernels_if_given_program def fuse_loop_domains(kernel): from loopy.kernel.tools import is_domain_dependent_on_inames diff --git a/loopy/preprocess.py b/loopy/preprocess.py index fc950c78..3657967a 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -27,7 +27,6 @@ import six from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyAdvisory) - import islpy as isl from pytools.persistent_dict import WriteOncePersistentDict @@ -37,13 +36,19 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types +from loopy.symbolic import RuleAwareIdentityMapper +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + CallInstruction, _DataObliviousInstruction) +from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) # {{{ prepare for caching +@iterate_over_kernels_if_given_program def prepare_for_caching(kernel): import loopy as lp new_args = [] @@ -885,9 +890,9 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} -def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, - automagic_scans_ok=False, force_scan=False, - force_outer_iname_for_scan=None): +def realize_reduction_for_single_kernel(kernel, program_callables_info, + insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, + force_scan=False, force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. @@ -1007,7 +1012,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential - def map_reduction_seq(expr, rec, nresults, arg_dtypes, + def map_reduction_seq(expr, rec, program_callables_info, nresults, arg_dtypes, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1125,7 +1130,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, v[iname].lt_set(v[0] + ubound)).get_basic_sets() return bs - def map_reduction_local(expr, rec, nresults, arg_dtypes, + def map_reduction_local(expr, rec, program_callables_info, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -1365,7 +1370,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential scan - def map_scan_seq(expr, rec, nresults, arg_dtypes, + def map_scan_seq(expr, rec, program_callables_info, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1454,17 +1459,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ local-parallel scan - def map_scan_local(expr, rec, nresults, arg_dtypes, - reduction_dtypes, sweep_iname, scan_iname, - sweep_min_value, scan_min_value, stride): + def map_scan_local(expr, rec, program_callables_info, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, + scan_min_value, stride): scan_size = _get_int_iname_size(sweep_iname) assert scan_size > 0 if scan_size == 1: - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, program_callables_info, + nresults, arg_dtypes, reduction_dtypes) outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1663,15 +1668,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ seq/par dispatch - def map_reduction(expr, rec, nresults=1): + def map_reduction(expr, rec, program_callables_info, nresults=1): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. from loopy.type_inference import ( infer_arg_and_reduction_dtypes_for_reduction_expression) - arg_dtypes, reduction_dtypes = ( + arg_dtypes, reduction_dtypes, program_callables_info = ( infer_arg_and_reduction_dtypes_for_reduction_expression( - temp_kernel, expr, unknown_types_ok)) + temp_kernel, expr, program_callables_info, unknown_types_ok)) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames @@ -1780,15 +1785,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, for tag in temp_kernel.iname_tags(sweep_iname)))) elif parallel: return map_scan_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes, + expr, rec, program_callables_info, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) elif sequential: return map_scan_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes, - sweep_iname, scan_param.scan_iname, + expr, rec, program_callables_info, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, + scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) @@ -1807,12 +1814,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if n_sequential: assert n_local_par == 0 - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, program_callables_info, + nresults, arg_dtypes, reduction_dtypes) else: assert n_local_par > 0 return map_reduction_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + expr, rec, program_callables_info, nresults, arg_dtypes, + reduction_dtypes) # }}} @@ -1845,9 +1853,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # Run reduction expansion. from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: - new_expressions = cb_mapper(insn.expression, nresults=nresults) + new_expressions = cb_mapper(insn.expression, + program_callables_info=program_callables_info, + nresults=nresults) else: - new_expressions = (cb_mapper(insn.expression),) + new_expressions = ( + cb_mapper(insn.expression, + program_callables_info=program_callables_info),) if generated_insns: # An expansion happened, so insert the generated stuff plus @@ -1935,6 +1947,31 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, return kernel + +def realize_reduction(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = realize_reduction_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} @@ -2108,17 +2145,159 @@ def check_atomic_loads(kernel): # }}} +# {{{ arg_descr_inference + +class ArgDescrInferenceMapper(RuleAwareIdentityMapper): + """ + Returns a set of instances of :class:`tuple` (expr, + in_kernel_callable). The mapped `in_kernel_callable` of the + :class:`InKernelCallable` are descriptor specialized for the given + arguments. + """ + + def __init__(self, rule_mapping_context, caller_kernel, + program_callables_info): + super(ArgDescrInferenceMapper, self).__init__( + rule_mapping_context) + self.caller_kernel = caller_kernel + self.program_callables_info = program_callables_info + + def map_call(self, expr, expn_state, **kwargs): + from pymbolic.primitives import Call, CallWithKwargs + from loopy.kernel.function_interface import ValueArgDescriptor + from loopy.symbolic import ResolvedFunction, SubArrayRef + + if not isinstance(expr.function, ResolvedFunction): + # ignore if the call is not to a ResolvedFunction + return super(ArgDescrInferenceMapper, self).map_call(expr, expn_state) + + if isinstance(expr, Call): + kw_parameters = {} + else: + assert isinstance(expr, CallWithKwargs) + kw_parameters = expr.kw_parameters + + # descriptors for the args and kwargs of the Call + arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.caller_kernel)) + if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor()) + for i, par in tuple(enumerate(expr.parameters)) + + tuple(kw_parameters.items())) + + assignee_id_to_descr = {} + + if 'assignees' in kwargs: + # If supplied with assignees then this is a CallInstruction + assignees = kwargs['assignees'] + assert isinstance(assignees, tuple) + for i, par in enumerate(assignees): + if isinstance(par, SubArrayRef): + assignee_id_to_descr[-i-1] = ( + par.get_array_arg_descriptor(self.caller_kernel)) + else: + assignee_id_to_descr[-i-1] = ValueArgDescriptor() + + # gathering all the descriptors + combined_arg_id_to_descr = arg_id_to_descr.copy() + combined_arg_id_to_descr.update(assignee_id_to_descr) + + # specializing the function according to the parameter description + in_knl_callable = self.program_callables_info[expr.function.name] + new_in_knl_callable, self.program_callables_info = ( + in_knl_callable.with_descrs( + combined_arg_id_to_descr, self.program_callables_info)) + self.program_callables_info, new_func_id = ( + self.program_callables_info.with_callable( + expr.function.function, + new_in_knl_callable)) + + if isinstance(expr, Call): + return Call( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + else: + assert isinstance(expr, CallWithKwargs) + return CallWithKwargs( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(kw_parameters)) + ) + + map_call_with_kwargs = map_call + + def map_kernel(self, kernel): + + new_insns = [] + + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + # In call instructions the assignees play an important in + # determining the arg_id_to_descr + new_insns.append(insn.with_transformed_expressions( + self, kernel, insn, assignees=insn.assignees)) + elif isinstance(insn, MultiAssignmentBase): + new_insns.append(insn.with_transformed_expressions( + self, kernel, insn)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError("arg_descr_inference for %s instruction" % + type(insn)) + + return kernel.copy(instructions=new_insns) + + +def traverse_to_infer_arg_descr(kernel, program_callables_info): + """ + Returns a copy of *kernel* with the argument shapes and strides matching for + scoped functions in the *kernel*. Refer + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. + """ + # FIXME: update this docs, once the design is finalized + + from loopy.symbolic import SubstitutionRuleMappingContext + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context, + kernel, program_callables_info) + + descr_inferred_kernel = rule_mapping_context.finish_kernel( + arg_descr_inf_mapper.map_kernel(kernel)) + + return descr_inferred_kernel, arg_descr_inf_mapper.program_callables_info + + +def infer_arg_descr(program): + root_kernel_callable = program.program_callables_info[program.name] + program_callables_info = ( + program.program_callables_info.with_edit_callables_mode()) + root_kernel = program.root_kernel + + new_root_kernel, program_callables_info = traverse_to_infer_arg_descr( + root_kernel, program_callables_info) + new_root_kernel_callable = root_kernel_callable.copy( + subkernel=new_root_kernel) + program_callables_info, _ = program_callables_info.with_callable(program.name, + new_root_kernel_callable) + + program_callables_info = program_callables_info.with_exit_edit_callables_mode() + + return program.copy(program_callables_info=program_callables_info) + +# }}} + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) -def preprocess_kernel(kernel, device=None): - if device is not None: - from warnings import warn - warn("passing 'device' to preprocess_kernel() is deprecated", - DeprecationWarning, stacklevel=2) - +def preprocess_single_kernel(kernel, program_callables_info, device=None): from loopy.kernel import KernelState if kernel.state >= KernelState.PREPROCESSED: return kernel @@ -2161,8 +2340,6 @@ def preprocess_kernel(kernel, device=None): # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. - kernel = infer_unknown_types(kernel, expect_completion=False) - check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) @@ -2177,8 +2354,8 @@ def preprocess_kernel(kernel, device=None): # - realize_reduction must happen after default dependencies are added # because it manipulates the depends_on field, which could prevent # defaults from being applied. - - kernel = realize_reduction(kernel, unknown_types_ok=False) + kernel = realize_reduction_for_single_kernel(kernel, + program_callables_info, unknown_types_ok=False) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators @@ -2222,4 +2399,81 @@ def preprocess_kernel(kernel, device=None): return kernel + +def preprocess_kernel(kernel, device=None): + # FIXME: error message? + return preprocess_program(kernel, device) + + +def preprocess_program(program, device=None): + + if device is not None: + from warnings import warn + warn("passing 'device' to preprocess_kernel() is deprecated", + DeprecationWarning, stacklevel=2) + + program = infer_unknown_types(program, expect_completion=False) + + # {{{ preprocess the root kernel + + # Callable editing restrictions: + # + # - cannot edit program_callables_info in :meth:`preprocess_single_kernel` + # as we are iterating over it. + # + # Refer: https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = preprocess_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + device) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + program = program.copy(program_callables_info=new_program_callables_info) + + # }}} + + # infer arg descrs of the callables + program = infer_arg_descr(program) + + # {{{ hw axes inference + + # FIXME: think of wrapping this in a function? + + local_size, global_size = program.get_grid_size_upper_bounds() + + resolved_function_with_hw_axes_sizes_set = {} + + for func_id, in_knl_callable in ( + program.program_callables_info.items()): + if func_id == program.name: + resolved_function_with_hw_axes_sizes_set[func_id] = ( + in_knl_callable) + else: + resolved_function_with_hw_axes_sizes_set[func_id] = ( + in_knl_callable.with_hw_axes_sizes(local_size, global_size)) + + new_program_callables_info = ( + program.program_callables_info.copy( + resolved_functions=resolved_function_with_hw_axes_sizes_set)) + + program = program.copy(program_callables_info=new_program_callables_info) + + # }}} + + return program + + # vim: foldmethod=marker diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 652f8b89..201bcc25 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1832,7 +1832,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit): # {{{ main scheduling entrypoint -def generate_loop_schedules(kernel, debug_args={}): +def generate_loop_schedules(kernel, program_callables_info, debug_args={}): """ .. warning:: @@ -1845,18 +1845,19 @@ def generate_loop_schedules(kernel, debug_args={}): """ with MinRecursionLimitForScheduling(kernel): - for sched in generate_loop_schedules_inner(kernel, debug_args=debug_args): + for sched in generate_loop_schedules_inner(kernel, + program_callables_info, debug_args=debug_args): yield sched -def generate_loop_schedules_inner(kernel, debug_args={}): +def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={}): from loopy.kernel import KernelState if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") from loopy.check import pre_schedule_checks - pre_schedule_checks(kernel) + pre_schedule_checks(kernel, program_callables_info) schedule_count = 0 @@ -1969,7 +1970,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}): gen_sched = convert_barrier_instructions_to_barriers( kernel, gen_sched) - gsize, lsize = kernel.get_grid_size_upper_bounds() + gsize, lsize = ( + kernel.get_grid_size_upper_bounds(program_callables_info)) if (gsize or lsize): if not kernel.options.disable_global_barriers: @@ -2026,7 +2028,7 @@ schedule_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def _get_one_scheduled_kernel_inner(kernel): +def _get_one_scheduled_kernel_inner(kernel, program_callables_info): # This helper function exists to ensure that the generator chain is fully # out of scope after the function returns. This allows it to be # garbage-collected in the exit handler of the @@ -2036,10 +2038,10 @@ def _get_one_scheduled_kernel_inner(kernel): # # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context. - return next(iter(generate_loop_schedules(kernel))) + return next(iter(generate_loop_schedules(kernel, program_callables_info))) -def get_one_scheduled_kernel(kernel): +def get_one_scheduled_kernel(kernel, program_callables_info): from loopy import CACHING_ENABLED sched_cache_key = kernel @@ -2057,7 +2059,8 @@ def get_one_scheduled_kernel(kernel): if not from_cache: with ProcessLogger(logger, "%s: schedule" % kernel.name): with MinRecursionLimitForScheduling(kernel): - result = _get_one_scheduled_kernel_inner(kernel) + result = _get_one_scheduled_kernel_inner(kernel, + program_callables_info) if CACHING_ENABLED and not from_cache: schedule_cache.store_if_not_present(sched_cache_key, result) diff --git a/loopy/statistics.py b/loopy/statistics.py index cee28b24..08b7f89e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -33,6 +33,7 @@ from loopy.kernel.data import ( MultiAssignmentBase, TemporaryVariable, AddressSpace) from loopy.diagnostic import warn_with_kernel, LoopyError from pytools import Record +from loopy.kernel.function_interface import ScalarCallable, CallableKernel __doc__ = """ @@ -59,6 +60,14 @@ __doc__ = """ """ +# FIXME: this is broken for the callable kernel design. +# Qns: +# - The variable name, what if multiple kernels use the same name? +# - We should also add the cumulative effect on the arguments of callee kernels +# into the caller kernel. +# FIXME: add an error that there is only one callable kernel. disable for +# multiple callable kernels. + # {{{ GuardedPwQPolynomial class GuardedPwQPolynomial(object): @@ -639,10 +648,11 @@ class MemAccess(Record): # {{{ counter base class CounterBase(CombineMapper): - def __init__(self, knl): + def __init__(self, knl, program_callables_info): self.knl = knl + self.program_callables_info = program_callables_info from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + self.type_inf = TypeInferenceMapper(knl, program_callables_info) def combine(self, values): return sum(values) @@ -697,10 +707,11 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl): + def __init__(self, knl, program_callables_info): self.knl = knl + self.program_callables_info = program_callables_info from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + self.type_inf = TypeInferenceMapper(knl, program_callables_info) def combine(self, values): return sum(values) @@ -712,9 +723,16 @@ class ExpressionOpCounter(CounterBase): map_variable = map_constant def map_call(self, expr): + from loopy.symbolic import ResolvedFunction + if isinstance(expr.function, ResolvedFunction): + function_identifier = self.program_callables_info[ + expr.function.name].name + else: + function_identifier = expr.function.name + return ToCountMap( {Op(dtype=self.type_inf(expr), - name='func:'+str(expr.function), + name='func:'+function_identifier, count_granularity=CountGranularity.WORKITEM): 1} ) + self.rec(expr.parameters) @@ -1090,6 +1108,16 @@ def add_assumptions_guard(kernel, pwqpolynomial): def count(kernel, set, space=None): + from loopy.program import Program + if isinstance(kernel, Program): + if len([in_knl_callable for in_knl_callable in + kernel.program_callables_info.values() if isinstance(in_knl_callable, + CallableKernel)]) != 1: + raise NotImplementedError("Currently only supported for program with " + "only one CallableKernel.") + + kernel = kernel.root_kernel + try: if space is not None: set = set.align_params(space) @@ -1188,9 +1216,10 @@ def count(kernel, set, space=None): return add_assumptions_guard(kernel, count) -def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): +def get_unused_hw_axes_factor(knl, program_callables_info, insn, + disregard_local_axes, space=None): # FIXME: Multi-kernel support - gsize, lsize = knl.get_grid_size_upper_bounds() + gsize, lsize = knl.get_grid_size_upper_bounds(program_callables_info) g_used = set() l_used = set() @@ -1228,7 +1257,8 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): return add_assumptions_guard(knl, result) -def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False): +def count_insn_runs(knl, program_callables_info, insn, count_redundant_work, + disregard_local_axes=False): insn_inames = knl.insn_inames(insn) @@ -1248,9 +1278,8 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False) c = count(knl, domain, space=space) if count_redundant_work: - unused_fac = get_unused_hw_axes_factor(knl, insn, - disregard_local_axes=disregard_local_axes, - space=space) + unused_fac = get_unused_hw_axes_factor(knl, program_callables_info, + insn, disregard_local_axes=disregard_local_axes, space=space) return c * unused_fac else: return c @@ -1260,7 +1289,50 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False) # {{{ get_op_map -def get_op_map(knl, numpy_types=True, count_redundant_work=False, + +def get_op_map_for_single_kernel(knl, program_callables_info, + numpy_types=True, count_redundant_work=False, + subgroup_size=None): + + if not knl.options.ignore_boostable_into: + raise LoopyError("Kernel '%s': Using operation counting requires the option " + "ignore_boostable_into to be set." % knl.name) + + from loopy.kernel.instruction import ( + CallInstruction, CInstruction, Assignment, + NoOpInstruction, BarrierInstruction) + + op_map = ToCountMap() + op_counter = ExpressionOpCounter(knl, + program_callables_info=program_callables_info) + for insn in knl.instructions: + if isinstance(insn, (CallInstruction, CInstruction, Assignment)): + ops = op_counter(insn.assignee) + op_counter(insn.expression) + op_map = op_map + ops*count_insn_runs( + knl, program_callables_info, insn, + count_redundant_work=count_redundant_work) + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + pass + else: + raise NotImplementedError("unexpected instruction item type: '%s'" + % type(insn).__name__) + + if numpy_types: + return ToCountMap( + init_dict=dict( + (Op( + dtype=op.dtype.numpy_dtype, + name=op.name, + count_granularity=op.count_granularity), + ct) + for op, ct in six.iteritems(op_map.count_map)), + val_type=op_map.val_type + ) + else: + return op_map + + +def get_op_map(program, numpy_types=True, count_redundant_work=False, subgroup_size=None): """Count the number of operations in a loopy kernel. @@ -1318,44 +1390,31 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, """ - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - - from loopy.preprocess import preprocess_kernel, infer_unknown_types - from loopy.kernel.instruction import ( - CallInstruction, CInstruction, Assignment, - NoOpInstruction, BarrierInstruction) - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) + from loopy.preprocess import preprocess_program, infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl) - for insn in knl.instructions: - if isinstance(insn, (CallInstruction, CInstruction, Assignment)): - ops = op_counter(insn.assignee) + op_counter(insn.expression) - op_map = op_map + ops*count_insn_runs( - knl, insn, - count_redundant_work=count_redundant_work) - elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + num_times_called = ( + program.program_callables_info.num_times_callables_called[ + func_id]) + knl = in_knl_callable.subkernel + knl_op_map = get_op_map_for_single_kernel(knl, + program.program_callables_info, numpy_types, + count_redundant_work, subgroup_size) + + for i in range(num_times_called): + op_map += knl_op_map + elif isinstance(in_knl_callable, ScalarCallable): pass else: - raise NotImplementedError("unexpected instruction item type: '%s'" - % type(insn).__name__) + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) - if numpy_types: - return ToCountMap( - init_dict=dict( - (Op( - dtype=op.dtype.numpy_dtype, - name=op.name, - count_granularity=op.count_granularity), - ct) - for op, ct in six.iteritems(op_map.count_map)), - val_type=op_map.val_type - ) - else: - return op_map + return op_map # }}} @@ -1376,93 +1435,9 @@ def _find_subgroup_size_for_knl(knl): # {{{ get_mem_access_map -def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, - subgroup_size=None): - """Count the number of memory accesses in a loopy kernel. - - :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be - counted. - - :arg numpy_types: A :class:`bool` specifying whether the types in the - returned mapping should be numpy types instead of - :class:`loopy.LoopyType`. - :arg count_redundant_work: Based on usage of hardware axes or other - specifics, a kernel may perform work redundantly. This :class:`bool` - flag indicates whether this work should be included in the count. - (Likely desirable for performance modeling, but undesirable for - code optimization.) - - :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or - *None* that specifies the sub-group size. An OpenCL sub-group is an - implementation-dependent grouping of work-items within a work-group, - analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when - counting a :class:`MemAccess` whose count_granularity specifies that it - should only be counted once per sub-group. If set to *None* an attempt - to find the sub-group size using the device will be made, if this fails - an error will be raised. If a :class:`str` ``'guess'`` is passed as - the subgroup_size, get_mem_access_map will attempt to find the - sub-group size using the device and, if unsuccessful, will make a wild - guess. - - :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** - :class:`islpy.PwQPolynomial` **}**. - - - The :class:`MemAccess` specifies the characteristics of the memory - access. - - - The :class:`islpy.PwQPolynomial` holds the number of memory accesses - with the characteristics specified in the key (in terms of the - :class:`loopy.LoopKernel` *inames*). - - Example usage:: - - # (first create loopy kernel and specify array data types) - - params = {'n': 512, 'm': 256, 'l': 128} - mem_map = get_mem_access_map(knl) - - f32_s1_g_ld_a = mem_map[MemAccess( - mtype='global', - dtype=np.float32, - lid_strides={0: 1}, - gid_strides={0: 256}, - direction='load', - variable='a', - count_granularity=CountGranularity.WORKITEM) - ].eval_with_dict(params) - f32_s1_g_st_a = mem_map[MemAccess( - mtype='global', - dtype=np.float32, - lid_strides={0: 1}, - gid_strides={0: 256}, - direction='store', - variable='a', - count_granularity=CountGranularity.WORKITEM) - ].eval_with_dict(params) - f32_s1_l_ld_x = mem_map[MemAccess( - mtype='local', - dtype=np.float32, - lid_strides={0: 1}, - gid_strides={0: 256}, - direction='load', - variable='x', - count_granularity=CountGranularity.WORKITEM) - ].eval_with_dict(params) - f32_s1_l_st_x = mem_map[MemAccess( - mtype='local', - dtype=np.float32, - lid_strides={0: 1}, - gid_strides={0: 256}, - direction='store', - variable='x', - count_granularity=CountGranularity.WORKITEM) - ].eval_with_dict(params) - - # (now use these counts to, e.g., predict performance) - - """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types +def get_access_map_for_single_kernel(knl, program_callables_info, + numpy_types=True, count_redundant_work=False, subgroup_size=None): if not knl.options.ignore_boostable_into: raise LoopyError("Kernel '%s': Using operation counting requires the option " @@ -1518,11 +1493,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, if count_granularity == CountGranularity.WORKITEM: return count_insn_runs( - knl, insn, count_redundant_work=count_redundant_work, + knl, program_callables_info, insn, + count_redundant_work=count_redundant_work, disregard_local_axes=False) ct_disregard_local = count_insn_runs( - knl, insn, disregard_local_axes=True, + knl, program_callables_info, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) if count_granularity == CountGranularity.WORKGROUP: @@ -1530,7 +1506,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, elif count_granularity == CountGranularity.SUBGROUP: # get the group size from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds() + _, local_size = knl.get_grid_size_upper_bounds(program_callables_info) workgroup_size = 1 if local_size: for size in local_size: @@ -1556,12 +1532,9 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, "not allowed. count_granularity options: %s" % (count_granularity, CountGranularity.ALL+[None])) - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - access_map = ToCountMap() - access_counter_g = GlobalMemAccessCounter(knl) - access_counter_l = LocalMemAccessCounter(knl) + access_counter_g = GlobalMemAccessCounter(knl, program_callables_info) + access_counter_l = LocalMemAccessCounter(knl, program_callables_info) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, @@ -1617,12 +1590,129 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, else: return access_map + +def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, + subgroup_size=None): + """Count the number of memory accesses in a loopy kernel. + + :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be + counted. + + :arg numpy_types: A :class:`bool` specifying whether the types in the + returned mapping should be numpy types instead of + :class:`loopy.LoopyType`. + + :arg count_redundant_work: Based on usage of hardware axes or other + specifics, a kernel may perform work redundantly. This :class:`bool` + flag indicates whether this work should be included in the count. + (Likely desirable for performance modeling, but undesirable for + code optimization.) + + :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or + *None* that specifies the sub-group size. An OpenCL sub-group is an + implementation-dependent grouping of work-items within a work-group, + analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when + counting a :class:`MemAccess` whose count_granularity specifies that it + should only be counted once per sub-group. If set to *None* an attempt + to find the sub-group size using the device will be made, if this fails + an error will be raised. If a :class:`str` ``'guess'`` is passed as + the subgroup_size, get_mem_access_map will attempt to find the + sub-group size using the device and, if unsuccessful, will make a wild + guess. + + :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** + :class:`islpy.PwQPolynomial` **}**. + + - The :class:`MemAccess` specifies the characteristics of the memory + access. + + - The :class:`islpy.PwQPolynomial` holds the number of memory accesses + with the characteristics specified in the key (in terms of the + :class:`loopy.LoopKernel` *inames*). + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = get_mem_access_map(knl) + + f32_s1_g_ld_a = mem_map[MemAccess( + mtype='global', + dtype=np.float32, + lid_strides={0: 1}, + gid_strides={0: 256}, + direction='load', + variable='a', + count_granularity=CountGranularity.WORKITEM) + ].eval_with_dict(params) + f32_s1_g_st_a = mem_map[MemAccess( + mtype='global', + dtype=np.float32, + lid_strides={0: 1}, + gid_strides={0: 256}, + direction='store', + variable='a', + count_granularity=CountGranularity.WORKITEM) + ].eval_with_dict(params) + f32_s1_l_ld_x = mem_map[MemAccess( + mtype='local', + dtype=np.float32, + lid_strides={0: 1}, + gid_strides={0: 256}, + direction='load', + variable='x', + count_granularity=CountGranularity.WORKITEM) + ].eval_with_dict(params) + f32_s1_l_st_x = mem_map[MemAccess( + mtype='local', + dtype=np.float32, + lid_strides={0: 1}, + gid_strides={0: 256}, + direction='store', + variable='x', + count_granularity=CountGranularity.WORKITEM) + ].eval_with_dict(params) + + # (now use these counts to, e.g., predict performance) + + """ + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) + + access_map = ToCountMap() + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + num_times_called = ( + program.program_callables_info.num_times_callables_called[ + func_id]) + knl = in_knl_callable.subkernel + knl_access_map = get_access_map_for_single_kernel(knl, + program.program_callables_info, numpy_types, + count_redundant_work, subgroup_size) + + # FIXME: didn't see any easy way to multiply + for i in range(num_times_called): + access_map += knl_access_map + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) + + return access_map + + # }}} # {{{ get_synchronization_map -def get_synchronization_map(knl, subgroup_size=None): +def get_synchronization_map_for_single_kernel(knl, program_callables_info, + subgroup_size=None): """Count the number of synchronization events each work-item encounters in a loopy kernel. @@ -1664,13 +1754,10 @@ def get_synchronization_map(knl, subgroup_size=None): raise LoopyError("Kernel '%s': Using operation counting requires the option " "ignore_boostable_into to be set." % knl.name) - from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl, program_callables_info) iname_list = [] result = ToCountMap() @@ -1713,12 +1800,42 @@ def get_synchronization_map(knl, subgroup_size=None): return result + +def get_synchronization_map(program, subgroup_size=None): + + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) + + sync_map = ToCountMap() + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + num_times_called = ( + program.program_callables_info.num_times_callables_called[ + func_id]) + knl = in_knl_callable.subkernel + knl_sync_map = get_synchronization_map_for_single_kernel(knl, + program.program_callables_info, subgroup_size) + + # FIXME: didn't see any easy way to multiply + for i in range(num_times_called): + sync_map += knl_sync_map + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) + + return sync_map + # }}} # {{{ gather_access_footprints -def gather_access_footprints(kernel, ignore_uncountable=False): +def gather_access_footprints_for_single_kernel(kernel, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.Set` instances capturing which indices of each the array *var_name* are read/written (where *direction* is either ``read`` or @@ -1729,13 +1846,6 @@ def gather_access_footprints(kernel, ignore_uncountable=False): nonlinear indices) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - - from loopy.kernel import KernelState - if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) - write_footprints = [] read_footprints = [] @@ -1758,6 +1868,46 @@ def gather_access_footprints(kernel, ignore_uncountable=False): write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) + return write_footprints, read_footprints + + +def gather_access_footprints(program, ignore_uncountable=False): + # FIMXE: works only for one callable kernel till now. + if len([in_knl_callable for in_knl_callable in + program.program_callables_info.values() if isinstance(in_knl_callable, + CallableKernel)]) != 1: + raise NotImplementedError("Currently only supported for program with " + "only one CallableKernel.") + + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) + + write_footprints = [] + read_footprints = [] + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + num_times_called = ( + program.program_callables_info.num_times_callables_called[ + func_id]) + knl = in_knl_callable.subkernel + knl_write_footprints, knl_read_footprints = ( + gather_access_footprints_for_single_kernel(knl, + ignore_uncountable)) + + # FIXME: didn't see any easy way to multiply + for i in range(num_times_called): + write_footprints.extend(knl_write_footprints) + read_footprints.extend(knl_read_footprints) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) + write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) @@ -1772,7 +1922,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False): return result -def gather_access_footprint_bytes(kernel, ignore_uncountable=False): +def gather_access_footprint_bytes(program, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.PwQPolynomial` instances capturing the number of bytes are read/written (where *direction* is either ``read`` or ``write`` on array @@ -1783,12 +1933,12 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): nonlinear indices) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + from loopy.preprocess import preprocess_program, infer_unknown_types + kernel = infer_unknown_types(program, expect_completion=True) from loopy.kernel import KernelState if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) + kernel = preprocess_program(program) result = {} fp = gather_access_footprints(kernel, diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 8927cd6f..7a268d06 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -56,7 +56,7 @@ from pymbolic.mapper.constant_folder import \ ConstantFoldingMapper as ConstantFoldingMapperBase from pymbolic.parser import Parser as ParserBase - +from loopy.diagnostic import LoopyError from loopy.diagnostic import ExpressionToAffineConversionError import islpy as isl @@ -69,22 +69,23 @@ import numpy as np # {{{ mappers with support for loopy-specific primitives class IdentityMapperMixin(object): - def map_literal(self, expr, *args): + def map_literal(self, expr, *args, **kwargs): return expr - def map_array_literal(self, expr, *args): - return type(expr)(tuple(self.rec(ch, *args) for ch in expr.children)) + def map_array_literal(self, expr, *args, **kwargs): + return type(expr)(tuple(self.rec(ch, *args, **kwargs) for ch in + expr.children)) - def map_group_hw_index(self, expr, *args): + def map_group_hw_index(self, expr, *args, **kwargs): return expr - def map_local_hw_index(self, expr, *args): + def map_local_hw_index(self, expr, *args, **kwargs): return expr - def map_loopy_function_identifier(self, expr, *args): + def map_loopy_function_identifier(self, expr, *args, **kwargs): return expr - def map_reduction(self, expr, *args): + def map_reduction(self, expr, *args, **kwargs): mapped_inames = [self.rec(p.Variable(iname), *args) for iname in expr.inames] new_inames = [] @@ -98,15 +99,18 @@ class IdentityMapperMixin(object): return Reduction( expr.operation, tuple(new_inames), - self.rec(expr.expr, *args), + self.rec(expr.expr, *args, **kwargs), allow_simultaneous=expr.allow_simultaneous) - def map_tagged_variable(self, expr, *args): + def map_tagged_variable(self, expr, *args, **kwargs): # leaf, doesn't change return expr - def map_type_annotation(self, expr, *args): - return type(expr)(expr.type, self.rec(expr.child)) + def map_type_annotation(self, expr, *args, **kwargs): + return type(expr)(expr.type, self.rec(expr.child, *args, **kwargs)) + + def map_resolved_function(self, expr, *args, **kwargs): + return ResolvedFunction(expr.function) map_type_cast = map_type_annotation @@ -165,9 +169,16 @@ class WalkMapper(WalkMapperBase): map_rule_argument = map_group_hw_index + def map_resolved_function(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.function, *args) + class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant + map_resolved_function = CallbackMapperBase.map_constant class CombineMapper(CombineMapperBase): @@ -232,13 +243,16 @@ class StringifyMapper(StringifyMapperBase): from pymbolic.mapper.stringifier import PREC_NONE return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) + def map_resolved_function(self, expr, prec): + return "ResolvedFunction('%s')" % expr.name + class UnidirectionalUnifier(UnidirectionalUnifierBase): def map_reduction(self, expr, other, unis): if not isinstance(other, type(expr)): return self.treat_mismatch(expr, other, unis) if (expr.inames != other.inames - or type(expr.operation) != type(other.operation) # noqa + or type(expr.function) != type(other.function) # noqa ): return [] @@ -289,6 +303,9 @@ class DependencyMapper(DependencyMapperBase): def map_type_cast(self, expr): return self.rec(expr.child) + def map_resolved_function(self, expr): + return self.rec(expr.function) + class SubstitutionRuleExpander(IdentityMapper): def __init__(self, rules): @@ -638,6 +655,51 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") + +class ResolvedFunction(p.Expression): + """ + A function invocation whose definition is known in a :mod:`loopy` kernel. + Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression + points to an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` through the + mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer + :ref:`ref_scoped_function` for a slightly detailed explanation on scoped + functions. + + .. attribute:: function + + An instance of :class:`pymbolic.primitives.Variable`, + :class:`loopy.library.reduction.ArgExtOp` or + :class:`loopy.library.reduction.SegmentedOp`. + """ + init_arg_names = ("function", ) + + def __init__(self, function): + if isinstance(function, str): + function = p.Variable(function) + from loopy.library.reduction import ArgExtOp, SegmentedOp + assert isinstance(function, (p.Variable, ArgExtOp, SegmentedOp)) + self.function = function + + @property + def name(self): + from loopy.library.reduction import ArgExtOp, SegmentedOp + if isinstance(self.function, p.Variable): + return self.function.name + elif isinstance(self.function, (ArgExtOp, SegmentedOp)): + return self.function + else: + raise LoopyError("Unexpected function type %s in ResolvedFunction." % + type(self.function)) + + def __getinitargs__(self): + return (self.function, ) + + def stringifier(self): + return StringifyMapper + + mapper_method = intern("map_resolved_function") + # }}} @@ -650,9 +712,12 @@ def get_dependencies(expr): # {{{ rule-aware mappers def parse_tagged_name(expr): + from loopy.library.reduction import ArgExtOp, SegmentedOp if isinstance(expr, TaggedVariable): return expr.name, expr.tag - elif isinstance(expr, p.Variable): + elif isinstance(expr, ResolvedFunction): + return parse_tagged_name(expr.function) + elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)): return expr.name, None else: raise RuntimeError("subst rule name not understood: %s" % expr) @@ -850,12 +915,14 @@ class RuleAwareIdentityMapper(IdentityMapper): def __init__(self, rule_mapping_context): self.rule_mapping_context = rule_mapping_context - def map_variable(self, expr, expn_state): + def map_variable(self, expr, expn_state, *args, **kwargs): name, tag = parse_tagged_name(expr) if name not in self.rule_mapping_context.old_subst_rules: - return IdentityMapper.map_variable(self, expr, expn_state) + return IdentityMapper.map_variable(self, expr, expn_state, *args, + **kwargs) else: - return self.map_substitution(name, tag, (), expn_state) + return self.map_substitution(name, tag, (), expn_state, *args, + **kwargs) def map_call(self, expr, expn_state): if not isinstance(expr.function, p.Variable): @@ -910,7 +977,7 @@ class RuleAwareIdentityMapper(IdentityMapper): else: return sym - def __call__(self, expr, kernel, insn): + def __call__(self, expr, kernel, insn, *args, **kwargs): from loopy.kernel.data import InstructionBase assert insn is None or isinstance(insn, InstructionBase) @@ -919,7 +986,7 @@ class RuleAwareIdentityMapper(IdentityMapper): kernel=kernel, instruction=insn, stack=(), - arg_context={})) + arg_context={}), *args, **kwargs) def map_instruction(self, kernel, insn): return insn diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index a81354e2..e3b4853c 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -80,7 +80,7 @@ class TargetBase(object): def preprocess(self, kernel): return kernel - def pre_codegen_check(self, kernel): + def pre_codegen_check(self, kernel, program_callables_info): pass # }}} @@ -150,7 +150,12 @@ class ASTBuilderBase(object): # {{{ library - def function_manglers(self): + def function_scopers(self): + """ + Returns an instance of list of the functions of signature + ``(target, identifiers)`` returning either an instance of + :class:`InKernelCallable` if a match is found or *None*. + """ return [] def symbol_manglers(self): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 83efecf0..1579bb31 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -27,7 +27,6 @@ THE SOFTWARE. import six import numpy as np # noqa -from loopy.kernel.data import CallMangleInfo from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError, LoopyTypeError from cgen import Pointer, NestedDeclarator, Block @@ -35,6 +34,7 @@ from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import IdentityMapper from loopy.types import NumpyType +from loopy.kernel.function_interface import ScalarCallable import pymbolic.primitives as p from pytools import memoize_method @@ -354,71 +354,116 @@ def c_symbol_mangler(kernel, name): # }}} -# {{{ function mangler +# {{{ function scoping -def c_math_mangler(target, name, arg_dtypes, modify_name=True): - # Function mangler for math functions defined in C standard - # Convert abs, min, max to fabs, fmin, fmax. - # If modify_name is set to True, function names are modified according to - # floating point types of the arguments (e.g. cos(double), cosf(float)) - # This should be set to True for C and Cuda, False for OpenCL - if not isinstance(name, str): - return None +class CMathCallable(ScalarCallable): + """ + An umbrella callable for all the math functions which can be seen in a + C-Target. + """ - if name in ["abs", "min", "max"]: - name = "f" + name + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + name = self.name - # unitary functions - if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] - and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind == "f"): + if name in ["abs", "min", "max"]: + name = "f" + name - dtype = arg_dtypes[0].numpy_dtype + # unary functions + if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]: - if modify_name: - if dtype == np.float64: - pass # fabs - elif dtype == np.float32: - name = name + "f" # fabsf - elif dtype == np.float128: - name = name + "l" # fabsl - else: - raise LoopyTypeError("%s does not support type %s" % (name, dtype)) + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError("%s can take only one argument." % name) - return CallMangleInfo( - target_name=name, - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) - # binary functions - if (name in ["fmax", "fmin"] - and len(arg_dtypes) == 2): + dtype = arg_id_to_dtype[0] + dtype = dtype.numpy_dtype - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyTypeError("%s does not support complex numbers") + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + elif dtype.kind == 'c': + raise LoopyTypeError("%s does not support type %s" % (name, dtype)) - elif dtype.kind == "f": - if modify_name: + from loopy.target.opencl import OpenCLTarget + if not isinstance(caller_kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified if dtype == np.float64: - pass # fmin + pass # fabs elif dtype == np.float32: - name = name + "f" # fminf + name = name + "f" # fabsf elif dtype == np.float128: - name = name + "l" # fminl + name = name + "l" # fabsl else: - raise LoopyTypeError("%s does not support type %s" - % (name, dtype)) + raise LoopyTypeError("%s does not support type %s" % (name, + dtype)) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + program_callables_info) + + # binary functions + if name in ["fmax", "fmin"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + + if dtype.kind == "c": + raise LoopyTypeError("%s does not support complex numbers") + + elif dtype.kind == "f": + from loopy.target.opencl import OpenCLTarget + if not isinstance(caller_kernel.target, OpenCLTarget): + if dtype == np.float64: + pass # fmin + elif dtype == np.float32: + name = name + "f" # fminf + elif dtype == np.float128: + name = name + "l" # fminl + else: + raise LoopyTypeError("%s does not support type %s" + % (name, dtype)) + dtype = NumpyType(dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + program_callables_info) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) +def scope_c_math_functions(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` if the function + represented by :arg:`identifier` is known in C, otherwise returns *None*. + """ + if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min"]: + return CMathCallable(name=identifier) return None # }}} @@ -427,12 +472,6 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): class CASTBuilder(ASTBuilderBase): # {{{ library - def function_manglers(self): - return ( - super(CASTBuilder, self).function_manglers() + [ - c_math_mangler - ]) - def symbol_manglers(self): return ( super(CASTBuilder, self).symbol_manglers() + [ @@ -445,6 +484,11 @@ class CASTBuilder(ASTBuilderBase): _preamble_generator, ]) + def function_scopers(self): + return ( + super(CASTBuilder, self).function_scopers() + [ + scope_c_math_functions]) + # }}} # {{{ code generation @@ -846,82 +890,31 @@ class CASTBuilder(ASTBuilderBase): return block_if_necessary(assignments) def emit_multiple_assignment(self, codegen_state, insn): - ecm = codegen_state.expression_to_code_mapper - from pymbolic.primitives import Variable - from pymbolic.mapper.stringifier import PREC_NONE - - func_id = insn.expression.function - parameters = insn.expression.parameters - - if isinstance(func_id, Variable): - func_id = func_id.name - - assignee_var_descriptors = [ - codegen_state.kernel.get_var_descriptor(a) - for a in insn.assignee_var_names()] - - par_dtypes = tuple(ecm.infer_type(par) for par in parameters) - - mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % func_id) - - assert mangle_result.arg_dtypes is not None + ecm = codegen_state.expression_to_code_mapper + func_id = insn.expression.function.name + in_knl_callable = codegen_state.program_callables_info[func_id] - if mangle_result.target_name == "loopy_make_tuple": - # This shorcut avoids actually having to emit a 'make_tuple' function. + if isinstance(in_knl_callable, ScalarCallable) and ( + in_knl_callable.name_in_target == 'loopy_make_tuple'): return self.emit_tuple_assignment(codegen_state, insn) - from loopy.expression import dtype_to_type_context - c_parameters = [ - ecm(par, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr - for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, mangle_result.arg_dtypes)] - - from loopy.codegen import SeenFunction - codegen_state.seen_functions.add( - SeenFunction(func_id, - mangle_result.target_name, - mangle_result.arg_dtypes)) - - from pymbolic import var - for i, (a, tgt_dtype) in enumerate( - zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): - if tgt_dtype != ecm.infer_type(a): - raise LoopyError("type mismatch in %d'th (1-based) left-hand " - "side of instruction '%s'" % (i+1, insn.id)) - c_parameters.append( - # TODO Yuck: The "where-at function": &(...) - var("&")( - ecm(a, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr)) + in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn( + insn=insn, + target=self.target, + expression_to_code_mapper=ecm) - from pymbolic import var - result = var(mangle_result.target_name)(*c_parameters) - - # In case of no assignees, we are done - if len(mangle_result.result_dtypes) == 0: + if is_returned: + from cgen import Assign + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + return Assign(lhs_code, + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) + else: from cgen import ExpressionStatement return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), result)) - - result = ecm.wrap_in_typecast( - mangle_result.result_dtypes[0], - assignee_var_descriptors[0].dtype, - result) - - lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) - - from cgen import Assign - return Assign( - lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), result)) + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 6b80bae2..b3c304d5 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -157,7 +157,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ def generate_output_handler( - self, gen, options, kernel, implemented_data_info): + self, gen, options, program, implemented_data_info): from loopy.kernel.data import KernelArgument @@ -166,12 +166,13 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join("\"%s\": %s" % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + if arg.base_name in + program.root_kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] + if arg.base_name in program.root_kernel.get_written_variables()] if out_args: gen("return None, (%s,)" % ", ".join(arg.name for arg in out_args)) @@ -373,7 +374,7 @@ class CKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, kernel, compiler=None): + def __init__(self, program, compiler=None): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the @@ -382,35 +383,35 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super(CKernelExecutor, self).__init__(kernel) + super(CKernelExecutor, self).__init__(program) def get_invoker_uncached(self, kernel, codegen_result): generator = CExecutionWrapperGenerator() return generator(kernel, codegen_result) @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + program = self.get_typed_and_scheduled_program(arg_to_dtype_set) from loopy.codegen import generate_code_v2 - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() host_code = codegen_result.host_code() all_code = '\n'.join([dev_code, '', host_code]) - if self.kernel.options.write_cl: + if self.program.root_kernel.options.write_cl: output = all_code - if self.kernel.options.highlight_cl: + if self.program.root_kernel.options.highlight_cl: output = get_highlighted_code(code=output) - if self.kernel.options.write_cl is True: + if self.program.root_kernel.options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program.root_kernel.options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if self.program.root_kernel.options.edit_cl: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") # update code from editor @@ -419,14 +420,14 @@ class CKernelExecutor(KernelExecutorBase): c_kernels = [] for dp in codegen_result.device_programs: c_kernels.append(CompiledCKernel(dp, - codegen_result.implemented_data_info, all_code, self.kernel.target, + codegen_result.implemented_data_info, all_code, self.program.target, self.compiler)) return _KernelInfo( - kernel=kernel, + program=program, c_kernels=c_kernels, implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + invoker=self.get_invoker(program, codegen_result)) # }}} @@ -443,7 +444,7 @@ class CKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(self.arg_to_dtype_set(kwargs)) - return kernel_info.invoker( - kernel_info.c_kernels, *args, **kwargs) + return program_info.invoker( + program_info.c_kernels, *args, **kwargs) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index dd2104d0..65a8c202 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -41,7 +41,7 @@ from pymbolic import var from loopy.expression import dtype_to_type_context from loopy.type_inference import TypeInferenceMapper -from loopy.diagnostic import LoopyError, LoopyWarning +from loopy.diagnostic import LoopyError from loopy.tools import is_integer from loopy.types import LoopyType @@ -54,7 +54,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeInferenceMapper(self.kernel, + self.codegen_state.program_callables_info) self.type_inf_mapper = type_inf_mapper self.allow_complex = codegen_state.allow_complex @@ -383,19 +384,19 @@ class ExpressionToCExpressionMapper(IdentityMapper): "for constant '%s'" % expr) def map_call(self, expr, type_context): - from pymbolic.primitives import Variable, Subscript - - identifier = expr.function + from pymbolic.primitives import Subscript # {{{ implement indexof, indexof_vec - if identifier.name in ["indexof", "indexof_vec"]: + identifier_name = ( + self.codegen_state.program_callables_info[expr.function.name].name) + if identifier_name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: - raise LoopyError("%s takes exactly one argument" % identifier.name) + raise LoopyError("%s takes exactly one argument" % identifier_name) arg, = expr.parameters if not isinstance(arg, Subscript): raise LoopyError( - "argument to %s must be a subscript" % identifier.name) + "argument to %s must be a subscript" % identifier_name) ary = self.find_array(arg) @@ -407,11 +408,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): - raise LoopyError("%s does not support images" % identifier.name) + raise LoopyError("%s does not support images" % identifier_name) - if identifier.name == "indexof": + if identifier_name == "indexof": return access_info.subscripts[0] - elif identifier.name == "indexof_vec": + elif identifier_name == "indexof_vec": from loopy.kernel.array import VectorArrayDimTag ivec = None for iaxis, dim_tag in enumerate(ary.dim_tags): @@ -430,56 +431,25 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) - - processed_parameters = None - - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: - raise LoopyError("functions with more or fewer than one return value " - "may not be used in an expression") - - if mangle_result.arg_dtypes is not None: - processed_parameters = tuple( - self.rec(par, - dtype_to_type_context(self.kernel.target, tgt_dtype), - tgt_dtype) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)) - - else: - # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to - # propagate the type context here. But for many others, it does - # not. Using the inferred type as a stopgap for now. - processed_parameters = tuple( - self.rec(par, - type_context=dtype_to_type_context( - self.kernel.target, par_dtype)) - for par, par_dtype in zip(expr.parameters, par_dtypes)) - - from warnings import warn - warn("Calling function '%s' with unknown C signature--" - "return CallMangleInfo.arg_dtypes" - % identifier, LoopyWarning) - - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) - - return var(mangle_result.target_name)(*processed_parameters) + from loopy.kernel.function_interface import ManglerCallable + if isinstance(self.codegen_state.program_callables_info[expr.function.name], + ManglerCallable): + from loopy.codegen import SeenFunction + in_knl_callable = ( + self.codegen_state.program_callables_info[ + expr.function.name]) + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) + + return ( + self.codegen_state.program_callables_info[ + expr.function.name].emit_call( + expression_to_code_mapper=self, + expression=expr, + target=self.kernel.target)) # {{{ deal with complex-valued variables diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 673d3b28..89cbfd03 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -34,6 +34,7 @@ from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.kernel.data import AddressSpace from pymbolic import var +from loopy.kernel.function_interface import ScalarCallable # {{{ vector types @@ -111,29 +112,82 @@ def _register_vector_types(dtype_registry): # }}} -# {{{ function mangler +# {{{ function scoper -def cuda_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +_CUDA_SPECIFIC_FUNCTIONS = { + "rsqrt": 1, + "atan2": 2, + } - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type([], arg_dtypes) - if dtype.kind == "c": - raise RuntimeError("min/max do not support complex numbers") +class CudaCallable(ScalarCallable): - if dtype.kind == "f": - name = "f" + name + def cuda_with_types(self, arg_id_to_dtype, caller_kernel, + program_callables_info): - return dtype, name + name = self.name - if name in "atan2" and len(arg_dtypes) == 2: - return arg_dtypes[0], name + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"] - return scalar_dtype, name + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"] + return ( + self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), + 0: dtype, 1: dtype}), + program_callables_info) + + if name in _CUDA_SPECIFIC_FUNCTIONS: + num_args = _CUDA_SPECIFIC_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + program_callables_info) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + +def scope_cuda_functions(target, identifier): + if identifier in set(["dot"]) | set( + _CUDA_SPECIFIC_FUNCTIONS): + return CudaCallable(name=identifier) return None @@ -217,13 +271,12 @@ class CudaTarget(CTarget): # {{{ ast builder class CUDACASTBuilder(CASTBuilder): + # {{{ library - def function_manglers(self): - return ( - super(CUDACASTBuilder, self).function_manglers() + [ - cuda_function_mangler - ]) + def function_scopers(self): + return [scope_cuda_functions] + ( + super(CUDACASTBuilder, self).function_scopers()) # }}} @@ -249,7 +302,8 @@ class CUDACASTBuilder(CASTBuilder): _, local_grid_size = \ codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.program_callables_info) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 3cdf2057..43963ddb 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -61,12 +61,12 @@ class SeparateArrayPackingController(object): It also repacks outgoing arrays of this type back into an object array. """ - def __init__(self, kernel): + def __init__(self, program): # map from arg name self.packing_info = {} from loopy.kernel.array import ArrayBase - for arg in kernel.args: + for arg in program.args: if not isinstance(arg, ArrayBase): continue @@ -82,7 +82,8 @@ class SeparateArrayPackingController(object): name=arg.name, sep_shape=arg.sep_shape(), subscripts_and_names=subscripts_and_names, - is_written=arg.name in kernel.get_written_variables()) + is_written=arg.name in + program.root_kernel.get_written_variables()) def unpack(self, kernel_kwargs): if not self.packing_info: @@ -143,7 +144,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from shapes def generate_integer_arg_finding_from_shapes( - self, gen, kernel, implemented_data_info): + self, gen, program, implemented_data_info): # a mapping from integer argument names to a list of tuples # (arg_name, expression), where expression is a # unary function of kernel.arg_dict[arg_name] @@ -168,7 +169,8 @@ class ExecutionWrapperGeneratorBase(object): if len(deps) == 1: integer_arg_var, = deps - if kernel.arg_dict[integer_arg_var.name].dtype.is_integral(): + if program.arg_dict[ + integer_arg_var.name].dtype.is_integral(): from pymbolic.algorithm import solve_affine_equations_for try: # friggin' overkill :) @@ -214,9 +216,9 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from offsets - def generate_integer_arg_finding_from_offsets(self, gen, kernel, + def generate_integer_arg_finding_from_offsets(self, gen, program, implemented_data_info): - options = kernel.options + options = program.root_kernel.options gen("# {{{ find integer arguments from offsets") gen("") @@ -239,7 +241,7 @@ class ExecutionWrapperGeneratorBase(object): else: gen("_lpy_offset = %s.offset" % impl_array_name) - base_arg = kernel.impl_arg_to_arg[impl_array_name] + base_arg = program.impl_arg_to_arg[impl_array_name] if not options.skip_arg_checks: gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)" @@ -264,8 +266,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from strides def generate_integer_arg_finding_from_strides( - self, gen, kernel, implemented_data_info): - options = kernel.options + self, gen, program, implemented_data_info): + options = program.root_kernel.options gen("# {{{ find integer arguments from strides") gen("") @@ -284,7 +286,7 @@ class ExecutionWrapperGeneratorBase(object): "passed array\")" % (arg.name, impl_array_name)) - base_arg = kernel.impl_arg_to_arg[impl_array_name] + base_arg = program.impl_arg_to_arg[impl_array_name] if not options.skip_arg_checks: gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)" @@ -307,8 +309,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ check that value args are present def generate_value_arg_check( - self, gen, kernel, implemented_data_info): - if kernel.options.skip_arg_checks: + self, gen, program, implemented_data_info): + if program.root_kernel.options.skip_arg_checks: return from loopy.kernel.data import ValueArg @@ -361,7 +363,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ arg setup def generate_arg_setup( - self, gen, kernel, implemented_data_info, options): + self, gen, program, implemented_data_info, options): import loopy as lp from loopy.kernel.data import KernelArgument @@ -384,8 +386,8 @@ class ExecutionWrapperGeneratorBase(object): expect_no_more_arguments = False for arg_idx, arg in enumerate(implemented_data_info): - is_written = arg.base_name in kernel.get_written_variables() - kernel_arg = kernel.impl_arg_to_arg.get(arg.name) + is_written = arg.base_name in program.root_kernel.get_written_variables() + program_arg = program.impl_arg_to_arg.get(arg.name) if not issubclass(arg.arg_class, KernelArgument): expect_no_more_arguments = True @@ -447,7 +449,7 @@ class ExecutionWrapperGeneratorBase(object): gen("if %s is None:" % arg.name) with Indentation(gen): self.handle_alloc( - gen, arg, kernel_arg, strify, options.skip_arg_checks) + gen, arg, program_arg, strify, options.skip_arg_checks) gen("_lpy_made_by_loopy = True") gen("") @@ -465,7 +467,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("if %s.dtype != %s:" % (arg.name, self.python_dtype_str( - kernel_arg.dtype.numpy_dtype))) + program_arg.dtype.numpy_dtype))) with Indentation(gen): gen("raise TypeError(\"dtype mismatch on argument '%s' " "(got: %%s, expected: %s)\" %% %s.dtype)" @@ -493,10 +495,10 @@ class ExecutionWrapperGeneratorBase(object): "%% (%s.shape, %s))" % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) - if kernel_arg.shape is None: + if program_arg.shape is None: pass - elif any(shape_axis is None for shape_axis in kernel_arg.shape): + elif any(shape_axis is None for shape_axis in program_arg.shape): gen("if len(%s.shape) != %s:" % (arg.name, len(arg.unvec_shape))) with Indentation(gen): @@ -519,8 +521,8 @@ class ExecutionWrapperGeneratorBase(object): # }}} - if arg.unvec_strides and kernel_arg.dim_tags: - itemsize = kernel_arg.dtype.numpy_dtype.itemsize + if arg.unvec_strides and program_arg.dim_tags: + itemsize = program_arg.dtype.numpy_dtype.itemsize sym_strides = tuple( itemsize*s_i for s_i in arg.unvec_strides) @@ -558,7 +560,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("raise ValueError(\"Argument '%s' does not " "allow arrays with offsets. Try passing " - "default_offset=loopy.auto to make_kernel()." + "default_offset=loopy.auto to make_program()." "\")" % arg.name) gen("") @@ -617,7 +619,7 @@ class ExecutionWrapperGeneratorBase(object): def generate_host_code(self, gen, codegen_result): raise NotImplementedError - def __call__(self, kernel, codegen_result): + def __call__(self, program, codegen_result): """ Generates the wrapping python invoker for this execution target @@ -629,12 +631,12 @@ class ExecutionWrapperGeneratorBase(object): kernel """ - options = kernel.options + options = program.root_kernel.options implemented_data_info = codegen_result.implemented_data_info from loopy.kernel.data import KernelArgument gen = PythonFunctionGenerator( - "invoke_%s_loopy_kernel" % kernel.name, + "invoke_%s_loopy_kernel" % program.name, self.system_args + [ "%s=None" % idi.name for idi in implemented_data_info @@ -651,21 +653,21 @@ class ExecutionWrapperGeneratorBase(object): self.initialize_system_args(gen) self.generate_integer_arg_finding_from_shapes( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_integer_arg_finding_from_offsets( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_integer_arg_finding_from_strides( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_value_arg_check( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) args = self.generate_arg_setup( - gen, kernel, implemented_data_info, options) + gen, program, implemented_data_info, options) self.generate_invocation(gen, codegen_result.host_program.name, args, - kernel, implemented_data_info) + program, implemented_data_info) - self.generate_output_handler(gen, options, kernel, implemented_data_info) + self.generate_output_handler(gen, options, program, implemented_data_info) if options.write_wrapper: output = gen.get() @@ -713,32 +715,32 @@ class KernelExecutorBase(object): .. automethod:: __call__ """ - def __init__(self, kernel): + def __init__(self, program): """ :arg kernel: a loopy.LoopKernel """ - self.kernel = kernel + self.program = program - self.packing_controller = SeparateArrayPackingController(kernel) + self.packing_controller = SeparateArrayPackingController(program) - self.output_names = tuple(arg.name for arg in self.kernel.args - if arg.name in self.kernel.get_written_variables()) + self.output_names = tuple(arg.name for arg in self.program.args + if arg.is_output_only) self.has_runtime_typed_args = any( arg.dtype is None - for arg in kernel.args) + for arg in program.args) - def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): + def get_typed_and_scheduled_program_uncached(self, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes - kernel = self.kernel + program = self.program if arg_to_dtype_set: var_to_dtype = {} for var, dtype in arg_to_dtype_set: try: - dest_name = kernel.impl_arg_to_arg[var].name + dest_name = program.impl_arg_to_arg[var].name except KeyError: dest_name = var @@ -749,28 +751,30 @@ class KernelExecutorBase(object): "no known variable/argument with that name" % var) - kernel = add_dtypes(kernel, var_to_dtype) + program = add_dtypes(program, var_to_dtype) - from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) - if kernel.schedule is None: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) + if program.root_kernel.schedule is None: + from loopy.preprocess import preprocess_program + program = preprocess_program(program) from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + program = program.with_root_kernel( + get_one_scheduled_kernel(program.root_kernel, + program.program_callables_info)) - return kernel + return program - def get_typed_and_scheduled_kernel(self, arg_to_dtype_set): + def get_typed_and_scheduled_program(self, arg_to_dtype_set): from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching # prepare_for_caching() gets run by preprocess, but the kernel at this # stage is not guaranteed to be preprocessed. - cacheable_kernel = prepare_for_caching(self.kernel) - cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set) + cacheable_program = prepare_for_caching(self.program) + cache_key = (type(self).__name__, cacheable_program, arg_to_dtype_set) if CACHING_ENABLED: try: @@ -778,9 +782,9 @@ class KernelExecutorBase(object): except KeyError: pass - logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name) + logger.debug("%s: typed-and-scheduled cache miss" % self.program.name) - kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set) + kernel = self.get_typed_and_scheduled_program_uncached(arg_to_dtype_set) if CACHING_ENABLED: typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) @@ -791,7 +795,7 @@ class KernelExecutorBase(object): if not self.has_runtime_typed_args: return None - impl_arg_to_arg = self.kernel.impl_arg_to_arg + impl_arg_to_arg = self.program.impl_arg_to_arg arg_to_dtype = {} for arg_name, val in six.iteritems(kwargs): arg = impl_arg_to_arg.get(arg_name, None) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 0464270a..53963183 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -172,8 +172,9 @@ class ISPCTarget(CTarget): host_program_name_suffix = "" device_program_name_suffix = "_inner" - def pre_codegen_check(self, kernel): - gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs() + def pre_codegen_check(self, kernel, program_callables_info): + gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs( + program_callables_info) if len(lsize) > 1: for i, ls_i in enumerate(lsize[1:]): if ls_i != 1: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 432c95ef..44f782a7 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -31,11 +31,11 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.target.c import DTypeRegistryWrapper, c_math_mangler -from loopy.kernel.data import AddressSpace, CallMangleInfo +from loopy.target.c import DTypeRegistryWrapper +from loopy.kernel.data import AddressSpace +from loopy.kernel.function_interface import ScalarCallable from pymbolic import var -from functools import partial # {{{ dtype registry wrappers @@ -166,59 +166,135 @@ VECTOR_LITERAL_FUNCS = dict( ) -def opencl_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +class OpenCLCallable(ScalarCallable): + """ + Records information about OpenCL functions which are not covered by + :class:`loopy.target.c.CMathCallable`. + """ + + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + name = self.name + + if name in ["max", "min"]: + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if (id >= 0 and dtype is not None)]) + + if dtype.kind in ['u', 'i', 'f']: + if dtype.kind == 'f': + name = 'f'+name + dtype = NumpyType(dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + program_callables_info) + else: + # Unsupported type. + raise LoopyError("%s function not supported for the types %s" % + (name, dtype)) + + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] + return ( + self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), 0: dtype, 1: dtype}), + program_callables_info) + + if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: + num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + program_callables_info) + + if name in VECTOR_LITERAL_FUNCS: + base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] + + for id in arg_id_to_dtype: + if not -1 <= id < count: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(count): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in + range(count)) + updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( + NumpyType(dtype), count) + + return ( + self.copy(name_in_target="(%s%d) " % (base_tp_name, count), + arg_id_to_dtype=updated_arg_id_to_dtype), + program_callables_info) + + # does not satisfy any of the conditions needed for specialization. + # hence just returning a copy of the callable. + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) - # OpenCL has min(), max() for integer types - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "i": - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"] - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(scalar_dtype),), - arg_dtypes=(arg_dtypes[0],)*2) - - if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: - num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] - if len(arg_dtypes) != num_args: - raise LoopyError("%s takes %d arguments (%d received)" - % (name, num_args, len(arg_dtypes))) - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyError("%s does not support complex numbers" - % name) - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=(result_dtype,)*num_args) - - if name in VECTOR_LITERAL_FUNCS: - base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] - - if count != len(arg_dtypes): - return None - - return CallMangleInfo( - target_name="(%s%d) " % (base_tp_name, count), - result_dtypes=(kernel.target.vector_dtype( - NumpyType(dtype), count),), - arg_dtypes=(NumpyType(dtype),)*count) + +def scope_opencl_functions(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` if the function defined by + *identifier* is known in OpenCL. + """ + opencl_function_ids = set(["max", "min", "dot"]) | set( + _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS) + + if identifier in opencl_function_ids: + return OpenCLCallable(name=identifier) return None @@ -280,6 +356,7 @@ def opencl_preamble_generator(preamble_info): from loopy.tools import remove_common_indentation kernel = preamble_info.kernel + yield ("00_declare_gid_lid", remove_common_indentation(""" #define lid(N) ((%(idx_ctype)s) get_local_id(N)) @@ -365,13 +442,10 @@ class OpenCLTarget(CTarget): class OpenCLCASTBuilder(CASTBuilder): # {{{ library - def function_manglers(self): + def function_scopers(self): return ( - [ - opencl_function_mangler, - partial(c_math_mangler, modify_name=False) - ] + - super(OpenCLCASTBuilder, self).function_manglers()) + [scope_opencl_functions] + super( + OpenCLCASTBuilder, self).function_scopers()) def symbol_manglers(self): return ( @@ -380,13 +454,10 @@ class OpenCLCASTBuilder(CASTBuilder): ]) def preamble_generators(self): - from loopy.library.reduction import reduction_preamble_generator return ( super(OpenCLCASTBuilder, self).preamble_generators() + [ - opencl_preamble_generator, - reduction_preamble_generator, - ]) + opencl_preamble_generator]) # }}} @@ -399,6 +470,11 @@ class OpenCLCASTBuilder(CASTBuilder): from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) + if not codegen_state.kernel.is_called_from_host: + # auxiliary kernels need not mention opencl speicific qualifiers + # for a functions signature + return fdecl + fdecl = fdecl.subdecl from cgen.opencl import CLKernel, CLRequiredWorkGroupSize @@ -407,7 +483,8 @@ class OpenCLCASTBuilder(CASTBuilder): from loopy.schedule import get_insn_ids_for_block_at _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.program_callables_info) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 73e8e009..03ba2693 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -31,12 +31,12 @@ from six.moves import range import numpy as np -from loopy.kernel.data import CallMangleInfo from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder from loopy.target.python import PythonASTBuilderBase from loopy.types import NumpyType -from loopy.diagnostic import LoopyError, warn_with_kernel +from loopy.diagnostic import LoopyError, warn_with_kernel, LoopyTypeError from warnings import warn +from loopy.kernel.function_interface import ScalarCallable import logging logger = logging.getLogger(__name__) @@ -134,7 +134,7 @@ def adjust_local_temp_var_storage(kernel, device): # {{{ check sizes against device properties -def check_sizes(kernel, device): +def check_sizes(kernel, program_callables_info, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError @@ -151,7 +151,8 @@ def check_sizes(kernel, device): if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately - glens, llens = kernel.get_grid_size_upper_bounds_as_exprs() + glens, llens = ( + kernel.get_grid_size_upper_bounds_as_exprs(program_callables_info)) if (max(len(glens), len(llens)) > device.max_work_item_dimensions): @@ -199,37 +200,89 @@ def check_sizes(kernel, device): # }}} -def pyopencl_function_mangler(target, name, arg_dtypes): - if len(arg_dtypes) == 1 and isinstance(name, str): - arg_dtype, = arg_dtypes +# {{{ pyopencl function scopers - if arg_dtype.is_complex(): - if arg_dtype.numpy_dtype == np.complex64: - tpname = "cfloat" - elif arg_dtype.numpy_dtype == np.complex128: - tpname = "cdouble" +class PyOpenCLCallable(ScalarCallable): + """ + Records information about the callables which are not covered by + :class:`loopy.target.opencl.OpenCLCallable` + """ + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + + name = self.name + + for id in arg_id_to_dtype: + # since all the below functions are single arg. + if not -1 <= id <= 0: + raise LoopyError("%s can only take one argument." % name) + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = arg_id_to_dtype[0] + + if name in ["real", "imag", "abs"]: + if dtype.is_complex(): + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return ( + self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: NumpyType( + np.dtype(dtype.numpy_dtype.type(0).real))}), + program_callables_info) + + if name in ["sqrt", "exp", "log", + "sin", "cos", "tan", + "sinh", "cosh", "tanh", + "conj", "abs"]: + if dtype.is_complex(): + # function parameters are complex. + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return ( + self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: dtype}), + program_callables_info) else: - raise RuntimeError("unexpected complex type '%s'" % arg_dtype) - - if name in ["sqrt", "exp", "log", - "sin", "cos", "tan", - "sinh", "cosh", "tanh", - "conj"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(arg_dtype,), - arg_dtypes=(arg_dtype,)) - - if name in ["real", "imag", "abs"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(NumpyType( - np.dtype(arg_dtype.numpy_dtype.type(0).real)), - ), - arg_dtypes=(arg_dtype,)) + # function calls for floating parameters. + numpy_dtype = dtype.numpy_dtype + if numpy_dtype.kind in ('u', 'i'): + dtype = dtype.copy(numpy_dtype=np.float32) + if name == 'abs': + name = 'fabs' + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: dtype, -1: dtype}), + program_callables_info) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + +def pyopencl_function_scoper(target, identifier): + if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", + "tanh", "conj", "real", "imag", "abs"]: + return PyOpenCLCallable(name=identifier) return None +# }}} + # {{{ preamble generator @@ -344,8 +397,8 @@ class PyOpenCLTarget(OpenCLTarget): kernel = adjust_local_temp_var_storage(kernel, self.device) return kernel - def pre_codegen_check(self, kernel): - check_sizes(kernel, self.device) + def pre_codegen_check(self, kernel, program_callables_info): + check_sizes(kernel, program_callables_info, self.device) def get_host_ast_builder(self): return PyOpenCLPythonASTBuilder(self) @@ -739,19 +792,15 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_manglers(self): - from loopy.library.random123 import random123_function_mangler + def function_scopers(self): + from loopy.library.random123 import random123_function_scoper return ( - super(PyOpenCLCASTBuilder, self).function_manglers() + [ - pyopencl_function_mangler, - random123_function_mangler - ]) + [pyopencl_function_scoper, random123_function_scoper] + super( + PyOpenCLCASTBuilder, self).function_scopers()) def preamble_generators(self): - from loopy.library.random123 import random123_preamble_generator return ([ pyopencl_preamble_generator, - random123_preamble_generator, ] + super(PyOpenCLCASTBuilder, self).preamble_generators()) # }}} diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 27be6198..380ab1d9 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -151,9 +151,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ generate invocation - def generate_invocation(self, gen, kernel_name, args, - kernel, implemented_data_info): - if kernel.options.cl_exec_manage_array_events: + def generate_invocation(self, gen, program_name, args, + program, implemented_data_info): + if program.root_kernel.options.cl_exec_manage_array_events: gen(""" if wait_for is None: wait_for = [] @@ -169,20 +169,21 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen("") - gen("_lpy_evt = {kernel_name}({args})" + gen("_lpy_evt = {program_name}({args})" .format( - kernel_name=kernel_name, + program_name=program_name, args=", ".join( ["_lpy_cl_kernels", "queue"] + args + ["wait_for=wait_for"]))) - if kernel.options.cl_exec_manage_array_events: + if program.root_kernel.options.cl_exec_manage_array_events: gen("") from loopy.kernel.data import ArrayArg for arg in implemented_data_info: if (issubclass(arg.arg_class, ArrayArg) - and arg.base_name in kernel.get_written_variables()): + and arg.base_name in ( + program.root_kernel.get_written_variables())): gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name)) # }}} @@ -190,7 +191,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ def generate_output_handler( - self, gen, options, kernel, implemented_data_info): + self, gen, options, program, implemented_data_info): from loopy.kernel.data import KernelArgument @@ -207,7 +208,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if not issubclass(arg.arg_class, KernelArgument): continue - is_written = arg.base_name in kernel.get_written_variables() + is_written = arg.base_name in ( + program.root_kernel.get_written_variables()) if is_written: gen("%s = %s.get(queue=queue)" % (arg.name, arg.name)) @@ -218,12 +220,13 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join("\"%s\": %s" % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + if arg.base_name in + program.root_kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] + if arg.base_name in program.root_kernel.get_written_variables()] if out_args: gen("return _lpy_evt, (%s,)" % ", ".join(arg.name for arg in out_args)) @@ -252,7 +255,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, context, kernel): + def __init__(self, context, program): """ :arg context: a :class:`pyopencl.Context` :arg kernel: may be a loopy.LoopKernel, a generator returning kernels @@ -261,40 +264,40 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): specific arguments. """ - super(PyOpenCLKernelExecutor, self).__init__(kernel) + super(PyOpenCLKernelExecutor, self).__init__(program) self.context = context from loopy.target.pyopencl import PyOpenCLTarget - if isinstance(kernel.target, PyOpenCLTarget): - self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0])) + if isinstance(program.target, PyOpenCLTarget): + self.program = program.copy(target=PyOpenCLTarget(context.devices[0])) def get_invoker_uncached(self, kernel, codegen_result): generator = PyOpenCLExecutionWrapperGenerator() return generator(kernel, codegen_result) @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + program = self.get_typed_and_scheduled_program(arg_to_dtype_set) from loopy.codegen import generate_code_v2 from loopy.target.execution import get_highlighted_code - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() - if self.kernel.options.write_cl: + if self.program.root_kernel.options.write_cl: output = dev_code - if self.kernel.options.highlight_cl: + if self.program.root_kernel.options.highlight_cl: output = get_highlighted_code(output) - if self.kernel.options.write_cl is True: + if self.program.root_kernel.options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program.root_kernel.options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if self.program.root_kernel.options.edit_cl: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.cl") @@ -302,17 +305,17 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): cl_program = ( cl.Program(self.context, dev_code) - .build(options=kernel.options.cl_build_options)) + .build(options=program.root_kernel.options.cl_build_options)) cl_kernels = _Kernels() for dp in codegen_result.device_programs: setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) return _KernelInfo( - kernel=kernel, + program=program, cl_kernels=cl_kernels, implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + invoker=self.get_invoker(program, codegen_result)) def __call__(self, queue, **kwargs): """ @@ -347,10 +350,10 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(self.arg_to_dtype_set(kwargs)) - return kernel_info.invoker( - kernel_info.cl_kernels, queue, allocator, wait_for, + return program_info.invoker( + program_info.cl_kernels, queue, allocator, wait_for, out_host, **kwargs) # }}} diff --git a/loopy/target/python.py b/loopy/target/python.py index ce04986d..cd6e6116 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -44,7 +44,8 @@ class ExpressionToPythonMapper(StringifyMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeInferenceMapper(self.kernel, + self.codegen_state.program_callables_info) self.type_inf_mapper = type_inf_mapper def handle_unsupported_expression(self, victim, enclosing_prec): @@ -82,47 +83,37 @@ class ExpressionToPythonMapper(StringifyMapper): expr, enclosing_prec) def map_call(self, expr, enclosing_prec): - from pymbolic.primitives import Variable from pymbolic.mapper.stringifier import PREC_NONE - identifier = expr.function + identifier_name = self.codegen_state.program_callables_info[ + expr.function.name].name - if identifier.name in ["indexof", "indexof_vec"]: + if identifier_name in ["indexof", "indexof_vec"]: raise LoopyError( "indexof, indexof_vec not yet supported in Python") - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters) + from loopy.kernel.function_interface import ManglerCallable + in_knl_callable = self.codegen_state.program_callables_info[ + expr.function.name] + if isinstance(in_knl_callable, ManglerCallable): + from loopy.codegen import SeenFunction + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) str_parameters = None + number_of_assignees = len([key for key in + in_knl_callable.arg_id_to_dtype.keys() if key < 0]) - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: + if number_of_assignees != 1: raise LoopyError("functions with more or fewer than one return value " "may not be used in an expression") - str_parameters = [ - self.rec(par, PREC_NONE) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)] + str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters] - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) - - return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters)) + return "%s(%s)" % (in_knl_callable.name_in_target, ", ".join(str_parameters)) def map_group_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have group hw axes") @@ -189,11 +180,11 @@ class PythonASTBuilderBase(ASTBuilderBase): # {{{ code generation guts - def function_manglers(self): + def function_scopers(self): + from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, self).function_manglers() + [ - _numpy_single_arg_function_mangler, - ]) + super(PythonASTBuilderBase, self).function_scopers() + + [scope_c_math_functions]) def preamble_generators(self): return ( diff --git a/loopy/tools.py b/loopy/tools.py index 8c5d3639..b243a794 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -73,7 +73,8 @@ class LoopyKeyBuilder(KeyBuilderBase): def update_for_dict(self, key_hash, key): # Order matters for the hash--insert in sorted order. - for dict_key in sorted(six.iterkeys(key)): + for dict_key in sorted(six.iterkeys(key), key=lambda obj: + type(obj).__name__ + str(obj)): self.rec(key_hash, (dict_key, key[dict_key])) update_for_defaultdict = update_for_dict diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index cfbbd56e..38bb2185 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -26,6 +26,8 @@ THE SOFTWARE. from loopy.kernel.instruction import BarrierInstruction from loopy.match import parse_match from loopy.transform.instruction import add_dependency +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel __doc__ = """ .. currentmodule:: loopy @@ -36,8 +38,10 @@ __doc__ = """ # {{{ add_barrier -def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, - tags=None, synchronization_kind="global", mem_kind=None): +@iterate_over_kernels_if_given_program +def add_barrier(knl, insn_before="", insn_after="", + id_based_on=None, tags=None, synchronization_kind="global", + mem_kind=None): """Takes in a kernel that needs to be added a barrier and returns a kernel which has a barrier inserted into it. It takes input of 2 instructions and then adds a barrier in between those 2 instructions. The expressions can @@ -55,6 +59,8 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, for "global" bariers. If not supplied, defaults to :arg:`synchronization_kind` """ + assert isinstance(knl, LoopKernel) + if mem_kind is None: mem_kind = synchronization_kind @@ -76,7 +82,7 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, mem_kind=mem_kind) new_knl = knl.copy(instructions=knl.instructions + [barrier_to_add]) - new_knl = add_dependency(kernel=new_knl, + new_knl = add_dependency(new_knl, insn_match=insn_after, depends_on="id:"+id) diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index b7f47c38..3df86e7a 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -27,9 +27,13 @@ import six from loopy.diagnostic import LoopyError +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + # {{{ fold constants +@iterate_over_kernels_if_given_program def fold_constants(kernel): from loopy.symbolic import ConstantFoldingMapper cfm = ConstantFoldingMapper() @@ -53,7 +57,9 @@ def fold_constants(kernel): # {{{ collect_common_factors_on_increment # thus far undocumented +@iterate_over_kernels_if_given_program def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): + assert isinstance(kernel, LoopKernel) # FIXME: Does not understand subst rules for now if kernel.substitutions: from loopy.transform.subst import expand_subst diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index f0b9814c..97054700 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -29,6 +29,9 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont from loopy.kernel.data import ValueArg, ArrayArg import islpy as isl +from loopy.program import iterate_over_kernels_if_given_program + + __doc__ = """ .. currentmodule:: loopy @@ -102,8 +105,9 @@ def _add_unique_dim_name(name, dim_names): return (ng(name),) + tuple(dim_names) -def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", - sequential=False): +@iterate_over_kernels_if_given_program +def to_batched(knl, nbatches, batch_varying_args, + batch_iname_prefix="ibatch", sequential=False): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. .. note:: diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 801da4c1..57c4397f 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -33,6 +33,9 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper from loopy.version import DATA_MODEL_VERSION from loopy.diagnostic import LoopyError +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import ScalarCallable, CallableKernel from pymbolic import var @@ -130,10 +133,10 @@ buffer_array_cache = WriteOncePersistentDict( # Adding an argument? also add something to the cache_key below. -def buffer_array(kernel, var_name, buffer_inames, init_expression=None, - store_expression=None, within=None, default_tag="l.auto", - temporary_scope=None, temporary_is_local=None, - fetch_bounding_box=False): +def buffer_array_for_single_kernel(kernel, program_callables_info, var_name, + buffer_inames, init_expression=None, store_expression=None, + within=None, default_tag="l.auto", temporary_scope=None, + temporary_is_local=None, fetch_bounding_box=False): """Replace accesses to *var_name* with ones to a temporary, which is created and acts as a buffer. To perform this transformation, the access footprint to *var_name* is determined and a temporary of a suitable @@ -169,6 +172,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, fetched. """ + assert isinstance(kernel, LoopKernel) + # {{{ unify temporary_scope / temporary_is_local from loopy.kernel.data import AddressSpace @@ -240,7 +245,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, from loopy.preprocess import prepare_for_caching key_kernel = prepare_for_caching(kernel) - cache_key = (key_kernel, var_name, tuple(buffer_inames), + cache_key = (key_kernel, var_name, + tuple(buffer_inames), PymbolicExpressionHashWrapper(init_expression), PymbolicExpressionHashWrapper(store_expression), within, default_tag, temporary_scope, fetch_bounding_box) @@ -528,7 +534,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, program_callables_info) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching @@ -537,4 +543,29 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, return kernel + +def buffer_array(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = buffer_array_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # vim: foldmethod=marker diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 5b1ee6cc..5f4f2f2a 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -30,6 +30,9 @@ from islpy import dim_type from loopy.kernel.data import ImageArg from pytools import MovedFunctionDeprecationWrapper +from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable # {{{ convenience: add_prefetch @@ -140,7 +143,8 @@ class _not_provided: # noqa: N801 pass -def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, +def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name, + sweep_inames=[], dim_arg_names=None, # "None" is a valid value here, distinct from the default. default_tag=_not_provided, @@ -239,6 +243,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, This function internally uses :func:`extract_subst` and :func:`precompute`. """ + assert isinstance(kernel, LoopKernel) # {{{ fish indexing out of var_name and into footprint_subscripts @@ -328,9 +333,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, # precompute module, but precompute acutally uses that to adjust its # warning message. - from loopy.transform.precompute import precompute - new_kernel = precompute(kernel, subst_use, sweep_inames, - precompute_inames=dim_arg_names, + from loopy.transform.precompute import precompute_for_single_kernel + new_kernel = precompute_for_single_kernel(kernel, program_callables_info, + subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, @@ -363,6 +368,31 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, else: return new_kernel + +def add_prefetch(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = add_prefetch_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} @@ -385,6 +415,7 @@ def change_arg_to_image(knl, name): # {{{ tag array axes +@iterate_over_kernels_if_given_program def tag_array_axes(knl, ary_names, dim_tags): """ .. versionchanged:: 2016.2 @@ -414,13 +445,15 @@ def tag_array_axes(knl, ary_names, dim_tags): return knl -tag_data_axes = MovedFunctionDeprecationWrapper(tag_array_axes) +tag_data_axes = ( + MovedFunctionDeprecationWrapper(tag_array_axes)) # }}} # {{{ set_array_axis_names +@iterate_over_kernels_if_given_program def set_array_axis_names(kernel, ary_names, dim_names): """ .. versionchanged:: 2016.2 @@ -445,13 +478,15 @@ def set_array_axis_names(kernel, ary_names, dim_names): return kernel -set_array_dim_names = MovedFunctionDeprecationWrapper(set_array_axis_names) +set_array_dim_names = (MovedFunctionDeprecationWrapper( + set_array_axis_names)) # }}} # {{{ remove_unused_arguments +@iterate_over_kernels_if_given_program def remove_unused_arguments(knl): new_args = [] @@ -493,6 +528,7 @@ def remove_unused_arguments(knl): # {{{ alias_temporaries +@iterate_over_kernels_if_given_program def alias_temporaries(knl, names, base_name_prefix=None, synchronize_for_exclusive_use=True): """Sets all temporaries given by *names* to be backed by a single piece of @@ -577,11 +613,14 @@ def alias_temporaries(knl, names, base_name_prefix=None, # {{{ set argument order +@iterate_over_kernels_if_given_program def set_argument_order(kernel, arg_names): """ :arg arg_names: A list (or comma-separated string) or argument names. All arguments must be in this list. """ + #FIXME: @inducer -- shoulld this only affect the root kernel, or should it + # take a within? if isinstance(arg_names, str): arg_names = arg_names.split(",") @@ -610,6 +649,7 @@ def set_argument_order(kernel, arg_names): # {{{ rename argument +@iterate_over_kernels_if_given_program def rename_argument(kernel, old_name, new_name, existing_ok=False): """ .. versionadded:: 2016.2 @@ -655,6 +695,7 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False): # {{{ set temporary scope +@iterate_over_kernels_if_given_program def set_temporary_scope(kernel, temp_var_names, scope): """ :arg temp_var_names: a container with membership checking, @@ -696,6 +737,7 @@ def set_temporary_scope(kernel, temp_var_names, scope): # {{{ reduction_arg_to_subst_rule +@iterate_over_kernels_if_given_program def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None): if isinstance(inames, str): inames = [s.strip() for s in inames.split(",")] diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index d4dcb370..54d06605 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -33,6 +33,7 @@ import loopy as lp from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext from loopy.isl_helpers import make_slab from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel # {{{ diff mapper @@ -370,6 +371,8 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i", *diff_context.by_name*, or *None* if no dependency exists. """ + assert isinstance(knl, LoopKernel) + from loopy.kernel.creation import apply_single_writer_depencency_heuristic knl = apply_single_writer_depencency_heuristic(knl, warn_if_used=True) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 49e30a75..d43ce025 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -31,6 +31,10 @@ from islpy import dim_type from loopy.diagnostic import LoopyError from pymbolic import var +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel +from loopy.program import rename_resolved_functions_in_a_single_kernel + def _apply_renames_in_exprs(kernel, var_renames): from loopy.symbolic import ( @@ -287,7 +291,7 @@ def _fuse_two_kernels(knla, knlb): # }}} -def fuse_kernels(kernels, suffixes=None, data_flow=None): +def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): """Return a kernel that performs all the operations in all entries of *kernels*. @@ -331,6 +335,8 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): *data_flow* was added in version 2016.2 """ + + assert all(isinstance(knl, LoopKernel) for knl in kernels) kernels = list(kernels) if data_flow is None: @@ -411,4 +417,52 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): return result + +def fuse_kernels(programs, suffixes=None, data_flow=None): + main_prog_callables_info = ( + programs[0].program_callables_info.with_edit_callables_mode()) + old_root_kernel_callable = ( + programs[0].program_callables_info[programs[0].name]) + kernels = [programs[0].root_kernel] + + # removing the callable collisions that maybe present + for prog in programs[1:]: + root_kernel = prog.root_kernel + renames_needed = {} + for old_func_id, in_knl_callable in prog.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + if in_knl_callable.name != prog.name: + raise LoopyError("fuse_kernels cannot fuse programs with " + "multiple callable kernels.") + continue + num_times_called = ( + prog.program_callables_info.num_times_callables_called[ + old_func_id]) + for i in range(num_times_called): + main_prog_callables_info, new_func_id = ( + main_prog_callables_info.with_callables(var(old_func_id), + in_knl_callable, True)) + + if old_func_id != new_func_id: + renames_needed[old_func_id] = new_func_id + + if renames_needed: + root_kernel = rename_resolved_functions_in_a_single_kernel( + root_kernel, renames_needed) + + kernels.append(root_kernel) + + new_root_kernel = fuse_loop_kernels(kernels, suffixes, data_flow) + new_root_kernel_callable = old_root_kernel_callable.copy( + subkernel=new_root_kernel.copy(name=programs[0].name)) + + main_prog_callables_info, _ = main_prog_callables_info.with_callable( + var(programs[0].name), new_root_kernel_callable) + + main_prog_callables_info = ( + main_prog_callables_info.with_exit_edit_callables_mode()) + + return programs[0].copy( + program_callables_info=main_prog_callables_info) + # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 2b618a46..93f6c53e 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -34,6 +34,10 @@ from loopy.symbolic import ( SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + __doc__ = """ .. currentmodule:: loopy @@ -93,6 +97,7 @@ def set_loop_priority(kernel, loop_priority): return kernel.copy(loop_priority=frozenset([loop_priority])) +@iterate_over_kernels_if_given_program def prioritize_loops(kernel, loop_priority): """Indicates the textual order in which loops should be entered in the kernel code. Note that this priority has an advisory role only. If the @@ -107,6 +112,8 @@ def prioritize_loops(kernel, loop_priority): :arg: an iterable of inames, or, for brevity, a comma-separated string of inames """ + + assert isinstance(kernel, LoopKernel) if isinstance(loop_priority, str): loop_priority = tuple(s.strip() for s in loop_priority.split(",") if s.strip()) @@ -299,13 +306,15 @@ def _split_iname_backend(kernel, split_iname, kernel = tag_inames(kernel, {outer_iname: existing_tag, inner_iname: existing_tag}) - return tag_inames(kernel, {outer_iname: outer_tag, inner_iname: inner_tag}) + return tag_inames(kernel, {outer_iname: outer_tag, + inner_iname: inner_tag}) # }}} # {{{ split iname +@iterate_over_kernels_if_given_program def split_iname(kernel, split_iname, inner_length, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, @@ -331,6 +340,8 @@ def split_iname(kernel, split_iname, inner_length, :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ + assert isinstance(kernel, LoopKernel) + def make_new_loop_index(inner, outer): return inner + outer*inner_length @@ -347,6 +358,7 @@ def split_iname(kernel, split_iname, inner_length, # {{{ chunk iname +@iterate_over_kernels_if_given_program def chunk_iname(kernel, split_iname, num_chunks, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, @@ -481,6 +493,7 @@ class _InameJoiner(RuleAwareSubstitutionMapper): return super(_InameJoiner, self).map_reduction(expr, expn_state) +@iterate_over_kernels_if_given_program def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """ :arg inames: fastest varying last @@ -625,7 +638,9 @@ def untag_inames(kernel, iname_to_untag, tag_type): # {{{ tag inames -def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): +@iterate_over_kernels_if_given_program +def tag_inames(kernel, iname_to_tag, force=False, + ignore_nonexistent=False): """Tag an iname :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given @@ -804,7 +819,9 @@ class _InameDuplicator(RuleAwareIdentityMapper): return insn.copy(within_inames=new_fid) -def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, +@iterate_over_kernels_if_given_program +def duplicate_inames(knl, inames, within, new_inames=None, + suffix=None, tags={}): """ :arg within: a stack match as understood by @@ -966,7 +983,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # If partitioning was empty, we have recursed successfully and yield nothing -def get_iname_duplication_options(knl, use_boostable_into=False): +def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=False): """List options for duplication of inames, if necessary for schedulability :returns: a generator listing all options to duplicate inames, if duplication @@ -1032,7 +1049,7 @@ def get_iname_duplication_options(knl, use_boostable_into=False): # If we find a duplication option and to not use boostable_into # information, we restart this generator with use_boostable_into=True if not use_boostable_into and not knl.options.ignore_boostable_into: - for option in get_iname_duplication_options(knl, True): + for option in get_iname_duplication_options_for_single_kernel(knl, True): yield option # Emit a warning that we needed boostable_into @@ -1060,18 +1077,42 @@ def get_iname_duplication_options(knl, use_boostable_into=False): yield iname, within -def has_schedulable_iname_nesting(knl): +def get_iname_duplication_options(program, use_boostable_into=False): + for in_knl_callable in program.program_callables_info.values(): + if isinstance(in_knl_callable, CallableKernel): + for option in get_iname_duplication_options_for_single_kernel( + in_knl_callable.subkernel, use_boostable_into): + yield option + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of in kernel callable %s." + % (type(in_knl_callable))) + + return + + +def has_schedulable_iname_nesting_for_single_kernel(knl): """ :returns: a :class:`bool` indicating whether this kernel needs an iname duplication in order to be schedulable. """ - return not bool(next(get_iname_duplication_options(knl), False)) + return not bool(next(get_iname_duplication_options_for_single_kernel(knl), + False)) + + +def has_schedulable_iname_nesting(program): + return all(has_schedulable_iname_nesting_for_single_kernel( + in_knl_callable.subkernel) for in_knl_callable in + program.program_callables_info.values() if isinstance(in_knl_callable, + CallableKernel)) # }}} # {{{ rename_inames +@iterate_over_kernels_if_given_program def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): """ :arg within: a stack match as understood by @@ -1278,6 +1319,7 @@ def _split_reduction(kernel, inames, direction, within=None): rsplit.map_kernel(kernel)) +@iterate_over_kernels_if_given_program def split_reduction_inward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1297,6 +1339,7 @@ def split_reduction_inward(kernel, inames, within=None): return _split_reduction(kernel, inames, "in", within) +@iterate_over_kernels_if_given_program def split_reduction_outward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1320,6 +1363,7 @@ def split_reduction_outward(kernel, inames, within=None): # {{{ affine map inames +@iterate_over_kernels_if_given_program def affine_map_inames(kernel, old_inames, new_inames, equations): """Return a new *kernel* where the affine transform specified by *equations* has been applied to the inames. @@ -1651,6 +1695,7 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper): expr, expn_state) +@iterate_over_kernels_if_given_program def make_reduction_inames_unique(kernel, inames=None, within=None): """ :arg inames: if not *None*, only apply to these inames @@ -1697,6 +1742,7 @@ def make_reduction_inames_unique(kernel, inames=None, within=None): # {{{ add_inames_to_insn +@iterate_over_kernels_if_given_program def add_inames_to_insn(knl, inames, insn_match): """ :arg inames: a frozenset of inames that will be added to the diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index e6ecb409..93cf932b 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -25,15 +25,35 @@ THE SOFTWARE. import six # noqa from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import (ScalarCallable, CallableKernel) +from loopy.program import Program, iterate_over_kernels_if_given_program # {{{ find_instructions -def find_instructions(kernel, insn_match): +def find_instructions_in_single_kernel(kernel, insn_match): + assert isinstance(kernel, LoopKernel) from loopy.match import parse_match match = parse_match(insn_match) return [insn for insn in kernel.instructions if match(kernel, insn)] + +def find_instructions(program, insn_match): + assert isinstance(program, Program) + insns = [] + for in_knl_callable in program.program_callables_info.values(): + if isinstance(in_knl_callable, CallableKernel): + insns += (find_instructions_in_single_kernel( + in_knl_callable.subkernel, insn_match)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % ( + type(in_knl_callable))) + + return insns + # }}} @@ -58,6 +78,7 @@ def map_instructions(kernel, insn_match, f): # {{{ set_instruction_priority +@iterate_over_kernels_if_given_program def set_instruction_priority(kernel, insn_match, priority): """Set the priority of instructions matching *insn_match* to *priority*. @@ -75,6 +96,7 @@ def set_instruction_priority(kernel, insn_match, priority): # {{{ add_dependency +@iterate_over_kernels_if_given_program def add_dependency(kernel, insn_match, depends_on): """Add the instruction dependency *dependency* to the instructions matched by *insn_match*. @@ -92,7 +114,8 @@ def add_dependency(kernel, insn_match, depends_on): added_deps = frozenset([depends_on]) else: added_deps = frozenset( - dep.id for dep in find_instructions(kernel, depends_on)) + dep.id for dep in find_instructions_in_single_kernel(kernel, + depends_on)) if not added_deps: raise LoopyError("no instructions found matching '%s' " @@ -209,6 +232,7 @@ def replace_instruction_ids(kernel, replacements): # {{{ tag_instructions +@iterate_over_kernels_if_given_program def tag_instructions(kernel, new_tag, within=None): from loopy.match import parse_match within = parse_match(within) @@ -228,6 +252,7 @@ def tag_instructions(kernel, new_tag, within=None): # {{{ add nosync +@iterate_over_kernels_if_given_program def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, empty_ok=False): """Add a *no_sync_with* directive between *source* and *sink*. @@ -260,18 +285,21 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, This used to silently pass. This behavior can be restored using *empty_ok*. """ + assert isinstance(kernel, LoopKernel) if isinstance(source, str) and source in kernel.id_to_insn: sources = frozenset([source]) else: sources = frozenset( - source.id for source in find_instructions(kernel, source)) + source.id for source in find_instructions_in_single_kernel( + kernel, source)) if isinstance(sink, str) and sink in kernel.id_to_insn: sinks = frozenset([sink]) else: sinks = frozenset( - sink.id for sink in find_instructions(kernel, sink)) + sink.id for sink in find_instructions_in_single_kernel( + kernel, sink)) if not sources and not empty_ok: raise LoopyError("No match found for source specification '%s'." % source) @@ -324,6 +352,7 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, # {{{ uniquify_instruction_ids +@iterate_over_kernels_if_given_program def uniquify_instruction_ids(kernel): """Converts any ids that are :class:`loopy.UniqueName` or *None* into unique strings. diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index d695e359..3e5e4a43 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -28,6 +28,9 @@ THE SOFTWARE. from pytools import MovedFunctionDeprecationWrapper from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + class ArrayAxisSplitHelper(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, arg_names, handler): @@ -44,7 +47,9 @@ class ArrayAxisSplitHelper(RuleAwareIdentityMapper): # {{{ split_array_dim (deprecated since June 2016) -def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, +@iterate_over_kernels_if_given_program +def split_array_dim(kernel, arrays_and_axes, count, + auto_split_inames=True, split_kwargs=None): """ :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating @@ -237,7 +242,7 @@ def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) if auto_split_inames: - from loopy import split_iname + from loopy.transform.iname import split_iname for iname, (outer_iname, inner_iname) in six.iteritems(split_vars): kernel = split_iname(kernel, iname, count, outer_iname=outer_iname, inner_iname=inner_iname, @@ -370,7 +375,9 @@ def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): return kernel -def split_array_axis(kernel, array_names, axis_nr, count, order="C"): +@iterate_over_kernels_if_given_program +def split_array_axis(kernel, array_names, axis_nr, count, + order="C"): """ :arg array: a list of names of temporary variables or arguments. May also be a comma-separated string of these. @@ -387,6 +394,7 @@ def split_array_axis(kernel, array_names, axis_nr, count, order="C"): There was a more complicated, dumber function called :func:`split_array_dim` that had the role of this function in versions prior to 2016.2. """ + assert isinstance(kernel, LoopKernel) if isinstance(array_names, str): array_names = [i.strip() for i in array_names.split(",") if i.strip()] @@ -439,6 +447,7 @@ def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1 # {{{ add_padding +@iterate_over_kernels_if_given_program def add_padding(kernel, variable, axis, align_bytes): arg_to_idx = dict((arg.name, i) for i, arg in enumerate(kernel.args)) arg_idx = arg_to_idx[variable] diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index fc5dad91..b7d017ec 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -28,6 +28,9 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + __doc__ = """ .. currentmodule:: loopy @@ -40,6 +43,7 @@ __doc__ = """ # {{{ assume +@iterate_over_kernels_if_given_program def assume(kernel, assumptions): """Include an assumption about :ref:`domain-parameters` in the kernel, e.g. `n mod 4 = 0`. @@ -134,6 +138,7 @@ def _fix_parameter(kernel, name, value): )) +@iterate_over_kernels_if_given_program def fix_parameters(kernel, **value_dict): """Fix the values of the arguments to specific constants. @@ -141,6 +146,7 @@ def fix_parameters(kernel, **value_dict): to be *value*. *name* may refer to :ref:`domain-parameters` or :ref:`arguments`. """ + assert isinstance(kernel, LoopKernel) for name, value in six.iteritems(value_dict): kernel = _fix_parameter(kernel, name, value) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 52d56897..66c7114a 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -38,6 +38,9 @@ from pymbolic import var from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, AccessDescriptor) +from loopy.program import Program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + class RuleAccessDescriptor(AccessDescriptor): __slots__ = ["args", "expansion_stack"] @@ -258,9 +261,9 @@ class _not_provided(object): # noqa: N801 pass -def precompute(kernel, subst_use, sweep_inames=[], within=None, - storage_axes=None, temporary_name=None, precompute_inames=None, - precompute_outer_inames=None, +def precompute_for_single_kernel(kernel, program_callables_info, subst_use, + sweep_inames=[], within=None, storage_axes=None, temporary_name=None, + precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, # "None" is a valid value here, distinct from the default. @@ -1037,15 +1040,40 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} - from loopy import tag_inames + from loopy.transform.iname import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag): from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, program_callables_info) return kernel + +def precompute(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = precompute_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # vim: foldmethod=marker diff --git a/loopy/transform/save.py b/loopy/transform/save.py index cca62bc5..4b957b03 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -64,7 +64,7 @@ class LivenessAnalysis(object): def __init__(self, kernel): self.kernel = kernel - self.schedule = self.kernel.schedule + self.schedule = kernel.schedule @memoize_method def get_successor_relation(self): @@ -235,8 +235,9 @@ class TemporarySaver(object): def new_shape(self): return self.hw_dims + self.non_hw_dims - def __init__(self, kernel): + def __init__(self, kernel, program_callables_info): self.kernel = kernel + self.program_callables_info = program_callables_info self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() @@ -439,7 +440,8 @@ class TemporarySaver(object): return (), () group_sizes, local_sizes = ( - self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) + self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids, + self.program_callables_info)) if temporary.address_space == lp.AddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. @@ -628,7 +630,7 @@ class TemporarySaver(object): kernel = lp.add_nosync(kernel, "global", source, sink) from loopy.kernel.tools import assign_automatic_axes - return assign_automatic_axes(kernel) + return assign_automatic_axes(kernel, self.program_callables_info) def save(self, temporary, subkernel): self.save_or_reload_impl(temporary, subkernel, "save") @@ -722,7 +724,7 @@ class TemporarySaver(object): # {{{ auto save and reload across kernel calls -def save_and_reload_temporaries(knl): +def save_and_reload_temporaries(program): """ Add instructions to save and reload temporary variables that are live across kernel calls. @@ -745,8 +747,19 @@ def save_and_reload_temporaries(knl): :returns: The resulting kernel """ + + knl = program.root_kernel + + if not knl.schedule: + program = lp.preprocess_program(program) + from loopy.schedule import get_one_scheduled_kernel + knl = get_one_scheduled_kernel(program.root_kernel, + program.program_callables_info) + + assert knl.schedule is not None + liveness = LivenessAnalysis(knl) - saver = TemporarySaver(knl) + saver = TemporarySaver(knl, program.program_callables_info) from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) @@ -784,7 +797,7 @@ def save_and_reload_temporaries(knl): .format(temporary, sched_item.kernel_name)) saver.save(temporary, sched_item.kernel_name) - return saver.finish() + return program.with_root_kernel(saver.finish()) # }}} diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index a681afe0..afe3fec5 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -33,6 +33,9 @@ from pymbolic.mapper.substitutor import make_subst_func from pytools import ImmutableRecord from pymbolic import var +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) @@ -44,6 +47,7 @@ class ExprDescriptor(ImmutableRecord): # {{{ extract_subst +@iterate_over_kernels_if_given_program def extract_subst(kernel, subst_name, template, parameters=()): """ :arg subst_name: The name of the substitution rule to be created. @@ -285,6 +289,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): return var(subst_name)(*index) +@iterate_over_kernels_if_given_program def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): """Extract an assignment (to a temporary variable or an argument) @@ -468,7 +473,9 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, # {{{ expand_subst +@iterate_over_kernels_if_given_program def expand_subst(kernel, within=None): + assert isinstance(kernel, LoopKernel) if not kernel.substitutions: return kernel @@ -501,8 +508,17 @@ def find_rules_matching(knl, pattern): return [r for r in knl.substitutions if pattern.match(r)] -def find_one_rule_matching(knl, pattern): - rules = find_rules_matching(knl, pattern) +def find_one_rule_matching(program, pattern): + rules = [] + for in_knl_callable in program.program_callables_info.values(): + if isinstance(in_knl_callable, CallableKernel): + knl = in_knl_callable.subkernel + rules.extend(find_rules_matching(knl, pattern)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable types %s." % ( + type(in_knl_callable).__name__)) if len(rules) > 1: raise ValueError("more than one substitution rule matched '%s'" diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 010a0658..0e8fa305 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -33,6 +33,11 @@ from loopy.types import NumpyType from loopy.diagnostic import ( LoopyError, TypeInferenceFailure, DependencyTypeInferenceFailure) +from loopy.kernel.instruction import _DataObliviousInstruction + +from loopy.program import ProgramCallablesInfo +from loopy.symbolic import SubArrayRef, LinearSubscript +from pymbolic.primitives import Variable, Subscript, Lookup import logging logger = logging.getLogger(__name__) @@ -44,10 +49,23 @@ def _debug(kernel, s, *args): logger.debug("%s: %s" % (kernel.name, logstr)) +def get_return_types_as_tuple(arg_id_to_dtype): + """Returns the types of arguments in a tuple format. + + :param arg_id_to_dtype: An instance of :class:`dict` which denotes a + mapping from the arguments to their inferred types. + """ + return_arg_id_to_dtype = dict((id, dtype) for id, dtype in + arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)) + return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True) + + return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) + + # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, new_assignments=None): + def __init__(self, kernel, program_callables_info, new_assignments=None): """ :arg new_assignments: mapping from names to either :class:`loopy.kernel.data.TemporaryVariable` @@ -56,10 +74,13 @@ class TypeInferenceMapper(CombineMapper): instances """ self.kernel = kernel + assert isinstance(program_callables_info, ProgramCallablesInfo) if new_assignments is None: new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() + self.program_callables_info = program_callables_info + self.old_calls_to_new_calls = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): kwargs = {} @@ -92,13 +113,16 @@ class TypeInferenceMapper(CombineMapper): # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) # are Python-equal (for many common constants such as integers). - def copy(self): - return type(self)(self.kernel, self.new_assignments) + def copy(self, program_callables_info=None): + if program_callables_info is None: + program_callables_info = self.program_callables_info + return type(self)(self.kernel, program_callables_info, + self.new_assignments) def with_assignments(self, names_to_vars): new_ass = self.new_assignments.copy() new_ass.update(names_to_vars) - return type(self)(self.kernel, new_ass) + return type(self)(self.kernel, self.program_callables_info, new_ass) @staticmethod def combine(dtype_sets): @@ -250,15 +274,20 @@ class TypeInferenceMapper(CombineMapper): return self.rec(expr.aggregate) def map_call(self, expr, return_tuple=False): - from pymbolic.primitives import Variable + + from pymbolic.primitives import Variable, CallWithKwargs, Call + from loopy.symbolic import ResolvedFunction + + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + assert isinstance(expr, Call) + kw_parameters = {} identifier = expr.function - if isinstance(identifier, Variable): + if isinstance(identifier, (Variable, ResolvedFunction)): identifier = identifier.name - if identifier in ["indexof", "indexof_vec"]: - return [self.kernel.index_dtype] - def none_if_empty(d): if d: d, = d @@ -266,25 +295,145 @@ class TypeInferenceMapper(CombineMapper): else: return None - arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters) - if None in arg_dtypes: - return [] + arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in + tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) + + # specializing the known function wrt type + if isinstance(expr.function, ResolvedFunction): + in_knl_callable = self.program_callables_info[expr.function.name] + + # {{{ checking that there is no overwriting of types of in_knl_callable + + if in_knl_callable.arg_id_to_dtype is not None: + + # specializing an already specialized function. + for id, dtype in arg_id_to_dtype.items(): + if id in in_knl_callable.arg_id_to_dtype and ( + in_knl_callable.arg_id_to_dtype[id] != + arg_id_to_dtype[id]): + + # {{{ ignoring the the cases when there is a discrepancy + # between np.uint and np.int + + import numpy as np + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint32) and ( + arg_id_to_dtype[id].dtype.type == np.int32): + continue + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint64) and ( + arg_id_to_dtype[id].dtype.type == + np.int64): + continue + + # }}} + + raise LoopyError("Overwriting a specialized function " + "is illegal--maybe start with new instance of " + "InKernelCallable?") + + # }}} + + in_knl_callable, self.program_callables_info = ( + in_knl_callable.with_types( + arg_id_to_dtype, self.kernel, + self.program_callables_info)) + + in_knl_callable = in_knl_callable.with_target(self.kernel.target) + + # storing the type specialized function so that it can be used for + # later use + self.program_callables_info, new_function_id = ( + self.program_callables_info.with_callable( + expr.function.function, + in_knl_callable)) + + if isinstance(expr, Call): + self.old_calls_to_new_calls[expr] = new_function_id + else: + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls[expr] = new_function_id + + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + + if new_arg_id_to_dtype is None: + return [] + + # collecting result dtypes in order of the assignees + if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: + if return_tuple: + return [get_return_types_as_tuple(new_arg_id_to_dtype)] + else: + return [new_arg_id_to_dtype[-1]] + + elif isinstance(expr.function, Variable): + # Since, the function is not "scoped", attempt to infer using + # kernel.function_manglers + + # {{{ trying to infer using function manglers + + arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in + expr.parameters) + + # finding the function_mangler which would be associated with the + # realized function. + + mangle_result = None + for function_mangler in self.kernel.function_manglers: + mangle_result = function_mangler(self.kernel, identifier, + arg_dtypes) + if mangle_result: + # found a match. + break - mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) - if return_tuple: - if mangle_result is not None: - return [mangle_result.result_dtypes] - else: if mangle_result is not None: - if len(mangle_result.result_dtypes) != 1 and not return_tuple: - raise LoopyError("functions with more or fewer than one " - "return value may only be used in direct assignments") + from loopy.kernel.function_interface import (ManglerCallable, + ValueArgDescriptor) + + # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes + arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target)) + for i, dt in enumerate(mangle_result.arg_dtypes)) + arg_id_to_dtype.update(dict((-i-1, + dtype.with_target(self.kernel.target)) for i, dtype in enumerate( + mangle_result.result_dtypes))) + arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.arg_dtypes)) + res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.result_dtypes)) + arg_id_to_descr = dict(arg_descrs+res_descrs) + + # creating the ManglerCallable object corresponding to the + # function. + in_knl_callable = ManglerCallable( + identifier, function_mangler, arg_id_to_dtype, + arg_id_to_descr, mangle_result.target_name) + self.program_callables_info, new_function_id = ( + self.program_callables_info.with_callable( + expr.function, in_knl_callable, True)) + + if isinstance(expr, Call): + self.old_calls_to_new_calls[expr] = new_function_id + else: + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls = new_function_id + + # Returning the type. + if return_tuple: + if mangle_result is not None: + return [mangle_result.result_dtypes] + else: + if mangle_result is not None: + if len(mangle_result.result_dtypes) != 1 and not return_tuple: + raise LoopyError("functions with more or fewer than one " + "return value may only be used in direct " + "assignments") - return [mangle_result.result_dtypes[0]] + return [mangle_result.result_dtypes[0]] + # }}} - raise RuntimeError("unable to resolve " - "function '%s' with %d given arguments" - % (identifier, len(arg_dtypes))) + return [] + + map_call_with_kwargs = map_call def map_variable(self, expr): if expr.name in self.kernel.all_inames(): @@ -399,14 +548,20 @@ class TypeInferenceMapper(CombineMapper): return [expr.operation.result_dtypes(self.kernel, rec_result)[0] for rec_result in rec_results] + def map_sub_array_ref(self, expr): + return self.rec(expr.get_begin_subscript()) + + # }}} # {{{ infer single variable def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): + if var_name in kernel.all_params(): - return [kernel.index_dtype], [] + return [kernel.index_dtype], [], {}, ( + type_inf_mapper.program_callables_info) from functools import partial debug = partial(_debug, kernel) @@ -451,11 +606,15 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): dtype_sets.append(result) if not dtype_sets: - return None, type_inf_mapper.symbols_with_unknown_types + return ( + None, type_inf_mapper.symbols_with_unknown_types, None, + type_inf_mapper.program_callables_info) result = type_inf_mapper.combine(dtype_sets) - return result, type_inf_mapper.symbols_with_unknown_types + return (result, type_inf_mapper.symbols_with_unknown_types, + type_inf_mapper.old_calls_to_new_calls, + type_inf_mapper.program_callables_info) # }}} @@ -482,7 +641,8 @@ class _DictUnionView: # {{{ infer_unknown_types -def infer_unknown_types(kernel, expect_completion=False): +def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, + expect_completion=False): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) @@ -544,7 +704,8 @@ def infer_unknown_types(kernel, expect_completion=False): new_temp_vars, new_arg_dict ]) - type_inf_mapper = TypeInferenceMapper(kernel, item_lookup) + type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info, + item_lookup) from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) @@ -553,6 +714,8 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.kernel.data import TemporaryVariable, KernelArgument + old_calls_to_new_calls = {} + for var_chain in sccs: changed_during_last_queue_run = False queue = var_chain[:] @@ -576,9 +739,12 @@ def infer_unknown_types(kernel, expect_completion=False): debug("inferring type for %s %s", type(item).__name__, item.name) - result, symbols_with_unavailable_types = ( + (result, symbols_with_unavailable_types, + new_old_calls_to_new_calls, program_callables_info) = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) + type_inf_mapper = type_inf_mapper.copy( + program_callables_info=program_callables_info) failed = not result if not failed: @@ -597,6 +763,7 @@ def infer_unknown_types(kernel, expect_completion=False): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") + old_calls_to_new_calls.update(new_old_calls_to_new_calls) else: debug(" failure") @@ -635,23 +802,141 @@ def infer_unknown_types(kernel, expect_completion=False): # }}} + # FIXME: copy the explanation from make_function_ready_for_codegen + # here. + + # {{{ check if insn missed during type inference + + def _instruction_missed_during_inference(insn): + for assignee in insn.assignees: + if isinstance(assignee, Lookup): + assignee = assignee.aggregate + + if isinstance(assignee, Variable): + if assignee.name in kernel.arg_dict: + if kernel.arg_dict[assignee.name].dtype is None: + return False + else: + assert assignee.name in kernel.temporary_variables + if kernel.temporary_variables[assignee.name].dtype is None: + return False + + elif isinstance(assignee, (Subscript, LinearSubscript)): + if assignee.aggregate.name in kernel.arg_dict: + if kernel.arg_dict[assignee.aggregate.name].dtype is None: + return False + else: + assert assignee.aggregate.name in kernel.temporary_variables + if kernel.temporary_variables[ + assignee.aggregate.name].dtype is None: + return False + else: + assert isinstance(assignee, SubArrayRef) + if assignee.subscript.aggregate.name in kernel.arg_dict: + if kernel.arg_dict[ + assignee.subscript.aggregate.name].dtype is None: + return False + else: + assert assignee.subscript.aggregate.name in ( + kernel.temporary_variables) + if kernel.temporary_variables[ + assignee.subscript.aggregate.name] is None: + return False + + return True + + # }}} + + for insn in kernel.instructions: + if isinstance(insn, lp.MultiAssignmentBase): + # just a dummy run over the expression, to pass over all the + # functions + # FIXME: need a check over here which checks the instruction for + # unseen cases + if _instruction_missed_during_inference(insn): + type_inf_mapper(insn.expression, return_tuple=isinstance(insn, + lp.CallInstruction), return_dtype_set=True) + elif isinstance(insn, (_DataObliviousInstruction, + lp.CInstruction)): + pass + else: + raise NotImplementedError("Unknown instructions type %s." % ( + type(insn).__name__)) + + program_callables_info = type_inf_mapper.program_callables_info + old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) + end_time = time.time() logger.debug("type inference took {dur:.2f} seconds".format( dur=end_time - start_time)) - return unexpanded_kernel.copy( + pre_type_specialized_knl = unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], ) + # this has to be subsitutition + from loopy.kernel.function_interface import ( + change_names_of_pymbolic_calls) + type_specialized_kernel = change_names_of_pymbolic_calls( + pre_type_specialized_knl, old_calls_to_new_calls) + + # the check is unnecessary as we would first get TypeInfereceFailure before + # encountering this. Move this at the start once ManglerCallable is + # deprecated. + if expect_completion: + # if completion is expected, then it is important that all the + # callables are scoped. + from loopy.check import check_functions_are_scoped + check_functions_are_scoped(type_specialized_kernel) + + return type_specialized_kernel, program_callables_info + + +def infer_unknown_types(program, expect_completion=False): + """Infer types on temporaries and arguments.""" + from loopy.kernel import LoopKernel + if isinstance(program, LoopKernel): + # FIXME: deprecate warning needed here + from loopy.program import make_program_from_kernel + program = make_program_from_kernel(program) + + program_callables_info = program.program_callables_info + + type_uninferred_knl_callable = ( + program_callables_info[program.name]) + type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel + + program_callables_info = ( + program.program_callables_info.with_edit_callables_mode()) + root_kernel, program_callables_info = ( + infer_unknown_types_for_a_single_kernel( + type_uninferred_root_kernel, + program_callables_info, expect_completion)) + + type_inferred_knl_callable = type_uninferred_knl_callable.copy( + subkernel=root_kernel) + + program_callables_info, _ = ( + program_callables_info.with_callable( + program.name, + type_inferred_knl_callable)) + + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) + + # FIXME: maybe put all of this in a function? + # need to infer functions that were left out during inference + return program.copy(program_callables_info=program_callables_info) + # }}} # {{{ reduction expression helper def infer_arg_and_reduction_dtypes_for_reduction_expression( - kernel, expr, unknown_types_ok): - type_inf_mapper = TypeInferenceMapper(kernel) + kernel, expr, program_callables_info, unknown_types_ok): + type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info) import loopy as lp if expr.is_tuple_typed: @@ -682,7 +967,8 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( if dt is not lp.auto else dt for dt in reduction_dtypes) - return tuple(arg_dtypes), reduction_dtypes + return tuple(arg_dtypes), reduction_dtypes, ( + type_inf_mapper.program_callables_info) # }}} diff --git a/test/test_apps.py b/test/test_apps.py index e7f4004f..a9c3bf2a 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -216,7 +216,8 @@ def test_rob_stroud_bernstein(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -224,13 +225,12 @@ def test_rob_stroud_bernstein(ctx_factory): knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) - - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, - ))) + )) + print(lp.generate_code_v2(knl)) def test_rob_stroud_bernstein_full(ctx_factory): @@ -296,7 +296,8 @@ def test_rob_stroud_bernstein_full(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -310,14 +311,14 @@ def test_rob_stroud_bernstein_full(ctx_factory): from pickle import dumps, loads knl = loads(dumps(knl)) - knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, tmp=np.float32, coeffs=np.float32, result=np.float32, )) - print(knl) + print(lp.generate_code_v2(knl)) def test_stencil(ctx_factory): @@ -660,7 +661,7 @@ def test_domain_tree_nesting(): lp.GlobalArg('B', shape=(100, 31), dtype=np.float64), lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)]) - parents_per_domain = knl.parents_per_domain() + parents_per_domain = knl.root_kernel.parents_per_domain() def depth(i): if parents_per_domain[i] is None: diff --git a/test/test_c_execution.py b/test/test_c_execution.py index c355893e..7c7df255 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -76,6 +76,7 @@ def test_c_target_strides(): # test with C-order knl = __get_kernel('C') + lp.generate_code_v2(knl) a_np = np.reshape(np.arange(16 * 16, dtype=np.float32), (16, -1), order='C') diff --git a/test/test_diff.py b/test/test_diff.py index b735ab17..a7fd9298 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -55,7 +55,7 @@ def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel( + knl = lp.make_kernel_function( """{ [i,j]: 0<=i,j a = 1/(1+sinh(x[i] + y[j])**2) @@ -66,6 +66,7 @@ def test_diff(ctx_factory): from loopy.transform.diff import diff_kernel dknl, diff_map = diff_kernel(knl, "z", "x") + dknl = lp.make_program_from_kernel(dknl) dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") diff --git a/test/test_domain.py b/test/test_domain.py index ebfde850..dd789d2c 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -61,20 +61,15 @@ def test_assume(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i 10") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) - assert "if" not in compiled.get_code() + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code def test_divisibility_assumption(ctx_factory): @@ -90,16 +85,14 @@ def test_divisibility_assumption(ctx_factory): lp.GlobalArg("b", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and (exists zz: n = 16*zz)") + assumptions="n>=1 and (exists zz: n = 16*zz)", + target=lp.PyOpenCLTarget(ctx.devices[0])) ref_knl = knl knl = lp.split_iname(knl, "i", 16) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "if" not in code + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3}) @@ -118,16 +111,12 @@ def test_eq_constraint(ctx_factory): [ lp.GlobalArg("a", np.float32, shape=(1000,)), lp.GlobalArg("b", np.float32, shape=(1000,)) - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for knl in kernel_gen: - print(lp.generate_code(knl)) + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds(ctx_factory): @@ -150,12 +139,10 @@ def test_dependent_loop_bounds(ctx_factory): lp.GlobalArg("a_sum", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_2(ctx_factory): @@ -179,14 +166,13 @@ def test_dependent_loop_bounds_2(ctx_factory): lp.GlobalArg("ax", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_3(ctx_factory): @@ -211,25 +197,21 @@ def test_dependent_loop_bounds_3(ctx_factory): lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) - assert knl.parents_per_domain()[1] == 0 + assert knl.root_kernel.parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - with pytest.raises(RuntimeError): - list(lp.generate_loop_schedules(knl_bad)) + list(lp.generate_code_v2(knl_bad)) def test_dependent_loop_bounds_4(): @@ -291,11 +273,10 @@ def test_independent_multi_domain(ctx_factory): inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - assert knl.parents_per_domain() == 2*[None] + assert knl.root_kernel.parents_per_domain() == 2*[None] n = 50 - cknl = lp.CompiledKernel(ctx, knl) - evt, (a, b) = cknl(queue, n=n, out_host=True) + evt, (a, b) = knl(queue, n=n, out_host=True) assert a.shape == (50,) assert b.shape == (50,) @@ -396,10 +377,11 @@ def test_triangle_domain(ctx_factory): knl = lp.make_kernel( "{[i,j]: 0<=i,j bb = a[i] - b[i] @@ -122,16 +122,15 @@ def test_type_inference_no_artificial_doubles(ctx_factory): lp.GlobalArg("c", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "double" not in code + code = lp.generate_code_v2(prog).device_code() + assert "double" not in code def test_type_inference_with_type_dependencies(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: i=0}", """ <>a = 99 @@ -143,13 +142,17 @@ def test_type_inference_with_type_dependencies(): <>d = b + 2 + 1j """, "...") - knl = lp.infer_unknown_types(knl) + prog = lp.infer_unknown_types(prog) from loopy.types import to_loopy_type - assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32) - assert knl.temporary_variables["b"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["c"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["d"].dtype == to_loopy_type(np.complex128) + assert prog.root_kernel.temporary_variables["a"].dtype == to_loopy_type( + np.int32) + assert prog.root_kernel.temporary_variables["b"].dtype == to_loopy_type( + np.float32) + assert prog.root_kernel.temporary_variables["c"].dtype == to_loopy_type( + np.float32) + assert prog.root_kernel.temporary_variables["d"].dtype == to_loopy_type( + np.complex128) def test_sized_and_complex_literals(ctx_factory): @@ -183,16 +186,12 @@ def test_simple_side_effect(ctx_factory): """ a[i] = a[i] + 1 """, - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_owed_barriers(ctx_factory): @@ -203,17 +202,14 @@ def test_owed_barriers(ctx_factory): [ " z[i] = a[i]" ], - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_wg_too_small(ctx_factory): @@ -225,17 +221,14 @@ def test_wg_too_small(ctx_factory): " z[i] = a[i] {id=copy}" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - import pytest - for gen_knl in kernel_gen: - with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, gen_knl).get_code() + print(knl) + with pytest.raises(RuntimeError): + print(lp.generate_code_v2(knl)) def test_multi_cse(ctx_factory): @@ -247,17 +240,14 @@ def test_multi_cse(ctx_factory): " z[i] = a[i] + a[i]**2" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) # {{{ code generator fuzzing @@ -414,17 +404,16 @@ def test_ilp_write_race_detection_global(ctx_factory): lp.GlobalArg("a", np.float32), lp.ValueArg("n", np.int32, approximately=1000), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.tag_inames(knl, dict(j="ilp")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: - list(lp.generate_loop_schedules(knl)) + lp.generate_code_v2(knl) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list) @@ -438,13 +427,13 @@ def test_ilp_write_race_avoidance_local(ctx_factory): [ "<> a[i] = 5+i+j", ], - []) + [], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16, 17) + assert knl.root_kernel.temporary_variables["a"].shape == (16, 17) def test_ilp_write_race_avoidance_private(ctx_factory): @@ -455,13 +444,13 @@ def test_ilp_write_race_avoidance_private(ctx_factory): [ "<> a = 5+j", ], - []) + [], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.tag_inames(knl, dict(j="ilp")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16,) + knl = lp.preprocess_kernel(knl) + assert knl.root_kernel.temporary_variables["a"].shape == (16,) # }}} @@ -482,11 +471,12 @@ def test_write_parameter(ctx_factory): lp.GlobalArg("b", dtype, shape=()), lp.ValueArg("n", np.int32, approximately=1000), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) import pytest with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, knl).get_code() + lp.generate_code_v2(knl).device_code() # {{{ arg guessing @@ -507,10 +497,11 @@ def test_arg_shape_guessing(ctx_factory): lp.GlobalArg("c", shape=lp.auto), lp.ValueArg("n"), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_arg_guessing(ctx_factory): @@ -523,10 +514,11 @@ def test_arg_guessing(ctx_factory): b[i, j] = i*j c[i+j, j] = b[j,i] """, - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_arg_guessing_with_reduction(ctx_factory): @@ -541,16 +533,16 @@ def test_arg_guessing_with_reduction(ctx_factory): b[i, j] = i*j c[i+j, j] = b[j,i] """, - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_unknown_arg_shape(ctx_factory): ctx = ctx_factory() from loopy.target.pyopencl import PyOpenCLTarget - from loopy.compiled import CompiledKernel bsize = [256, 0] knl = lp.make_kernel( @@ -566,11 +558,11 @@ def test_unknown_arg_shape(ctx_factory): """, seq_dependencies=True, name="uniform_l", - target=PyOpenCLTarget(), + target=PyOpenCLTarget(ctx.devices[0]), assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0])) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - kernel_info = CompiledKernel(ctx, knl).kernel_info(frozenset()) # noqa + print(lp.generate_code_v2(knl).device_code()) # }}} @@ -587,10 +579,11 @@ def test_nonlinear_index(ctx_factory): lp.GlobalArg("a", shape="n"), lp.ValueArg("n"), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_offsets_and_slicing(ctx_factory): @@ -607,9 +600,7 @@ def test_offsets_and_slicing(ctx_factory): assumptions="n>=1 and m>=1", default_offset=lp.auto) - knl = lp.tag_data_axes(knl, "a,b", "stride:auto,stride:1") - - cknl = lp.CompiledKernel(ctx, knl) + knl = lp.tag_array_axes(knl, "a,b", "stride:auto,stride:1") a_full = cl.clrandom.rand(queue, (n, n), np.float64) a_full_h = a_full.get() @@ -624,8 +615,10 @@ def test_offsets_and_slicing(ctx_factory): b_full_h[b_sub] = 2*a_full_h[a_sub] - print(cknl.get_highlighted_code({"a": a.dtype})) - cknl(queue, a=a, b=b) + knl = lp.add_dtypes(knl, {"a": a.dtype}) + + print(lp.generate_code_v2(knl)) + knl(queue, a=a, b=b) import numpy.linalg as la assert la.norm(b_full.get() - b_full_h) < 1e-13 @@ -642,18 +635,16 @@ def test_vector_ilp_with_prefetch(ctx_factory): # argument guessing. lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"], default_tag="l.auto") - cknl = lp.CompiledKernel(ctx, knl) - cknl.kernel_info() - import re - code = cknl.get_code() + code = lp.generate_code_v2(knl).device_code() assert len(list(re.finditer("barrier", code))) == 1 @@ -674,18 +665,18 @@ def test_c_instruction(ctx_factory): lp.TemporaryVariable("x", np.float32), "...", ], - assumptions="n>=1") + assumptions="n>=1", target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_dependent_domain_insn_iname_finding(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel([ + prog = lp.make_kernel([ "{[isrc_box]: 0<=isrc_box src_ibox = source_boxes[i] @@ -728,8 +720,8 @@ def test_inames_deps_from_write_subscript(ctx_factory): None, shape=None), "..."]) - print(knl) - assert "i" in knl.insn_inames("myred") + print(prog) + assert "i" in prog.root_kernel.insn_inames("myred") def test_modulo_indexing(ctx_factory): @@ -743,14 +735,12 @@ def test_modulo_indexing(ctx_factory): [ lp.GlobalArg("a", None, shape="n"), "..." - ] + ], target=lp.PyOpenCLTarget(ctx.devices[0]) ) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( - a=np.float32, - ))) + knl = lp.add_dtypes(knl, {"a": np.float32}) + print(lp.generate_code_v2(knl).device_code()) @pytest.mark.parametrize("vec_len", [2, 3, 4, 8, 16]) @@ -770,7 +760,7 @@ def test_vector_types(ctx_factory, vec_len): ref_knl = knl - knl = lp.tag_data_axes(knl, "out", "c,vec") + knl = lp.tag_array_axes(knl, "out", "c,vec") knl = lp.tag_inames(knl, dict(j="unr")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") @@ -898,11 +888,7 @@ def test_multiple_writes_to_local_temporary(): temp[i, 1] = 15 """) knl = lp.tag_inames(knl, dict(i="l.0")) - - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - code, _ = lp.generate_code(k) - print(code) + print(lp.generate_code_v2(knl).device_code()) def test_make_copy_kernel(ctx_factory): @@ -980,9 +966,7 @@ def test_variable_size_temporary(): # Make sure that code generation succeeds even if # there are variable-length arrays. - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - lp.generate_code(k) + lp.generate_code_v2(knl).device_code() def test_indexof(ctx_factory): @@ -1014,7 +998,7 @@ def test_indexof_vec(ctx_factory): ''' out[i,j,k] = indexof_vec(out[i,j,k])''') knl = lp.tag_inames(knl, {"i": "vec"}) - knl = lp.tag_data_axes(knl, "out", "vec,c,c") + knl = lp.tag_array_axes(knl, "out", "vec,c,c") knl = lp.set_options(knl, write_cl=True) (evt, (out,)) = knl(queue) @@ -1156,7 +1140,7 @@ def test_within_inames_and_reduction(): within_inames=frozenset(), within_inames_is_final=True) - k = lp.make_kernel("{[i,j] : 0<=i,j {[j]: 0 <= j < jmax}"], """ @@ -2440,10 +2413,11 @@ def test_barrier_insertion_near_bottom_of_loop(): end """, seq_dependencies=True) - knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.set_temporary_scope(knl, "a", "local") - knl = lp.set_temporary_scope(knl, "b", "local") - knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + prog = lp.tag_inames(prog, dict(i="l.0")) + prog = lp.set_temporary_scope(prog, "a", "local") + prog = lp.set_temporary_scope(prog, "b", "local") + prog = lp.preprocess_kernel(prog) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) print(knl) @@ -2453,7 +2427,7 @@ def test_barrier_insertion_near_bottom_of_loop(): def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # make simple barrier'd kernel - knl = lp.make_kernel('{[i]: 0 <= i < 10}', + prog = lp.make_kernel('{[i]: 0 <= i < 10}', """ for i a[i] = i {id=a} @@ -2468,15 +2442,17 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # split into kernel w/ vesize larger than iname domain vecsize = 16 - knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0') + prog = lp.split_iname(prog, 'i', vecsize, inner_tag='l.0') from testlib import GridOverride # artifically expand via overridden_get_grid_sizes_for_insn_ids + knl = prog.root_kernel knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( knl.copy(), vecsize)) + prog = prog.with_root_kernel(knl) # make sure we can generate the code - lp.generate_code_v2(knl) + lp.generate_code_v2(prog) def test_multi_argument_reduction_type_inference(): @@ -2485,7 +2461,7 @@ def test_multi_argument_reduction_type_inference(): from loopy.types import to_loopy_type op = SegmentedSumReductionOperation() - knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=ja = 0 <>b_s0 = 0 """) - vng = knl.get_var_name_generator() + vng = prog.root_kernel.get_var_name_generator() assert vng("a_s0") != "a_s0" assert vng("b") != "b" @@ -2647,7 +2625,7 @@ def test_fixed_parameters(ctx_factory): def test_parameter_inference(): knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "") - assert knl.all_params() == set(["n"]) + assert knl.root_kernel.all_params() == set(["n"]) def test_execution_backend_can_cache_dtypes(ctx_factory): @@ -2666,7 +2644,7 @@ def test_execution_backend_can_cache_dtypes(ctx_factory): def test_wildcard_dep_matching(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0 <= i < 10}", """ <>a = 0 {id=insn1} @@ -2679,11 +2657,15 @@ def test_wildcard_dep_matching(): all_insns = set("insn%d" % i for i in range(1, 6)) - assert knl.id_to_insn["insn1"].depends_on == set() - assert knl.id_to_insn["insn2"].depends_on == all_insns - set(["insn2"]) - assert knl.id_to_insn["insn3"].depends_on == all_insns - set(["insn3"]) - assert knl.id_to_insn["insn4"].depends_on == set(["insn1", "insn2"]) - assert knl.id_to_insn["insn5"].depends_on == all_insns - set(["insn1", "insn5"]) + assert prog.root_kernel.id_to_insn["insn1"].depends_on == set() + assert (prog.root_kernel.id_to_insn["insn2"].depends_on == all_insns - + set(["insn2"])) + assert (prog.root_kernel.id_to_insn["insn3"].depends_on == all_insns - + set(["insn3"])) + assert (prog.root_kernel.id_to_insn["insn4"].depends_on == set(["insn1", + "insn2"])) + assert (prog.root_kernel.id_to_insn["insn5"].depends_on == all_insns - + set(["insn1", "insn5"])) def test_preamble_with_separate_temporaries(ctx_factory): @@ -2777,7 +2759,7 @@ def test_relaxed_stride_checks(ctx_factory): def test_add_prefetch_works_in_lhs_index(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{ [n,k,l,k1,l1,k2,l2]: " "start<=ntmp[i] = 10 {id=insn1} <>tmp2[i] = 10 {id=insn2} @@ -492,28 +494,34 @@ def test_add_nosync(): tmp5[i] = 1 {id=insn6,conflicts=g1} """) - orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local") - orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp3", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp5", "local") # No dependency present - don't add nosync - knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2", + prog = lp.add_nosync(orig_prog, "any", "writes:tmp", "writes:tmp2", empty_ok=True) - assert frozenset() == knl.id_to_insn["insn2"].no_sync_with + assert frozenset() == ( + prog.root_kernel.id_to_insn["insn2"].no_sync_with) # Dependency present - knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3") - assert frozenset() == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "writes:tmp3", "reads:tmp3") + assert frozenset() == ( + prog.root_kernel.id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog.root_kernel.id_to_insn["insn4"].no_sync_with) # Bidirectional - knl = lp.add_nosync( - orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) - assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync( + orig_prog, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) + assert frozenset([("insn4", "local")]) == ( + prog.root_kernel.id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog.root_kernel.id_to_insn["insn4"].no_sync_with) # Groups - knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6") - assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "insn5", "insn6") + assert frozenset([("insn5", "local")]) == ( + prog.root_kernel.id_to_insn["insn6"].no_sync_with) def test_uniquify_instruction_ids(): @@ -522,12 +530,14 @@ def test_uniquify_instruction_ids(): i3 = lp.Assignment("b", 1, id=lp.UniqueName("b")) i4 = lp.Assignment("b", 1, id=lp.UniqueName("b")) - knl = lp.make_kernel("{[i]: i = 1}", []).copy(instructions=[i1, i2, i3, i4]) + prog = lp.make_kernel("{[i]: i = 1}", []) + new_root_kernel = prog.root_kernel.copy(instructions=[i1, i2, i3, i4]) + prog = prog.with_root_kernel(new_root_kernel) from loopy.transform.instruction import uniquify_instruction_ids - knl = uniquify_instruction_ids(knl) + prog = uniquify_instruction_ids(prog) - insn_ids = set(insn.id for insn in knl.instructions) + insn_ids = set(insn.id for insn in prog.root_kernel.instructions) assert len(insn_ids) == 4 assert all(isinstance(id, str) for id in insn_ids) diff --git a/test/testlib.py b/test/testlib.py index ad290ee7..eebc792d 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -1,4 +1,5 @@ import loopy as lp +import numpy as np # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel @@ -8,8 +9,9 @@ class GridOverride(object): self.clean = clean self.vecsize = vecsize - def __call__(self, insn_ids, ignore_auto=True): - gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto) + def __call__(self, insn_ids, program_callables_info, ignore_auto=True): + gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, + program_callables_info, ignore_auto) return gsize, (self.vecsize,) # }}} @@ -132,4 +134,48 @@ class SeparateTemporariesPreambleTestPreambleGenerator( # }}} + +# {{{ test_register_function_lookup + +class Log2Callable(lp.ScalarCallable): + + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = arg_id_to_dtype[0].numpy_dtype + + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + + from loopy.target.opencl import OpenCLTarget + name_in_target = "log2" + if not isinstance(kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified + if dtype == np.float32: + name_in_target = "log2f" + elif dtype == np.float128: + name_in_target = "log2l" + + from loopy.types import NumpyType + return ( + self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + program_callables_info) + + +def register_log2_lookup(target, identifier): + if identifier == 'log2': + return Log2Callable(name='log2') + return None + +# }}} + # vim: foldmethod=marker -- GitLab From ee6214767d96b9b4a7d240c5ed8affed2137ec6e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 12 Aug 2018 16:38:50 +0530 Subject: [PATCH 04/80] adding untracked files. --- doc/ref_call.rst | 191 +++++++ loopy/kernel/function_interface.py | 867 +++++++++++++++++++++++++++++ loopy/program.py | 684 +++++++++++++++++++++++ loopy/transform/callable.py | 707 +++++++++++++++++++++++ test/test_callables.py | 414 ++++++++++++++ 5 files changed, 2863 insertions(+) create mode 100644 doc/ref_call.rst create mode 100644 loopy/kernel/function_interface.py create mode 100644 loopy/program.py create mode 100644 loopy/transform/callable.py create mode 100644 test/test_callables.py diff --git a/doc/ref_call.rst b/doc/ref_call.rst new file mode 100644 index 00000000..4ff1ef2f --- /dev/null +++ b/doc/ref_call.rst @@ -0,0 +1,191 @@ +Calling Loopy Kernels and External Functions +============================================ + +Goals of a function interface +----------------------------- + +- Must be able to have complete information of the function just through the + epxression node. +- Must adhere to :mod:`loopy` semantics of immutability. +- Must have a class instance linked with the expression node which would record + the properties of the function. +- Must indicate in the expression if the function is known to the kernel. (This + is intended to be done by making the function expression node an instance of + ``ResolvedFunction`` as soon as the function definition is resolved by the + kernel) +- Function overloading is not encouraged in :mod:`loopy` as it gives rise to + contention while debugging with the help of the kernel intermediate + representation and hence if the expression nodes point to different function + instances they must differ in their representation. For example: ``float + sin(float )`` and ``double sin(double )`` should diverge by having different + identifiers as soon as data type of the argument is inferred. +- Must have an interface to register external functions. + + +Scoped Function and resolving +----------------------------- + +``ResolvedFunctions`` are pymbolic nodes within expressions in a ``Loo.py`` +kernel, whose name has been resolved by the kernel. The process of matching a +function idenitifier with the function definition is called "resolving". + +A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it +is "resolved" by one of the ``function_scoper`` in a +:attr:`LoopKernel.scoped_functions` + +- Functions already registered by the target. Some examples include -- + ``sin()``, ``cos()``, ``exp()``, ``max()`` (for C-Targets.) +- Functions that are defined in ``Loo.py`` and are realized into + different set of instructions during code generation. Some examples + include ``make_tuple``, ``ArgExtOp``, ``index_of``, ... +- Functions registered as ``CallableKernels`` using + ``lp.register_callable_kernel(...)``. +- Functions that have been provided through + ``lp.register_function_scoper(...)`` +- Functions that can be made known from the user through + ``lp.register_function_mangler``. This is planned to be deprecated, + as its functionality is superseded by + ``lp.register_function_scoper(...)``. + +Expressions after a function is scoped +-------------------------------------- + +Consider the following expression. + +:: + + sin(a[i]) + unknown_func(b[i]) + callable_knl_func(c[i])*mangler_call(d[i]) + +During the kernel creation phase, the kernel would know that ``sin`` is +a function known to the target and hence it should be scoped. And as +expected, after ``make_kernel`` has been called the above expression +would get converted to: + +:: + + ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + callable_knl_func(c[i])*mangler_call(d[i]) + +This would also make an entry in the kernel's ``scoped_functions`` +dictionary as: + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None)} + +It might be noteworthy that at this step, it only scopes functions +through their names without any information about the types of the +function. + +Once, the user calls the transformation: +``lp.register_callable_kernel(knl, 'callable_knl_func', callee_knl)``, +the expression gets converted to: + +:: + + ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + ResolvedFunction('callable_knl_func')(c[i])*mangler_call(d[i]) + +This also makes an entry in the ``scoped_functions`` dictionary as -- + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None)} + +Now, if the user calls +``register_function_mangler(knl, 'mangler_call')``, one might expect +that the mangler call function should get scoped, but that does **not** +happen, because the "old" ``function_manglers``, would return a match +only if all the parameters of the function match viz. name, argument +arity and argument types. Hence, the ``scoped_functions`` dictionary +would remain unchanged. + +``ResolvedFunctions`` and specializations +--------------------------------------- + +Consider the same ``ResolvedFunction('sin')`` as above. This function +although scoped does not the know the types i.e. it does yet know that +for a ``C-Target``, whether it should emit ``sin`` or ``sinf`` or +``sinl``. Hence, right now the function can be called as a +"type-generic" function as further in the pipeline it can take any one +of the above definitions. The functions go through a "specialization" +processes at various points in the pipeline, where the attributes of the +callables are resolved. + +- During type inference, the functions go though type specialization + where in the ``arg_id_to_dtype`` of the functions is realized. +- During descriptor inference, the functions goes through a description + specialization where the ``arg_id_to_descr`` is populated. The + ``arg_id_to_descr`` contains important information regarding shape, + strides and scope of the arguments which form an important part of + ``CallableKernel`` as this information would be helpful to to + generate the function signature and make changes to the data access + pattern of the variables in the callee kernel. +- Whenever a ``ResolvedFunction`` goes through a specialization, this is + indicated by changing the name in the ``pymbolic`` node. + +If during type inference, it is inferred that the type of ``a[i]`` is +``np.float32``. The new ``pymbolic`` node would be: + +:: + + ResolvedFunction('sin_0')(a[i]) + ... + +This name change is done so that it indicates that the node points to a +different ``ScalarCallable`` in the dictionary. And hence a new entry is +added to the ``scoped_functions`` dictionary as: + +:: + + {'sin': ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None), + 'sin_0': ScalarCallable(name='sin', arg_id_to_dtype={0:np.float32, + -1: np.float32}, arg_id_to_descr=None, name_in_target='sinf')} + +Description Inference +--------------------- + +Although this step has no significance for a ``ScalarCallable``, it +forms a very important part of ``CallableKernel``. In which the +``dim_tags``, ``shape`` and ``address_space`` of the arguments of the +callable kernel is altered. + +- The ``dim_tags`` attribute helps to ensure that the memory layout + between the caller and the callee kernel is coherent. +- The ``address_space`` attribute ensures that, while writing the device + code we emit the appropriate scope qualifiers for the function + declaration arguments. +- The ``shape`` attribute helps in: + + - Storage allocation. + - Memory layout. + - Out of bounds accesses to be caught in ``Loo.py``. + +Hence, in the ``Loo.py`` pipeline, one might expect the following +developments of the ``sin`` pymbolic call expression node. + +:: + + sin -> (Kernel creation) -> ResolvedFunction(Variable('sin')) -> + (Type Inference) -> ResolvedFunction(Variable('sin_0')) -> + (Descriptor Inference) -> ResolvedFunction(Variable('sin_1')) + +Changes on the target side to accommodate the new function interface +-------------------------------------------------------------------- + +The earlier "function\_mangler" as a member method of the class +``lp.ASTBuilderBase`` will be replaced by ``function_scopers``. The +function scopers would return a list of functions with the signature +``(target, identifier)->lp.InKernelCallable``. + +An example: Calling BLAS +------------------------ + +.. literalinclude:: ../examples/python/external-call.py + diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py new file mode 100644 index 00000000..2ea26065 --- /dev/null +++ b/loopy/kernel/function_interface.py @@ -0,0 +1,867 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Andreas Klöckner, Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import re +import six + +from six.moves import zip + +from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError + +from loopy.symbolic import parse_tagged_name + +from loopy.symbolic import (ResolvedFunction, SubstitutionRuleMappingContext, + RuleAwareIdentityMapper, SubstitutionRuleExpander) + +from loopy.kernel import LoopKernel + + +# {{{ argument descriptors + +class ValueArgDescriptor(ImmutableRecord): + hash_fields = () + + update_persistent_hash = LoopKernel.update_persistent_hash + + +class ArrayArgDescriptor(ImmutableRecord): + """ + Records information about an array argument to an in-kernel callable, to be + passed to and returned from + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`, used + for matching shape and scope of caller and callee kernels. + + ..attribute:: shape + + Shape of the array. + + .. attribute:: address_space + + An attribute of :class:`loopy.kernel.data.AddressSpace`. + + .. attribute:: dim_tags + + A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase` + """ + + fields = set(['shape', 'address_space', 'dim_tags']) + + def __init__(self, shape, address_space, dim_tags): + + # {{{ sanity checks + + from loopy.kernel.array import FixedStrideArrayDimTag + + assert isinstance(shape, tuple) + assert isinstance(dim_tags, tuple) + + # FIXME at least vector dim tags should be supported + assert all(isinstance(dim_tag, FixedStrideArrayDimTag) for dim_tag in + dim_tags) + + # }}} + + super(ArrayArgDescriptor, self).__init__( + shape=shape, + address_space=address_space, + dim_tags=dim_tags) + + hash_fields = ( + "shape", + "address_space", + "dim_tags") + + update_persistent_hash = LoopKernel.update_persistent_hash + +# }}} + + +# {{{ helper function for in-kernel callables + +def get_kw_pos_association(kernel): + """ + Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in + *kernel*. + """ + from loopy.kernel.tools import infer_arg_is_output_only + kernel = infer_arg_is_output_only(kernel) + kw_to_pos = {} + pos_to_kw = {} + + read_count = 0 + write_count = -1 + + for arg in kernel.args: + if not arg.is_output_only: + kw_to_pos[arg.name] = read_count + pos_to_kw[read_count] = arg.name + read_count += 1 + else: + kw_to_pos[arg.name] = write_count + pos_to_kw[write_count] = arg.name + write_count -= 1 + + return kw_to_pos, pos_to_kw + + +class GridOverrideForCalleeKernel(ImmutableRecord): + """ + Helper class to set the + :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the + callee kernels. Refer + :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, + :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. + + .. attribute:: local_size + + The local work group size that has to be set in the callee kernel. + + .. attribute:: global_size + + The global work group size that to be set in the callee kernel. + + .. note:: + + This class acts as a pseduo-callable and its significance lies in + solving picklability issues. + """ + fields = set(["local_size", "global_size"]) + + def __init__(self, local_size, global_size): + self.local_size = local_size + self.global_size = global_size + + def __call__(self, insn_ids, program_callables_info, ignore_auto=True): + return self.local_size, self.global_size + +# }}} + + +# {{{ template class + +class InKernelCallable(ImmutableRecord): + """ + An abstract interface to define a callable encountered in a kernel. + + .. attribute:: name + + The name of the callable which can be encountered within a kernel. + + .. attribute:: arg_id_to_dtype + + A mapping which indicates the arguments types and result types it would + be handling. This would be set once the callable is type specialized. + + .. attribute:: arg_id_to_descr + + A mapping which gives indicates the argument shape and ``dim_tags`` it + would be responsible for generating code. These parameters would be set, + once it is shape and stride(``dim_tags``) specialized. + + .. note:: + + Negative "id" values ``-i`` in the mapping attributes indicate + return value with (0-based) index *i*. + + .. automethod:: __init__ + .. automethod:: with_types + .. automethod:: with_descrs + .. automethod:: with_target + .. automethod:: with_hw_axes_sizes + .. automethod:: generate_preambles + .. automethod:: emit_call + .. automethod:: emit_call_insn + .. automethod:: is_ready_for_codegen + """ + + fields = set(["arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr") + + def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None): + + super(InKernelCallable, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + def __getinitargs__(self): + return (self.arg_id_to_dtype, self.arg_id_to_descr) + + update_persistent_hash = LoopKernel.update_persistent_hash + + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + """ + :arg arg_id_to_type: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.types.LoopyType` instances. + Unspecified/unknown types are not represented in *arg_id_to_type*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given types, + and *arg_id_to_type* is a mapping of the same form as the + argument above, however it may have more information present. + Any argument information exists both by its positional and + its keyword identifier. + """ + # FIXME: In all these with_** functions add that also passes a + # program_callables_info + + raise NotImplementedError() + + def with_descrs(self, arg_id_to_descr, program_callables_info): + """ + :arg arg_id_to_descr: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.ArrayArgDescriptor` instances. + Unspecified/unknown types are not represented in *arg_id_to_descr*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a copy of *self* which is a new instance of + :class:`InKernelCallable` specialized for the given types, and + *arg_id_to_descr* is a mapping of the same form as the argument above, + however it may have more information present. Any argument information + exists both by its positional and its keyword identifier. + """ + + raise NotImplementedError() + + def with_target(self, target): + """ + Returns a copy of *self* with all the ``dtypes`` in + ``in_knl_callable.arg_id_to_dtype`` associated with the *target*. Refer + :meth:`loopy.types.LoopyType.with_target`. + + :arg target: An instance of :class:`loopy.target.TargetBase`. + """ + + if target is None: + raise LoopyError("target cannot be None for with_target") + + def with_target_if_not_None(dtype): + """ + Returns a copy of :arg:`dtype` associated with the target. If + ``dtype`` is *None* returns *None*. + """ + if dtype: + return dtype.with_target(target) + else: + return None + + new_arg_id_to_dtype = None + if self.arg_id_to_dtype is not None: + new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id, + dtype in self.arg_id_to_dtype.items()) + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + + def with_hw_axes_sizes(self, local_size, global_size): + """ + Returns a copy of *self* with modifications to comply with the grid + sizes ``(local_size, global_size)`` of the kernel in which it is + supposed to be called. + + :arg local_size: An instance of :class:`islpy.PwAff`. + :arg global_size: An instance of :class:`islpy.PwAff`. + """ + raise NotImplementedError() + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + def generate_preambles(self, target): + """ Yields the target specific preamble. + """ + raise NotImplementedError() + + def emit_call(self, expression_to_code_mapper, expression, target): + + raise NotImplementedError() + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + Returns a tuple of ``(call, assignee_is_returned)`` which is the target + facing function call that would be seen in the generated code. ``call`` + is an instance of ``pymbolic.primitives.Call`` ``assignee_is_returned`` + is an instance of :class:`bool` to indicate if the assignee is returned + by value of C-type targets. + + *Example:* If ``assignee_is_returned=True``, then ``a, b = f(c, d)`` is + interpreted in the target as ``a = f(c, d, &b)``. If + ``assignee_is_returned=False``, then ``a, b = f(c, d)`` is interpreted + in the target as the statement ``f(c, d, &a, &b)``. + """ + + raise NotImplementedError() + + def __hash__(self): + + return hash(tuple(self.fields)) + +# }}} + + +# {{{ scalar callable + +class ScalarCallable(InKernelCallable): + """ + An abstranct interface the to a scalar callable encountered in a kernel. + + .. note:: + + The :meth:`ScalarCallable.with_types` is intended to assist with type + specialization of the funciton and is expected to be supplemented in the + derived subclasses. + """ + + fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) + init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target") + hash_fields = fields + + def __init__(self, name, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + super(ScalarCallable, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.name = name + self.name_in_target = name_in_target + + def __getinitargs__(self): + return (self.arg_id_to_dtype, self.arg_id_to_descr, + self.name_in_target) + + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + raise LoopyError("No type inference information present for " + "the function %s." % (self.name)) + + def with_descrs(self, arg_id_to_descr, program_callables_info): + + arg_id_to_descr[-1] = ValueArgDescriptor() + return ( + self.copy(arg_id_to_descr=arg_id_to_descr), + program_callables_info) + + def with_hw_axes_sizes(self, global_size, local_size): + return self.copy() + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + # {{{ code generation + + def emit_call(self, expression_to_code_mapper, expression, target): + + assert self.is_ready_for_codegen() + + # must have single assignee + assert len(expression.parameters) == len(self.arg_id_to_dtype) - 1 + arg_dtypes = tuple(self.arg_id_to_dtype[id] for id in + range(len(self.arg_id_to_dtype)-1)) + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + expression.parameters) + + from loopy.expression import dtype_to_type_context + # processing the parameters with the required dtypes + processed_parameters = tuple( + expression_to_code_mapper.rec(par, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype) + for par, par_dtype, tgt_dtype in zip( + expression.parameters, par_dtypes, arg_dtypes)) + + from pymbolic import var + return var(self.name_in_target)(*processed_parameters) + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + Returns a pymbolic call for C-based targets, when the instructions + involve multiple return values along with the required type casting. + The first assignee is returned, but the rest of them are appended to + the parameters and passed by reference. + + *Example:* ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)`` + + :arg insn: An instance of :class:`loopy.kernel.instructions.CallInstruction`. + :arg target: An instance of :class:`loopy.target.TargetBase`. + :arg expression_to_code_mapper: An instance of :class:`IdentityMapper` + responsible for code mapping from :mod:`loopy` syntax to the + **target syntax**. + """ + + # Currently this is formulated such that the first argument is returned + # and rest all are passed by reference as arguments to the function. + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + assignees = insn.assignees[1:] + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + parameters) + arg_dtypes = tuple(self.arg_id_to_dtype[i] for i, _ in + enumerate(parameters)) + + assignee_dtypes = tuple(self.arg_id_to_dtype[-i-2] for i, _ in + enumerate(assignees)) + + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from pymbolic import var + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr + for par, par_dtype, tgt_dtype in zip( + parameters, par_dtypes, arg_dtypes)] + + for i, (a, tgt_dtype) in enumerate(zip(assignees, assignee_dtypes)): + if tgt_dtype != expression_to_code_mapper.infer_type(a): + raise LoopyError("Type Mismatch in function %s. Expected: %s" + "Got: %s" % (self.name, tgt_dtype, + expression_to_code_mapper.infer_type(a))) + c_parameters.append( + var("&")( + expression_to_code_mapper(a, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr)) + + # assignee is returned whenever the size of assignees is non zero. + assignee_is_returned = len(assignees) > 0 + + return var(self.name_in_target)(*c_parameters), assignee_is_returned + + def generate_preambles(self, target): + return + yield + + # }}} + +# }}} + + +# {{{ callable kernel + +class CallableKernel(InKernelCallable): + """ + Records informations about a callee kernel. Also provides interface through + member methods to make the callee kernel compatible to be called from a + caller kernel. The :meth:`loopy.register_callable_kernel` should be called + in order to initiate association between a function in caller kernel and + the callee kernel. + + :meth:`CallableKernel.with_types` should be called in order to match + the ``dtypes`` of the arguments that are shared between the caller and the + callee kernel. + + :meth:`CallableKernel.with_descrs` should be called in order to match + :attr:`ArrayArgDescriptor.dim_tags`, :attr:`ArrayArgDescriptor.shape`, + :attr:`ArrayArgDescriptor.address_space`` of the arguments shared between the + caller and the callee kernel. + + :meth:`CallableKernel.with_hw_axes` should be called to set the grid + sizes for the :attr:`subkernel` of the callable. + """ + + fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") + hash_fields = fields + + def __init__(self, subkernel, arg_id_to_dtype=None, + arg_id_to_descr=None): + assert isinstance(subkernel, LoopKernel) + + super(CallableKernel, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.subkernel = subkernel.copy( + args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target)) + if arg.dtype is not None else arg for arg in subkernel.args]) + + def __getinitargs__(self): + return (self.subkernel, self.arg_id_to_dtype, + self.arg_id_to_descr) + + @property + def name(self): + return self.subkernel.name + + def with_types(self, arg_id_to_dtype, caller_kernel, + program_callables_info): + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + new_args = [] + for arg in self.subkernel.args: + kw = arg.name + if kw in arg_id_to_dtype: + # id exists as kw + new_args.append(arg.copy(dtype=arg_id_to_dtype[kw])) + elif kw_to_pos[kw] in arg_id_to_dtype: + # id exists as positional argument + new_args.append(arg.copy( + dtype=arg_id_to_dtype[kw_to_pos[kw]])) + else: + new_args.append(arg) + + from loopy.type_inference import ( + infer_unknown_types_for_a_single_kernel) + pre_specialized_subkernel = self.subkernel.copy( + args=new_args) + + # infer the types of the written variables based on the knowledge + # of the types of the arguments supplied + specialized_kernel, program_callables_info = ( + infer_unknown_types_for_a_single_kernel( + pre_specialized_subkernel, + program_callables_info, + expect_completion=True)) + + new_arg_id_to_dtype = {} + for arg in specialized_kernel.args: + # associate the updated_arg_id_to_dtype with keyword as well as + # positional id. + new_arg_id_to_dtype[arg.name] = arg.dtype + new_arg_id_to_dtype[kw_to_pos[arg.name]] = arg.dtype + + # Return the kernel call with specialized subkernel and the corresponding + # new arg_id_to_dtype + return self.copy(subkernel=specialized_kernel, + arg_id_to_dtype=new_arg_id_to_dtype), program_callables_info + + def with_descrs(self, arg_id_to_descr, program_callables_info): + + # tune the subkernel so that we have the matching shapes and + # dim_tags + + new_args = self.subkernel.args[:] + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + for arg_id, descr in arg_id_to_descr.items(): + if isinstance(arg_id, int): + arg_id = pos_to_kw[arg_id] + assert isinstance(arg_id, str) + + if isinstance(descr, ArrayArgDescriptor): + new_arg = self.subkernel.arg_dict[arg_id].copy( + shape=descr.shape, + dim_tags=descr.dim_tags, + address_space=descr.address_space) + # replacing the new arg with the arg of the same name + new_args = [new_arg if arg.name == arg_id else arg for arg in + new_args] + elif isinstance(descr, ValueArgDescriptor): + pass + else: + raise LoopyError("Descriptor must be either an instance of " + "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % + type(descr)) + descriptor_specialized_knl = self.subkernel.copy(args=new_args) + from loopy.preprocess import traverse_to_infer_arg_descr + descriptor_specialized_knl, program_callables_info = ( + traverse_to_infer_arg_descr(descriptor_specialized_knl, + program_callables_info)) + + return ( + self.copy( + subkernel=descriptor_specialized_knl, + arg_id_to_descr=arg_id_to_descr), + program_callables_info) + + def with_packing_for_args(self): + from loopy.kernel.data import AddressSpace + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + arg_id_to_descr = {} + + for pos, kw in pos_to_kw.items(): + arg = self.subkernel.arg_dict[kw] + arg_id_to_descr[pos] = ArrayArgDescriptor( + shape=arg.shape, + dim_tags=arg.dim_tags, + address_space=AddressSpace.GLOBAL) + + return self.copy(subkernel=self.subkernel, + arg_id_to_descr=arg_id_to_descr) + + def with_hw_axes_sizes(self, gsize, lsize): + return self.copy( + subkernel=self.subkernel.copy( + overridden_get_grid_sizes_for_insn_ids=( + GridOverrideForCalleeKernel(lsize, gsize)))) + + def is_ready_for_codegen(self): + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + def generate_preambles(self, target): + """ Yields the *target* specific preambles. + """ + # FIXME Check that this is correct. + + return + yield + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + from pymbolic.primitives import CallWithKwargs + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + kw_parameters = {} + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + + assignees = insn.assignees + + parameters = list(parameters) + par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + for i in range(len(parameters), len(parameters)+len(kw_parameters)): + parameters.append(kw_parameters[pos_to_kw[i]]) + par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) + + # insert the assigness at the required positions + assignee_write_count = -1 + for i, arg in enumerate(self.subkernel.args): + if arg.is_output_only: + assignee = assignees[-assignee_write_count-1] + parameters.insert(i, assignee) + par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) + assignee_write_count -= 1 + + # no type casting in array calls + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from loopy.symbolic import SubArrayRef + from pymbolic import var + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr if isinstance(par, SubArrayRef) else + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr + for par, par_dtype in zip( + parameters, par_dtypes)] + + return var(self.subkernel.name)(*c_parameters), False + +# }}} + + +# {{{ mangler callable + +class ManglerCallable(ScalarCallable): + """ + A callable whose characateristic is defined by a function mangler. + + .. attribute:: function_mangler + + A function of signature ``(kernel, name , arg_dtypes)`` and returns an + instance of ``loopy.CallMangleInfo``. + """ + fields = set(["name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"]) + init_arg_names = ("name", "function_mangler", "arg_id_to_dtype", + "arg_id_to_descr", "name_in_target") + hash_fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"]) + + def __init__(self, name, function_mangler, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + self.function_mangler = function_mangler + + super(ManglerCallable, self).__init__( + name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr, + name_in_target=name_in_target) + + def __getinitargs__(self): + return (self.name, self.function_mangler, self.arg_id_to_dtype, + self.arg_id_to_descr, self.name_in_target) + + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + if self.arg_id_to_dtype is not None: + # specializing an already specialized function. + for arg_id, dtype in arg_id_to_dtype.items(): + # only checking for the ones which have been provided + # if does not match, returns an error. + if self.arg_id_to_dtype[arg_id] != arg_id_to_dtype[arg_id]: + raise LoopyError("Overwriting a specialized" + " function is illegal--maybe start with new instance of" + " ManglerCallable?") + + sorted_keys = sorted(arg_id_to_dtype.keys()) + arg_dtypes = tuple(arg_id_to_dtype[key] for key in sorted_keys if + key >= 0) + + mangle_result = self.function_mangler(kernel, self.name, + arg_dtypes) + if mangle_result: + new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) + new_arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in + enumerate(mangle_result.result_dtypes))) + return ( + self.copy(name_in_target=mangle_result.target_name, + arg_id_to_dtype=new_arg_id_to_dtype), + program_callables_info) + else: + # The function mangler does not agree with the arg id to dtypes + # provided. Indicating that is illegal. + raise LoopyError("Function %s not coherent with the provided types." % ( + self.name, kernel.target)) + + def mangle_result(self, kernel): + """ + Returns an instance of :class:`loopy.kernel.data.CallMangleInfo` for + the given pair :attr:`function_mangler` and :attr:`arg_id_to_dtype`. + """ + sorted_keys = sorted(self.arg_id_to_dtype.keys()) + arg_dtypes = tuple(self.arg_id_to_dtype[key] for key in sorted_keys if + key >= 0) + + return self.function_mangler(kernel, self.name, arg_dtypes) + +# }}} + + +# {{{ new pymbolic calls to scoped functions + +def next_indexed_variable(function): + """ + Returns an instance of :class:`str` with the next indexed-name in the + sequence for the name of *function*. + + *Example:* ``Variable('sin_0')`` will return ``'sin_1'``. + + :arg function: Either an instance of :class:`pymbolic.primitives.Variable` + or :class:`loopy.reduction.ArgExtOp` or + :class:`loopy.reduction.SegmentedOp`. + """ + from loopy.library.reduction import ArgExtOp, SegmentedOp + if isinstance(function, (ArgExtOp, SegmentedOp)): + return function.copy() + func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") + + match = func_name.match(function.name) + + if match is None: + if function.name[-1] == '_': + return "{old_name}0".format(old_name=function.name) + else: + return "{old_name}_0".format(old_name=function.name) + + return "{alpha}_{num}".format(alpha=match.group('alpha'), + num=int(match.group('num'))+1) + + +class FunctionNameChanger(RuleAwareIdentityMapper): + """ + Changes the names of scoped functions in calls of expressions according to + the mapping ``calls_to_new_functions`` + """ + + def __init__(self, rule_mapping_context, calls_to_new_names, + subst_expander): + super(FunctionNameChanger, self).__init__(rule_mapping_context) + self.calls_to_new_names = calls_to_new_names + self.subst_expander = subst_expander + + def map_call(self, expr, expn_state): + name, tag = parse_tagged_name(expr.function) + + if name not in self.rule_mapping_context.old_subst_rules: + expanded_expr = self.subst_expander(expr) + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + elif expanded_expr in self.calls_to_new_names: + # FIXME: this is horribly wrong logic. + # investigate how to make edits to a substitution rule + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expanded_expr]), + tuple(self.rec(child, expn_state) + for child in expanded_expr.parameters)) + else: + return super(FunctionNameChanger, self).map_call( + expr, expn_state) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + else: + return super(FunctionNameChanger, self).map_call_with_kwargs( + expr, expn_state) + + +def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + name_changer = FunctionNameChanger(rule_mapping_context, + pymbolic_calls_to_new_names, subst_expander) + + return rule_mapping_context.finish_kernel( + name_changer.map_kernel(kernel)) + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/program.py b/loopy/program.py new file mode 100644 index 00000000..096bd1ec --- /dev/null +++ b/loopy/program.py @@ -0,0 +1,684 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six +import re + +from pytools import ImmutableRecord, memoize_method +from pymbolic.primitives import Variable +from functools import wraps + +from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction +from loopy.kernel.function_interface import ( + CallableKernel, ScalarCallable) +from loopy.diagnostic import LoopyError + +from loopy.kernel import LoopKernel + + +class ResolvedFunctionMarker(RuleAwareIdentityMapper): + """ + Mapper to convert the ``function`` attribute of a + :class:`pymbolic.primitives.Call` known in the kernel as instances of + :class:`loopy.symbolic.ResolvedFunction`. A function is known in the + *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier` + returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + **Example:** If given an expression of the form ``sin(x) + unknown_function(y) + + log(z)``, then the mapper would return ``ResolvedFunction('sin')(x) + + unknown_function(y) + ResolvedFunction('log')(z)``. + + :arg rule_mapping_context: An instance of + :class:`loopy.symbolic.RuleMappingContext`. + :arg function_ids: A container with instances of :class:`str` indicating + the function identifiers to look for while scoping functions. + """ + def __init__(self, rule_mapping_context, kernel, program_callables_info, + function_id_to_in_knl_callable_mappers): + super(ResolvedFunctionMarker, self).__init__(rule_mapping_context) + self.kernel = kernel + self.program_callables_info = program_callables_info + # FIXME: function_resolvesrs looks like a very bad name change it + self.function_id_to_in_knl_callable_mappers = ( + function_id_to_in_knl_callable_mappers) + + def find_in_knl_callable_from_identifier(self, identifier): + """ + Returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` if the + :arg:`identifier` is known to any kernel function scoper, otherwise returns + *None*. + """ + # FIXME change docs + for func_id_to_in_knl_callable_mapper in ( + self.function_id_to_in_knl_callable_mappers): + # fixme: do we really need to given target for the function + in_knl_callable = func_id_to_in_knl_callable_mapper( + self.kernel.target, identifier) + if in_knl_callable is not None: + return in_knl_callable + + return None + + def map_call(self, expr, expn_state): + from pymbolic.primitives import Call, CallWithKwargs + from loopy.symbolic import parse_tagged_name + + name, tag = parse_tagged_name(expr.function) + if name not in self.rule_mapping_context.old_subst_rules: + new_call_with_kwargs = self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={}), expn_state) + return Call(new_call_with_kwargs.function, + new_call_with_kwargs.parameters) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + + if not isinstance(expr.function, ResolvedFunction): + + # search the kernel for the function. + in_knl_callable = self.find_in_knl_callable_from_identifier( + expr.function.name) + + if in_knl_callable: + # associate the newly created ResolvedFunction with the + # resolved in-kernel callable + + self.program_callables_info, new_func_id = ( + self.program_callables_info.with_callable(expr.function, + in_knl_callable, True)) + return type(expr)( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + + # this is an unknown function as of yet, do not modify it + return super(ResolvedFunctionMarker, self).map_call_with_kwargs(expr, + expn_state) + + def map_reduction(self, expr, expn_state): + for func_id in ( + expr.operation.get_scalar_callables()): + in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) + assert in_knl_callable is not None + self.program_callables_info, _ = ( + self.program_callables_info.with_callable(func_id, + in_knl_callable, True)) + return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) + + +def initialize_program_callables_info_from_kernel( + kernel, func_id_to_kernel_callable_mappers): + program_callables_info = ProgramCallablesInfo({}) + program_callables_info = program_callables_info.with_edit_callables_mode() + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, kernel, program_callables_info, + func_id_to_kernel_callable_mappers) + + # scoping fucntions and collecting the scoped functions + kernel_with_functions_resolved = rule_mapping_context.finish_kernel( + resolved_function_marker.map_kernel(kernel)) + program_callables_info = resolved_function_marker.program_callables_info + + callable_kernel = CallableKernel(kernel_with_functions_resolved) + program_callables_info, _ = program_callables_info.with_callable( + Variable(kernel.name), callable_kernel, True) + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) + + return program_callables_info + + +# {{{ program definition + +class Program(ImmutableRecord): + def __init__(self, + name, + program_callables_info, + target, + func_id_to_in_knl_callable_mappers): + assert isinstance(program_callables_info, ProgramCallablesInfo) + + # FIXME: check if all sanity checks have been covered? + # FIXME: The comments over here may need some attention. + assert name in program_callables_info + + super(Program, self).__init__( + name=name, + program_callables_info=program_callables_info, + target=target, + func_id_to_in_knl_callable_mappers=( + func_id_to_in_knl_callable_mappers)) + + self._program_executor_cache = {} + + hash_fields = ( + "name", + "program_callables_info", + "target",) + + update_persistent_hash = LoopKernel.update_persistent_hash + + def copy(self, **kwargs): + if 'target' in kwargs: + target = kwargs['target'] + new_self = super(Program, self).copy(**kwargs) + new_resolved_functions = {} + for func_id, in_knl_callable in ( + new_self.program_callables_info.items()): + if isinstance(in_knl_callable, CallableKernel): + subkernel = in_knl_callable.subkernel + new_resolved_functions[func_id] = in_knl_callable.copy( + subkernel=subkernel.copy(target=target)) + else: + new_resolved_functions[func_id] = in_knl_callable + + program_callables_info = new_self.program_callables_info.copy( + resolved_functions=new_resolved_functions) + + return super(Program, new_self).copy( + program_callables_info=program_callables_info) + else: + return super(Program, self).copy(**kwargs) + + def get_grid_size_upper_bounds(self, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of *all* instructions in the kernel. + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + return self.root_kernel.get_grid_size_upper_bounds( + self.program_callables_info, + ignore_auto=ignore_auto) + + def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of *all* instructions in the kernel. + + *global_size* and *local_size* are :mod:`pymbolic` expressions + """ + return self.root_kernel.get_grid_size_upper_bounds_as_exprs( + self.program_callables_info, + ignore_auto=ignore_auto) + + # {{{ implementation arguments + + @property + @memoize_method + def impl_arg_to_arg(self): + from loopy.kernel.array import ArrayBase + + result = {} + + for arg in self.args: + if not isinstance(arg, ArrayBase): + result[arg.name] = arg + continue + + if arg.shape is None or arg.dim_tags is None: + result[arg.name] = arg + continue + + subscripts_and_names = arg.subscripts_and_names() + if subscripts_and_names is None: + result[arg.name] = arg + continue + + for index, sub_arg_name in subscripts_and_names: + result[sub_arg_name] = arg + + return result + + # }}} + + @property + def root_kernel(self): + return self.program_callables_info[self.name].subkernel + + @property + def arg_dict(self): + return self.root_kernel.arg_dict + + def with_root_kernel(self, root_kernel): + new_in_knl_callable = self.program_callables_info[ + self.name].copy(subkernel=root_kernel) + new_resolved_functions = ( + self.program_callables_info.resolved_functions.copy()) + new_resolved_functions[self.name] = new_in_knl_callable + + return self.copy( + program_callables_info=self.program_callables_info.copy( + resolved_functions=new_resolved_functions)) + + @property + def args(self): + return self.root_kernel.args[:] + + def __call__(self, *args, **kwargs): + key = self.target.get_kernel_executor_cache_key(*args, **kwargs) + try: + pex = self._program_executor_cache[key] + except KeyError: + pex = self.target.get_kernel_executor(self, *args, **kwargs) + self._program_executor_cache[key] = pex + + return pex(*args, **kwargs) + + def __str__(self): + # FIXME: make this better + print(self.program_callables_info.num_times_callables_called) + return ( + (self.program_callables_info[ + self.name].subkernel).__str__() + + '\nResolved Functions: ' + + (self.program_callables_info.resolved_functions.keys()).__str__() + + '\n' + 75*'-' + '\n') + +# }}} + + +def next_indexed_function_identifier(function): + """ + Returns an instance of :class:`str` with the next indexed-name in the + sequence for the name of *function*. + + *Example:* ``Variable('sin_0')`` will return ``'sin_1'``. + + :arg function: Either an instance of :class:`pymbolic.primitives.Variable` + or :class:`loopy.reduction.ArgExtOp` or + :class:`loopy.reduction.SegmentedOp`. + """ + from loopy.library.reduction import ArgExtOp, SegmentedOp + if isinstance(function, (ArgExtOp, SegmentedOp)): + return function.copy() + elif isinstance(function, str): + function = Variable(function) + + assert isinstance(function, Variable) + func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") + + match = func_name.match(function.name) + + if match is None: + if function.name[-1] == '_': + return "{old_name}0".format(old_name=function.name) + else: + return "{old_name}_0".format(old_name=function.name) + + return "{alpha}_{num}".format(alpha=match.group('alpha'), + num=int(match.group('num'))+1) + + +class ResolvedFunctionRenamer(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, renaming_dict): + super(ResolvedFunctionRenamer, self).__init__( + rule_mapping_context) + self.renaming_dict = renaming_dict + + def map_resolved_function(self, expr, expn_state): + if expr.name in self.renaming_dict: + return ResolvedFunction(self.renaming_dict[expr.name]) + else: + return super(ResolvedFunctionRenamer, self).map_resolved_function( + expr, expn_state) + + +def rename_resolved_functions_in_a_single_kernel(kernel, + renaming_dict): + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + resolved_function_renamer = ResolvedFunctionRenamer(rule_mapping_context, + renaming_dict) + return ( + rule_mapping_context.finish_kernel( + resolved_function_renamer.map_kernel(kernel))) + + +# {{{ program callables info + +class ProgramCallablesInfo(ImmutableRecord): + # FIXME: dont evalutate num_times_called, rahter compute it from the + # resolved_functions + # FIXME: make the edit callables thing a ContextManager. + def __init__(self, resolved_functions, num_times_callables_called=None, + history=None, is_being_edited=False, + num_times_hit_during_editing={}, + renames_needed_after_editing={}): + + if num_times_callables_called is None: + num_times_callables_called = dict((func_id, 1) for func_id in + resolved_functions) + if history is None: + history = dict((func_id, set([func_id])) for func_id in + resolved_functions) + + super(ProgramCallablesInfo, self).__init__( + resolved_functions=resolved_functions, + num_times_callables_called=num_times_callables_called, + history=history, + is_being_edited=is_being_edited, + num_times_hit_during_editing=num_times_hit_during_editing, + renames_needed_after_editing=renames_needed_after_editing) + + hash_fields = ( + "resolved_functions", + "num_times_callables_called", + "is_being_edited", + "num_times_hit_during_editing", + "renames_needed_after_editing", + "history") + + update_persistent_hash = LoopKernel.update_persistent_hash + + def with_edit_callables_mode(self): + return self.copy(is_being_edited=True, + num_times_hit_during_editing=dict((func_id, 0) for func_id in + self.resolved_functions)) + + def with_callable(self, function, in_kernel_callable, + resolved_for_the_first_time=False): + """ + :arg function: An instance of :class:`pymbolic.primitives.Variable` or + :class:`loopy.library.reduction.ReductionOpFunction`. + + :arg in_kernel_callables: An instance of + :class:`loopy.InKernelCallable`. + + .. note:: + + Assumes that each callable is touched atmost once, the internal + working of this function fails if that is violated. + """ + # FIXME: add a note about using enter and exit. ~KK + # FIXME: think about a better idea of "with_added_callable" this would + # be more convenient for developer-faced usage. ~KK + + if not self.is_being_edited: + if function.name in self.resolved_functions and ( + self.resolved_functions[function.name] == in_kernel_callable): + return self, function + else: + print('Old: ', self.resolved_functions[function.name]) + print('New: ', in_kernel_callable) + raise LoopyError("Use 'enter_edit_callables_mode' first.") + + from loopy.library.reduction import ArgExtOp, SegmentedOp + + # {{{ sanity checks + + if isinstance(function, str): + function = Variable(function) + + assert isinstance(function, (Variable, ArgExtOp, SegmentedOp)) + + # }}} + + renames_needed_after_editing = self.renames_needed_after_editing.copy() + num_times_hit_during_editing = self.num_times_hit_during_editing.copy() + num_times_callables_called = self.num_times_callables_called.copy() + history = self.history.copy() + + if not resolved_for_the_first_time: + if isinstance(function, (ArgExtOp, SegmentedOp)): + num_times_hit_during_editing[function] += 1 + else: + num_times_hit_during_editing[function.name] += 1 + + if isinstance(function, (ArgExtOp, SegmentedOp)): + unique_function_identifier = function.copy() + if not resolved_for_the_first_time: + num_times_callables_called[function] -= 1 + + num_times_callables_called[unique_function_identifier] = 1 + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + return ( + self.copy( + resolved_functions=updated_resolved_functions, + num_times_callables_called=num_times_callables_called, + num_times_hit_during_editing=( + num_times_hit_during_editing), + renames_needed_after_editing=( + renames_needed_after_editing)), + unique_function_identifier) + + if in_kernel_callable in self.resolved_functions.values(): + # the callable already exists, implies return the function + # identifier corresposing to that callable. + for func_id, in_knl_callable in self.resolved_functions.items(): + if in_knl_callable == in_kernel_callable: + num_times_callables_called[func_id] += 1 + if not resolved_for_the_first_time: + num_times_callables_called[function.name] -= 1 + if num_times_callables_called[function.name] == 0: + renames_needed_after_editing[func_id] = function.name + + history[func_id] = history[func_id] | set([function.name]) + return ( + self.copy( + history=history, + num_times_hit_during_editing=( + num_times_hit_during_editing), + num_times_callables_called=( + num_times_callables_called), + renames_needed_after_editing=( + renames_needed_after_editing)), + func_id) + else: + # FIXME: maybe deal with the history over here? + # FIXME: once the code logic is running beautify this part. + # many "ifs" can be avoided + unique_function_identifier = function.name + if (resolved_for_the_first_time or + self.num_times_callables_called[function.name] > 1): + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + if not resolved_for_the_first_time: + num_times_callables_called[function.name] -= 1 + + num_times_callables_called[unique_function_identifier] = 1 + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + if not resolved_for_the_first_time: + history[unique_function_identifier] = ( + history[function.name] | set([unique_function_identifier])) + else: + history[unique_function_identifier] = set( + [unique_function_identifier]) + + return ( + self.copy( + history=history, + resolved_functions=updated_resolved_functions, + num_times_callables_called=num_times_callables_called, + num_times_hit_during_editing=num_times_hit_during_editing, + renames_needed_after_editing=renames_needed_after_editing), + Variable(unique_function_identifier)) + + def with_exit_edit_callables_mode(self): + assert self.is_being_edited + + num_times_callables_called = {} + resolved_functions = {} + history = self.history.copy() + + for func_id, in_knl_callable in self.resolved_functions.items(): + if isinstance(in_knl_callable, CallableKernel): + old_subkernel = in_knl_callable.subkernel + new_subkernel = rename_resolved_functions_in_a_single_kernel( + old_subkernel, self.renames_needed_after_editing) + in_knl_callable = ( + in_knl_callable.copy(subkernel=new_subkernel)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % + type(in_knl_callable).__name__) + + if func_id in self.renames_needed_after_editing: + history.pop(func_id) + + new_func_id = self.renames_needed_after_editing[func_id] + resolved_functions[new_func_id] = ( + in_knl_callable) + num_times_callables_called[new_func_id] = ( + self.num_times_callables_called[func_id]) + + else: + resolved_functions[func_id] = in_knl_callable + num_times_callables_called[func_id] = ( + self.num_times_callables_called[func_id]) + + return self.copy( + is_being_edited=False, + resolved_functions=resolved_functions, + num_times_callables_called=num_times_callables_called, + num_times_hit_during_editing={}, + renames_needed_after_editing={}) + + def with_deleted_callable(self, func_id, instances=1): + num_times_callables_called = self.num_times_callables_called.copy() + history = self.history.copy() + resolved_functions = self.resolved_functions.copy() + + assert instances <= num_times_callables_called[func_id] + + num_times_callables_called[func_id] -= instances + + if num_times_callables_called[func_id] == 0: + num_times_callables_called.pop(func_id) + history.pop(func_id) + resolved_functions.pop(func_id) + + return self.copy( + resolved_functions=resolved_functions, + num_times_callables_called=num_times_callables_called, + history=history) + + def __getitem__(self, item): + return self.resolved_functions[item] + + def __contains__(self, item): + return item in self.resolved_functions + + def items(self): + return self.resolved_functions.items() + + def values(self): + return self.resolved_functions.values() + + +# }}} + + +def default_func_id_to_kernel_callable_mappers(target): + + from loopy.library.function import loopy_specific_callable_scopers + return ( + [loopy_specific_callable_scopers] + ( + target.get_device_ast_builder().function_scopers())) + + +def make_program_from_kernel(kernel): + + program_callables_info = initialize_program_callables_info_from_kernel(kernel, + default_func_id_to_kernel_callable_mappers(kernel.target)) + + program = Program( + name=kernel.name, + program_callables_info=program_callables_info, + func_id_to_in_knl_callable_mappers=( + default_func_id_to_kernel_callable_mappers(kernel.target)), + target=kernel.target) + + return program + + +def iterate_over_kernels_if_given_program(transform_for_single_kernel): + def _collective_transform(program_or_kernel, *args, **kwargs): + if isinstance(program_or_kernel, Program): + program = program_or_kernel + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = transform_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + else: + assert isinstance(program_or_kernel, LoopKernel) + kernel = program_or_kernel + return transform_for_single_kernel(kernel, *args, **kwargs) + + return wraps(transform_for_single_kernel)(_collective_transform) + + +# {{{ ingoring this for now + +# if False and isinstance(function, (ArgExtOp, SegmentedOp)): +# FIXME: ignoring this casse for now +# FIXME: If a kernel has two flavors of ArgExtOp then they are +# overwritten and hence not supported.(for now). +# updated_resolved_functions = self.scoped_functions.copy() +# updated_resolved_functions[function] = in_kernel_callable +# return self.copy(updated_resolved_functions), function.copy() + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py new file mode 100644 index 00000000..b5b80ad8 --- /dev/null +++ b/loopy/transform/callable.py @@ -0,0 +1,707 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six + +import islpy as isl +from pymbolic.primitives import CallWithKwargs + +from loopy.kernel import LoopKernel +from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, + CInstruction, _DataObliviousInstruction) +from loopy.symbolic import IdentityMapper, SubstitutionMapper +from loopy.isl_helpers import simplify_via_aff +from loopy.kernel.function_interface import (get_kw_pos_association, + change_names_of_pymbolic_calls, CallableKernel, ScalarCallable) +from loopy.program import Program, ResolvedFunctionMarker + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: register_function_id_to_in_knl_callable_mapper + +.. autofunction:: register_callable_kernel +""" + + +# {{{ register function lookup + +def resolved_callables_from_function_lookup(program, + func_id_to_kernel_callable_mapper): + program_callables_info = program.program_callables_info + program_callables_info = program_callables_info.with_edit_callables_mode() + + callable_knls = dict( + (func_id, in_knl_callable) for func_id, in_knl_callable in + program_callables_info.items() if isinstance(in_knl_callable, + CallableKernel)) + edited_callable_knls = {} + + for func_id, in_knl_callable in callable_knls.items(): + kernel = in_knl_callable.subkernel + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, kernel, program_callables_info, + [func_id_to_kernel_callable_mapper]) + + # scoping fucntions and collecting the scoped functions + new_subkernel = rule_mapping_context.finish_kernel( + resolved_function_marker.map_kernel(kernel)) + program_callables_info = resolved_function_marker.program_callables_info + + edited_callable_knls[func_id] = in_knl_callable.copy( + subkernel=new_subkernel) + + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) + + new_resolved_functions = {} + + for func_id, in_knl_callable in program_callables_info.items(): + if func_id in edited_callable_knls: + new_resolved_functions[func_id] = edited_callable_knls[func_id] + else: + new_resolved_functions[func_id] = in_knl_callable + + program_callables_info = program_callables_info.copy( + resolved_functions=new_resolved_functions) + + return program.copy(program_callables_info=program_callables_info) + + +def register_function_id_to_in_knl_callable_mapper(program, + func_id_to_in_knl_callable_mapper): + """ + Returns a copy of *kernel* with the *function_lookup* registered. + + :arg func_id_to_in_knl_callable_mapper: A function of signature ``(target, + identifier)`` returning a + :class:`loopy.kernel.function_interface.InKernelCallable` or *None* if + the *function_identifier* is not known. + """ + + # adding the function lookup to the set of function lookers in the kernel. + if func_id_to_in_knl_callable_mapper not in ( + program.func_id_to_in_knl_callable_mappers): + from loopy.tools import unpickles_equally + if not unpickles_equally(func_id_to_in_knl_callable_mapper): + raise LoopyError("function '%s' does not " + "compare equally after being upickled " + "and would disrupt loopy's caches" + % func_id_to_in_knl_callable_mapper) + new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + ( + [func_id_to_in_knl_callable_mapper]) + + program = resolved_callables_from_function_lookup(program, + func_id_to_in_knl_callable_mapper) + + new_program = program.copy( + func_id_to_in_knl_callable_mappers=new_func_id_mappers) + + return new_program + +# }}} + + +# {{{ register_callable_kernel + +class _RegisterCalleeKernel(ImmutableRecord): + """ + Helper class to make the function scoper from + :func:`loopy.transform.register_callable_kernel` picklable. As python + cannot pickle lexical closures. + """ + fields = set(['callable_kernel']) + + def __init__(self, callable_kernel): + self.callable_kernel = callable_kernel + + def __call__(self, target, identifier): + if identifier == self.callable_kernel.subkernel.name: + return self.callable_kernel + return None + + +def register_callable_kernel(program, callee_kernel): + """Returns a copy of *caller_kernel*, which would resolve *function_name* in an + expression as a call to *callee_kernel*. + + :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. + :arg function_name: An instance of :class:`str`. + :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. + """ + + # {{{ sanity checks + + assert isinstance(program, Program) + assert isinstance(callee_kernel, LoopKernel) + + # check to make sure that the variables with 'out' direction is equal to + # the number of assigness in the callee kernel intructions. + expected_num_assignees = len([arg for arg in callee_kernel.args if + arg.is_output_only]) + expected_num_parameters = len(callee_kernel.args) - expected_num_assignees + for in_knl_callable in program.program_callables_info.values(): + if isinstance(in_knl_callable, CallableKernel): + caller_kernel = in_knl_callable.subkernel + for insn in caller_kernel.instructions: + if isinstance(insn, CallInstruction) and ( + insn.expression.function.name == callee_kernel.name): + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + else: + kw_parameters = {} + if len(insn.assignees) != expected_num_assignees: + raise LoopyError("The number of arguments with 'out' " + "direction " "in callee kernel %s and the number " + "of assignees in " "instruction %s do not " + "match." % ( + callee_kernel.name, insn.id)) + if len(insn.expression.parameters+tuple( + kw_parameters.values())) != expected_num_parameters: + raise LoopyError("The number of expected arguments " + "for the callee kernel %s and the number of " + "parameters in instruction %s do not match." + % (callee_kernel.name, insn.id)) + + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("unknown instruction %s" % type(insn)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % + type(in_knl_callable).__name__) + + # }}} + + # take the function resolvers from the Program and resolve the functions in + # the callee kernel + program_callables_info = ( + program.program_callables_info.with_edit_callables_mode()) + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + callee_kernel.substitutions, + callee_kernel.get_var_name_generator()) + + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, callee_kernel, program_callables_info, + program.func_id_to_in_knl_callable_mappers) + + callee_kernel = rule_mapping_context.finish_kernel( + resolved_function_marker.map_kernel(callee_kernel)) + program_callables_info = resolved_function_marker.program_callables_info + + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) + program = program.copy(program_callables_info=program_callables_info) + + # making the target of the child kernel to be same as the target of parent + # kernel. + callable_kernel = CallableKernel(subkernel=callee_kernel.copy( + target=program.target, + is_called_from_host=False)) + + # FIXME disabling global barriers for callee kernel (for now) + from loopy import set_options + callee_kernel = set_options(callee_kernel, "disable_global_barriers") + + # FIXME: the number of callables is wrong. This is horrible please + # compensate. + + return register_function_id_to_in_knl_callable_mapper( + program, + _RegisterCalleeKernel(callable_kernel)) + +# }}} + + +# {{{ kernel inliner mapper + +class KernelInliner(SubstitutionMapper): + """Mapper to replace variables (indices, temporaries, arguments) in the + callee kernel with variables in the caller kernel. + + :arg caller: the caller kernel + :arg arg_map: dict of argument name to variables in caller + :arg arg_dict: dict of argument name to arguments in callee + """ + + def __init__(self, subst_func, caller, arg_map, arg_dict): + super(KernelInliner, self).__init__(subst_func) + self.caller = caller + self.arg_map = arg_map + self.arg_dict = arg_dict + + def map_subscript(self, expr): + if expr.aggregate.name in self.arg_map: + + aggregate = self.subst_func(expr.aggregate) + sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller + callee_arg = self.arg_dict[expr.aggregate.name] # Arg in callee + if aggregate.name in self.caller.arg_dict: + caller_arg = self.caller.arg_dict[aggregate.name] # Arg in caller + else: + caller_arg = self.caller.temporary_variables[aggregate.name] + + # Firstly, map inner inames to outer inames. + outer_indices = self.map_tuple(expr.index_tuple) + + # Next, reshape to match dimension of outer arrays. + # We can have e.g. A[3, 2] from outside and B[6] from inside + from numbers import Integral + if not all(isinstance(d, Integral) for d in callee_arg.shape): + raise LoopyError( + "Argument: {0} in callee kernel: {1} does not have " + "constant shape.".format(callee_arg)) + + flatten_index = 0 + for i, idx in enumerate(sar.get_begin_subscript().index_tuple): + flatten_index += idx*caller_arg.dim_tags[i].stride + + flatten_index += sum( + idx * tag.stride + for idx, tag in zip(outer_indices, callee_arg.dim_tags)) + + from loopy.isl_helpers import simplify_via_aff + flatten_index = simplify_via_aff(flatten_index) + + new_indices = [] + for dim_tag in caller_arg.dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + return aggregate.index(tuple(new_indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + +# }}} + + +# {{{ inlining of a single call instruction + +def _inline_call_instruction(caller_kernel, callee_knl, instruction): + """ + Returns a copy of *kernel* with the *instruction* in the *kernel* + replaced by inlining :attr:`subkernel` within it. + """ + callee_label = callee_knl.name[:4] + "_" + + # {{{ duplicate and rename inames + + vng = caller_kernel.get_var_name_generator() + ing = caller_kernel.get_instruction_id_generator() + dim_type = isl.dim_type.set + + iname_map = {} + for iname in callee_knl.all_inames(): + iname_map[iname] = vng(callee_label+iname) + + new_domains = [] + new_iname_to_tags = caller_kernel.iname_to_tags.copy() + + # transferring iname tags info from the callee to the caller kernel + for domain in callee_knl.domains: + new_domain = domain.copy() + for i in range(new_domain.n_dim()): + iname = new_domain.get_dim_name(dim_type, i) + + if iname in callee_knl.iname_to_tags: + new_iname_to_tags[iname_map[iname]] = ( + callee_knl.iname_to_tags[iname]) + new_domain = new_domain.set_dim_name( + dim_type, i, iname_map[iname]) + new_domains.append(new_domain) + + kernel = caller_kernel.copy(domains=caller_kernel.domains + new_domains, + iname_to_tags=new_iname_to_tags) + + # }}} + + # {{{ rename temporaries + + temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(callee_knl.temporary_variables): + new_name = vng(callee_label+name) + temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # }}} + + # {{{ match kernel arguments + + arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) + + assignees = instruction.assignees # writes + parameters = instruction.expression.parameters # reads + + # add keyword parameters + from pymbolic.primitives import CallWithKwargs + + if isinstance(instruction.expression, CallWithKwargs): + from loopy.kernel.function_interface import get_kw_pos_association + + _, pos_to_kw = get_kw_pos_association(callee_knl) + kw_parameters = instruction.expression.kw_parameters + for i in range(len(parameters), len(parameters) + len(kw_parameters)): + parameters = parameters + (kw_parameters[pos_to_kw[i]],) + + assignee_pos = 0 + parameter_pos = 0 + for i, arg in enumerate(callee_knl.args): + if arg.is_output_only: + arg_map[arg.name] = assignees[assignee_pos] + assignee_pos += 1 + else: + arg_map[arg.name] = parameters[parameter_pos] + parameter_pos += 1 + + # }}} + + # {{{ rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) + for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner( + make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) + + insn_id = {} + for insn in callee_knl.instructions: + insn_id[insn.id] = ing(callee_label+insn.id) + + # {{{ root and leave instructions in callee kernel + + dep_map = callee_knl.recursive_insn_dep_map() + # roots depend on nothing + heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + # leaves have nothing that depends on them + tails = set(dep_map.keys()) + for insn, deps in six.iteritems(dep_map): + tails = tails - deps + + # }}} + + # {{{ use NoOp to mark the start and end of callee kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing(callee_label+"_start"), + within_inames=instruction.within_inames, + depends_on=instruction.depends_on + ) + noop_end = NoOpInstruction( + id=instruction.id, + within_inames=instruction.within_inames, + depends_on=frozenset(insn_id[insn] for insn in tails) + ) + # }}} + + inner_insns = [noop_start] + + for insn in callee_knl.instructions: + insn = insn.with_transformed_expressions(subst_mapper) + within_inames = frozenset(map(iname_map.get, insn.within_inames)) + within_inames = within_inames | instruction.within_inames + depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( + instruction.depends_on) + if insn.id in heads: + depends_on = depends_on | set([noop_start.id]) + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on + ) + inner_insns.append(insn) + + inner_insns.append(noop_end) + + new_insns = [] + for insn in kernel.instructions: + if insn == instruction: + new_insns.extend(inner_insns) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + + # }}} + + return kernel + +# }}} + + +# {{{ inline callable kernel + +def _inline_single_callable_kernel(caller_kernel, function_name, + program_callables_info): + old_insns = caller_kernel.instructions + for insn in old_insns: + if isinstance(insn, CallInstruction): + # FIXME This seems to use identifiers across namespaces. Why not + # check whether the function is a scoped function first? ~AK + if insn.expression.function.name in program_callables_info: + history_of_identifier = program_callables_info.history[ + insn.expression.function.name] + + if function_name in history_of_identifier: + in_knl_callable = program_callables_info[ + insn.expression.function.name] + assert isinstance(in_knl_callable, CallableKernel) + caller_kernel = _inline_call_instruction( + caller_kernel, in_knl_callable.subkernel, insn) + program_callables_info = ( + program_callables_info.with_deleted_callable( + insn.expression.function.name, + program_callables_info.num_times_callables_called[ + caller_kernel.name])) + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unknown instruction type %s" + % type(insn).__name__) + + return caller_kernel, program_callables_info + + +# FIXME This should take a 'within' parameter to be able to only inline +# *some* calls to a kernel, but not others. +def inline_callable_kernel(program, function_name): + """ + Returns a copy of *kernel* with the callable kernel addressed by + (scoped) name *function_name* inlined. + """ + from loopy.preprocess import infer_arg_descr + program = infer_arg_descr(program) + program_callables_info = program.program_callables_info + old_program_callables_info = program_callables_info.copy() + + edited_callable_kernels = {} + + for func_id, in_knl_callable in old_program_callables_info.items(): + if function_name not in old_program_callables_info.history[func_id] and ( + isinstance(in_knl_callable, CallableKernel)): + caller_kernel = in_knl_callable.subkernel + caller_kernel, program_callables_info = ( + _inline_single_callable_kernel(caller_kernel, + function_name, + program_callables_info)) + edited_callable_kernels[func_id] = in_knl_callable.copy( + subkernel=caller_kernel) + + new_resolved_functions = {} + for func_id, in_knl_callable in program_callables_info.items(): + if func_id in edited_callable_kernels: + new_resolved_functions[func_id] = edited_callable_kernels[func_id] + else: + new_resolved_functions[func_id] = in_knl_callable + + program_callables_info = program_callables_info.copy( + resolved_functions=new_resolved_functions) + + return program.copy(program_callables_info=program_callables_info) + +# }}} + + +# {{{ tools to match caller to callee args by (guessed) automatic reshaping + +# (This is undocumented and not recommended, but it is currently needed +# to support Firedrake.) + +class DimChanger(IdentityMapper): + """ + Mapper to change the dimensions of an argument. + + .. attribute:: callee_arg_dict + + A mapping from the argument name (:class:`str`) to instances of + :class:`loopy.kernel.array.ArrayBase`. + + .. attribute:: desried_shape + + A mapping from argument name (:class:`str`) to an instance of + :class:`tuple`. + """ + def __init__(self, callee_arg_dict, desired_shape): + self.callee_arg_dict = callee_arg_dict + self.desired_shape = desired_shape + + def map_subscript(self, expr): + callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags + flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in + zip(callee_arg_dim_tags, expr.index_tuple)) + new_indices = [] + + from operator import mul + from functools import reduce + stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1) + + for length in self.desired_shape[expr.aggregate.name]: + stride /= length + ind = flattened_index // int(stride) + flattened_index -= (int(stride) * ind) + new_indices.append(simplify_via_aff(ind)) + + return expr.aggregate.index(tuple(new_indices)) + + +def _match_caller_callee_argument_dimension_for_single_kernel( + caller_knl, program_callables_info, callee_function_name): + """ + Returns a copy of *caller_knl* with the instance of + :class:`loopy.kernel.function_interface.CallableKernel` addressed by + *callee_function_name* in the *caller_knl* aligned with the argument + dimesnsions required by *caller_knl*. + """ + pymbolic_calls_to_new_callables = {} + for insn in caller_knl.instructions: + if not isinstance(insn, CallInstruction) or ( + insn.expression.function.name not in + program_callables_info): + # Call to a callable kernel can only occur through a + # CallInstruction. + continue + + in_knl_callable = program_callables_info[ + insn.expression.function.name] + + if in_knl_callable.subkernel.name != callee_function_name: + # Not the callable we're looking for. + continue + + # getting the caller->callee arg association + + parameters = insn.expression.parameters[:] + kw_parameters = {} + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + + assignees = insn.assignees + + parameter_shapes = [par.get_array_arg_descriptor(caller_knl).shape + for par in parameters] + kw_to_pos, pos_to_kw = get_kw_pos_association(in_knl_callable.subkernel) + for i in range(len(parameters), len(parameters)+len(kw_parameters)): + parameter_shapes.append(kw_parameters[pos_to_kw[i]] + .get_array_arg_descriptor(caller_knl).shape) + + # inserting the assigness at the required positions. + assignee_write_count = -1 + for i, arg in enumerate(in_knl_callable.subkernel.args): + if arg.is_output_only: + assignee = assignees[-assignee_write_count-1] + parameter_shapes.insert(i, assignee + .get_array_arg_descriptor(caller_knl).shape) + assignee_write_count -= 1 + + callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in + in_knl_callable.subkernel.args], parameter_shapes)) + dim_changer = DimChanger(in_knl_callable.subkernel.arg_dict, + callee_arg_to_desired_dim_tag) + new_callee_insns = [] + for callee_insn in in_knl_callable.subkernel.instructions: + if isinstance(callee_insn, MultiAssignmentBase): + new_callee_insns.append(callee_insn.copy(expression=dim_changer( + callee_insn.expression), + assignee=dim_changer(callee_insn.assignee))) + elif isinstance(callee_insn, (CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknwon instruction %s." % + type(insn)) + + # subkernel with instructions adjusted according to the new dimensions. + new_subkernel = in_knl_callable.subkernel.copy(instructions=new_callee_insns) + + new_in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel) + + pymbolic_calls_to_new_callables[insn.expression] = new_in_knl_callable + + if not pymbolic_calls_to_new_callables: + # complain if no matching function found. + raise LoopyError("No CallableKernel with the name %s found in %s." % ( + callee_function_name, caller_knl.name)) + + return change_names_of_pymbolic_calls(caller_knl, + pymbolic_calls_to_new_callables) + + +def _match_caller_callee_argument_dimension_(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = ( + _match_caller_callee_argument_dimension_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs)) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + +# }}} + + +# vim: foldmethod=marker diff --git a/test/test_callables.py b/test/test_callables.py new file mode 100644 index 00000000..f25bbbe6 --- /dev/null +++ b/test/test_callables.py @@ -0,0 +1,414 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import numpy as np +import pyopencl as cl +import pyopencl.clrandom # noqa: F401 +import loopy as lp +import pytest +import sys + + +from pyopencl.tools import ( # noqa: F401 + pytest_generate_tests_for_pyopencl + as pytest_generate_tests) + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 + + +def test_register_function_lookup(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + from testlib import register_log2_lookup + + x = np.random.rand(10) + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + prog = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = log2(x[i]) + """) + prog = lp.register_function_id_to_in_knl_callable_mapper(prog, + register_log2_lookup) + + evt, (out, ) = prog(queue, x=x) + + assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2 ** 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + grandchild_knl = lp.make_kernel_function( + "{[i, j]:0<= i, j< 16}", + """ + c[i, j] = 2*a[i, j] + 3*b[i, j] + """, name='linear_combo1') + + child_knl = lp.make_kernel_function( + "{[i, j]:0<=i, j < 16}", + """ + [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) + """, name='linear_combo2') + + parent_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), '...'], + ) + + knl = lp.register_callable_kernel( + parent_knl, child_knl) + knl = lp.register_callable_kernel( + knl, grandchild_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo2') + knl = lp.inline_callable_kernel(knl, 'linear_combo1') + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out)/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_slices_with_negative_step(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2 ** 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + child_knl = lp.make_kernel_function( + "{[i, j]:0<=i, j < 16}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """, name="linear_combo") + + parent_knl = lp.make_kernel( + "{[i, k, m]: 0<=i, k, m<16}", + """ + z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], + y[i, :, k, :, m]) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='z', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), '...'], + ) + + knl = lp.register_callable_kernel( + parent_knl, child_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_call_with_kwargs(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 2 ** 2 + + a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) + b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) + c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_kernel_function( + "{[i, j]:0<=i, j < %d}" % n, + """ + h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] + <>f1[i, j] = 2*f[i, j] + p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] + """, + [ + lp.GlobalArg('f, e, h, g'), '...'], + name='linear_combo') + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, + """ + <> d[i, j, k, l, m] = 2*b[i, j, k, l, m] + [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m] = linear_combo( + f=[j, l]: a[i, j, k, l, m], + g=[j, l]: d[i, j, k, l, m], + e=[j, l]: c[i, j, k, l, m]) + """) + + knl = lp.register_callable_kernel( + caller_knl, callee_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) + + a = a_dev.get() + b = b_dev.get() + c = c_dev.get() + + h = out1.get() # h = 2c + 3a + 8b + p = out2.get() # p = 7c + 8a + 4b + h_exact = 3*a + 8*b + 2*c + p_exact = 8*a + 4*b + 7*c + + assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7 + assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_hw_axes(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 2 ** 4 + + x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_kernel_function( + "{[i, j]:0<=i, j < 16}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """, name='linear_combo') + + callee_knl = lp.split_iname(callee_knl, "i", 4, inner_tag="l.0", outer_tag="g.0") + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """ + ) + caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") + + knl = lp.register_callable_kernel( + caller_knl, callee_knl) + + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out, ) = knl(queue, x=x_dev, y=y_dev) + + x_host = x_dev.get() + y_host = y_dev.get() + + assert np.linalg.norm(2*x_host+3*y_host-out.get())/np.linalg.norm( + 2*x_host+3*y_host) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_shape_translation_through_sub_array_ref(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) + x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) + x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) + + callee1 = lp.make_kernel_function( + "{[i]: 0<=i<6}", + """ + a[i] = 2*abs(b[i]) + """, name="callee_fn1") + + callee2 = lp.make_kernel_function( + "{[i, j]: 0<=i<3 and 0 <= j < 2}", + """ + a[i, j] = 3*b[i, j] + """, name="callee_fn2") + + callee3 = lp.make_kernel_function( + "{[i]: 0<=i<6}", + """ + a[i] = 5*b[i] + """, name="callee_fn3") + + knl = lp.make_kernel( + "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", + """ + [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) + [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) + [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) + """) + + knl = lp.register_callable_kernel(knl, callee1) + knl = lp.register_callable_kernel(knl, callee2) + knl = lp.register_callable_kernel(knl, callee3) + + if inline: + knl = lp.inline_callable_kernel(knl, 'callee_fn1') + knl = lp.inline_callable_kernel(knl, 'callee_fn2') + knl = lp.inline_callable_kernel(knl, 'callee_fn3') + + knl = lp.set_options(knl, "write_cl") + knl = lp.set_options(knl, "return_dict") + evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) + + y1 = out_dict['y1'].get() + y2 = out_dict['y2'].get() + y3 = out_dict['y3'].get() + + assert (np.linalg.norm(y1-2*x1.get())) < 1e-15 + assert (np.linalg.norm(y2-3*x2.get())) < 1e-15 + assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15 + + +def test_multi_arg_array_call(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + import pymbolic.primitives as p + n = 10 + acc_i = p.Variable("acc_i") + i = p.Variable("i") + index = p.Variable("index") + a_i = p.Subscript(p.Variable("a"), p.Variable("i")) + argmin_kernel = lp.make_kernel_function( + "{[i]: 0 <= i < n}", + [ + lp.Assignment(id="init2", assignee=index, + expression=0), + lp.Assignment(id="init1", assignee=acc_i, + expression="214748367"), + lp.Assignment(id="insn", assignee=index, + expression=p.If(p.Expression.eq(acc_i, a_i), i, index), + depends_on="update"), + lp.Assignment(id="update", assignee=acc_i, + expression=p.Variable("min")(acc_i, a_i), + depends_on="init1,init2")], + name="custom_argmin") + + argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) + + knl = lp.make_kernel( + "{[i]:0<=i 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: foldmethod=marker -- GitLab From 28bb8efd90784545444c705c7820d26e4ef2a555 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 12 Aug 2018 16:45:18 +0530 Subject: [PATCH 05/80] removing unused part of code. --- loopy/kernel/function_interface.py | 103 ----- loopy/transform/callable.py | 592 +---------------------------- test/test_callables.py | 345 ----------------- 3 files changed, 2 insertions(+), 1038 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 2ea26065..8b24da21 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -524,109 +524,6 @@ class CallableKernel(InKernelCallable): def name(self): return self.subkernel.name - def with_types(self, arg_id_to_dtype, caller_kernel, - program_callables_info): - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - - new_args = [] - for arg in self.subkernel.args: - kw = arg.name - if kw in arg_id_to_dtype: - # id exists as kw - new_args.append(arg.copy(dtype=arg_id_to_dtype[kw])) - elif kw_to_pos[kw] in arg_id_to_dtype: - # id exists as positional argument - new_args.append(arg.copy( - dtype=arg_id_to_dtype[kw_to_pos[kw]])) - else: - new_args.append(arg) - - from loopy.type_inference import ( - infer_unknown_types_for_a_single_kernel) - pre_specialized_subkernel = self.subkernel.copy( - args=new_args) - - # infer the types of the written variables based on the knowledge - # of the types of the arguments supplied - specialized_kernel, program_callables_info = ( - infer_unknown_types_for_a_single_kernel( - pre_specialized_subkernel, - program_callables_info, - expect_completion=True)) - - new_arg_id_to_dtype = {} - for arg in specialized_kernel.args: - # associate the updated_arg_id_to_dtype with keyword as well as - # positional id. - new_arg_id_to_dtype[arg.name] = arg.dtype - new_arg_id_to_dtype[kw_to_pos[arg.name]] = arg.dtype - - # Return the kernel call with specialized subkernel and the corresponding - # new arg_id_to_dtype - return self.copy(subkernel=specialized_kernel, - arg_id_to_dtype=new_arg_id_to_dtype), program_callables_info - - def with_descrs(self, arg_id_to_descr, program_callables_info): - - # tune the subkernel so that we have the matching shapes and - # dim_tags - - new_args = self.subkernel.args[:] - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - - for arg_id, descr in arg_id_to_descr.items(): - if isinstance(arg_id, int): - arg_id = pos_to_kw[arg_id] - assert isinstance(arg_id, str) - - if isinstance(descr, ArrayArgDescriptor): - new_arg = self.subkernel.arg_dict[arg_id].copy( - shape=descr.shape, - dim_tags=descr.dim_tags, - address_space=descr.address_space) - # replacing the new arg with the arg of the same name - new_args = [new_arg if arg.name == arg_id else arg for arg in - new_args] - elif isinstance(descr, ValueArgDescriptor): - pass - else: - raise LoopyError("Descriptor must be either an instance of " - "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % - type(descr)) - descriptor_specialized_knl = self.subkernel.copy(args=new_args) - from loopy.preprocess import traverse_to_infer_arg_descr - descriptor_specialized_knl, program_callables_info = ( - traverse_to_infer_arg_descr(descriptor_specialized_knl, - program_callables_info)) - - return ( - self.copy( - subkernel=descriptor_specialized_knl, - arg_id_to_descr=arg_id_to_descr), - program_callables_info) - - def with_packing_for_args(self): - from loopy.kernel.data import AddressSpace - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - - arg_id_to_descr = {} - - for pos, kw in pos_to_kw.items(): - arg = self.subkernel.arg_dict[kw] - arg_id_to_descr[pos] = ArrayArgDescriptor( - shape=arg.shape, - dim_tags=arg.dim_tags, - address_space=AddressSpace.GLOBAL) - - return self.copy(subkernel=self.subkernel, - arg_id_to_descr=arg_id_to_descr) - - def with_hw_axes_sizes(self, gsize, lsize): - return self.copy( - subkernel=self.subkernel.copy( - overridden_get_grid_sizes_for_insn_ids=( - GridOverrideForCalleeKernel(lsize, gsize)))) - def is_ready_for_codegen(self): return (self.arg_id_to_dtype is not None and self.arg_id_to_descr is not None) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index b5b80ad8..9d9935ab 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -21,29 +21,15 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - -import six - -import islpy as isl -from pymbolic.primitives import CallWithKwargs - -from loopy.kernel import LoopKernel -from pytools import ImmutableRecord from loopy.diagnostic import LoopyError -from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, - CInstruction, _DataObliviousInstruction) -from loopy.symbolic import IdentityMapper, SubstitutionMapper -from loopy.isl_helpers import simplify_via_aff -from loopy.kernel.function_interface import (get_kw_pos_association, - change_names_of_pymbolic_calls, CallableKernel, ScalarCallable) -from loopy.program import Program, ResolvedFunctionMarker +from loopy.kernel.function_interface import CallableKernel +from loopy.program import ResolvedFunctionMarker __doc__ = """ .. currentmodule:: loopy .. autofunction:: register_function_id_to_in_knl_callable_mapper -.. autofunction:: register_callable_kernel """ @@ -130,578 +116,4 @@ def register_function_id_to_in_knl_callable_mapper(program, # }}} -# {{{ register_callable_kernel - -class _RegisterCalleeKernel(ImmutableRecord): - """ - Helper class to make the function scoper from - :func:`loopy.transform.register_callable_kernel` picklable. As python - cannot pickle lexical closures. - """ - fields = set(['callable_kernel']) - - def __init__(self, callable_kernel): - self.callable_kernel = callable_kernel - - def __call__(self, target, identifier): - if identifier == self.callable_kernel.subkernel.name: - return self.callable_kernel - return None - - -def register_callable_kernel(program, callee_kernel): - """Returns a copy of *caller_kernel*, which would resolve *function_name* in an - expression as a call to *callee_kernel*. - - :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. - :arg function_name: An instance of :class:`str`. - :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. - """ - - # {{{ sanity checks - - assert isinstance(program, Program) - assert isinstance(callee_kernel, LoopKernel) - - # check to make sure that the variables with 'out' direction is equal to - # the number of assigness in the callee kernel intructions. - expected_num_assignees = len([arg for arg in callee_kernel.args if - arg.is_output_only]) - expected_num_parameters = len(callee_kernel.args) - expected_num_assignees - for in_knl_callable in program.program_callables_info.values(): - if isinstance(in_knl_callable, CallableKernel): - caller_kernel = in_knl_callable.subkernel - for insn in caller_kernel.instructions: - if isinstance(insn, CallInstruction) and ( - insn.expression.function.name == callee_kernel.name): - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters - else: - kw_parameters = {} - if len(insn.assignees) != expected_num_assignees: - raise LoopyError("The number of arguments with 'out' " - "direction " "in callee kernel %s and the number " - "of assignees in " "instruction %s do not " - "match." % ( - callee_kernel.name, insn.id)) - if len(insn.expression.parameters+tuple( - kw_parameters.values())) != expected_num_parameters: - raise LoopyError("The number of expected arguments " - "for the callee kernel %s and the number of " - "parameters in instruction %s do not match." - % (callee_kernel.name, insn.id)) - - elif isinstance(insn, (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("unknown instruction %s" % type(insn)) - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown callable type %s." % - type(in_knl_callable).__name__) - - # }}} - - # take the function resolvers from the Program and resolve the functions in - # the callee kernel - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) - - from loopy.symbolic import SubstitutionRuleMappingContext - rule_mapping_context = SubstitutionRuleMappingContext( - callee_kernel.substitutions, - callee_kernel.get_var_name_generator()) - - resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, callee_kernel, program_callables_info, - program.func_id_to_in_knl_callable_mappers) - - callee_kernel = rule_mapping_context.finish_kernel( - resolved_function_marker.map_kernel(callee_kernel)) - program_callables_info = resolved_function_marker.program_callables_info - - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) - program = program.copy(program_callables_info=program_callables_info) - - # making the target of the child kernel to be same as the target of parent - # kernel. - callable_kernel = CallableKernel(subkernel=callee_kernel.copy( - target=program.target, - is_called_from_host=False)) - - # FIXME disabling global barriers for callee kernel (for now) - from loopy import set_options - callee_kernel = set_options(callee_kernel, "disable_global_barriers") - - # FIXME: the number of callables is wrong. This is horrible please - # compensate. - - return register_function_id_to_in_knl_callable_mapper( - program, - _RegisterCalleeKernel(callable_kernel)) - -# }}} - - -# {{{ kernel inliner mapper - -class KernelInliner(SubstitutionMapper): - """Mapper to replace variables (indices, temporaries, arguments) in the - callee kernel with variables in the caller kernel. - - :arg caller: the caller kernel - :arg arg_map: dict of argument name to variables in caller - :arg arg_dict: dict of argument name to arguments in callee - """ - - def __init__(self, subst_func, caller, arg_map, arg_dict): - super(KernelInliner, self).__init__(subst_func) - self.caller = caller - self.arg_map = arg_map - self.arg_dict = arg_dict - - def map_subscript(self, expr): - if expr.aggregate.name in self.arg_map: - - aggregate = self.subst_func(expr.aggregate) - sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller - callee_arg = self.arg_dict[expr.aggregate.name] # Arg in callee - if aggregate.name in self.caller.arg_dict: - caller_arg = self.caller.arg_dict[aggregate.name] # Arg in caller - else: - caller_arg = self.caller.temporary_variables[aggregate.name] - - # Firstly, map inner inames to outer inames. - outer_indices = self.map_tuple(expr.index_tuple) - - # Next, reshape to match dimension of outer arrays. - # We can have e.g. A[3, 2] from outside and B[6] from inside - from numbers import Integral - if not all(isinstance(d, Integral) for d in callee_arg.shape): - raise LoopyError( - "Argument: {0} in callee kernel: {1} does not have " - "constant shape.".format(callee_arg)) - - flatten_index = 0 - for i, idx in enumerate(sar.get_begin_subscript().index_tuple): - flatten_index += idx*caller_arg.dim_tags[i].stride - - flatten_index += sum( - idx * tag.stride - for idx, tag in zip(outer_indices, callee_arg.dim_tags)) - - from loopy.isl_helpers import simplify_via_aff - flatten_index = simplify_via_aff(flatten_index) - - new_indices = [] - for dim_tag in caller_arg.dim_tags: - ind = flatten_index // dim_tag.stride - flatten_index -= (dim_tag.stride * ind) - new_indices.append(ind) - - new_indices = tuple(simplify_via_aff(i) for i in new_indices) - - return aggregate.index(tuple(new_indices)) - else: - return super(KernelInliner, self).map_subscript(expr) - -# }}} - - -# {{{ inlining of a single call instruction - -def _inline_call_instruction(caller_kernel, callee_knl, instruction): - """ - Returns a copy of *kernel* with the *instruction* in the *kernel* - replaced by inlining :attr:`subkernel` within it. - """ - callee_label = callee_knl.name[:4] + "_" - - # {{{ duplicate and rename inames - - vng = caller_kernel.get_var_name_generator() - ing = caller_kernel.get_instruction_id_generator() - dim_type = isl.dim_type.set - - iname_map = {} - for iname in callee_knl.all_inames(): - iname_map[iname] = vng(callee_label+iname) - - new_domains = [] - new_iname_to_tags = caller_kernel.iname_to_tags.copy() - - # transferring iname tags info from the callee to the caller kernel - for domain in callee_knl.domains: - new_domain = domain.copy() - for i in range(new_domain.n_dim()): - iname = new_domain.get_dim_name(dim_type, i) - - if iname in callee_knl.iname_to_tags: - new_iname_to_tags[iname_map[iname]] = ( - callee_knl.iname_to_tags[iname]) - new_domain = new_domain.set_dim_name( - dim_type, i, iname_map[iname]) - new_domains.append(new_domain) - - kernel = caller_kernel.copy(domains=caller_kernel.domains + new_domains, - iname_to_tags=new_iname_to_tags) - - # }}} - - # {{{ rename temporaries - - temp_map = {} - new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(callee_knl.temporary_variables): - new_name = vng(callee_label+name) - temp_map[name] = new_name - new_temps[new_name] = temp.copy(name=new_name) - - kernel = kernel.copy(temporary_variables=new_temps) - - # }}} - - # {{{ match kernel arguments - - arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) - - assignees = instruction.assignees # writes - parameters = instruction.expression.parameters # reads - - # add keyword parameters - from pymbolic.primitives import CallWithKwargs - - if isinstance(instruction.expression, CallWithKwargs): - from loopy.kernel.function_interface import get_kw_pos_association - - _, pos_to_kw = get_kw_pos_association(callee_knl) - kw_parameters = instruction.expression.kw_parameters - for i in range(len(parameters), len(parameters) + len(kw_parameters)): - parameters = parameters + (kw_parameters[pos_to_kw[i]],) - - assignee_pos = 0 - parameter_pos = 0 - for i, arg in enumerate(callee_knl.args): - if arg.is_output_only: - arg_map[arg.name] = assignees[assignee_pos] - assignee_pos += 1 - else: - arg_map[arg.name] = parameters[parameter_pos] - parameter_pos += 1 - - # }}} - - # {{{ rewrite instructions - - import pymbolic.primitives as p - from pymbolic.mapper.substitutor import make_subst_func - - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) - for k, v in six.iteritems(arg_map))) - subst_mapper = KernelInliner( - make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) - - insn_id = {} - for insn in callee_knl.instructions: - insn_id[insn.id] = ing(callee_label+insn.id) - - # {{{ root and leave instructions in callee kernel - - dep_map = callee_knl.recursive_insn_dep_map() - # roots depend on nothing - heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) - # leaves have nothing that depends on them - tails = set(dep_map.keys()) - for insn, deps in six.iteritems(dep_map): - tails = tails - deps - - # }}} - - # {{{ use NoOp to mark the start and end of callee kernel - - from loopy.kernel.instruction import NoOpInstruction - - noop_start = NoOpInstruction( - id=ing(callee_label+"_start"), - within_inames=instruction.within_inames, - depends_on=instruction.depends_on - ) - noop_end = NoOpInstruction( - id=instruction.id, - within_inames=instruction.within_inames, - depends_on=frozenset(insn_id[insn] for insn in tails) - ) - # }}} - - inner_insns = [noop_start] - - for insn in callee_knl.instructions: - insn = insn.with_transformed_expressions(subst_mapper) - within_inames = frozenset(map(iname_map.get, insn.within_inames)) - within_inames = within_inames | instruction.within_inames - depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( - instruction.depends_on) - if insn.id in heads: - depends_on = depends_on | set([noop_start.id]) - insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in callee kernel - priority=instruction.priority, - depends_on=depends_on - ) - inner_insns.append(insn) - - inner_insns.append(noop_end) - - new_insns = [] - for insn in kernel.instructions: - if insn == instruction: - new_insns.extend(inner_insns) - else: - new_insns.append(insn) - - kernel = kernel.copy(instructions=new_insns) - - # }}} - - return kernel - -# }}} - - -# {{{ inline callable kernel - -def _inline_single_callable_kernel(caller_kernel, function_name, - program_callables_info): - old_insns = caller_kernel.instructions - for insn in old_insns: - if isinstance(insn, CallInstruction): - # FIXME This seems to use identifiers across namespaces. Why not - # check whether the function is a scoped function first? ~AK - if insn.expression.function.name in program_callables_info: - history_of_identifier = program_callables_info.history[ - insn.expression.function.name] - - if function_name in history_of_identifier: - in_knl_callable = program_callables_info[ - insn.expression.function.name] - assert isinstance(in_knl_callable, CallableKernel) - caller_kernel = _inline_call_instruction( - caller_kernel, in_knl_callable.subkernel, insn) - program_callables_info = ( - program_callables_info.with_deleted_callable( - insn.expression.function.name, - program_callables_info.num_times_callables_called[ - caller_kernel.name])) - elif isinstance(insn, (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError( - "Unknown instruction type %s" - % type(insn).__name__) - - return caller_kernel, program_callables_info - - -# FIXME This should take a 'within' parameter to be able to only inline -# *some* calls to a kernel, but not others. -def inline_callable_kernel(program, function_name): - """ - Returns a copy of *kernel* with the callable kernel addressed by - (scoped) name *function_name* inlined. - """ - from loopy.preprocess import infer_arg_descr - program = infer_arg_descr(program) - program_callables_info = program.program_callables_info - old_program_callables_info = program_callables_info.copy() - - edited_callable_kernels = {} - - for func_id, in_knl_callable in old_program_callables_info.items(): - if function_name not in old_program_callables_info.history[func_id] and ( - isinstance(in_knl_callable, CallableKernel)): - caller_kernel = in_knl_callable.subkernel - caller_kernel, program_callables_info = ( - _inline_single_callable_kernel(caller_kernel, - function_name, - program_callables_info)) - edited_callable_kernels[func_id] = in_knl_callable.copy( - subkernel=caller_kernel) - - new_resolved_functions = {} - for func_id, in_knl_callable in program_callables_info.items(): - if func_id in edited_callable_kernels: - new_resolved_functions[func_id] = edited_callable_kernels[func_id] - else: - new_resolved_functions[func_id] = in_knl_callable - - program_callables_info = program_callables_info.copy( - resolved_functions=new_resolved_functions) - - return program.copy(program_callables_info=program_callables_info) - -# }}} - - -# {{{ tools to match caller to callee args by (guessed) automatic reshaping - -# (This is undocumented and not recommended, but it is currently needed -# to support Firedrake.) - -class DimChanger(IdentityMapper): - """ - Mapper to change the dimensions of an argument. - - .. attribute:: callee_arg_dict - - A mapping from the argument name (:class:`str`) to instances of - :class:`loopy.kernel.array.ArrayBase`. - - .. attribute:: desried_shape - - A mapping from argument name (:class:`str`) to an instance of - :class:`tuple`. - """ - def __init__(self, callee_arg_dict, desired_shape): - self.callee_arg_dict = callee_arg_dict - self.desired_shape = desired_shape - - def map_subscript(self, expr): - callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags - flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in - zip(callee_arg_dim_tags, expr.index_tuple)) - new_indices = [] - - from operator import mul - from functools import reduce - stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1) - - for length in self.desired_shape[expr.aggregate.name]: - stride /= length - ind = flattened_index // int(stride) - flattened_index -= (int(stride) * ind) - new_indices.append(simplify_via_aff(ind)) - - return expr.aggregate.index(tuple(new_indices)) - - -def _match_caller_callee_argument_dimension_for_single_kernel( - caller_knl, program_callables_info, callee_function_name): - """ - Returns a copy of *caller_knl* with the instance of - :class:`loopy.kernel.function_interface.CallableKernel` addressed by - *callee_function_name* in the *caller_knl* aligned with the argument - dimesnsions required by *caller_knl*. - """ - pymbolic_calls_to_new_callables = {} - for insn in caller_knl.instructions: - if not isinstance(insn, CallInstruction) or ( - insn.expression.function.name not in - program_callables_info): - # Call to a callable kernel can only occur through a - # CallInstruction. - continue - - in_knl_callable = program_callables_info[ - insn.expression.function.name] - - if in_knl_callable.subkernel.name != callee_function_name: - # Not the callable we're looking for. - continue - - # getting the caller->callee arg association - - parameters = insn.expression.parameters[:] - kw_parameters = {} - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters - - assignees = insn.assignees - - parameter_shapes = [par.get_array_arg_descriptor(caller_knl).shape - for par in parameters] - kw_to_pos, pos_to_kw = get_kw_pos_association(in_knl_callable.subkernel) - for i in range(len(parameters), len(parameters)+len(kw_parameters)): - parameter_shapes.append(kw_parameters[pos_to_kw[i]] - .get_array_arg_descriptor(caller_knl).shape) - - # inserting the assigness at the required positions. - assignee_write_count = -1 - for i, arg in enumerate(in_knl_callable.subkernel.args): - if arg.is_output_only: - assignee = assignees[-assignee_write_count-1] - parameter_shapes.insert(i, assignee - .get_array_arg_descriptor(caller_knl).shape) - assignee_write_count -= 1 - - callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in - in_knl_callable.subkernel.args], parameter_shapes)) - dim_changer = DimChanger(in_knl_callable.subkernel.arg_dict, - callee_arg_to_desired_dim_tag) - new_callee_insns = [] - for callee_insn in in_knl_callable.subkernel.instructions: - if isinstance(callee_insn, MultiAssignmentBase): - new_callee_insns.append(callee_insn.copy(expression=dim_changer( - callee_insn.expression), - assignee=dim_changer(callee_insn.assignee))) - elif isinstance(callee_insn, (CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("Unknwon instruction %s." % - type(insn)) - - # subkernel with instructions adjusted according to the new dimensions. - new_subkernel = in_knl_callable.subkernel.copy(instructions=new_callee_insns) - - new_in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel) - - pymbolic_calls_to_new_callables[insn.expression] = new_in_knl_callable - - if not pymbolic_calls_to_new_callables: - # complain if no matching function found. - raise LoopyError("No CallableKernel with the name %s found in %s." % ( - callee_function_name, caller_knl.name)) - - return change_names_of_pymbolic_calls(caller_knl, - pymbolic_calls_to_new_callables) - - -def _match_caller_callee_argument_dimension_(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = ( - _match_caller_callee_argument_dimension_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, - *args, **kwargs)) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - -# }}} - - # vim: foldmethod=marker diff --git a/test/test_callables.py b/test/test_callables.py index f25bbbe6..d2ca9b71 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -26,7 +26,6 @@ import numpy as np import pyopencl as cl import pyopencl.clrandom # noqa: F401 import loopy as lp -import pytest import sys @@ -60,350 +59,6 @@ def test_register_function_lookup(ctx_factory): assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 -@pytest.mark.parametrize("inline", [False, True]) -def test_register_knl(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - n = 2 ** 4 - - x = np.random.rand(n, n, n, n, n) - y = np.random.rand(n, n, n, n, n) - - grandchild_knl = lp.make_kernel_function( - "{[i, j]:0<= i, j< 16}", - """ - c[i, j] = 2*a[i, j] + 3*b[i, j] - """, name='linear_combo1') - - child_knl = lp.make_kernel_function( - "{[i, j]:0<=i, j < 16}", - """ - [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) - """, name='linear_combo2') - - parent_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", - """ - [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], - [j, l]: y[i, j, k, l, m]) - """, - kernel_data=[ - lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], - ) - - knl = lp.register_callable_kernel( - parent_knl, child_knl) - knl = lp.register_callable_kernel( - knl, grandchild_knl) - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo2') - knl = lp.inline_callable_kernel(knl, 'linear_combo1') - - evt, (out, ) = knl(queue, x=x, y=y) - - assert (np.linalg.norm(2*x+3*y-out)/( - np.linalg.norm(2*x+3*y))) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_slices_with_negative_step(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - n = 2 ** 4 - - x = np.random.rand(n, n, n, n, n) - y = np.random.rand(n, n, n, n, n) - - child_knl = lp.make_kernel_function( - "{[i, j]:0<=i, j < 16}", - """ - g[i, j] = 2*e[i, j] + 3*f[i, j] - """, name="linear_combo") - - parent_knl = lp.make_kernel( - "{[i, k, m]: 0<=i, k, m<16}", - """ - z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], - y[i, :, k, :, m]) - """, - kernel_data=[ - lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='z', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], - ) - - knl = lp.register_callable_kernel( - parent_knl, child_knl) - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') - - evt, (out, ) = knl(queue, x=x, y=y) - - assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/( - np.linalg.norm(2*x+3*y))) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_register_knl_with_call_with_kwargs(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 2 ** 2 - - a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) - b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) - c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - - callee_knl = lp.make_kernel_function( - "{[i, j]:0<=i, j < %d}" % n, - """ - h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] - <>f1[i, j] = 2*f[i, j] - p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] - """, - [ - lp.GlobalArg('f, e, h, g'), '...'], - name='linear_combo') - - caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, - """ - <> d[i, j, k, l, m] = 2*b[i, j, k, l, m] - [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m] = linear_combo( - f=[j, l]: a[i, j, k, l, m], - g=[j, l]: d[i, j, k, l, m], - e=[j, l]: c[i, j, k, l, m]) - """) - - knl = lp.register_callable_kernel( - caller_knl, callee_knl) - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') - - evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) - - a = a_dev.get() - b = b_dev.get() - c = c_dev.get() - - h = out1.get() # h = 2c + 3a + 8b - p = out2.get() # p = 7c + 8a + 4b - h_exact = 3*a + 8*b + 2*c - p_exact = 8*a + 4*b + 7*c - - assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7 - assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_register_knl_with_hw_axes(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 2 ** 4 - - x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - - callee_knl = lp.make_kernel_function( - "{[i, j]:0<=i, j < 16}", - """ - g[i, j] = 2*e[i, j] + 3*f[i, j] - """, name='linear_combo') - - callee_knl = lp.split_iname(callee_knl, "i", 4, inner_tag="l.0", outer_tag="g.0") - - caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", - """ - [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], - [j, l]: y[i, j, k, l, m]) - """ - ) - caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") - - knl = lp.register_callable_kernel( - caller_knl, callee_knl) - - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') - - evt, (out, ) = knl(queue, x=x_dev, y=y_dev) - - x_host = x_dev.get() - y_host = y_dev.get() - - assert np.linalg.norm(2*x_host+3*y_host-out.get())/np.linalg.norm( - 2*x_host+3*y_host) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_shape_translation_through_sub_array_ref(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) - x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) - x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) - - callee1 = lp.make_kernel_function( - "{[i]: 0<=i<6}", - """ - a[i] = 2*abs(b[i]) - """, name="callee_fn1") - - callee2 = lp.make_kernel_function( - "{[i, j]: 0<=i<3 and 0 <= j < 2}", - """ - a[i, j] = 3*b[i, j] - """, name="callee_fn2") - - callee3 = lp.make_kernel_function( - "{[i]: 0<=i<6}", - """ - a[i] = 5*b[i] - """, name="callee_fn3") - - knl = lp.make_kernel( - "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", - """ - [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) - [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) - [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) - """) - - knl = lp.register_callable_kernel(knl, callee1) - knl = lp.register_callable_kernel(knl, callee2) - knl = lp.register_callable_kernel(knl, callee3) - - if inline: - knl = lp.inline_callable_kernel(knl, 'callee_fn1') - knl = lp.inline_callable_kernel(knl, 'callee_fn2') - knl = lp.inline_callable_kernel(knl, 'callee_fn3') - - knl = lp.set_options(knl, "write_cl") - knl = lp.set_options(knl, "return_dict") - evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) - - y1 = out_dict['y1'].get() - y2 = out_dict['y2'].get() - y3 = out_dict['y3'].get() - - assert (np.linalg.norm(y1-2*x1.get())) < 1e-15 - assert (np.linalg.norm(y2-3*x2.get())) < 1e-15 - assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15 - - -def test_multi_arg_array_call(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - import pymbolic.primitives as p - n = 10 - acc_i = p.Variable("acc_i") - i = p.Variable("i") - index = p.Variable("index") - a_i = p.Subscript(p.Variable("a"), p.Variable("i")) - argmin_kernel = lp.make_kernel_function( - "{[i]: 0 <= i < n}", - [ - lp.Assignment(id="init2", assignee=index, - expression=0), - lp.Assignment(id="init1", assignee=acc_i, - expression="214748367"), - lp.Assignment(id="insn", assignee=index, - expression=p.If(p.Expression.eq(acc_i, a_i), i, index), - depends_on="update"), - lp.Assignment(id="update", assignee=acc_i, - expression=p.Variable("min")(acc_i, a_i), - depends_on="init1,init2")], - name="custom_argmin") - - argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) - - knl = lp.make_kernel( - "{[i]:0<=i 1: exec(sys.argv[1]) -- GitLab From 5ed57fe2f50af100a75c08ff1f876c938123d666 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 12 Aug 2018 18:44:11 +0530 Subject: [PATCH 06/80] minor error handling. --- loopy/codegen/__init__.py | 18 ++++------ loopy/kernel/__init__.py | 56 +++++------------------------- loopy/kernel/creation.py | 9 ++--- loopy/kernel/function_interface.py | 4 --- loopy/kernel/instruction.py | 12 ++----- loopy/preprocess.py | 11 ++---- loopy/type_inference.py | 19 ++-------- 7 files changed, 25 insertions(+), 104 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 3e675db7..7a25b67e 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -150,7 +150,6 @@ class SeenFunction(ImmutableRecord): class CodeGenerationState(object): """ .. attribute:: kernel - .. attribute:: target .. attribute:: implemented_data_info a list of :class:`ImplementedDataInfo` objects. @@ -196,7 +195,7 @@ class CodeGenerationState(object): .. attribute:: program_callables_info """ - def __init__(self, kernel, target, + def __init__(self, kernel, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, @@ -206,7 +205,6 @@ class CodeGenerationState(object): gen_program_name=None, schedule_index_end=None): self.kernel = kernel - self.target = target self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain self.implemented_predicates = implemented_predicates @@ -224,7 +222,7 @@ class CodeGenerationState(object): # {{{ copy helpers - def copy(self, kernel=None, target=None, implemented_data_info=None, + def copy(self, kernel=None, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), var_subst_map=None, vectorization_info=None, is_generating_device_code=None, @@ -234,9 +232,6 @@ class CodeGenerationState(object): if kernel is None: kernel = self.kernel - if target is None: - target = self.target - if implemented_data_info is None: implemented_data_info = self.implemented_data_info @@ -257,7 +252,6 @@ class CodeGenerationState(object): return CodeGenerationState( kernel=kernel, - target=target, implemented_data_info=implemented_data_info, implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( @@ -389,7 +383,7 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_for_a_single_kernel(kernel, program_callables_info, target): +def generate_code_for_a_single_kernel(kernel, program_callables_info): """ :returns: a :class:`CodeGenerationResult` """ @@ -477,7 +471,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info, target): gen_program_name=( kernel.target.host_program_name_prefix + kernel.name - + target.host_program_name_suffix), + + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule), program_callables_info=program_callables_info) @@ -512,7 +506,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info, target): ) preamble_generators = (kernel.preamble_generators - + target.get_device_ast_builder().preamble_generators()) + + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) @@ -555,7 +549,7 @@ def generate_code_v2(program): if isinstance(in_knl_callable, CallableKernel): codegen_results[func_id] = ( generate_code_for_a_single_kernel(in_knl_callable.subkernel, - program.program_callables_info, program.target)) + program.program_callables_info)) device_preambles = set() for cgr in codegen_results.values(): diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index d2723c57..f686e58f 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1036,25 +1036,19 @@ class LoopKernel(ImmutableRecordWithoutPickling): self.get_iname_bounds(iname, constants_only=True).size, constants_only=True))) - def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, - program_callables_info, ignore_auto=False): + def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info, + ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. - :arg insn_ids: a :class:`frozenset` of instruction IDs - - *global_size* and *local_size* are instances of :class:`dict` with - mapping of the form from ``axis`` to :class:`islpy.PwAff` objects. + *global_size* and *local_size* are :class:`islpy.PwAff` objects. """ - # {{{ collecting the callee kernels in insn_ids - - from loopy.kernel.tools import get_direct_callee_kernels - callee_kernels = get_direct_callee_kernels(self, - program_callables_info, insn_ids) - - # }}} + if self.overridden_get_grid_sizes_for_insn_ids: + return self.overridden_get_grid_sizes_for_insn_ids( + insn_ids, + ignore_auto=ignore_auto) all_inames_by_insns = set() for insn_id in insn_ids: @@ -1069,15 +1063,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): global_sizes = {} local_sizes = {} - # updating the grid sizes from the callee_kernels. - for callee_kernel in callee_kernels: - gsize, lsize = callee_kernel.get_grid_sizes_for_insn_ids_as_dicts( - frozenset(insn.id for insn in callee_kernel.instructions), - program_callables_info, ignore_auto) - - global_sizes.update(gsize) - local_sizes.update(lsize) - from loopy.kernel.data import ( GroupIndexTag, LocalIndexTag, AutoLocalIndexTagBase) @@ -1118,31 +1103,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): tgt_dict[tag.axis] = size - return global_sizes, local_sizes - - def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info, - ignore_auto=False): - """Return a tuple (global_size, local_size) containing a grid that - could accommodate execution of all instructions whose IDs are given - in *insn_ids*. - - :arg insn_ids: a :class:`frozenset` of instruction IDs - - *global_size* and *local_size* are :class:`islpy.PwAff` objects. - """ - - if self.overridden_get_grid_sizes_for_insn_ids: - return self.overridden_get_grid_sizes_for_insn_ids( - insn_ids, - program_callables_info=program_callables_info, - ignore_auto=ignore_auto) - - assert self.is_called_from_host, ("Callee kernels do not have sufficient " - "information to compute grid sizes.") - - global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( - insn_ids, program_callables_info, ignore_auto=ignore_auto) - def to_dim_tuple(size_dict, which, forced_sizes={}): forced_sizes = forced_sizes.copy() @@ -1172,6 +1132,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) + + def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index bac4afc8..bc996d9c 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -27,16 +27,13 @@ THE SOFTWARE. import numpy as np from pymbolic.mapper import CSECachingMapperMixin -from pymbolic.primitives import Slice, Variable, Subscript from loopy.tools import intern_frozenset_of_ids from loopy.symbolic import ( - IdentityMapper, WalkMapper, SubArrayRef) + IdentityMapper, WalkMapper) from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, SubstitutionRule, AddressSpace) -from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, - CallInstruction) from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -507,11 +504,9 @@ def parse_insn(groups, insn_options): assignee_names.append(inner_lhs_i.name) elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)): assignee_names.append(inner_lhs_i.aggregate.name) - elif isinstance(inner_lhs_i, SubArrayRef): - assignee_names.append(inner_lhs_i.subscript.aggregate.name) else: raise LoopyError("left hand side of assignment '%s' must " - "be variable, subscript or a SubArrayRef" % (lhs_i,)) + "be variable or subscript" % (lhs_i,)) new_lhs.append(lhs_i) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 8b24da21..e0954fb7 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -571,13 +571,9 @@ class CallableKernel(InKernelCallable): # no type casting in array calls from loopy.expression import dtype_to_type_context from pymbolic.mapper.stringifier import PREC_NONE - from loopy.symbolic import SubArrayRef from pymbolic import var c_parameters = [ - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr if isinstance(par, SubArrayRef) else expression_to_code_mapper(par, PREC_NONE, dtype_to_type_context(target, par_dtype), par_dtype).expr diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 0f548bba..2a03ad63 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -487,7 +487,7 @@ class InstructionBase(ImmutableRecord): def _get_assignee_var_name(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript, SubArrayRef + from loopy.symbolic import LinearSubscript if isinstance(expr, Lookup): expr = expr.aggregate @@ -507,19 +507,13 @@ def _get_assignee_var_name(expr): return agg.name - elif isinstance(expr, SubArrayRef): - agg = expr.subscript.aggregate - assert isinstance(agg, Variable) - - return agg.name - else: raise RuntimeError("invalid lvalue '%s'" % expr) def _get_assignee_subscript_deps(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript, get_dependencies, SubArrayRef + from loopy.symbolic import LinearSubscript, get_dependencies if isinstance(expr, Lookup): expr = expr.aggregate @@ -530,8 +524,6 @@ def _get_assignee_subscript_deps(expr): return get_dependencies(expr.index) elif isinstance(expr, LinearSubscript): return get_dependencies(expr.index) - elif isinstance(expr, SubArrayRef): - return get_dependencies(expr.get_begin_subscript().index) else: raise RuntimeError("invalid lvalue '%s'" % expr) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 3657967a..bf23c4a4 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2165,7 +2165,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): def map_call(self, expr, expn_state, **kwargs): from pymbolic.primitives import Call, CallWithKwargs from loopy.kernel.function_interface import ValueArgDescriptor - from loopy.symbolic import ResolvedFunction, SubArrayRef + from loopy.symbolic import ResolvedFunction if not isinstance(expr.function, ResolvedFunction): # ignore if the call is not to a ResolvedFunction @@ -2178,8 +2178,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): kw_parameters = expr.kw_parameters # descriptors for the args and kwargs of the Call - arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.caller_kernel)) - if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor()) + arg_id_to_descr = dict((i, ValueArgDescriptor()) for i, par in tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) @@ -2190,11 +2189,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): assignees = kwargs['assignees'] assert isinstance(assignees, tuple) for i, par in enumerate(assignees): - if isinstance(par, SubArrayRef): - assignee_id_to_descr[-i-1] = ( - par.get_array_arg_descriptor(self.caller_kernel)) - else: - assignee_id_to_descr[-i-1] = ValueArgDescriptor() + assignee_id_to_descr[-i-1] = ValueArgDescriptor() # gathering all the descriptors combined_arg_id_to_descr = arg_id_to_descr.copy() diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 0e8fa305..3ae9a142 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -36,7 +36,7 @@ from loopy.diagnostic import ( from loopy.kernel.instruction import _DataObliviousInstruction from loopy.program import ProgramCallablesInfo -from loopy.symbolic import SubArrayRef, LinearSubscript +from loopy.symbolic import LinearSubscript from pymbolic.primitives import Variable, Subscript, Lookup import logging @@ -548,10 +548,6 @@ class TypeInferenceMapper(CombineMapper): return [expr.operation.result_dtypes(self.kernel, rec_result)[0] for rec_result in rec_results] - def map_sub_array_ref(self, expr): - return self.rec(expr.get_begin_subscript()) - - # }}} @@ -831,17 +827,8 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, assignee.aggregate.name].dtype is None: return False else: - assert isinstance(assignee, SubArrayRef) - if assignee.subscript.aggregate.name in kernel.arg_dict: - if kernel.arg_dict[ - assignee.subscript.aggregate.name].dtype is None: - return False - else: - assert assignee.subscript.aggregate.name in ( - kernel.temporary_variables) - if kernel.temporary_variables[ - assignee.subscript.aggregate.name] is None: - return False + raise NotImplementedError("Unknown assignee type %s" % + type(assignee)) return True -- GitLab From 79fed9786ce5ae90c367ac6cbff1192678aa1014 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 12 Aug 2018 18:55:30 +0530 Subject: [PATCH 07/80] Flake8 --- loopy/isl_helpers.py | 2 +- loopy/kernel/__init__.py | 11 ----------- loopy/target/opencl.py | 5 ----- 3 files changed, 1 insertion(+), 17 deletions(-) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index ef07b7e2..5a747d07 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -27,7 +27,7 @@ THE SOFTWARE. from six.moves import range, zip -from loopy.diagnostic import StaticValueFindingError, LoopyError +from loopy.diagnostic import StaticValueFindingError import islpy as isl from islpy import dim_type diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index f686e58f..f5e105c7 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -221,11 +221,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): A subclass of :class:`loopy.TargetBase`. - .. attribute:: is_called_from_host - - An instance of :class:`bool`. Will be set *False* for the kernel which - would be called from another top level kernels. Default value is - *True*. """ # {{{ constructor @@ -254,8 +249,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): state=KernelState.INITIAL, target=None, - is_called_from_host=True, - overridden_get_grid_sizes_for_insn_ids=None, _cached_written_variables=None): """ @@ -368,7 +361,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): options=options, state=state, target=target, - is_called_from_host=is_called_from_host, overridden_get_grid_sizes_for_insn_ids=( overridden_get_grid_sizes_for_insn_ids), _cached_written_variables=_cached_written_variables) @@ -1132,8 +1124,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) - - def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that @@ -1456,7 +1446,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): "silenced_warnings", "options", "state", - "is_called_from_host", "target", ) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 44f782a7..44bf9c4c 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -470,11 +470,6 @@ class OpenCLCASTBuilder(CASTBuilder): from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) - if not codegen_state.kernel.is_called_from_host: - # auxiliary kernels need not mention opencl speicific qualifiers - # for a functions signature - return fdecl - fdecl = fdecl.subdecl from cgen.opencl import CLKernel, CLRequiredWorkGroupSize -- GitLab From ec84ad60427fa2ebf2accf03e4b9432bece54be6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 12 Aug 2018 19:21:46 +0530 Subject: [PATCH 08/80] adds program_callables_info to grid_override... --- loopy/kernel/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index f5e105c7..be66cf85 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1040,6 +1040,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if self.overridden_get_grid_sizes_for_insn_ids: return self.overridden_get_grid_sizes_for_insn_ids( insn_ids, + program_callables_info, ignore_auto=ignore_auto) all_inames_by_insns = set() -- GitLab From dd995d883c7ea00950f7121533c86a0638cd2b10 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 12 Aug 2018 19:47:04 +0530 Subject: [PATCH 09/80] took the test to the earlier state. --- test/test_loopy.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 02eeda13..43371c8a 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -409,11 +409,14 @@ def test_ilp_write_race_detection_global(ctx_factory): knl = lp.tag_inames(knl, dict(j="ilp")) + knl = lp.preprocess_kernel(knl) + with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: - lp.generate_code_v2(knl) + list(lp.generate_loop_schedules(knl.root_kernel, + knl.program_callables_info)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list) -- GitLab From 82a16b6cc6709b5a9f516ef5b1da376b92782b8d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 13 Aug 2018 11:27:00 +0530 Subject: [PATCH 10/80] fix the style of code to get started with changing ProgramCallablesInfo --- loopy/kernel/__init__.py | 3 +- loopy/kernel/function_interface.py | 4 +- loopy/library/reduction.py | 2 +- loopy/program.py | 70 +++++++----------------------- loopy/statistics.py | 6 +-- loopy/symbolic.py | 8 ++-- 6 files changed, 27 insertions(+), 66 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index be66cf85..3f637e53 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1360,7 +1360,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ direct execution def __call__(self, *args, **kwargs): - # FIXME: scream and then convert to a program + raise LoopyError("Calling a LoopKernel is deprecated, call a Program " + "instead.") from loopy.program import make_program_from_kernel program = make_program_from_kernel(self) return program(*args, **kwargs) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e0954fb7..8c3a6911 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -676,8 +676,8 @@ def next_indexed_variable(function): or :class:`loopy.reduction.ArgExtOp` or :class:`loopy.reduction.SegmentedOp`. """ - from loopy.library.reduction import ArgExtOp, SegmentedOp - if isinstance(function, (ArgExtOp, SegmentedOp)): + from loopy.library.reduction import ReductionOpFunction + if isinstance(function, ReductionOpFunction): return function.copy() func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 6ec8e4b2..b968192e 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -504,7 +504,7 @@ class ReductionCallable(ScalarCallable): def reduction_scoper(target, identifier): - if isinstance(identifier, (ArgExtOp, SegmentedOp)): + if isinstance(identifier, ReductionOpFunction): return ReductionCallable(name=identifier) return None diff --git a/loopy/program.py b/loopy/program.py index 096bd1ec..279228af 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -298,14 +298,7 @@ class Program(ImmutableRecord): return pex(*args, **kwargs) def __str__(self): - # FIXME: make this better - print(self.program_callables_info.num_times_callables_called) - return ( - (self.program_callables_info[ - self.name].subkernel).__str__() + - '\nResolved Functions: ' + - (self.program_callables_info.resolved_functions.keys()).__str__() + - '\n' + 75*'-' + '\n') + return self.root_kernel.__str__() # }}} @@ -315,14 +308,14 @@ def next_indexed_function_identifier(function): Returns an instance of :class:`str` with the next indexed-name in the sequence for the name of *function*. - *Example:* ``Variable('sin_0')`` will return ``'sin_1'``. + *Example:* ``'sin_0'`` will return ``'sin_1'``. - :arg function: Either an instance of :class:`pymbolic.primitives.Variable` - or :class:`loopy.reduction.ArgExtOp` or - :class:`loopy.reduction.SegmentedOp`. + :arg function: Either an instance of :class:`str`, + :class:`pymbolic.primitives.Variable` , + :class:`loopy.reduction.ReductionOpFunction`. """ - from loopy.library.reduction import ArgExtOp, SegmentedOp - if isinstance(function, (ArgExtOp, SegmentedOp)): + from loopy.library.reduction import ReductionOpFunction + if isinstance(function, ReductionOpFunction): return function.copy() elif isinstance(function, str): function = Variable(function) @@ -371,12 +364,8 @@ def rename_resolved_functions_in_a_single_kernel(kernel, # {{{ program callables info class ProgramCallablesInfo(ImmutableRecord): - # FIXME: dont evalutate num_times_called, rahter compute it from the - # resolved_functions - # FIXME: make the edit callables thing a ContextManager. def __init__(self, resolved_functions, num_times_callables_called=None, history=None, is_being_edited=False, - num_times_hit_during_editing={}, renames_needed_after_editing={}): if num_times_callables_called is None: @@ -391,23 +380,19 @@ class ProgramCallablesInfo(ImmutableRecord): num_times_callables_called=num_times_callables_called, history=history, is_being_edited=is_being_edited, - num_times_hit_during_editing=num_times_hit_during_editing, renames_needed_after_editing=renames_needed_after_editing) hash_fields = ( "resolved_functions", "num_times_callables_called", "is_being_edited", - "num_times_hit_during_editing", "renames_needed_after_editing", "history") update_persistent_hash = LoopKernel.update_persistent_hash def with_edit_callables_mode(self): - return self.copy(is_being_edited=True, - num_times_hit_during_editing=dict((func_id, 0) for func_id in - self.resolved_functions)) + return self.copy(is_being_edited=True) def with_callable(self, function, in_kernel_callable, resolved_for_the_first_time=False): @@ -426,6 +411,10 @@ class ProgramCallablesInfo(ImmutableRecord): # FIXME: add a note about using enter and exit. ~KK # FIXME: think about a better idea of "with_added_callable" this would # be more convenient for developer-faced usage. ~KK + # FIXME: Is this is a bad code? Yes. + # Is there a better alternative to it. Definitely maybe. + # But I don't want to spend the next 182 years of my life optimizing + # some scheme, without even implmenting it to some problem! if not self.is_being_edited: if function.name in self.resolved_functions and ( @@ -436,29 +425,22 @@ class ProgramCallablesInfo(ImmutableRecord): print('New: ', in_kernel_callable) raise LoopyError("Use 'enter_edit_callables_mode' first.") - from loopy.library.reduction import ArgExtOp, SegmentedOp + from loopy.library.reduction import ReductionOpFunction # {{{ sanity checks if isinstance(function, str): function = Variable(function) - assert isinstance(function, (Variable, ArgExtOp, SegmentedOp)) + assert isinstance(function, (Variable, ReductionOpFunction)) # }}} renames_needed_after_editing = self.renames_needed_after_editing.copy() - num_times_hit_during_editing = self.num_times_hit_during_editing.copy() num_times_callables_called = self.num_times_callables_called.copy() history = self.history.copy() - if not resolved_for_the_first_time: - if isinstance(function, (ArgExtOp, SegmentedOp)): - num_times_hit_during_editing[function] += 1 - else: - num_times_hit_during_editing[function.name] += 1 - - if isinstance(function, (ArgExtOp, SegmentedOp)): + if isinstance(function, ReductionOpFunction): unique_function_identifier = function.copy() if not resolved_for_the_first_time: num_times_callables_called[function] -= 1 @@ -473,8 +455,6 @@ class ProgramCallablesInfo(ImmutableRecord): self.copy( resolved_functions=updated_resolved_functions, num_times_callables_called=num_times_callables_called, - num_times_hit_during_editing=( - num_times_hit_during_editing), renames_needed_after_editing=( renames_needed_after_editing)), unique_function_identifier) @@ -494,17 +474,12 @@ class ProgramCallablesInfo(ImmutableRecord): return ( self.copy( history=history, - num_times_hit_during_editing=( - num_times_hit_during_editing), num_times_callables_called=( num_times_callables_called), renames_needed_after_editing=( renames_needed_after_editing)), func_id) else: - # FIXME: maybe deal with the history over here? - # FIXME: once the code logic is running beautify this part. - # many "ifs" can be avoided unique_function_identifier = function.name if (resolved_for_the_first_time or self.num_times_callables_called[function.name] > 1): @@ -534,7 +509,6 @@ class ProgramCallablesInfo(ImmutableRecord): history=history, resolved_functions=updated_resolved_functions, num_times_callables_called=num_times_callables_called, - num_times_hit_during_editing=num_times_hit_during_editing, renames_needed_after_editing=renames_needed_after_editing), Variable(unique_function_identifier)) @@ -576,7 +550,6 @@ class ProgramCallablesInfo(ImmutableRecord): is_being_edited=False, resolved_functions=resolved_functions, num_times_callables_called=num_times_callables_called, - num_times_hit_during_editing={}, renames_needed_after_editing={}) def with_deleted_callable(self, func_id, instances=1): @@ -668,17 +641,4 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): return wraps(transform_for_single_kernel)(_collective_transform) -# {{{ ingoring this for now - -# if False and isinstance(function, (ArgExtOp, SegmentedOp)): -# FIXME: ignoring this casse for now -# FIXME: If a kernel has two flavors of ArgExtOp then they are -# overwritten and hence not supported.(for now). -# updated_resolved_functions = self.scoped_functions.copy() -# updated_resolved_functions[function] = in_kernel_callable -# return self.copy(updated_resolved_functions), function.copy() - -# }}} - - # vim: foldmethod=marker diff --git a/loopy/statistics.py b/loopy/statistics.py index 08b7f89e..95e9f62a 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -64,9 +64,9 @@ __doc__ = """ # Qns: # - The variable name, what if multiple kernels use the same name? # - We should also add the cumulative effect on the arguments of callee kernels -# into the caller kernel. -# FIXME: add an error that there is only one callable kernel. disable for -# multiple callable kernels. +# into the caller kernel +# - Make changes to MemAccessInfo to include the effect of several kernels. +# - Renovate `count`. # {{{ GuardedPwQPolynomial diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 7a268d06..92b209ac 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -677,16 +677,16 @@ class ResolvedFunction(p.Expression): def __init__(self, function): if isinstance(function, str): function = p.Variable(function) - from loopy.library.reduction import ArgExtOp, SegmentedOp - assert isinstance(function, (p.Variable, ArgExtOp, SegmentedOp)) + from loopy.library.reduction import ReductionOpFunction + assert isinstance(function, (p.Variable, ReductionOpFunction)) self.function = function @property def name(self): - from loopy.library.reduction import ArgExtOp, SegmentedOp + from loopy.library.reduction import ReductionOpFunction if isinstance(self.function, p.Variable): return self.function.name - elif isinstance(self.function, (ArgExtOp, SegmentedOp)): + elif isinstance(self.function, ReductionOpFunction): return self.function else: raise LoopyError("Unexpected function type %s in ResolvedFunction." % -- GitLab From 88d746d0d041435d33aebd2a301855647c054ebe Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 13 Aug 2018 20:38:16 +0530 Subject: [PATCH 11/80] started with beautifying code. --- loopy/program.py | 108 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 102 insertions(+), 6 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 279228af..1b9d03d4 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -165,6 +165,35 @@ def initialize_program_callables_info_from_kernel( # {{{ program definition class Program(ImmutableRecord): + """ + Records the information about all the callables in a :mod:`loopy` program. + + .. attribute:: name + + An instance of :class:`str`, also the name of the top-most level + :class:`loopy.LoopKernel`. + + .. attribute:: program_callables_info + + An instance of :class:`loopy.program.ProgramCallablesInfo`. + + .. attribute:: target + + An instance of :class:`loopy.target.TargetBase`. + + .. attribute:: func_id_to_in_knl_callables_mappers + + A list of functions of the signature ``(target: TargetBase, + function_indentifier: str)`` that would return an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + + .. note:: + + - To create an instance of :class:`loopy.Program`, it is recommeneded to + go through :method:`loopy.make_kernel`. + - This data structure and its attributes should be considered + immutable, any modifications should be done through :method:`copy`. + """ def __init__(self, name, program_callables_info, @@ -172,8 +201,6 @@ class Program(ImmutableRecord): func_id_to_in_knl_callable_mappers): assert isinstance(program_callables_info, ProgramCallablesInfo) - # FIXME: check if all sanity checks have been covered? - # FIXME: The comments over here may need some attention. assert name in program_callables_info super(Program, self).__init__( @@ -194,6 +221,7 @@ class Program(ImmutableRecord): def copy(self, **kwargs): if 'target' in kwargs: + # target attribute of all the callable kernels should be updated. target = kwargs['target'] new_self = super(Program, self).copy(**kwargs) new_resolved_functions = {} @@ -266,13 +294,43 @@ class Program(ImmutableRecord): @property def root_kernel(self): + """ + Returns an instance of :class:`loopy.LoopKernel` denoting the topmost + level kernel in codegeneration. + + .. note:: + + Syntactic sugar. + """ return self.program_callables_info[self.name].subkernel @property def arg_dict(self): + """ + Returns ``arg_dict`` of the ``root_kernel``. + + .. note:: + + Syntactic sugar. + """ return self.root_kernel.arg_dict + @property + def args(self): + """ + Returns ``args`` of the ``root_kernel``. + + .. note:: + + Syntactic sugar. + """ + return self.root_kernel.args[:] + def with_root_kernel(self, root_kernel): + """ + Returns a copy of *self* with the topmost level kernel as + *root_kernel*. + """ new_in_knl_callable = self.program_callables_info[ self.name].copy(subkernel=root_kernel) new_resolved_functions = ( @@ -283,10 +341,6 @@ class Program(ImmutableRecord): program_callables_info=self.program_callables_info.copy( resolved_functions=new_resolved_functions)) - @property - def args(self): - return self.root_kernel.args[:] - def __call__(self, *args, **kwargs): key = self.target.get_kernel_executor_cache_key(*args, **kwargs) try: @@ -336,6 +390,10 @@ def next_indexed_function_identifier(function): class ResolvedFunctionRenamer(RuleAwareIdentityMapper): + """ + Mapper to rename the resolved functions in an expression according to + *renaming_dict*. + """ def __init__(self, rule_mapping_context, renaming_dict): super(ResolvedFunctionRenamer, self).__init__( rule_mapping_context) @@ -351,6 +409,10 @@ class ResolvedFunctionRenamer(RuleAwareIdentityMapper): def rename_resolved_functions_in_a_single_kernel(kernel, renaming_dict): + """ + Returns a copy of *kernel* with the instances of :class:`ResolvedFunction` + renames according to *renaming_dict*. + """ from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) @@ -364,6 +426,40 @@ def rename_resolved_functions_in_a_single_kernel(kernel, # {{{ program callables info class ProgramCallablesInfo(ImmutableRecord): + """ + Records the information of all the callables called in a :class:`loopy.Program`. + + .. attribute:: resolved_functions + + An instance of :class:`dict` that contains a mapping from function + identifier to instances of + :class:`loopy.kernel.function_interface.InKernelCallable` + + .. attribute:: num_times_callables_called + + An instace of :class:`dict` that contains a mapping from function + identifier to :class:`int`, that denotes the number of times the + callable is being called in the entire :class:`loopy.Program`. + + .. attribute:: history + + An instance of :class:`dict` that contains a mapping from function + identifier to and instance of :class:`list`that would contain all the + names taken by a function before the current name.(For example: one + possibility could be ``{'sin_1': ['sin', 'sin_0', 'sin_1']}``) + + .. attribute:: is_being_edited + + An instance of :class:`bool` which is intended to aid the working of + :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and + :meth:`with_exit_edit_callables_mode`. + + .. attribute:: renames_needed_after_editing + + An instance of :class:`dict` which is intended to aid the working of + :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and + :meth:`with_exit_edit_callables_mode`. + """ def __init__(self, resolved_functions, num_times_callables_called=None, history=None, is_being_edited=False, renames_needed_after_editing={}): -- GitLab From e3277fa2d162f773072109a951f05e24816a88e0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 13 Aug 2018 21:00:10 +0530 Subject: [PATCH 12/80] changes in program_callables_info design. --- loopy/kernel/__init__.py | 7 +++++++ loopy/program.py | 42 ++++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 3f637e53..3b189da5 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -221,6 +221,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): A subclass of :class:`loopy.TargetBase`. + .. attribute:: is_called_from_host + An instance of :class:`bool`. Will be set *False* for the kernel which + would be called from another top level kernels. Default value is + *True*. + """ # {{{ constructor @@ -248,6 +253,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): state=KernelState.INITIAL, target=None, + is_called_from_host=True, overridden_get_grid_sizes_for_insn_ids=None, _cached_written_variables=None): @@ -361,6 +367,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): options=options, state=state, target=target, + is_called_from_host=is_called_from_host, overridden_get_grid_sizes_for_insn_ids=( overridden_get_grid_sizes_for_insn_ids), _cached_written_variables=_cached_written_variables) diff --git a/loopy/program.py b/loopy/program.py index 1b9d03d4..0dc327aa 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -460,9 +460,9 @@ class ProgramCallablesInfo(ImmutableRecord): :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and :meth:`with_exit_edit_callables_mode`. """ - def __init__(self, resolved_functions, num_times_callables_called=None, - history=None, is_being_edited=False, - renames_needed_after_editing={}): + def __init__(self, resolved_functions, + num_times_callables_called=None, history=None, + is_being_edited=False, renames_needed_after_editing={}): if num_times_callables_called is None: num_times_callables_called = dict((func_id, 1) for func_id in @@ -487,11 +487,22 @@ class ProgramCallablesInfo(ImmutableRecord): update_persistent_hash = LoopKernel.update_persistent_hash + def add_callable(self, function, in_kernel_callable): + + history[unique_function_identifier] = set( + [unique_function_identifier]) + pass + + def with_updated_num_times_being_called(self): + root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable + in self.resolved_functions.values() if + isinstance(in_knl_callable, CallableKernel) and + in_knl_callable.is_called_from_host] + def with_edit_callables_mode(self): return self.copy(is_being_edited=True) - def with_callable(self, function, in_kernel_callable, - resolved_for_the_first_time=False): + def with_callable(self, function, in_kernel_callable): """ :arg function: An instance of :class:`pymbolic.primitives.Variable` or :class:`loopy.library.reduction.ReductionOpFunction`. @@ -538,8 +549,7 @@ class ProgramCallablesInfo(ImmutableRecord): if isinstance(function, ReductionOpFunction): unique_function_identifier = function.copy() - if not resolved_for_the_first_time: - num_times_callables_called[function] -= 1 + num_times_callables_called[function] -= 1 num_times_callables_called[unique_function_identifier] = 1 @@ -561,12 +571,11 @@ class ProgramCallablesInfo(ImmutableRecord): for func_id, in_knl_callable in self.resolved_functions.items(): if in_knl_callable == in_kernel_callable: num_times_callables_called[func_id] += 1 - if not resolved_for_the_first_time: - num_times_callables_called[function.name] -= 1 - if num_times_callables_called[function.name] == 0: - renames_needed_after_editing[func_id] = function.name + num_times_callables_called[function.name] -= 1 + if num_times_callables_called[function.name] == 0: + renames_needed_after_editing[func_id] = function.name - history[func_id] = history[func_id] | set([function.name]) + history[func_id] = history[func_id] | set([function.name]) return ( self.copy( history=history, @@ -577,16 +586,13 @@ class ProgramCallablesInfo(ImmutableRecord): func_id) else: unique_function_identifier = function.name - if (resolved_for_the_first_time or - self.num_times_callables_called[function.name] > 1): + if self.num_times_callables_called[function.name] > 1: while unique_function_identifier in self.resolved_functions: unique_function_identifier = ( next_indexed_function_identifier( unique_function_identifier)) - if not resolved_for_the_first_time: - num_times_callables_called[function.name] -= 1 - + num_times_callables_called[function.name] -= 1 num_times_callables_called[unique_function_identifier] = 1 updated_resolved_functions = self.resolved_functions.copy() @@ -597,8 +603,6 @@ class ProgramCallablesInfo(ImmutableRecord): history[unique_function_identifier] = ( history[function.name] | set([unique_function_identifier])) else: - history[unique_function_identifier] = set( - [unique_function_identifier]) return ( self.copy( -- GitLab From a4ebe862bb8e434fc67d85c4b9201bad12577975 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 09:17:03 +0530 Subject: [PATCH 13/80] new design to interface with program callables info. --- loopy/preprocess.py | 6 +- loopy/program.py | 448 ++++++++++++++++++++++++------------ loopy/transform/callable.py | 24 +- loopy/transform/fusion.py | 117 +++++----- loopy/type_inference.py | 10 +- 5 files changed, 384 insertions(+), 221 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index bf23c4a4..56db777b 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2269,6 +2269,9 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info): def infer_arg_descr(program): root_kernel_callable = program.program_callables_info[program.name] + from loopy.program import count_callables_in_program_callables_info + old_callables_count = count_callables_in_program_callables_info( + program.program_callables_info) program_callables_info = ( program.program_callables_info.with_edit_callables_mode()) root_kernel = program.root_kernel @@ -2280,7 +2283,8 @@ def infer_arg_descr(program): program_callables_info, _ = program_callables_info.with_callable(program.name, new_root_kernel_callable) - program_callables_info = program_callables_info.with_exit_edit_callables_mode() + program_callables_info = program_callables_info.with_exit_edit_callables_mode( + old_callables_count) return program.copy(program_callables_info=program_callables_info) diff --git a/loopy/program.py b/loopy/program.py index 0dc327aa..32869d26 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -29,12 +29,20 @@ from pytools import ImmutableRecord, memoize_method from pymbolic.primitives import Variable from functools import wraps -from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction +from loopy.symbolic import ( + RuleAwareIdentityMapper, ResolvedFunction, CombineMapper) from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) +from loopy.kernel.instruction import ( + MultiAssignmentBase, CInstruction, _DataObliviousInstruction) from loopy.diagnostic import LoopyError +from loopy.library.reduction import ReductionOpFunction from loopy.kernel import LoopKernel +from collections import Counter +from pymbolic.primitives import Call, CallWithKwargs + +# FIXME: autofunction/autoclass?? ~KK class ResolvedFunctionMarker(RuleAwareIdentityMapper): @@ -60,7 +68,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): super(ResolvedFunctionMarker, self).__init__(rule_mapping_context) self.kernel = kernel self.program_callables_info = program_callables_info - # FIXME: function_resolvesrs looks like a very bad name change it self.function_id_to_in_knl_callable_mappers = ( function_id_to_in_knl_callable_mappers) @@ -71,7 +78,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): :arg:`identifier` is known to any kernel function scoper, otherwise returns *None*. """ - # FIXME change docs for func_id_to_in_knl_callable_mapper in ( self.function_id_to_in_knl_callable_mappers): # fixme: do we really need to given target for the function @@ -83,7 +89,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): return None def map_call(self, expr, expn_state): - from pymbolic.primitives import Call, CallWithKwargs from loopy.symbolic import parse_tagged_name name, tag = parse_tagged_name(expr.function) @@ -109,8 +114,8 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): # resolved in-kernel callable self.program_callables_info, new_func_id = ( - self.program_callables_info.with_callable(expr.function, - in_knl_callable, True)) + self.program_callables_info.with_add_callable(expr.function, + in_knl_callable)) return type(expr)( ResolvedFunction(new_func_id), tuple(self.rec(child, expn_state) @@ -135,10 +140,15 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) -def initialize_program_callables_info_from_kernel( - kernel, func_id_to_kernel_callable_mappers): +def initialize_program_callables_info_from_kernel(kernel): + """ + Returns an instance of :class:`loopy.ProgramCallablesInfo`, by resolving + the functions based on :mod:`loopy`'s default function resolvers. + """ + # collect the default function resolvers + func_id_to_kernel_callable_mappers = ( + default_func_id_to_kernel_callable_mappers(kernel.target)) program_callables_info = ProgramCallablesInfo({}) - program_callables_info = program_callables_info.with_edit_callables_mode() from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( @@ -148,16 +158,17 @@ def initialize_program_callables_info_from_kernel( rule_mapping_context, kernel, program_callables_info, func_id_to_kernel_callable_mappers) - # scoping fucntions and collecting the scoped functions + # mark the functions as "Resolved" in the expression nodes. kernel_with_functions_resolved = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(kernel)) + # collect the update program_callables_info program_callables_info = resolved_function_marker.program_callables_info callable_kernel = CallableKernel(kernel_with_functions_resolved) - program_callables_info, _ = program_callables_info.with_callable( - Variable(kernel.name), callable_kernel, True) - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) + + # add the callable kernel to the program_callables_info + program_callables_info, _ = program_callables_info.with_add_callable( + Variable(kernel.name), callable_kernel) return program_callables_info @@ -357,33 +368,31 @@ class Program(ImmutableRecord): # }}} -def next_indexed_function_identifier(function): +def next_indexed_function_identifier(function_id): """ Returns an instance of :class:`str` with the next indexed-name in the sequence for the name of *function*. *Example:* ``'sin_0'`` will return ``'sin_1'``. - :arg function: Either an instance of :class:`str`, - :class:`pymbolic.primitives.Variable` , - :class:`loopy.reduction.ReductionOpFunction`. + :arg function_id: Either an instance of :class:`str`. """ - from loopy.library.reduction import ReductionOpFunction - if isinstance(function, ReductionOpFunction): - return function.copy() - elif isinstance(function, str): - function = Variable(function) - assert isinstance(function, Variable) + # {{{ sanity checks + + assert isinstance(function_id, str) + + # }}} + func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") - match = func_name.match(function.name) + match = func_name.match(function_id) if match is None: - if function.name[-1] == '_': - return "{old_name}0".format(old_name=function.name) + if function_id[-1] == '_': + return "{old_name}0".format(old_name=function_id) else: - return "{old_name}_0".format(old_name=function.name) + return "{old_name}_0".format(old_name=function_id) return "{alpha}_{num}".format(alpha=match.group('alpha'), num=int(match.group('num'))+1) @@ -423,6 +432,115 @@ def rename_resolved_functions_in_a_single_kernel(kernel, resolved_function_renamer.map_kernel(kernel))) +# {{{ counting helpers + +class CallablesCountingMapper(CombineMapper): + """ + Returns an instance of :class:`collections.Counter` with the count of + callables registered in *program_callables_info*. + + .. attribute:: program_callables_info + + An instance of :class:`loopy.program.ProgramCallablesInfo`. + """ + def __init__(self, program_callables_info): + self.program_callables_info = program_callables_info + + def combine(self, values): + return sum(values, Counter()) + + def map_call(self, expr): + + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + assert isinstance(expr, Call) + kw_parameters = {} + + if isinstance(expr.function, (ResolvedFunction)): + in_knl_callable = self.program_callables_info[expr.function.name] + if isinstance(in_knl_callable, ScalarCallable): + return (Counter([expr.function.name]) + + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + + elif isinstance(in_knl_callable, CallableKernel): + + # callable kernels have more callables in them. + callables_count_in_subkernel = ( + count_callables_in_kernel( + in_knl_callable.subkernel, + self.program_callables_info)) + + return (Counter([expr.function.name]) + + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + ( + callables_count_in_subkernel) + else: + raise NotImplementedError("Unknown callable type %s." % ( + type)) + else: + return ( + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + + map_call_with_kwargs = map_call + + def map_constant(self, expr): + return Counter() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +# FIXME: @memoize_method +def count_callables_in_kernel(kernel, program_callables_info): + """ + Returns an instance of :class:`collections.Counter` representing the number + of callables in the *kernel* that are registered in + *program_callables_info*. + """ + assert isinstance(kernel, LoopKernel) + callables_count = Counter() + callables_counting_mapper = CallablesCountingMapper( + program_callables_info) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callables_count += ( + callables_counting_mapper(insn.expression)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + pass + else: + raise NotImplementedError("Unknown instruction type %s." % ( + type(insn))) + + return callables_count + + +# FIXME: @memoize_method +def count_callables_in_program_callables_info(program_callables_info): + """ + Returns an instance of :class:`collection.Counter` representing the number + of times the callables is called in program_callables_info. + """ + root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable + in program_callables_info.values() if + isinstance(in_knl_callable, CallableKernel) and + in_knl_callable.subkernel.is_called_from_host] + + from collections import Counter + callables_count = Counter([root_kernel_name]) + callables_count += ( + count_callables_in_kernel(program_callables_info[ + root_kernel_name].subkernel, program_callables_info)) + return callables_count + +# }}} + + # {{{ program callables info class ProgramCallablesInfo(ImmutableRecord): @@ -435,12 +553,6 @@ class ProgramCallablesInfo(ImmutableRecord): identifier to instances of :class:`loopy.kernel.function_interface.InKernelCallable` - .. attribute:: num_times_callables_called - - An instace of :class:`dict` that contains a mapping from function - identifier to :class:`int`, that denotes the number of times the - callable is being called in the entire :class:`loopy.Program`. - .. attribute:: history An instance of :class:`dict` that contains a mapping from function @@ -453,54 +565,92 @@ class ProgramCallablesInfo(ImmutableRecord): An instance of :class:`bool` which is intended to aid the working of :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and :meth:`with_exit_edit_callables_mode`. - - .. attribute:: renames_needed_after_editing - - An instance of :class:`dict` which is intended to aid the working of - :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and - :meth:`with_exit_edit_callables_mode`. """ def __init__(self, resolved_functions, - num_times_callables_called=None, history=None, - is_being_edited=False, renames_needed_after_editing={}): + history=None, is_being_edited=False): - if num_times_callables_called is None: - num_times_callables_called = dict((func_id, 1) for func_id in - resolved_functions) if history is None: history = dict((func_id, set([func_id])) for func_id in resolved_functions) super(ProgramCallablesInfo, self).__init__( resolved_functions=resolved_functions, - num_times_callables_called=num_times_callables_called, history=history, - is_being_edited=is_being_edited, - renames_needed_after_editing=renames_needed_after_editing) + is_being_edited=is_being_edited) hash_fields = ( "resolved_functions", - "num_times_callables_called", "is_being_edited", - "renames_needed_after_editing", "history") update_persistent_hash = LoopKernel.update_persistent_hash - def add_callable(self, function, in_kernel_callable): + def with_add_callable(self, function, in_kernel_callable): + """ + Returns a copy of *self* with the *function* associated with the + *in_kernel_callable*. + """ + # note: this does not require the edit mode to be true. + # the reason for the edit mode is that we need to take care of the + # renaming that might be needed to be done + # PS: delete this note? + history = self.history.copy() + + if in_kernel_callable in self.resolved_functions.values(): + # the callable already exists, implies return the function + # identifier corresposing to that callable. + for func_id, in_knl_callable in self.resolved_functions.items(): + if in_knl_callable == in_kernel_callable: + history[func_id] = history[func_id] | set([function.name]) + return ( + self.copy( + history=history), + func_id) + else: + + # {{{ handle ReductionOpFunction + + if isinstance(function, ReductionOpFunction): + unique_function_identifier = function.copy() + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + return ( + self.copy( + resolved_functions=updated_resolved_functions), + unique_function_identifier) + + # }}} + + unique_function_identifier = function.name + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) history[unique_function_identifier] = set( [unique_function_identifier]) - pass - def with_updated_num_times_being_called(self): - root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable - in self.resolved_functions.values() if - isinstance(in_knl_callable, CallableKernel) and - in_knl_callable.is_called_from_host] + return ( + self.copy( + history=history, + resolved_functions=updated_resolved_functions), + Variable(unique_function_identifier)) def with_edit_callables_mode(self): - return self.copy(is_being_edited=True) + """ + Initiates *self* for a walk traversal through all the callables. + """ + # PS: I don't see a need for this method right now. + # This is just for validation purposes, maybe needs to disapper if you + # find a better solution? + return self.copy( + is_being_edited=True) def with_callable(self, function, in_kernel_callable): """ @@ -512,27 +662,24 @@ class ProgramCallablesInfo(ImmutableRecord): .. note:: - Assumes that each callable is touched atmost once, the internal - working of this function fails if that is violated. + - Use :meth:`with_add_callable` if a callable is being resolved for the + first time. """ - # FIXME: add a note about using enter and exit. ~KK - # FIXME: think about a better idea of "with_added_callable" this would - # be more convenient for developer-faced usage. ~KK - # FIXME: Is this is a bad code? Yes. - # Is there a better alternative to it. Definitely maybe. - # But I don't want to spend the next 182 years of my life optimizing - # some scheme, without even implmenting it to some problem! + + # {{{ non-edit mode if not self.is_being_edited: if function.name in self.resolved_functions and ( self.resolved_functions[function.name] == in_kernel_callable): + # if not being edited, check that the given function is + # equal to the the old version of the callable. return self, function else: print('Old: ', self.resolved_functions[function.name]) print('New: ', in_kernel_callable) - raise LoopyError("Use 'enter_edit_callables_mode' first.") + raise LoopyError("Use 'with_enter_edit_callables_mode' first.") - from loopy.library.reduction import ReductionOpFunction + # }}} # {{{ sanity checks @@ -543,87 +690,90 @@ class ProgramCallablesInfo(ImmutableRecord): # }}} - renames_needed_after_editing = self.renames_needed_after_editing.copy() - num_times_callables_called = self.num_times_callables_called.copy() history = self.history.copy() - if isinstance(function, ReductionOpFunction): - unique_function_identifier = function.copy() - num_times_callables_called[function] -= 1 - - num_times_callables_called[unique_function_identifier] = 1 - - updated_resolved_functions = self.resolved_functions.copy() - updated_resolved_functions[unique_function_identifier] = ( - in_kernel_callable) - - return ( - self.copy( - resolved_functions=updated_resolved_functions, - num_times_callables_called=num_times_callables_called, - renames_needed_after_editing=( - renames_needed_after_editing)), - unique_function_identifier) - if in_kernel_callable in self.resolved_functions.values(): - # the callable already exists, implies return the function - # identifier corresposing to that callable. + + # the callable already exists, hence return the function + # identifier corresponding to that callable. for func_id, in_knl_callable in self.resolved_functions.items(): if in_knl_callable == in_kernel_callable: - num_times_callables_called[func_id] += 1 - num_times_callables_called[function.name] -= 1 - if num_times_callables_called[function.name] == 0: - renames_needed_after_editing[func_id] = function.name - history[func_id] = history[func_id] | set([function.name]) return ( self.copy( - history=history, - num_times_callables_called=( - num_times_callables_called), - renames_needed_after_editing=( - renames_needed_after_editing)), + history=history), func_id) else: - unique_function_identifier = function.name - if self.num_times_callables_called[function.name] > 1: - while unique_function_identifier in self.resolved_functions: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) + # {{{ handle ReductionOpFunction + + if isinstance(function, ReductionOpFunction): + unique_function_identifier = function.copy() + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) - num_times_callables_called[function.name] -= 1 - num_times_callables_called[unique_function_identifier] = 1 + return ( + self.copy( + resolved_functions=updated_resolved_functions), + unique_function_identifier) + + # }}} + unique_function_identifier = function.name + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) - if not resolved_for_the_first_time: - history[unique_function_identifier] = ( - history[function.name] | set([unique_function_identifier])) - else: + history[unique_function_identifier] = ( + history[function.name] | set([unique_function_identifier])) return ( self.copy( history=history, - resolved_functions=updated_resolved_functions, - num_times_callables_called=num_times_callables_called, - renames_needed_after_editing=renames_needed_after_editing), + resolved_functions=updated_resolved_functions), Variable(unique_function_identifier)) - def with_exit_edit_callables_mode(self): + def with_exit_edit_callables_mode(self, old_callables_count): + """ + Returns a copy of *self* with renaming of the callables done whenver + possible. + + *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``, + then all the renaming is done such that one of flavors of the function + is renamed back to ``sin``. + """ + + new_callables_count = count_callables_in_program_callables_info( + self) + history = self.history.copy() + renames_needed = {} + assert self.is_being_edited - num_times_callables_called = {} + # NOTE:(to self by KK) + # all we need to do is change the name of the variables that were seen + # in old_callables_count but are no longer available. + # Using these 2 figure out the renames needed. + for old_func_id in old_callables_count-new_callables_count: + # this implies that all the function instances having the name + # "func_id" have been renamed to something else. + for new_func_id in ( + new_callables_count.keys()-renames_needed.keys()): + if old_func_id in history[new_func_id]: + renames_needed[new_func_id] = old_func_id + resolved_functions = {} - history = self.history.copy() for func_id, in_knl_callable in self.resolved_functions.items(): if isinstance(in_knl_callable, CallableKernel): + # If callable kernel, perform renames. old_subkernel = in_knl_callable.subkernel new_subkernel = rename_resolved_functions_in_a_single_kernel( - old_subkernel, self.renames_needed_after_editing) + old_subkernel, renames_needed) in_knl_callable = ( in_knl_callable.copy(subkernel=new_subkernel)) elif isinstance(in_knl_callable, ScalarCallable): @@ -632,44 +782,22 @@ class ProgramCallablesInfo(ImmutableRecord): raise NotImplementedError("Unknown callable type %s." % type(in_knl_callable).__name__) - if func_id in self.renames_needed_after_editing: + if func_id in renames_needed: + # If function name itself in renames change the key of the + # dict. history.pop(func_id) - new_func_id = self.renames_needed_after_editing[func_id] + new_func_id = renames_needed[func_id] resolved_functions[new_func_id] = ( in_knl_callable) - num_times_callables_called[new_func_id] = ( - self.num_times_callables_called[func_id]) - else: resolved_functions[func_id] = in_knl_callable - num_times_callables_called[func_id] = ( - self.num_times_callables_called[func_id]) return self.copy( is_being_edited=False, - resolved_functions=resolved_functions, - num_times_callables_called=num_times_callables_called, - renames_needed_after_editing={}) - - def with_deleted_callable(self, func_id, instances=1): - num_times_callables_called = self.num_times_callables_called.copy() - history = self.history.copy() - resolved_functions = self.resolved_functions.copy() - - assert instances <= num_times_callables_called[func_id] + resolved_functions=resolved_functions) - num_times_callables_called[func_id] -= instances - - if num_times_callables_called[func_id] == 0: - num_times_callables_called.pop(func_id) - history.pop(func_id) - resolved_functions.pop(func_id) - - return self.copy( - resolved_functions=resolved_functions, - num_times_callables_called=num_times_callables_called, - history=history) + # {{{ behave like a dict(syntactic sugar) def __getitem__(self, item): return self.resolved_functions[item] @@ -683,11 +811,16 @@ class ProgramCallablesInfo(ImmutableRecord): def values(self): return self.resolved_functions.values() + # }}} # }}} def default_func_id_to_kernel_callable_mappers(target): + """ + Returns a list of functions that are provided through *target* by deafault. + """ + # FIXME: name scopers is confusing!(change it to something else.) from loopy.library.function import loopy_specific_callable_scopers return ( @@ -695,11 +828,18 @@ def default_func_id_to_kernel_callable_mappers(target): target.get_device_ast_builder().function_scopers())) +# {{{ helper functions + def make_program_from_kernel(kernel): + """ + Returns an instance of :class:`loopy.Program` with the *kernel* as the root + kernel. + """ - program_callables_info = initialize_program_callables_info_from_kernel(kernel, - default_func_id_to_kernel_callable_mappers(kernel.target)) + # get the program callables info + program_callables_info = initialize_program_callables_info_from_kernel(kernel) + # get the program from program callables info program = Program( name=kernel.name, program_callables_info=program_callables_info, @@ -711,6 +851,12 @@ def make_program_from_kernel(kernel): def iterate_over_kernels_if_given_program(transform_for_single_kernel): + """ + Function wrapper for transformations of the type ``transform(kernel: + LoopKernel, *args, **kwargs): LoopKernel``. Returns a function with the + ``transform`` being implemented on all of the callable kernels in a + :class:`loopy.Program`. + """ def _collective_transform(program_or_kernel, *args, **kwargs): if isinstance(program_or_kernel, Program): program = program_or_kernel @@ -740,5 +886,7 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): return wraps(transform_for_single_kernel)(_collective_transform) +# }}} + # vim: foldmethod=marker diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 9d9935ab..90f53095 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -35,10 +35,18 @@ __doc__ = """ # {{{ register function lookup -def resolved_callables_from_function_lookup(program, - func_id_to_kernel_callable_mapper): +def _resolved_callables_from_function_lookup(program, + func_id_to_in_kernel_callable_mapper): + """ + Returns a copy of *program* with the expression nodes marked "Resolved" + if any match is found through the given + *func_id_to_in_kernel_callable_mapper*. + + :arg func_id_to_in_kernel_callable_mapper: A function with signature + ``(target, identifier)`` that returns either an instance of + :class:`loopy.InKernelCallable` or *None*. + """ program_callables_info = program.program_callables_info - program_callables_info = program_callables_info.with_edit_callables_mode() callable_knls = dict( (func_id, in_knl_callable) for func_id, in_knl_callable in @@ -55,9 +63,8 @@ def resolved_callables_from_function_lookup(program, resolved_function_marker = ResolvedFunctionMarker( rule_mapping_context, kernel, program_callables_info, - [func_id_to_kernel_callable_mapper]) + [func_id_to_in_kernel_callable_mapper]) - # scoping fucntions and collecting the scoped functions new_subkernel = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(kernel)) program_callables_info = resolved_function_marker.program_callables_info @@ -65,9 +72,6 @@ def resolved_callables_from_function_lookup(program, edited_callable_knls[func_id] = in_knl_callable.copy( subkernel=new_subkernel) - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) - new_resolved_functions = {} for func_id, in_knl_callable in program_callables_info.items(): @@ -85,7 +89,7 @@ def resolved_callables_from_function_lookup(program, def register_function_id_to_in_knl_callable_mapper(program, func_id_to_in_knl_callable_mapper): """ - Returns a copy of *kernel* with the *function_lookup* registered. + Returns a copy of *program* with the *function_lookup* registered. :arg func_id_to_in_knl_callable_mapper: A function of signature ``(target, identifier)`` returning a @@ -105,7 +109,7 @@ def register_function_id_to_in_knl_callable_mapper(program, new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + ( [func_id_to_in_knl_callable_mapper]) - program = resolved_callables_from_function_lookup(program, + program = _resolved_callables_from_function_lookup(program, func_id_to_in_knl_callable_mapper) new_program = program.copy( diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index d43ce025..f2e62368 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -292,50 +292,6 @@ def _fuse_two_kernels(knla, knlb): def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): - """Return a kernel that performs all the operations in all entries - of *kernels*. - - :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. - :arg suffixes: If given, must be a list of strings of a length matching - that of *kernels*. This will be used to disambiguate the names - of temporaries, as described below. - :arg data_flow: A list of data dependencies - ``[(var_name, from_kernel, to_kernel), ...]``. - Based on this, the fuser will create dependencies between all - writers of *var_name* in ``kernels[from_kernel]`` to - readers of *var_name* in ``kernels[to_kernel]``. - *from_kernel* and *to_kernel* are indices into *kernels*. - - The components of the kernels are fused as follows: - - * The resulting kernel will have a domain involving all the inames - and parameters occurring across *kernels*. - Inames with matching names across *kernels* are fused in such a way - that they remain a single iname in the fused kernel. - Use :func:`loopy.rename_iname` if this is not desired. - - * The projection of the domains of each pair of kernels onto their - common subset of inames must match in order for fusion to - succeed. - - * Assumptions are fused by taking their conjunction. - - * If kernel arguments with matching names are encountered across - *kernels*, their declarations must match in order for fusion to - succeed. - - * Temporaries are automatically renamed to remain uniquely associated - with each instruction stream. - - * The resulting kernel will contain all instructions from each entry - of *kernels*. Clashing instruction IDs will be renamed to ensure - uniqueness. - - .. versionchanged:: 2016.2 - - *data_flow* was added in version 2016.2 - """ - assert all(isinstance(knl, LoopKernel) for knl in kernels) kernels = list(kernels) @@ -419,8 +375,54 @@ def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): def fuse_kernels(programs, suffixes=None, data_flow=None): + """Return a kernel that performs all the operations in all entries + of *kernels*. + + :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. + :arg suffixes: If given, must be a list of strings of a length matching + that of *kernels*. This will be used to disambiguate the names + of temporaries, as described below. + :arg data_flow: A list of data dependencies + ``[(var_name, from_kernel, to_kernel), ...]``. + Based on this, the fuser will create dependencies between all + writers of *var_name* in ``kernels[from_kernel]`` to + readers of *var_name* in ``kernels[to_kernel]``. + *from_kernel* and *to_kernel* are indices into *kernels*. + + The components of the kernels are fused as follows: + + * The resulting kernel will have a domain involving all the inames + and parameters occurring across *kernels*. + Inames with matching names across *kernels* are fused in such a way + that they remain a single iname in the fused kernel. + Use :func:`loopy.rename_iname` if this is not desired. + + * The projection of the domains of each pair of kernels onto their + common subset of inames must match in order for fusion to + succeed. + + * Assumptions are fused by taking their conjunction. + + * If kernel arguments with matching names are encountered across + *kernels*, their declarations must match in order for fusion to + succeed. + + * Temporaries are automatically renamed to remain uniquely associated + with each instruction stream. + + * The resulting kernel will contain all instructions from each entry + of *kernels*. Clashing instruction IDs will be renamed to ensure + uniqueness. + + .. versionchanged:: 2016.2 + + *data_flow* was added in version 2016.2 + """ + + # all the resolved functions in programs must be registered in + # main_program_callables_info main_prog_callables_info = ( - programs[0].program_callables_info.with_edit_callables_mode()) + programs[0].program_callables_info) old_root_kernel_callable = ( programs[0].program_callables_info[programs[0].name]) kernels = [programs[0].root_kernel] @@ -431,17 +433,22 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): renames_needed = {} for old_func_id, in_knl_callable in prog.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): + # Fusing programs with multiple callable kernels is tough. + # Reason: Need to first figure out the order in which the + # callable kernels must be resolved into + # main_program_callables_info, because of renaming is + # needed to be done in the callable kernels before registering. + # Hence disabling it until required. if in_knl_callable.name != prog.name: raise LoopyError("fuse_kernels cannot fuse programs with " "multiple callable kernels.") + + # root kernel are dealt at the end after performing all the + # renaming. continue - num_times_called = ( - prog.program_callables_info.num_times_callables_called[ - old_func_id]) - for i in range(num_times_called): - main_prog_callables_info, new_func_id = ( - main_prog_callables_info.with_callables(var(old_func_id), - in_knl_callable, True)) + main_prog_callables_info, new_func_id = ( + main_prog_callables_info.with_add_callable(var(old_func_id), + in_knl_callable)) if old_func_id != new_func_id: renames_needed[old_func_id] = new_func_id @@ -456,12 +463,10 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): new_root_kernel_callable = old_root_kernel_callable.copy( subkernel=new_root_kernel.copy(name=programs[0].name)) - main_prog_callables_info, _ = main_prog_callables_info.with_callable( + # TODO: change the name of the final root kernel. + main_prog_callables_info, _ = main_prog_callables_info.with_add_callable( var(programs[0].name), new_root_kernel_callable) - main_prog_callables_info = ( - main_prog_callables_info.with_exit_edit_callables_mode()) - return programs[0].copy( program_callables_info=main_prog_callables_info) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 3ae9a142..ab37519e 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -52,7 +52,7 @@ def _debug(kernel, s, *args): def get_return_types_as_tuple(arg_id_to_dtype): """Returns the types of arguments in a tuple format. - :param arg_id_to_dtype: An instance of :class:`dict` which denotes a + :arg arg_id_to_dtype: An instance of :class:`dict` which denotes a mapping from the arguments to their inferred types. """ return_arg_id_to_dtype = dict((id, dtype) for id, dtype in @@ -894,6 +894,9 @@ def infer_unknown_types(program, expect_completion=False): program_callables_info[program.name]) type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel + from loopy.program import count_callables_in_program_callables_info + old_callables_count = count_callables_in_program_callables_info( + program_callables_info) program_callables_info = ( program.program_callables_info.with_edit_callables_mode()) root_kernel, program_callables_info = ( @@ -910,10 +913,9 @@ def infer_unknown_types(program, expect_completion=False): type_inferred_knl_callable)) program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) + program_callables_info.with_exit_edit_callables_mode( + old_callables_count)) - # FIXME: maybe put all of this in a function? - # need to infer functions that were left out during inference return program.copy(program_callables_info=program_callables_info) # }}} -- GitLab From 42229e028ba32c132fde98deee8edec002354131 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 11:23:35 +0530 Subject: [PATCH 14/80] much better design for program callables info. --- loopy/program.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 32869d26..e3a527ee 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -526,6 +526,8 @@ def count_callables_in_program_callables_info(program_callables_info): Returns an instance of :class:`collection.Counter` representing the number of times the callables is called in program_callables_info. """ + # should raise an error if there are more than one root kernels(which is + # illegal) root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable in program_callables_info.values() if isinstance(in_knl_callable, CallableKernel) and @@ -636,6 +638,9 @@ class ProgramCallablesInfo(ImmutableRecord): history[unique_function_identifier] = set( [unique_function_identifier]) + if unique_function_identifier == 'loopy_kernel_0': + 1/0 + return ( self.copy( history=history, @@ -719,10 +724,16 @@ class ProgramCallablesInfo(ImmutableRecord): # }}} unique_function_identifier = function.name - while unique_function_identifier in self.resolved_functions: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) + + if isinstance(in_kernel_callable, CallableKernel) and ( + in_kernel_callable.subkernel.is_called_from_host): + # special treatment if the callable is the root kernel + pass + else: + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( -- GitLab From fa0fb70b114f3727a3683488e2cc55c900081873 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 12:22:50 +0530 Subject: [PATCH 15/80] deal with reduction callables. --- loopy/program.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index e3a527ee..7010e110 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -135,8 +135,9 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) assert in_knl_callable is not None self.program_callables_info, _ = ( - self.program_callables_info.with_callable(func_id, - in_knl_callable, True)) + self.program_callables_info.with_add_callable(func_id, + in_knl_callable)) + # FIXME: where do you deal with the parameters? ~KK return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) @@ -486,6 +487,10 @@ class CallablesCountingMapper(CombineMapper): map_call_with_kwargs = map_call + def map_reduction(self, expr): + return Counter(expr.operation.get_scalar_callables()) + ( + super(CallablesCountingMapper, self).map_reduction(expr)) + def map_constant(self, expr): return Counter() @@ -592,10 +597,21 @@ class ProgramCallablesInfo(ImmutableRecord): Returns a copy of *self* with the *function* associated with the *in_kernel_callable*. """ + # FIXME: pleasse better docs.. ~KK # note: this does not require the edit mode to be true. # the reason for the edit mode is that we need to take care of the # renaming that might be needed to be done # PS: delete this note? + + # {{{ sanity checks + + if isinstance(function, str): + function = Variable(function) + + assert isinstance(function, (Variable, ReductionOpFunction)) + + # }}} + history = self.history.copy() if in_kernel_callable in self.resolved_functions.values(): @@ -617,9 +633,12 @@ class ProgramCallablesInfo(ImmutableRecord): updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) + history[unique_function_identifier] = set( + [unique_function_identifier]) return ( self.copy( + history=history, resolved_functions=updated_resolved_functions), unique_function_identifier) @@ -638,9 +657,6 @@ class ProgramCallablesInfo(ImmutableRecord): history[unique_function_identifier] = set( [unique_function_identifier]) - if unique_function_identifier == 'loopy_kernel_0': - 1/0 - return ( self.copy( history=history, @@ -779,7 +795,8 @@ class ProgramCallablesInfo(ImmutableRecord): resolved_functions = {} - for func_id, in_knl_callable in self.resolved_functions.items(): + for func_id in new_callables_count: + in_knl_callable = self.resolved_functions[func_id] if isinstance(in_knl_callable, CallableKernel): # If callable kernel, perform renames. old_subkernel = in_knl_callable.subkernel -- GitLab From a161a4854c2b800884fc12269062f60cafe8b95e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 12:26:34 +0530 Subject: [PATCH 16/80] removes wrong invocation of with_callable for ManglerCallable. --- loopy/type_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index ab37519e..8b5a656c 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -408,8 +408,8 @@ class TypeInferenceMapper(CombineMapper): identifier, function_mangler, arg_id_to_dtype, arg_id_to_descr, mangle_result.target_name) self.program_callables_info, new_function_id = ( - self.program_callables_info.with_callable( - expr.function, in_knl_callable, True)) + self.program_callables_info.with_add_callable( + expr.function, in_knl_callable)) if isinstance(expr, Call): self.old_calls_to_new_calls[expr] = new_function_id -- GitLab From 76336791d7b6cb6919ec97b02a32f4e74740c7db Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 12:50:27 +0530 Subject: [PATCH 17/80] count callables in expression after expanding for substitutitons. --- loopy/kernel/__init__.py | 4 ++-- loopy/program.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 3b189da5..89aef660 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1367,8 +1367,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ direct execution def __call__(self, *args, **kwargs): - raise LoopyError("Calling a LoopKernel is deprecated, call a Program " - "instead.") + warn("Calling a LoopKernel is deprecated, call a Program " + "instead.", DeprecationWarning, stacklevel=2) from loopy.program import make_program_from_kernel program = make_program_from_kernel(self) return program(*args, **kwargs) diff --git a/loopy/program.py b/loopy/program.py index 7010e110..12fe756d 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -29,8 +29,8 @@ from pytools import ImmutableRecord, memoize_method from pymbolic.primitives import Variable from functools import wraps -from loopy.symbolic import ( - RuleAwareIdentityMapper, ResolvedFunction, CombineMapper) +from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction, + CombineMapper, SubstitutionRuleExpander) from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.kernel.instruction import ( @@ -511,11 +511,13 @@ def count_callables_in_kernel(kernel, program_callables_info): callables_count = Counter() callables_counting_mapper = CallablesCountingMapper( program_callables_info) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): callables_count += ( - callables_counting_mapper(insn.expression)) + callables_counting_mapper(subst_expander( + insn.expression))) elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): pass else: -- GitLab From ab8bebf0a06bc3661396d0b49176ae47c7ee40f1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 13:16:30 +0530 Subject: [PATCH 18/80] pass statistics --- loopy/preprocess.py | 4 +--- loopy/program.py | 49 ++++++++++++++++++++++------------------- loopy/statistics.py | 28 ++++++++++------------- loopy/type_inference.py | 4 +--- 4 files changed, 40 insertions(+), 45 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 56db777b..472c74db 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2269,9 +2269,7 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info): def infer_arg_descr(program): root_kernel_callable = program.program_callables_info[program.name] - from loopy.program import count_callables_in_program_callables_info - old_callables_count = count_callables_in_program_callables_info( - program.program_callables_info) + old_callables_count = program.program_callables_info.callables_count() program_callables_info = ( program.program_callables_info.with_edit_callables_mode()) root_kernel = program.root_kernel diff --git a/loopy/program.py b/loopy/program.py index 12fe756d..a0477bdf 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -526,27 +526,6 @@ def count_callables_in_kernel(kernel, program_callables_info): return callables_count - -# FIXME: @memoize_method -def count_callables_in_program_callables_info(program_callables_info): - """ - Returns an instance of :class:`collection.Counter` representing the number - of times the callables is called in program_callables_info. - """ - # should raise an error if there are more than one root kernels(which is - # illegal) - root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable - in program_callables_info.values() if - isinstance(in_knl_callable, CallableKernel) and - in_knl_callable.subkernel.is_called_from_host] - - from collections import Counter - callables_count = Counter([root_kernel_name]) - callables_count += ( - count_callables_in_kernel(program_callables_info[ - root_kernel_name].subkernel, program_callables_info)) - return callables_count - # }}} @@ -594,6 +573,29 @@ class ProgramCallablesInfo(ImmutableRecord): update_persistent_hash = LoopKernel.update_persistent_hash + # FIXME: @memoize_method + def callables_count(self): + """ + Returns an instance of :class:`collection.Counter` representing the number + of times the callables is called in program_callables_info. + """ + # should raise an error if there are more than one root kernels(which is + # illegal) + root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable + in self.values() if + isinstance(in_knl_callable, CallableKernel) and + in_knl_callable.subkernel.is_called_from_host] + + from collections import Counter + callables_count = Counter([root_kernel_name]) + callables_count += ( + count_callables_in_kernel(self[ + root_kernel_name].subkernel, self)) + + return callables_count + + # {{{ interface to perfrom edits on callables + def with_add_callable(self, function, in_kernel_callable): """ Returns a copy of *self* with the *function* associated with the @@ -776,8 +778,7 @@ class ProgramCallablesInfo(ImmutableRecord): is renamed back to ``sin``. """ - new_callables_count = count_callables_in_program_callables_info( - self) + new_callables_count = self.callables_count() history = self.history.copy() renames_needed = {} @@ -827,6 +828,8 @@ class ProgramCallablesInfo(ImmutableRecord): is_being_edited=False, resolved_functions=resolved_functions) + # }}} + # {{{ behave like a dict(syntactic sugar) def __getitem__(self, item): diff --git a/loopy/statistics.py b/loopy/statistics.py index 95e9f62a..3799967b 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1396,17 +1396,17 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, op_map = ToCountMap() + callables_count = ( + program.program_callables_info.callables_count()) + for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): - num_times_called = ( - program.program_callables_info.num_times_callables_called[ - func_id]) knl = in_knl_callable.subkernel knl_op_map = get_op_map_for_single_kernel(knl, program.program_callables_info, numpy_types, count_redundant_work, subgroup_size) - for i in range(num_times_called): + for i in range(callables_count[func_id]): op_map += knl_op_map elif isinstance(in_knl_callable, ScalarCallable): pass @@ -1684,18 +1684,17 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, access_map = ToCountMap() + callables_count = program.program_callables_info.callables_count() + for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): - num_times_called = ( - program.program_callables_info.num_times_callables_called[ - func_id]) knl = in_knl_callable.subkernel knl_access_map = get_access_map_for_single_kernel(knl, program.program_callables_info, numpy_types, count_redundant_work, subgroup_size) # FIXME: didn't see any easy way to multiply - for i in range(num_times_called): + for i in range(callables_count[func_id]): access_map += knl_access_map elif isinstance(in_knl_callable, ScalarCallable): pass @@ -1809,18 +1808,16 @@ def get_synchronization_map(program, subgroup_size=None): program = preprocess_program(program) sync_map = ToCountMap() + callables_count = program.program_callables_info.callables_count() for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): - num_times_called = ( - program.program_callables_info.num_times_callables_called[ - func_id]) knl = in_knl_callable.subkernel knl_sync_map = get_synchronization_map_for_single_kernel(knl, program.program_callables_info, subgroup_size) # FIXME: didn't see any easy way to multiply - for i in range(num_times_called): + for i in range(callables_count[func_id]): sync_map += knl_sync_map elif isinstance(in_knl_callable, ScalarCallable): pass @@ -1887,18 +1884,17 @@ def gather_access_footprints(program, ignore_uncountable=False): write_footprints = [] read_footprints = [] + callables_count = program.program_callables_info.callables_count() + for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): - num_times_called = ( - program.program_callables_info.num_times_callables_called[ - func_id]) knl = in_knl_callable.subkernel knl_write_footprints, knl_read_footprints = ( gather_access_footprints_for_single_kernel(knl, ignore_uncountable)) # FIXME: didn't see any easy way to multiply - for i in range(num_times_called): + for i in range(callables_count[func_id]): write_footprints.extend(knl_write_footprints) read_footprints.extend(knl_read_footprints) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 8b5a656c..76d4a579 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -894,9 +894,7 @@ def infer_unknown_types(program, expect_completion=False): program_callables_info[program.name]) type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel - from loopy.program import count_callables_in_program_callables_info - old_callables_count = count_callables_in_program_callables_info( - program_callables_info) + old_callables_count = program_callables_info.callables_count() program_callables_info = ( program.program_callables_info.with_edit_callables_mode()) root_kernel, program_callables_info = ( -- GitLab From 44b247dc760d6f2eeb9e06b0cf375ce24262b68b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 14:28:48 +0530 Subject: [PATCH 19/80] dont rename if given a root kernel. --- loopy/program.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index a0477bdf..efc66b5a 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -649,15 +649,25 @@ class ProgramCallablesInfo(ImmutableRecord): # }}} unique_function_identifier = function.name - while unique_function_identifier in self.resolved_functions: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) + + if isinstance(in_kernel_callable, CallableKernel) and ( + in_kernel_callable.subkernel.is_called_from_host): + # special treatment if the callable is the root kernel + pass + else: + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) + if 'strongVolumeKernelR_0' in updated_resolved_functions: + import pudb + pudb.set_trace() + history[unique_function_identifier] = set( [unique_function_identifier]) @@ -759,6 +769,10 @@ class ProgramCallablesInfo(ImmutableRecord): updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) + if 'strongVolumeKernelR_0' in updated_resolved_functions: + import pudb + pudb.set_trace() + history[unique_function_identifier] = ( history[function.name] | set([unique_function_identifier])) -- GitLab From 01e42c10b6e3b362d2dc325c7e1d177e0b7377a0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 20:31:08 +0530 Subject: [PATCH 20/80] perform only one rename! --- loopy/program.py | 1 + loopy/type_inference.py | 5 ----- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index efc66b5a..911667df 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -809,6 +809,7 @@ class ProgramCallablesInfo(ImmutableRecord): new_callables_count.keys()-renames_needed.keys()): if old_func_id in history[new_func_id]: renames_needed[new_func_id] = old_func_id + break resolved_functions = {} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 76d4a579..52150dcd 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -882,11 +882,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" - from loopy.kernel import LoopKernel - if isinstance(program, LoopKernel): - # FIXME: deprecate warning needed here - from loopy.program import make_program_from_kernel - program = make_program_from_kernel(program) program_callables_info = program.program_callables_info -- GitLab From 50dc2fe4b266a968360fb03749705478372342d6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 20:38:25 +0530 Subject: [PATCH 21/80] replace keys() by six.viewkeys() for py2.7. --- loopy/program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/program.py b/loopy/program.py index 911667df..3872a83e 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -806,7 +806,7 @@ class ProgramCallablesInfo(ImmutableRecord): # this implies that all the function instances having the name # "func_id" have been renamed to something else. for new_func_id in ( - new_callables_count.keys()-renames_needed.keys()): + six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)): if old_func_id in history[new_func_id]: renames_needed[new_func_id] = old_func_id break -- GitLab From 7ab71c675f472e2daa94f02a53c9fa61e8b5e2ff Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 21:34:23 +0530 Subject: [PATCH 22/80] make ProgramCallablesInfo hashable. --- loopy/kernel/__init__.py | 2 ++ loopy/program.py | 23 +++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 89aef660..8b2cf3dd 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1035,6 +1035,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): self.get_iname_bounds(iname, constants_only=True).size, constants_only=True))) + @memoize_method def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that @@ -1132,6 +1133,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) + @memoize_method def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that diff --git a/loopy/program.py b/loopy/program.py index 3872a83e..d19cd4e8 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -500,7 +500,7 @@ class CallablesCountingMapper(CombineMapper): map_type_cast = map_constant -# FIXME: @memoize_method +@memoize_method def count_callables_in_kernel(kernel, program_callables_info): """ Returns an instance of :class:`collections.Counter` representing the number @@ -558,7 +558,7 @@ class ProgramCallablesInfo(ImmutableRecord): history=None, is_being_edited=False): if history is None: - history = dict((func_id, set([func_id])) for func_id in + history = dict((func_id, frozenset([func_id])) for func_id in resolved_functions) super(ProgramCallablesInfo, self).__init__( @@ -571,9 +571,16 @@ class ProgramCallablesInfo(ImmutableRecord): "is_being_edited", "history") + def __hash__(self): + return hash(( + frozenset(six.iteritems(self.resolved_functions)), + frozenset(six.iteritems(self.history)), + self.is_being_edited + )) + update_persistent_hash = LoopKernel.update_persistent_hash - # FIXME: @memoize_method + @memoize_method def callables_count(self): """ Returns an instance of :class:`collection.Counter` representing the number @@ -623,7 +630,7 @@ class ProgramCallablesInfo(ImmutableRecord): # identifier corresposing to that callable. for func_id, in_knl_callable in self.resolved_functions.items(): if in_knl_callable == in_kernel_callable: - history[func_id] = history[func_id] | set([function.name]) + history[func_id] = history[func_id] | frozenset([function.name]) return ( self.copy( history=history), @@ -637,7 +644,7 @@ class ProgramCallablesInfo(ImmutableRecord): updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) - history[unique_function_identifier] = set( + history[unique_function_identifier] = frozenset( [unique_function_identifier]) return ( @@ -668,7 +675,7 @@ class ProgramCallablesInfo(ImmutableRecord): import pudb pudb.set_trace() - history[unique_function_identifier] = set( + history[unique_function_identifier] = frozenset( [unique_function_identifier]) return ( @@ -733,7 +740,7 @@ class ProgramCallablesInfo(ImmutableRecord): # identifier corresponding to that callable. for func_id, in_knl_callable in self.resolved_functions.items(): if in_knl_callable == in_kernel_callable: - history[func_id] = history[func_id] | set([function.name]) + history[func_id] = history[func_id] | frozenset([function.name]) return ( self.copy( history=history), @@ -774,7 +781,7 @@ class ProgramCallablesInfo(ImmutableRecord): pudb.set_trace() history[unique_function_identifier] = ( - history[function.name] | set([unique_function_identifier])) + history[function.name] | frozenset([unique_function_identifier])) return ( self.copy( -- GitLab From 8d4af7a2a89e7cff3db9c2a351733abfeb0161ef Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 22:24:31 +0530 Subject: [PATCH 23/80] update persistent dict changed for frozenset. --- loopy/library/reduction.py | 1 - loopy/tools.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index b968192e..b3deba65 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -229,7 +229,6 @@ class ReductionOpFunction(FunctionIdentifier): update_persistent_hash = LoopKernel.update_persistent_hash - # }}} diff --git a/loopy/tools.py b/loopy/tools.py index b243a794..5eabe6c3 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -79,6 +79,11 @@ class LoopyKeyBuilder(KeyBuilderBase): update_for_defaultdict = update_for_dict + def update_for_frozenset(self, key_hash, key): + for set_key in sorted(key, + key=lambda obj: type(obj).__name__ + str(obj)): + self.rec(key_hash, set_key) + def update_for_BasicSet(self, key_hash, key): # noqa from islpy import Printer prn = Printer.to_str(key.get_ctx()) -- GitLab From f8307a0ed463312a6eb162f7b8ab054babad97f3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 08:32:16 +0530 Subject: [PATCH 24/80] minor cleanup/comments. --- loopy/preprocess.py | 91 +++++++++++++++++++++++++++------------------ loopy/program.py | 7 +++- 2 files changed, 59 insertions(+), 39 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 472c74db..e9e55cc4 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2149,10 +2149,7 @@ def check_atomic_loads(kernel): class ArgDescrInferenceMapper(RuleAwareIdentityMapper): """ - Returns a set of instances of :class:`tuple` (expr, - in_kernel_callable). The mapped `in_kernel_callable` of the - :class:`InKernelCallable` are descriptor specialized for the given - arguments. + Infers the :attr:`loopy` """ def __init__(self, rule_mapping_context, caller_kernel, @@ -2250,9 +2247,11 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info): Returns a copy of *kernel* with the argument shapes and strides matching for scoped functions in the *kernel*. Refer :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. - """ - # FIXME: update this docs, once the design is finalized + .. note:: + + Initiates a walk starting from *kernel* to all its callee kernels. + """ from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( @@ -2268,6 +2267,11 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info): def infer_arg_descr(program): + """ + Returns a copy of *program* with the + :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the + callables. + """ root_kernel_callable = program.program_callables_info[program.name] old_callables_count = program.program_callables_info.callables_count() program_callables_info = ( @@ -2397,28 +2401,60 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): return kernel -def preprocess_kernel(kernel, device=None): - # FIXME: error message? - return preprocess_program(kernel, device) +# {{{ hw axes inference + +def infer_hw_axes_sizes(program): + """ + Returns copy of *program* with the hardware axes sizes inferred. + + .. note:: + + - Firstly, computes the collective hardware axes sizes from all the + callable kernels. + - Then, overrides the grid sizes of all the callable kernels to the + collective value. + """ + + local_size, global_size = program.get_grid_size_upper_bounds() + + resolved_function_with_hw_axes_sizes_inferred = {} + + for func_id, in_knl_callable in ( + program.program_callables_info.items()): + if func_id == program.name: + resolved_function_with_hw_axes_sizes_inferred[func_id] = ( + in_knl_callable) + else: + resolved_function_with_hw_axes_sizes_inferred[func_id] = ( + in_knl_callable.with_hw_axes_sizes(local_size, global_size)) + + new_program_callables_info = ( + program.program_callables_info.copy( + resolved_functions=resolved_function_with_hw_axes_sizes_inferred)) + + program = program.copy(program_callables_info=new_program_callables_info) + +# }}} def preprocess_program(program, device=None): if device is not None: + # FIXME: Time to remove this? (Git blame shows 5 years ago) from warnings import warn warn("passing 'device' to preprocess_kernel() is deprecated", DeprecationWarning, stacklevel=2) program = infer_unknown_types(program, expect_completion=False) - # {{{ preprocess the root kernel + # {{{ preprocess callable kernels # Callable editing restrictions: # - # - cannot edit program_callables_info in :meth:`preprocess_single_kernel` - # as we are iterating over it. + # - should not edit program_callables_info in :meth:`preprocess_single_kernel` + # as we are iterating over it.[1] # - # Refer: https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects + # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects new_resolved_functions = {} for func_id, in_knl_callable in program.program_callables_info.items(): @@ -2431,7 +2467,7 @@ def preprocess_program(program, device=None): elif isinstance(in_knl_callable, ScalarCallable): pass else: - raise NotImplementedError("Unknown type of callable %s." % ( + raise NotImplementedError("Unknown callable type %s." % ( type(in_knl_callable).__name__)) new_resolved_functions[func_id] = in_knl_callable @@ -2445,32 +2481,13 @@ def preprocess_program(program, device=None): # infer arg descrs of the callables program = infer_arg_descr(program) - # {{{ hw axes inference - - # FIXME: think of wrapping this in a function? + program = infer_hw_axes_sizes(program) - local_size, global_size = program.get_grid_size_upper_bounds() - - resolved_function_with_hw_axes_sizes_set = {} - - for func_id, in_knl_callable in ( - program.program_callables_info.items()): - if func_id == program.name: - resolved_function_with_hw_axes_sizes_set[func_id] = ( - in_knl_callable) - else: - resolved_function_with_hw_axes_sizes_set[func_id] = ( - in_knl_callable.with_hw_axes_sizes(local_size, global_size)) - - new_program_callables_info = ( - program.program_callables_info.copy( - resolved_functions=resolved_function_with_hw_axes_sizes_set)) + return program - program = program.copy(program_callables_info=new_program_callables_info) - # }}} - - return program +# FIXME: Do we add a deprecation warning? +preprocess_kernel = preprocess_program # vim: foldmethod=marker diff --git a/loopy/program.py b/loopy/program.py index d19cd4e8..eec8157c 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -861,10 +861,13 @@ class ProgramCallablesInfo(ImmutableRecord): return item in self.resolved_functions def items(self): - return self.resolved_functions.items() + return six.iteritems(self.resolved_functions) def values(self): - return self.resolved_functions.values() + return six.itervalues(self.resolved_functions) + + def keys(self): + return six.iterkeys(self.resolved_functions) # }}} -- GitLab From caec9506a1b42bddb2ce57e009c207aaad4d7dc9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 08:46:50 +0530 Subject: [PATCH 25/80] with_add_callable -> with_added_callable --- loopy/program.py | 10 +++++----- loopy/transform/fusion.py | 4 ++-- loopy/type_inference.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index eec8157c..90eb64e9 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -114,7 +114,7 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): # resolved in-kernel callable self.program_callables_info, new_func_id = ( - self.program_callables_info.with_add_callable(expr.function, + self.program_callables_info.with_added_callable(expr.function, in_knl_callable)) return type(expr)( ResolvedFunction(new_func_id), @@ -135,7 +135,7 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) assert in_knl_callable is not None self.program_callables_info, _ = ( - self.program_callables_info.with_add_callable(func_id, + self.program_callables_info.with_added_callable(func_id, in_knl_callable)) # FIXME: where do you deal with the parameters? ~KK return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) @@ -168,7 +168,7 @@ def initialize_program_callables_info_from_kernel(kernel): callable_kernel = CallableKernel(kernel_with_functions_resolved) # add the callable kernel to the program_callables_info - program_callables_info, _ = program_callables_info.with_add_callable( + program_callables_info, _ = program_callables_info.with_added_callable( Variable(kernel.name), callable_kernel) return program_callables_info @@ -603,7 +603,7 @@ class ProgramCallablesInfo(ImmutableRecord): # {{{ interface to perfrom edits on callables - def with_add_callable(self, function, in_kernel_callable): + def with_added_callable(self, function, in_kernel_callable): """ Returns a copy of *self* with the *function* associated with the *in_kernel_callable*. @@ -704,7 +704,7 @@ class ProgramCallablesInfo(ImmutableRecord): .. note:: - - Use :meth:`with_add_callable` if a callable is being resolved for the + - Use :meth:`with_added_callable` if a callable is being resolved for the first time. """ diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index f2e62368..b0d67764 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -447,7 +447,7 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): # renaming. continue main_prog_callables_info, new_func_id = ( - main_prog_callables_info.with_add_callable(var(old_func_id), + main_prog_callables_info.with_added_callable(var(old_func_id), in_knl_callable)) if old_func_id != new_func_id: @@ -464,7 +464,7 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): subkernel=new_root_kernel.copy(name=programs[0].name)) # TODO: change the name of the final root kernel. - main_prog_callables_info, _ = main_prog_callables_info.with_add_callable( + main_prog_callables_info, _ = main_prog_callables_info.with_added_callable( var(programs[0].name), new_root_kernel_callable) return programs[0].copy( diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 52150dcd..04392d8d 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -408,7 +408,7 @@ class TypeInferenceMapper(CombineMapper): identifier, function_mangler, arg_id_to_dtype, arg_id_to_descr, mangle_result.target_name) self.program_callables_info, new_function_id = ( - self.program_callables_info.with_add_callable( + self.program_callables_info.with_added_callable( expr.function, in_knl_callable)) if isinstance(expr, Call): -- GitLab From f041d166645c5d7f72413f45200b475a4b2bc150 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 09:47:06 +0530 Subject: [PATCH 26/80] Minimalized CallableKernel for MR271 --- loopy/kernel/function_interface.py | 169 +---------------------------- loopy/preprocess.py | 2 +- loopy/type_inference.py | 138 ++++++++++++++++++++++- 3 files changed, 138 insertions(+), 171 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 8c3a6911..5efc44ad 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -23,19 +23,11 @@ THE SOFTWARE. """ -import re -import six - from six.moves import zip from pytools import ImmutableRecord from loopy.diagnostic import LoopyError -from loopy.symbolic import parse_tagged_name - -from loopy.symbolic import (ResolvedFunction, SubstitutionRuleMappingContext, - RuleAwareIdentityMapper, SubstitutionRuleExpander) - from loopy.kernel import LoopKernel @@ -145,7 +137,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord): .. note:: - This class acts as a pseduo-callable and its significance lies in + This class acts as a pseudo-callable and its significance lies in solving picklability issues. """ fields = set(["local_size", "global_size"]) @@ -228,8 +220,6 @@ class InKernelCallable(ImmutableRecord): Any argument information exists both by its positional and its keyword identifier. """ - # FIXME: In all these with_** functions add that also passes a - # program_callables_info raise NotImplementedError() @@ -333,12 +323,12 @@ class InKernelCallable(ImmutableRecord): class ScalarCallable(InKernelCallable): """ - An abstranct interface the to a scalar callable encountered in a kernel. + An abstract interface the to a scalar callable encountered in a kernel. .. note:: The :meth:`ScalarCallable.with_types` is intended to assist with type - specialization of the funciton and is expected to be supplemented in the + specialization of the function and is expected to be supplemented in the derived subclasses. """ @@ -520,68 +510,12 @@ class CallableKernel(InKernelCallable): return (self.subkernel, self.arg_id_to_dtype, self.arg_id_to_descr) - @property - def name(self): - return self.subkernel.name - - def is_ready_for_codegen(self): - return (self.arg_id_to_dtype is not None and - self.arg_id_to_descr is not None) - def generate_preambles(self, target): """ Yields the *target* specific preambles. """ - # FIXME Check that this is correct. - return yield - def emit_call_insn(self, insn, target, expression_to_code_mapper): - - assert self.is_ready_for_codegen() - - from loopy.kernel.instruction import CallInstruction - from pymbolic.primitives import CallWithKwargs - - assert isinstance(insn, CallInstruction) - - parameters = insn.expression.parameters - kw_parameters = {} - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters - - assignees = insn.assignees - - parameters = list(parameters) - par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - for i in range(len(parameters), len(parameters)+len(kw_parameters)): - parameters.append(kw_parameters[pos_to_kw[i]]) - par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) - - # insert the assigness at the required positions - assignee_write_count = -1 - for i, arg in enumerate(self.subkernel.args): - if arg.is_output_only: - assignee = assignees[-assignee_write_count-1] - parameters.insert(i, assignee) - par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) - assignee_write_count -= 1 - - # no type casting in array calls - from loopy.expression import dtype_to_type_context - from pymbolic.mapper.stringifier import PREC_NONE - from pymbolic import var - - c_parameters = [ - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr - for par, par_dtype in zip( - parameters, par_dtypes)] - - return var(self.subkernel.name)(*c_parameters), False - # }}} @@ -589,7 +523,7 @@ class CallableKernel(InKernelCallable): class ManglerCallable(ScalarCallable): """ - A callable whose characateristic is defined by a function mangler. + A callable whose characteristic is defined by a function mangler. .. attribute:: function_mangler @@ -662,99 +596,4 @@ class ManglerCallable(ScalarCallable): # }}} - -# {{{ new pymbolic calls to scoped functions - -def next_indexed_variable(function): - """ - Returns an instance of :class:`str` with the next indexed-name in the - sequence for the name of *function*. - - *Example:* ``Variable('sin_0')`` will return ``'sin_1'``. - - :arg function: Either an instance of :class:`pymbolic.primitives.Variable` - or :class:`loopy.reduction.ArgExtOp` or - :class:`loopy.reduction.SegmentedOp`. - """ - from loopy.library.reduction import ReductionOpFunction - if isinstance(function, ReductionOpFunction): - return function.copy() - func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") - - match = func_name.match(function.name) - - if match is None: - if function.name[-1] == '_': - return "{old_name}0".format(old_name=function.name) - else: - return "{old_name}_0".format(old_name=function.name) - - return "{alpha}_{num}".format(alpha=match.group('alpha'), - num=int(match.group('num'))+1) - - -class FunctionNameChanger(RuleAwareIdentityMapper): - """ - Changes the names of scoped functions in calls of expressions according to - the mapping ``calls_to_new_functions`` - """ - - def __init__(self, rule_mapping_context, calls_to_new_names, - subst_expander): - super(FunctionNameChanger, self).__init__(rule_mapping_context) - self.calls_to_new_names = calls_to_new_names - self.subst_expander = subst_expander - - def map_call(self, expr, expn_state): - name, tag = parse_tagged_name(expr.function) - - if name not in self.rule_mapping_context.old_subst_rules: - expanded_expr = self.subst_expander(expr) - if expr in self.calls_to_new_names: - return type(expr)( - ResolvedFunction(self.calls_to_new_names[expr]), - tuple(self.rec(child, expn_state) - for child in expr.parameters)) - elif expanded_expr in self.calls_to_new_names: - # FIXME: this is horribly wrong logic. - # investigate how to make edits to a substitution rule - return type(expr)( - ResolvedFunction(self.calls_to_new_names[expanded_expr]), - tuple(self.rec(child, expn_state) - for child in expanded_expr.parameters)) - else: - return super(FunctionNameChanger, self).map_call( - expr, expn_state) - else: - return self.map_substitution(name, tag, expr.parameters, expn_state) - - def map_call_with_kwargs(self, expr, expn_state): - - if expr in self.calls_to_new_names: - return type(expr)( - ResolvedFunction(self.calls_to_new_names[expr]), - tuple(self.rec(child, expn_state) - for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) - ) - else: - return super(FunctionNameChanger, self).map_call_with_kwargs( - expr, expn_state) - - -def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): - rule_mapping_context = SubstitutionRuleMappingContext( - kernel.substitutions, kernel.get_var_name_generator()) - subst_expander = SubstitutionRuleExpander(kernel.substitutions) - name_changer = FunctionNameChanger(rule_mapping_context, - pymbolic_calls_to_new_names, subst_expander) - - return rule_mapping_context.finish_kernel( - name_changer.map_kernel(kernel)) - -# }}} - - # vim: foldmethod=marker diff --git a/loopy/preprocess.py b/loopy/preprocess.py index e9e55cc4..41674ed9 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2432,7 +2432,7 @@ def infer_hw_axes_sizes(program): program.program_callables_info.copy( resolved_functions=resolved_function_with_hw_axes_sizes_inferred)) - program = program.copy(program_callables_info=new_program_callables_info) + return program.copy(program_callables_info=new_program_callables_info) # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 04392d8d..e5c17886 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -36,7 +36,10 @@ from loopy.diagnostic import ( from loopy.kernel.instruction import _DataObliviousInstruction from loopy.program import ProgramCallablesInfo -from loopy.symbolic import LinearSubscript +from loopy.symbolic import ( + LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper, + SubstitutionRuleExpander, ResolvedFunction, + SubstitutionRuleMappingContext) from pymbolic.primitives import Variable, Subscript, Lookup import logging @@ -62,6 +65,135 @@ def get_return_types_as_tuple(arg_id_to_dtype): return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) +# {{{ renaming helpers + +class FunctionNameChanger(RuleAwareIdentityMapper): + """ + Changes the names of scoped functions in calls of expressions according to + the mapping ``calls_to_new_functions`` + """ + + def __init__(self, rule_mapping_context, calls_to_new_names, + subst_expander): + super(FunctionNameChanger, self).__init__(rule_mapping_context) + self.calls_to_new_names = calls_to_new_names + self.subst_expander = subst_expander + + def map_call(self, expr, expn_state): + name, tag = parse_tagged_name(expr.function) + + if name not in self.rule_mapping_context.old_subst_rules: + expanded_expr = self.subst_expander(expr) + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + elif expanded_expr in self.calls_to_new_names: + # FIXME: This is killing the substitution. + # Maybe using a RuleAwareIdentityMapper for TypeInferenceMapper + # would help. + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expanded_expr]), + tuple(self.rec(child, expn_state) + for child in expanded_expr.parameters)) + else: + return super(FunctionNameChanger, self).map_call( + expr, expn_state) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + else: + return super(FunctionNameChanger, self).map_call_with_kwargs( + expr, expn_state) + + +def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): + """ + Returns a copy of *kernel* with the names of pymbolic calls changed + according to the mapping given by *pymbolic_calls_new_names*. + + :arg pymbolic_calls_to_new_names: A mapping from instances of + :class:`pymbolic.primitives.Call` to :class:`str`. + + **Example: ** + + - Given a *kernel* -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin')(x[i]) + end i + ------------------------------------------------------------- + + - And given a *pymbolic_calls_to_new_names* -- + + .. code:: + + {Call(ResolvedFunction(Variable('sin')), (Subscript(Variable('x'), + Variable('i')),))": 'sin_1'} + + - The following *kernel* is returned -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin_1')(x[i]) + end i + ------------------------------------------------------------- + """ + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + name_changer = FunctionNameChanger(rule_mapping_context, + pymbolic_calls_to_new_names, subst_expander) + + return rule_mapping_context.finish_kernel( + name_changer.map_kernel(kernel)) + +# }}} + + # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): @@ -276,7 +408,6 @@ class TypeInferenceMapper(CombineMapper): def map_call(self, expr, return_tuple=False): from pymbolic.primitives import Variable, CallWithKwargs, Call - from loopy.symbolic import ResolvedFunction if isinstance(expr, CallWithKwargs): kw_parameters = expr.kw_parameters @@ -862,9 +993,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, args=[new_arg_dict[arg.name] for arg in kernel.args], ) - # this has to be subsitutition - from loopy.kernel.function_interface import ( - change_names_of_pymbolic_calls) type_specialized_kernel = change_names_of_pymbolic_calls( pre_type_specialized_knl, old_calls_to_new_calls) -- GitLab From 4f8ec6989ef1e515fa956214702f7ef11b300305 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 10:42:01 +0530 Subject: [PATCH 27/80] added autofunction/class/methods --- loopy/kernel/function_interface.py | 13 +++ loopy/program.py | 143 +++++++++++++++++------------ 2 files changed, 96 insertions(+), 60 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 5efc44ad..e4e8c1d5 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -30,6 +30,19 @@ from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel +__doc__ = """ + +.. currentmodule:: loopy + +.. autoclass:: ValueArgDescriptor +.. autoclass:: ArrayArgDescriptor +.. autoclass:: InKernelCallable +.. autoclass:: CallableKernel +.. autoclass:: ScalarCallable +.. autoclass:: ManglerCallable + +""" + # {{{ argument descriptors diff --git a/loopy/program.py b/loopy/program.py index 90eb64e9..e5d033e0 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -42,7 +42,17 @@ from loopy.kernel import LoopKernel from collections import Counter from pymbolic.primitives import Call, CallWithKwargs -# FIXME: autofunction/autoclass?? ~KK +__doc__ = """ + +.. currentmodule:: loopy + +.. autoclass:: Program +.. autoclass:: ProgramCallablesInfo + +.. autofunction:: make_program_from_kernel +.. autofunction:: iterate_over_kernels_if_given_program + +""" class ResolvedFunctionMarker(RuleAwareIdentityMapper): @@ -114,8 +124,8 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): # resolved in-kernel callable self.program_callables_info, new_func_id = ( - self.program_callables_info.with_added_callable(expr.function, - in_knl_callable)) + self.program_callables_info.with_added_callable( + expr.function, in_knl_callable)) return type(expr)( ResolvedFunction(new_func_id), tuple(self.rec(child, expn_state) @@ -137,10 +147,21 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): self.program_callables_info, _ = ( self.program_callables_info.with_added_callable(func_id, in_knl_callable)) - # FIXME: where do you deal with the parameters? ~KK return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) +def _default_func_id_to_kernel_callable_mappers(target): + """ + Returns a list of functions that are provided through *target* by deafault. + """ + # FIXME: the name -- scopers is no longer used!(change it) ~KK + + from loopy.library.function import loopy_specific_callable_scopers + return ( + [loopy_specific_callable_scopers] + ( + target.get_device_ast_builder().function_scopers())) + + def initialize_program_callables_info_from_kernel(kernel): """ Returns an instance of :class:`loopy.ProgramCallablesInfo`, by resolving @@ -148,7 +169,7 @@ def initialize_program_callables_info_from_kernel(kernel): """ # collect the default function resolvers func_id_to_kernel_callable_mappers = ( - default_func_id_to_kernel_callable_mappers(kernel.target)) + _default_func_id_to_kernel_callable_mappers(kernel.target)) program_callables_info = ProgramCallablesInfo({}) from loopy.symbolic import SubstitutionRuleMappingContext @@ -553,6 +574,9 @@ class ProgramCallablesInfo(ImmutableRecord): An instance of :class:`bool` which is intended to aid the working of :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and :meth:`with_exit_edit_callables_mode`. + + .. automethod:: __init__ + .. automethod:: callables_count """ def __init__(self, resolved_functions, history=None, is_being_edited=False): @@ -580,6 +604,7 @@ class ProgramCallablesInfo(ImmutableRecord): update_persistent_hash = LoopKernel.update_persistent_hash + @property @memoize_method def callables_count(self): """ @@ -601,18 +626,36 @@ class ProgramCallablesInfo(ImmutableRecord): return callables_count - # {{{ interface to perfrom edits on callables + # {{{ interface to perform edits on callables def with_added_callable(self, function, in_kernel_callable): """ Returns a copy of *self* with the *function* associated with the *in_kernel_callable*. + + .. note:: + + - Always checks whether the + :attr:``loopy.ProgramCallablesInfo.resolved_functions` has + *in_kernel_callable*, does not introduce copies. + + - The difference between + :meth:`loopy.ProgramCallablesInfo.with_added_callable` + and :meth:`ProgramCallablesInfo.with_callable` being that + the former has no support for renaming the callable back i.e. + ``with_callable`` supports renaming from ``sin_0`` to ``sin``, + if possible, through the member method + ``loopy.ProgramCallablesInfo.with_exit_edit_callables_mode`` + + This subtle difference makes -- + + - :meth:`loopy.ProgramCallablesInfo.with_added_callable` suitable + for usage while resolving the functions first time, where no + renaming is needed. + + - :meth:`loopy.ProgramCallablesInfo.with_callable` suitable for + implementing edits in callables during inference-walks. """ - # FIXME: pleasse better docs.. ~KK - # note: this does not require the edit mode to be true. - # the reason for the edit mode is that we need to take care of the - # renaming that might be needed to be done - # PS: delete this note? # {{{ sanity checks @@ -627,7 +670,7 @@ class ProgramCallablesInfo(ImmutableRecord): if in_kernel_callable in self.resolved_functions.values(): # the callable already exists, implies return the function - # identifier corresposing to that callable. + # identifier corresponding to that callable. for func_id, in_knl_callable in self.resolved_functions.items(): if in_knl_callable == in_kernel_callable: history[func_id] = history[func_id] | frozenset([function.name]) @@ -659,7 +702,7 @@ class ProgramCallablesInfo(ImmutableRecord): if isinstance(in_kernel_callable, CallableKernel) and ( in_kernel_callable.subkernel.is_called_from_host): - # special treatment if the callable is the root kernel + # do not rename root kernel pass else: while unique_function_identifier in self.resolved_functions: @@ -671,10 +714,6 @@ class ProgramCallablesInfo(ImmutableRecord): updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) - if 'strongVolumeKernelR_0' in updated_resolved_functions: - import pudb - pudb.set_trace() - history[unique_function_identifier] = frozenset( [unique_function_identifier]) @@ -688,24 +727,26 @@ class ProgramCallablesInfo(ImmutableRecord): """ Initiates *self* for a walk traversal through all the callables. """ - # PS: I don't see a need for this method right now. - # This is just for validation purposes, maybe needs to disapper if you - # find a better solution? return self.copy( is_being_edited=True) def with_callable(self, function, in_kernel_callable): """ + Returns a copy of *self* with the *function* associated with the + *in_kernel_callable*. Also refer -- + :meth:`loopy.ProgramCallablesInfo.with_added_callable` + + :arg function: An instance of :class:`pymbolic.primitives.Variable` or :class:`loopy.library.reduction.ReductionOpFunction`. - :arg in_kernel_callables: An instance of + :arg in_kernel_callable: An instance of :class:`loopy.InKernelCallable`. .. note:: - Use :meth:`with_added_callable` if a callable is being resolved for the - first time. + first time. """ # {{{ non-edit mode @@ -714,7 +755,7 @@ class ProgramCallablesInfo(ImmutableRecord): if function.name in self.resolved_functions and ( self.resolved_functions[function.name] == in_kernel_callable): # if not being edited, check that the given function is - # equal to the the old version of the callable. + # equal to the old version of the callable. return self, function else: print('Old: ', self.resolved_functions[function.name]) @@ -764,7 +805,7 @@ class ProgramCallablesInfo(ImmutableRecord): if isinstance(in_kernel_callable, CallableKernel) and ( in_kernel_callable.subkernel.is_called_from_host): - # special treatment if the callable is the root kernel + # do not rename root kernel pass else: while unique_function_identifier in self.resolved_functions: @@ -776,10 +817,6 @@ class ProgramCallablesInfo(ImmutableRecord): updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) - if 'strongVolumeKernelR_0' in updated_resolved_functions: - import pudb - pudb.set_trace() - history[unique_function_identifier] = ( history[function.name] | frozenset([unique_function_identifier])) @@ -791,39 +828,38 @@ class ProgramCallablesInfo(ImmutableRecord): def with_exit_edit_callables_mode(self, old_callables_count): """ - Returns a copy of *self* with renaming of the callables done whenver + Returns a copy of *self* with renaming of the callables done whenever possible. *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``, - then all the renaming is done such that one of flavors of the function + then all the renaming is done such that one of flavors of the callable is renamed back to ``sin``. """ + assert self.is_being_edited + new_callables_count = self.callables_count() - history = self.history.copy() - renames_needed = {} - assert self.is_being_edited + # {{{ calculate the renames needed - # NOTE:(to self by KK) - # all we need to do is change the name of the variables that were seen - # in old_callables_count but are no longer available. - # Using these 2 figure out the renames needed. + renames_needed = {} for old_func_id in old_callables_count-new_callables_count: # this implies that all the function instances having the name # "func_id" have been renamed to something else. for new_func_id in ( six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)): - if old_func_id in history[new_func_id]: + if old_func_id in self.history[new_func_id]: renames_needed[new_func_id] = old_func_id break + # }}} - resolved_functions = {} + new_resolved_functions = {} + new_history = {} for func_id in new_callables_count: in_knl_callable = self.resolved_functions[func_id] if isinstance(in_knl_callable, CallableKernel): - # If callable kernel, perform renames. + # if callable kernel, perform renames inside its expressions. old_subkernel = in_knl_callable.subkernel new_subkernel = rename_resolved_functions_in_a_single_kernel( old_subkernel, renames_needed) @@ -836,19 +872,18 @@ class ProgramCallablesInfo(ImmutableRecord): type(in_knl_callable).__name__) if func_id in renames_needed: - # If function name itself in renames change the key of the - # dict. - history.pop(func_id) - new_func_id = renames_needed[func_id] - resolved_functions[new_func_id] = ( + new_resolved_functions[new_func_id] = ( in_knl_callable) + new_history[new_func_id] = self.history[func_id] else: - resolved_functions[func_id] = in_knl_callable + new_resolved_functions[func_id] = in_knl_callable + new_history[func_id] = self.history[func_id] return self.copy( is_being_edited=False, - resolved_functions=resolved_functions) + resolved_functions=new_resolved_functions, + history=new_history) # }}} @@ -874,18 +909,6 @@ class ProgramCallablesInfo(ImmutableRecord): # }}} -def default_func_id_to_kernel_callable_mappers(target): - """ - Returns a list of functions that are provided through *target* by deafault. - """ - # FIXME: name scopers is confusing!(change it to something else.) - - from loopy.library.function import loopy_specific_callable_scopers - return ( - [loopy_specific_callable_scopers] + ( - target.get_device_ast_builder().function_scopers())) - - # {{{ helper functions def make_program_from_kernel(kernel): @@ -902,7 +925,7 @@ def make_program_from_kernel(kernel): name=kernel.name, program_callables_info=program_callables_info, func_id_to_in_knl_callable_mappers=( - default_func_id_to_kernel_callable_mappers(kernel.target)), + _default_func_id_to_kernel_callable_mappers(kernel.target)), target=kernel.target) return program -- GitLab From a28164f965eedd1611752e9d7540d108c2ae8d76 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 10:43:14 +0530 Subject: [PATCH 28/80] made callables count a property. --- loopy/preprocess.py | 2 +- loopy/program.py | 2 +- loopy/statistics.py | 8 ++++---- loopy/type_inference.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 41674ed9..44653316 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2273,7 +2273,7 @@ def infer_arg_descr(program): callables. """ root_kernel_callable = program.program_callables_info[program.name] - old_callables_count = program.program_callables_info.callables_count() + old_callables_count = program.program_callables_info.callables_count program_callables_info = ( program.program_callables_info.with_edit_callables_mode()) root_kernel = program.root_kernel diff --git a/loopy/program.py b/loopy/program.py index e5d033e0..bdf40a1b 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -838,7 +838,7 @@ class ProgramCallablesInfo(ImmutableRecord): assert self.is_being_edited - new_callables_count = self.callables_count() + new_callables_count = self.callables_count # {{{ calculate the renames needed diff --git a/loopy/statistics.py b/loopy/statistics.py index 3799967b..71a62986 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1397,7 +1397,7 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, op_map = ToCountMap() callables_count = ( - program.program_callables_info.callables_count()) + program.program_callables_info.callables_count) for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): @@ -1684,7 +1684,7 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, access_map = ToCountMap() - callables_count = program.program_callables_info.callables_count() + callables_count = program.program_callables_info.callables_count for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): @@ -1808,7 +1808,7 @@ def get_synchronization_map(program, subgroup_size=None): program = preprocess_program(program) sync_map = ToCountMap() - callables_count = program.program_callables_info.callables_count() + callables_count = program.program_callables_info.callables_count for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): @@ -1884,7 +1884,7 @@ def gather_access_footprints(program, ignore_uncountable=False): write_footprints = [] read_footprints = [] - callables_count = program.program_callables_info.callables_count() + callables_count = program.program_callables_info.callables_count for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): diff --git a/loopy/type_inference.py b/loopy/type_inference.py index e5c17886..d5df36bf 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -1017,7 +1017,7 @@ def infer_unknown_types(program, expect_completion=False): program_callables_info[program.name]) type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel - old_callables_count = program_callables_info.callables_count() + old_callables_count = program_callables_info.callables_count program_callables_info = ( program.program_callables_info.with_edit_callables_mode()) root_kernel, program_callables_info = ( -- GitLab From 621ef9f8c05abe5f9ba64adc2ecbeae9cdd92e58 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 10:56:22 +0530 Subject: [PATCH 29/80] docs cleanup for Program --- loopy/program.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index bdf40a1b..236bbc44 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -222,10 +222,13 @@ class Program(ImmutableRecord): .. note:: - - To create an instance of :class:`loopy.Program`, it is recommeneded to + - To create an instance of :class:`loopy.Program`, it is recommended to go through :method:`loopy.make_kernel`. - This data structure and its attributes should be considered immutable, any modifications should be done through :method:`copy`. + + .. automethod:: __init__ + .. automethod:: with_root_kernel """ def __init__(self, name, @@ -329,7 +332,7 @@ class Program(ImmutableRecord): def root_kernel(self): """ Returns an instance of :class:`loopy.LoopKernel` denoting the topmost - level kernel in codegeneration. + level kernel. .. note:: @@ -577,6 +580,10 @@ class ProgramCallablesInfo(ImmutableRecord): .. automethod:: __init__ .. automethod:: callables_count + .. automethod:: with_added_callable + .. automethod:: with_edit_callables_mode + .. automethod:: with_callable + .. automethod:: with_exit_edit_callables_mode """ def __init__(self, resolved_functions, history=None, is_being_edited=False): -- GitLab From 8e64c24f8d0669faaca742138a1982cda56c52cf Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 12:07:20 +0530 Subject: [PATCH 30/80] small error in docs. --- doc/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 71b8f438..4c67e3d3 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -334,7 +334,7 @@ that these dependencies show up there, too: .. doctest:: - >>> print(knl.stringify(with_dependencies=True)) + >>> print(knl.root_kernel.stringify(with_dependencies=True)) --------------------------------------------------------------------------- KERNEL: loopy_kernel --------------------------------------------------------------------------- -- GitLab From 3293f6ae0b24ce1206487835ac52aeb37a06a174 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 12:16:30 +0530 Subject: [PATCH 31/80] callable kernel no longer has a name. --- loopy/transform/fusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index b0d67764..44e69ecf 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -439,7 +439,7 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): # main_program_callables_info, because of renaming is # needed to be done in the callable kernels before registering. # Hence disabling it until required. - if in_knl_callable.name != prog.name: + if in_knl_callable.subkernel.name != prog.name: raise LoopyError("fuse_kernels cannot fuse programs with " "multiple callable kernels.") -- GitLab From 70ada3da326053a6023fa050008284aec9d277eb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 12:32:00 +0530 Subject: [PATCH 32/80] minor changes in docs --- doc/tutorial.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 4c67e3d3..8e20dbc2 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1207,7 +1207,8 @@ This happens due to the kernel splitting done by :mod:`loopy`. The splitting happens when the instruction schedule is generated. To see the schedule, we should call :func:`loopy.get_one_scheduled_kernel`: - >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + >>> knl = lp.preprocess_kernel(knl) + >>> knl = lp.get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info) >>> print(knl) --------------------------------------------------------------------------- KERNEL: rotate_v2 @@ -1237,9 +1238,8 @@ function adds instructions to the kernel without scheduling them. That means that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to put those instructions into the schedule. - >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) >>> knl = lp.save_and_reload_temporaries(knl) - >>> knl = lp.get_one_scheduled_kernel(knl) # Schedule added instructions + >>> knl = lp.get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info) # Schedule added instructions >>> print(knl) --------------------------------------------------------------------------- KERNEL: rotate_v2 -- GitLab From 66b9f4275979426e6e6c9ced76f51c4fc84ebc3a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 12:49:01 +0530 Subject: [PATCH 33/80] Pass docs. --- doc/tutorial.rst | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 8e20dbc2..597240cc 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1179,7 +1179,7 @@ Let us start with an example. Consider the kernel from above with a .. doctest:: - >>> knl = lp.make_kernel( + >>> prog = lp.make_kernel( ... "[n] -> {[i] : 0<=i>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0") Here is what happens when we try to generate code for the kernel: - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) Traceback (most recent call last): ... loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?) @@ -1207,9 +1207,10 @@ This happens due to the kernel splitting done by :mod:`loopy`. The splitting happens when the instruction schedule is generated. To see the schedule, we should call :func:`loopy.get_one_scheduled_kernel`: - >>> knl = lp.preprocess_kernel(knl) - >>> knl = lp.get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info) - >>> print(knl) + >>> prog = lp.preprocess_kernel(prog) + >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) + >>> prog = prog.with_root_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1238,9 +1239,10 @@ function adds instructions to the kernel without scheduling them. That means that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to put those instructions into the schedule. - >>> knl = lp.save_and_reload_temporaries(knl) - >>> knl = lp.get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info) # Schedule added instructions - >>> print(knl) + >>> prog = lp.save_and_reload_temporaries(prog) + >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) # Schedule added instructions + >>> prog = prog.with_root_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1279,7 +1281,7 @@ does in more detail: The kernel translates into two OpenCL kernels. - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) >>> print(cgr.device_code()) #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) -- GitLab From fba32ca309e7ac03bd521816a08dc98d9695c1df Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 21:11:09 +0530 Subject: [PATCH 34/80] change credits of program.py --- loopy/program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/program.py b/loopy/program.py index 236bbc44..54d13343 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -1,6 +1,6 @@ from __future__ import division, absolute_import -__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy -- GitLab From 2636fe29c3e574ff14fb1f66764c5f6b34cc54cd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 28 Aug 2018 21:30:11 -0500 Subject: [PATCH 35/80] better function naming, no more usage of "scoped" terminology. --- doc/ref_call.rst | 2 +- loopy/library/function.py | 16 +++++++++++++--- loopy/library/reduction.py | 2 +- loopy/program.py | 6 +++--- loopy/target/__init__.py | 2 +- loopy/target/c/__init__.py | 4 ++-- loopy/target/cuda.py | 4 ++-- loopy/target/opencl.py | 4 ++-- loopy/target/pyopencl.py | 4 ++-- loopy/target/python.py | 4 ++-- 10 files changed, 29 insertions(+), 19 deletions(-) diff --git a/doc/ref_call.rst b/doc/ref_call.rst index 4ff1ef2f..147363a1 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -180,7 +180,7 @@ Changes on the target side to accommodate the new function interface -------------------------------------------------------------------- The earlier "function\_mangler" as a member method of the class -``lp.ASTBuilderBase`` will be replaced by ``function_scopers``. The +``lp.ASTBuilderBase`` will be replaced by ``function_id_in_knl_callable_mapper``. The function scopers would return a list of functions with the signature ``(target, identifier)->lp.InKernelCallable``. diff --git a/loopy/library/function.py b/loopy/library/function.py index 8338875d..f3fb5f8c 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -55,15 +55,25 @@ class IndexOfCallable(ScalarCallable): program_callables_info) -def loopy_specific_callable_scopers(target, identifier): +def loopy_specific_callable_func_id_to_knl_callable_mappers(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` for the *idenitifer* + which is not present in *target*, but whose interface is given by + :mod:`loo.py`. Callables that fall in this category are -- + + - reductions leading to function calls like ``argmin``, ``argmax``. + - callables that have a predefined meaning in :mod:`loo.py` like + ``make_tuple``, ``index_of``, ``indexof_vec``. + """ if identifier == "make_tuple": return MakeTupleCallable(name="make_tuple") if identifier in ["indexof", "indexof_vec"]: return IndexOfCallable(name=identifier) - from loopy.library.reduction import reduction_scoper - return reduction_scoper(target, identifier) + from loopy.library.reduction import ( + reduction_func_id_to_in_knl_callable_mapper) + return reduction_func_id_to_in_knl_callable_mapper(target, identifier) # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index b3deba65..70df864d 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -502,7 +502,7 @@ class ReductionCallable(ScalarCallable): return -def reduction_scoper(target, identifier): +def reduction_func_id_to_in_knl_callable_mapper(target, identifier): if isinstance(identifier, ReductionOpFunction): return ReductionCallable(name=identifier) diff --git a/loopy/program.py b/loopy/program.py index 54d13343..fd4ae63f 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -156,10 +156,10 @@ def _default_func_id_to_kernel_callable_mappers(target): """ # FIXME: the name -- scopers is no longer used!(change it) ~KK - from loopy.library.function import loopy_specific_callable_scopers + from loopy.library.function import loopy_specific_callable_func_id_to_knl_callable_mappers return ( - [loopy_specific_callable_scopers] + ( - target.get_device_ast_builder().function_scopers())) + [loopy_specific_callable_func_id_to_knl_callable_mappers] + ( + target.get_device_ast_builder().function_id_in_knl_callable_mapper())) def initialize_program_callables_info_from_kernel(kernel): diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index e3b4853c..92ee2dc5 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -150,7 +150,7 @@ class ASTBuilderBase(object): # {{{ library - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): """ Returns an instance of list of the functions of signature ``(target, identifiers)`` returning either an instance of diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 1579bb31..418ce025 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -484,9 +484,9 @@ class CASTBuilder(ASTBuilderBase): _preamble_generator, ]) - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): return ( - super(CASTBuilder, self).function_scopers() + [ + super(CASTBuilder, self).function_id_in_knl_callable_mapper() + [ scope_c_math_functions]) # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 89cbfd03..e6abf73f 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -274,9 +274,9 @@ class CUDACASTBuilder(CASTBuilder): # {{{ library - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): return [scope_cuda_functions] + ( - super(CUDACASTBuilder, self).function_scopers()) + super(CUDACASTBuilder, self).function_id_in_knl_callable_mapper()) # }}} diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 44bf9c4c..d8c195de 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -442,10 +442,10 @@ class OpenCLTarget(CTarget): class OpenCLCASTBuilder(CASTBuilder): # {{{ library - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): return ( [scope_opencl_functions] + super( - OpenCLCASTBuilder, self).function_scopers()) + OpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) def symbol_manglers(self): return ( diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 03ba2693..0e955648 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -792,11 +792,11 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): from loopy.library.random123 import random123_function_scoper return ( [pyopencl_function_scoper, random123_function_scoper] + super( - PyOpenCLCASTBuilder, self).function_scopers()) + PyOpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) def preamble_generators(self): return ([ diff --git a/loopy/target/python.py b/loopy/target/python.py index cd6e6116..0dbecce2 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -180,10 +180,10 @@ class PythonASTBuilderBase(ASTBuilderBase): # {{{ code generation guts - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, self).function_scopers() + + super(PythonASTBuilderBase, self).function_id_in_knl_callable_mapper() + [scope_c_math_functions]) def preamble_generators(self): -- GitLab From d923227ed2d2557e0b3dcdc505546ada4069a142 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 28 Aug 2018 21:34:07 -0500 Subject: [PATCH 36/80] flake8 fixes after `sed` --- loopy/program.py | 6 ++++-- loopy/target/python.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index fd4ae63f..a18d9076 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -156,10 +156,12 @@ def _default_func_id_to_kernel_callable_mappers(target): """ # FIXME: the name -- scopers is no longer used!(change it) ~KK - from loopy.library.function import loopy_specific_callable_func_id_to_knl_callable_mappers + from loopy.library.function import ( + loopy_specific_callable_func_id_to_knl_callable_mappers) return ( [loopy_specific_callable_func_id_to_knl_callable_mappers] + ( - target.get_device_ast_builder().function_id_in_knl_callable_mapper())) + target.get_device_ast_builder().function_id_in_knl_callable_mapper( + ))) def initialize_program_callables_info_from_kernel(kernel): diff --git a/loopy/target/python.py b/loopy/target/python.py index 0dbecce2..2e6712ec 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -183,7 +183,8 @@ class PythonASTBuilderBase(ASTBuilderBase): def function_id_in_knl_callable_mapper(self): from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, self).function_id_in_knl_callable_mapper() + + super(PythonASTBuilderBase, + self).function_id_in_knl_callable_mapper() + [scope_c_math_functions]) def preamble_generators(self): -- GitLab From 906e1e2eb9a2ee0e850d28f57cccdb5e904ffd57 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 28 Aug 2018 21:35:03 -0500 Subject: [PATCH 37/80] replaces unnecessary old logic in unscoped_call_collector. --- loopy/check.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index ae5599bc..7033b62d 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -68,10 +68,6 @@ class UnscopedCallCollector(CombineMapper): :returns: An :class:`frozenset` of function names that are not scoped in the kernel. - - .. note:: - :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are - never scoped in the pipeline. """ def combine(self, values): @@ -85,8 +81,7 @@ class UnscopedCallCollector(CombineMapper): kw_parameters={})) def map_call_with_kwargs(self, expr): - from loopy.library.reduction import ArgExtOp - if not isinstance(expr.function, (ResolvedFunction, ArgExtOp)): + if not isinstance(expr.function, ResolvedFunction): return (frozenset([expr.function.name]) | self.combine((self.rec(child) for child in expr.parameters + tuple(expr.kw_parameters.values())))) -- GitLab From eeae2d861228796110337b8b5ccacddf84b53543 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 28 Aug 2018 22:00:36 -0500 Subject: [PATCH 38/80] Comment rewording, scoper-> function_id_to_in_knl_callable_mapper --- doc/ref_call.rst | 6 +++--- loopy/check.py | 4 ++-- loopy/kernel/__init__.py | 2 +- loopy/kernel/function_interface.py | 2 +- loopy/library/random123.py | 2 +- loopy/target/pyopencl.py | 8 +++++--- 6 files changed, 13 insertions(+), 11 deletions(-) diff --git a/doc/ref_call.rst b/doc/ref_call.rst index 147363a1..ab810137 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -30,7 +30,7 @@ kernel, whose name has been resolved by the kernel. The process of matching a function idenitifier with the function definition is called "resolving". A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it -is "resolved" by one of the ``function_scoper`` in a +is "resolved" by one of the ``function_id_to_in_knl_callable_mapper`` in a :attr:`LoopKernel.scoped_functions` - Functions already registered by the target. Some examples include -- @@ -41,11 +41,11 @@ is "resolved" by one of the ``function_scoper`` in a - Functions registered as ``CallableKernels`` using ``lp.register_callable_kernel(...)``. - Functions that have been provided through - ``lp.register_function_scoper(...)`` + ``lp.register_function_id_to_in_knl_callable_mapper(...)`` - Functions that can be made known from the user through ``lp.register_function_mangler``. This is planned to be deprecated, as its functionality is superseded by - ``lp.register_function_scoper(...)``. + ``lp.register_function_id_to_in_knl_callable_mapper(...)``. Expressions after a function is scoped -------------------------------------- diff --git a/loopy/check.py b/loopy/check.py index 7033b62d..76a56c08 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -181,8 +181,8 @@ def check_loop_priority_inames_known(kernel): def _get_all_unique_iname_tags(kernel): - """Returns a set of all the iname tags used in *kernel* that - inherit from :class:`loopy.kernel.data.UniqueTag`. + """Returns an instance of :class:`set` of all the iname tags used in + *kernel* that inherit from :class:`loopy.kernel.data.UniqueTag`. """ from loopy.kernel.data import UniqueTag iname_tags = [kernel.iname_to_tag.get(iname) for iname in diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 8b2cf3dd..410f1332 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -223,7 +223,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: is_called_from_host An instance of :class:`bool`. Will be set *False* for the kernel which - would be called from another top level kernels. Default value is + would be called from other top level kernels. Default value is *True*. """ diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e4e8c1d5..c8b5a953 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -287,7 +287,7 @@ class InKernelCallable(ImmutableRecord): def with_hw_axes_sizes(self, local_size, global_size): """ Returns a copy of *self* with modifications to comply with the grid - sizes ``(local_size, global_size)`` of the kernel in which it is + sizes ``(local_size, global_size)`` of the program in which it is supposed to be called. :arg local_size: An instance of :class:`islpy.PwAff`. diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 59ca72df..397e985b 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -231,7 +231,7 @@ class Random123Callable(ScalarCallable): return -def random123_function_scoper(target, identifier): +def random123_function_id_to_in_knl_callable_mapper(target, identifier): if identifier in FUNC_NAMES_TO_RNG: return Random123Callable(name=identifier) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 0e955648..435a5e79 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -274,7 +274,7 @@ class PyOpenCLCallable(ScalarCallable): program_callables_info) -def pyopencl_function_scoper(target, identifier): +def pyopencl_function_id_to_in_knl_callable_mapper(target, identifier): if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", "tanh", "conj", "real", "imag", "abs"]: return PyOpenCLCallable(name=identifier) @@ -793,9 +793,11 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library def function_id_in_knl_callable_mapper(self): - from loopy.library.random123 import random123_function_scoper + from loopy.library.random123 import ( + random123_function_id_to_in_knl_callable_mapper) return ( - [pyopencl_function_scoper, random123_function_scoper] + super( + [pyopencl_function_id_to_in_knl_callable_mapper, + random123_function_id_to_in_knl_callable_mapper] + super( PyOpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) def preamble_generators(self): -- GitLab From 481573be0b9ebca023ce2994ed866c66cb85d6e3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 28 Aug 2018 22:02:41 -0500 Subject: [PATCH 39/80] removes FIXME. --- loopy/program.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index a18d9076..161249e0 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -154,8 +154,6 @@ def _default_func_id_to_kernel_callable_mappers(target): """ Returns a list of functions that are provided through *target* by deafault. """ - # FIXME: the name -- scopers is no longer used!(change it) ~KK - from loopy.library.function import ( loopy_specific_callable_func_id_to_knl_callable_mappers) return ( -- GitLab From 46d1502bf2372803eaaa0483a07190d4cfef60cd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 28 Aug 2018 22:34:27 -0500 Subject: [PATCH 40/80] adds a comment that the ref_call needs one more revamping, removed unnecessary fixme in type_inference, some other minor comment rewording. --- doc/ref_call.rst | 2 ++ loopy/program.py | 14 +++++++++----- loopy/statistics.py | 4 ++-- loopy/type_inference.py | 2 -- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/doc/ref_call.rst b/doc/ref_call.rst index ab810137..5a59e842 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -4,6 +4,8 @@ Calling Loopy Kernels and External Functions Goals of a function interface ----------------------------- +- *FIXME: * Needs to change after the new design of program. + - Must be able to have complete information of the function just through the epxression node. - Must adhere to :mod:`loopy` semantics of immutability. diff --git a/loopy/program.py b/loopy/program.py index 161249e0..7479ee04 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -556,6 +556,8 @@ def count_callables_in_kernel(kernel, program_callables_info): # {{{ program callables info class ProgramCallablesInfo(ImmutableRecord): + # FIXME: is CallablesTable a better name?(similar to symbol table in + # compilers.) """ Records the information of all the callables called in a :class:`loopy.Program`. @@ -637,8 +639,11 @@ class ProgramCallablesInfo(ImmutableRecord): def with_added_callable(self, function, in_kernel_callable): """ - Returns a copy of *self* with the *function* associated with the - *in_kernel_callable*. + Returns an instance of :class:`tuple` of ``(new_self, new_function)``. + ``new_self`` is a copy of *self* with the *function* associated with the + *in_kernel_callable*. ``new_function`` is the function identifier that + should be noted in the expression node so that it could be associated + with an instance of :class:`InKernelCallable`. .. note:: @@ -739,9 +744,8 @@ class ProgramCallablesInfo(ImmutableRecord): def with_callable(self, function, in_kernel_callable): """ - Returns a copy of *self* with the *function* associated with the - *in_kernel_callable*. Also refer -- - :meth:`loopy.ProgramCallablesInfo.with_added_callable` + Returns an instance of :class:`tuple` ``(new_self, new_function)``. + Also refer -- :meth:`loopy.ProgramCallablesInfo.with_added_callable` :arg function: An instance of :class:`pymbolic.primitives.Variable` or diff --git a/loopy/statistics.py b/loopy/statistics.py index 71a62986..000f651a 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -61,8 +61,8 @@ __doc__ = """ # FIXME: this is broken for the callable kernel design. -# Qns: -# - The variable name, what if multiple kernels use the same name? +# - The variable name, what if multiple kernels use the same name?(needs a +# different MemAccessInfo) # - We should also add the cumulative effect on the arguments of callee kernels # into the caller kernel # - Make changes to MemAccessInfo to include the effect of several kernels. diff --git a/loopy/type_inference.py b/loopy/type_inference.py index d5df36bf..a2174181 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -969,8 +969,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, if isinstance(insn, lp.MultiAssignmentBase): # just a dummy run over the expression, to pass over all the # functions - # FIXME: need a check over here which checks the instruction for - # unseen cases if _instruction_missed_during_inference(insn): type_inf_mapper(insn.expression, return_tuple=isinstance(insn, lp.CallInstruction), return_dtype_set=True) -- GitLab From e5b0303aea50dbbea889c0f16f2bea724c8c8fa1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 31 Aug 2018 19:36:54 -0400 Subject: [PATCH 41/80] actually use `for_atomic` in the constructor --- loopy/kernel/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 186597c6..6bf733a8 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -833,6 +833,7 @@ class ArrayBase(ImmutableRecord): dim_names=dim_names, order=order, alignment=alignment, + for_atomic=for_atomic, **kwargs) def __eq__(self, other): -- GitLab From 5137aded65aa9e7f55219eba4b66b86055a4f627 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 1 Sep 2018 07:33:45 -0500 Subject: [PATCH 42/80] actually update the dtype target, for array base sub-classes. --- loopy/preprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index fc950c78..2afcd3db 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -53,7 +53,7 @@ def prepare_for_caching(kernel): for arg in kernel.args: dtype = arg.dtype if dtype is not None and dtype is not lp.auto and dtype.target is not tgt: - arg = arg.copy(dtype=dtype.with_target(kernel.target)) + arg = arg.copy(dtype=dtype.with_target(tgt), target=tgt) new_args.append(arg) @@ -61,7 +61,7 @@ def prepare_for_caching(kernel): for name, temp in six.iteritems(kernel.temporary_variables): dtype = temp.dtype if dtype is not None and dtype is not lp.auto and dtype.target is not tgt: - temp = temp.copy(dtype=dtype.with_target(tgt)) + temp = temp.copy(dtype=dtype.with_target(tgt), target=tgt) new_temporary_variables[name] = temp -- GitLab From d6b4b615ecf049314a75fc2662eef8068cf99f6a Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 3 Sep 2018 18:57:10 -0500 Subject: [PATCH 43/80] removed commented out code w/previous count granularity --- loopy/statistics.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index f8999367..c233ab09 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -715,7 +715,6 @@ class ExpressionOpCounter(CounterBase): return ToCountMap( {Op(dtype=self.type_inf(expr), name='func:'+str(expr.function), - #count_granularity=CountGranularity.WORKITEM): 1} count_granularity=CountGranularity.SUBGROUP): 1} ) + self.rec(expr.parameters) @@ -727,7 +726,6 @@ class ExpressionOpCounter(CounterBase): return ToCountMap( {Op(dtype=self.type_inf(expr), name='add', - #count_granularity=CountGranularity.WORKITEM): count_granularity=CountGranularity.SUBGROUP): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) @@ -737,20 +735,17 @@ class ExpressionOpCounter(CounterBase): assert expr.children return sum(ToCountMap({Op(dtype=self.type_inf(expr), name='mul', - #count_granularity=CountGranularity.WORKITEM): 1}) count_granularity=CountGranularity.SUBGROUP): 1}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ ToCountMap({Op(dtype=self.type_inf(expr), name='mul', - #count_granularity=CountGranularity.WORKITEM): -1}) count_granularity=CountGranularity.SUBGROUP): -1}) def map_quotient(self, expr, *args): return ToCountMap({Op(dtype=self.type_inf(expr), name='div', - #count_granularity=CountGranularity.WORKITEM): 1}) \ count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -761,7 +756,6 @@ class ExpressionOpCounter(CounterBase): def map_power(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='pow', - #count_granularity=CountGranularity.WORKITEM): 1}) \ count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) @@ -769,7 +763,6 @@ class ExpressionOpCounter(CounterBase): def map_left_shift(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='shift', - #count_granularity=CountGranularity.WORKITEM): 1}) \ count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) @@ -779,14 +772,12 @@ class ExpressionOpCounter(CounterBase): def map_bitwise_not(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='bw', - #count_granularity=CountGranularity.WORKITEM): 1}) \ count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='bw', - #count_granularity=CountGranularity.WORKITEM): count_granularity=CountGranularity.SUBGROUP): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -811,7 +802,6 @@ class ExpressionOpCounter(CounterBase): def map_min(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin', - #count_granularity=CountGranularity.WORKITEM): count_granularity=CountGranularity.SUBGROUP): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -928,7 +918,6 @@ class LocalMemAccessCounter(MemAccessCounter): sub_map[MemAccess( mtype='local', dtype=dtype, - #count_granularity=CountGranularity.WORKITEM) count_granularity=CountGranularity.SUBGROUP) ] = 1 return sub_map @@ -949,7 +938,6 @@ class LocalMemAccessCounter(MemAccessCounter): lid_strides=dict(sorted(six.iteritems(lid_strides))), gid_strides=dict(sorted(six.iteritems(gid_strides))), variable=name, - #count_granularity=CountGranularity.WORKITEM)] = 1 count_granularity=CountGranularity.SUBGROUP)] = 1 return sub_map -- GitLab From bd9973f33cbbbf9faf75d1fc71ec9ecbca36ed9a Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 3 Sep 2018 19:47:35 -0500 Subject: [PATCH 44/80] combined duplicate implementations of get_insn_count into single function --- loopy/statistics.py | 185 +++++++++++++++----------------------------- 1 file changed, 63 insertions(+), 122 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index c233ab09..194775db 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -32,7 +32,7 @@ from functools import reduce from loopy.kernel.data import ( MultiAssignmentBase, TemporaryVariable, AddressSpace) from loopy.diagnostic import warn_with_kernel, LoopyError -from pytools import Record +from pytools import Record, memoize_method __doc__ = """ @@ -1255,6 +1255,59 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False) else: return c + +@memoize_method +def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, + count_granularity=CountGranularity.WORKITEM): + insn = knl.id_to_insn[insn_id] + + if count_granularity is None: + warn_with_kernel(knl, "get_insn_count_assumes_granularity", + "get_insn_count: No count granularity passed, " + "assuming %s granularity." + % (CountGranularity.WORKITEM)) + count_granularity == CountGranularity.WORKITEM + + if count_granularity == CountGranularity.WORKITEM: + return count_insn_runs( + knl, insn, count_redundant_work=count_redundant_work, + disregard_local_axes=False) + + ct_disregard_local = count_insn_runs( + knl, insn, disregard_local_axes=True, + count_redundant_work=count_redundant_work) + + if count_granularity == CountGranularity.WORKGROUP: + return ct_disregard_local + elif count_granularity == CountGranularity.SUBGROUP: + # get the group size + from loopy.symbolic import aff_to_expr + _, local_size = knl.get_grid_size_upper_bounds() + workgroup_size = 1 + if local_size: + for size in local_size: + s = aff_to_expr(size) + if not isinstance(s, int): + raise LoopyError("Cannot count insn with %s granularity, " + "work-group size is not integer: %s" + % (CountGranularity.SUBGROUP, local_size)) + workgroup_size *= s + + warn_with_kernel(knl, "insn_count_subgroups_upper_bound", + "get_insn_count: when counting instruction %s with " + "count_granularity=%s, using upper bound for work-group size " + "(%d work-items) to compute sub-groups per work-group. When " + "multiple device programs present, actual sub-group count may be" + "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size)) + + from pytools import div_ceil + return ct_disregard_local*div_ceil(workgroup_size, subgroup_size) + else: + # this should not happen since this is enforced in Op/MemAccess + raise ValueError("get_insn_count: count_granularity '%s' is" + "not allowed. count_granularity options: %s" + % (count_granularity, CountGranularity.ALL+[None])) + # }}} @@ -1360,77 +1413,18 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, "must be integer, 'guess', or, if you're feeling " "lucky, None." % (subgroup_size)) - # ------------------------------ - #class CacheHolder(object): - # pass - - #cache_holder = CacheHolder() - #from pytools import memoize_in - - #@memoize_in(cache_holder, "insn_count") - def get_insn_count(knl, insn, count_granularity=CountGranularity.WORKITEM): - - if count_granularity is None: - warn_with_kernel(knl, "get_insn_count_assumes_granularity", - "get_insn_count: No count granularity passed for " - "Op, assuming %s granularity." - % (CountGranularity.WORKITEM)) - count_granularity == CountGranularity.WORKITEM - - if count_granularity == CountGranularity.WORKITEM: - return count_insn_runs( - knl, insn, count_redundant_work=count_redundant_work, - disregard_local_axes=False) - - ct_disregard_local = count_insn_runs( - knl, insn, disregard_local_axes=True, - count_redundant_work=count_redundant_work) - - if count_granularity == CountGranularity.WORKGROUP: - return ct_disregard_local - elif count_granularity == CountGranularity.SUBGROUP: - # get the group size - from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds() - workgroup_size = 1 - if local_size: - for size in local_size: - s = aff_to_expr(size) - if not isinstance(s, int): - raise LoopyError("Cannot count insn with %s granularity, " - "work-group size is not integer: %s" - % (CountGranularity.SUBGROUP, local_size)) - workgroup_size *= s - - warn_with_kernel(knl, "insn_count_subgroups_upper_bound", - "get_insn_count: when counting instruction %s with " - "count_granularity=%s, using upper bound for work-group size " - "(%d work-items) to compute sub-groups per work-group. When " - "multiple device programs present, actual sub-group count may be" - "lower." % (insn, CountGranularity.SUBGROUP, workgroup_size)) - - from pytools import div_ceil - return ct_disregard_local*div_ceil(workgroup_size, subgroup_size) - else: - # this should not happen since this is enforced in Op - raise ValueError("get_insn_count: count_granularity '%s' is" - "not allowed. count_granularity options: %s" - % (count_granularity, CountGranularity.ALL+[None])) - # ------------------------------ - op_map = ToCountMap() op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: if isinstance(insn, (CallInstruction, CInstruction, Assignment)): ops = op_counter(insn.assignee) + op_counter(insn.expression) - #op_map = op_map + ops*count_insn_runs( - # knl, insn, - # count_redundant_work=count_redundant_work) for key, val in six.iteritems(ops): op_map = ( op_map + ToCountMap({key: val}) - * get_insn_count(knl, insn, key.count_granularity)) + * _get_insn_count(knl, insn.id, subgroup_size, + count_redundant_work, + key.count_granularity)) elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass @@ -1594,63 +1588,6 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, "must be integer, 'guess', or, if you're feeling " "lucky, None." % (subgroup_size)) - class CacheHolder(object): - pass - - cache_holder = CacheHolder() - from pytools import memoize_in - - @memoize_in(cache_holder, "insn_count") - def get_insn_count(knl, insn_id, count_granularity=CountGranularity.WORKITEM): - insn = knl.id_to_insn[insn_id] - - if count_granularity is None: - warn_with_kernel(knl, "get_insn_count_assumes_granularity", - "get_insn_count: No count granularity passed for " - "MemAccess, assuming %s granularity." - % (CountGranularity.WORKITEM)) - count_granularity == CountGranularity.WORKITEM - - if count_granularity == CountGranularity.WORKITEM: - return count_insn_runs( - knl, insn, count_redundant_work=count_redundant_work, - disregard_local_axes=False) - - ct_disregard_local = count_insn_runs( - knl, insn, disregard_local_axes=True, - count_redundant_work=count_redundant_work) - - if count_granularity == CountGranularity.WORKGROUP: - return ct_disregard_local - elif count_granularity == CountGranularity.SUBGROUP: - # get the group size - from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds() - workgroup_size = 1 - if local_size: - for size in local_size: - s = aff_to_expr(size) - if not isinstance(s, int): - raise LoopyError("Cannot count insn with %s granularity, " - "work-group size is not integer: %s" - % (CountGranularity.SUBGROUP, local_size)) - workgroup_size *= s - - warn_with_kernel(knl, "insn_count_subgroups_upper_bound", - "get_insn_count: when counting instruction %s with " - "count_granularity=%s, using upper bound for work-group size " - "(%d work-items) to compute sub-groups per work-group. When " - "multiple device programs present, actual sub-group count may be" - "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size)) - - from pytools import div_ceil - return ct_disregard_local*div_ceil(workgroup_size, subgroup_size) - else: - # this should not happen since this is enforced in MemAccess - raise ValueError("get_insn_count: count_granularity '%s' is" - "not allowed. count_granularity options: %s" - % (count_granularity, CountGranularity.ALL+[None])) - knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) @@ -1679,14 +1616,18 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, access_map = ( access_map + ToCountMap({key: val}) - * get_insn_count(knl, insn.id, key.count_granularity)) + * _get_insn_count(knl, insn.id, subgroup_size, + count_redundant_work, + key.count_granularity)) for key, val in six.iteritems(access_assignee.count_map): access_map = ( access_map + ToCountMap({key: val}) - * get_insn_count(knl, insn.id, key.count_granularity)) + * _get_insn_count(knl, insn.id, subgroup_size, + count_redundant_work, + key.count_granularity)) elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass -- GitLab From de02e453215cec23136b31bcb3741bbe881f6b04 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 3 Sep 2018 19:53:01 -0500 Subject: [PATCH 45/80] moved import statements in get_op_map and get_mem_access_map closer to where they are used --- loopy/statistics.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 194775db..02b79a81 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1375,13 +1375,6 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, raise LoopyError("Kernel '%s': Using operation counting requires the option " "ignore_boostable_into to be set." % knl.name) - from loopy.preprocess import preprocess_kernel, infer_unknown_types - from loopy.kernel.instruction import ( - CallInstruction, CInstruction, Assignment, - NoOpInstruction, BarrierInstruction) - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - if not isinstance(subgroup_size, int): # try to find subgroup_size subgroup_size_guess = _find_subgroup_size_for_knl(knl) @@ -1413,8 +1406,17 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, "must be integer, 'guess', or, if you're feeling " "lucky, None." % (subgroup_size)) + from loopy.preprocess import preprocess_kernel, infer_unknown_types + knl = infer_unknown_types(knl, expect_completion=True) + knl = preprocess_kernel(knl) + op_map = ToCountMap() op_counter = ExpressionOpCounter(knl) + + from loopy.kernel.instruction import ( + CallInstruction, CInstruction, Assignment, + NoOpInstruction, BarrierInstruction) + for insn in knl.instructions: if isinstance(insn, (CallInstruction, CInstruction, Assignment)): ops = op_counter(insn.assignee) + op_counter(insn.expression) @@ -1551,7 +1553,6 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, # (now use these counts to, e.g., predict performance) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types if not knl.options.ignore_boostable_into: raise LoopyError("Kernel '%s': Using operation counting requires the option " @@ -1588,6 +1589,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, "must be integer, 'guess', or, if you're feeling " "lucky, None." % (subgroup_size)) + from loopy.preprocess import preprocess_kernel, infer_unknown_types knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) -- GitLab From 0cf111d21244988e976ac7d6a26591b22f60cd11 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 3 Sep 2018 20:23:06 -0500 Subject: [PATCH 46/80] combined duplicate code processing subgroup_size into single function --- loopy/statistics.py | 100 ++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 60 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 02b79a81..f71e1d91 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1375,36 +1375,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, raise LoopyError("Kernel '%s': Using operation counting requires the option " "ignore_boostable_into to be set." % knl.name) - if not isinstance(subgroup_size, int): - # try to find subgroup_size - subgroup_size_guess = _find_subgroup_size_for_knl(knl) - - if subgroup_size is None: - if subgroup_size_guess is None: - # 'guess' was not passed and either no target device found - # or get_simd_group_size returned None - raise ValueError("No sub-group size passed, no target device found. " - "Either (1) pass integer value for subgroup_size, " - "(2) ensure that kernel.target is PyOpenClTarget " - "and kernel.target.device is set, or (3) pass " - "subgroup_size='guess' and hope for the best.") - else: - subgroup_size = subgroup_size_guess - - elif subgroup_size == 'guess': - if subgroup_size_guess is None: - # unable to get subgroup_size from device, so guess - subgroup_size = 32 - warn_with_kernel(knl, "get_op_map_guessing_subgroup_size", - "get_op_map: 'guess' sub-group size " - "passed, no target device found, wildly guessing " - "that sub-group size is %d." % (subgroup_size)) - else: - subgroup_size = subgroup_size_guess - else: - raise ValueError("Invalid value for subgroup_size: %s. subgroup_size " - "must be integer, 'guess', or, if you're feeling " - "lucky, None." % (subgroup_size)) + subgroup_size = _process_subgroup_size(knl, subgroup_size) from loopy.preprocess import preprocess_kernel, infer_unknown_types knl = infer_unknown_types(knl, expect_completion=True) @@ -1465,6 +1436,44 @@ def _find_subgroup_size_for_knl(knl): return None +@memoize_method +def _process_subgroup_size(knl, subgroup_size_requested): + + if isinstance(subgroup_size_requested, int): + return subgroup_size_requested + else: + # try to find subgroup_size + subgroup_size_guess = _find_subgroup_size_for_knl(knl) + + if subgroup_size_requested is None: + if subgroup_size_guess is None: + # 'guess' was not passed and either no target device found + # or get_simd_group_size returned None + raise ValueError("No sub-group size passed, no target device found. " + "Either (1) pass integer value for subgroup_size, " + "(2) ensure that kernel.target is PyOpenClTarget " + "and kernel.target.device is set, or (3) pass " + "subgroup_size='guess' and hope for the best.") + else: + return subgroup_size_guess + + elif subgroup_size_requested == 'guess': + if subgroup_size_guess is None: + # unable to get subgroup_size from device, so guess + subgroup_size_guess = 32 + warn_with_kernel(knl, "get_x_map_guessing_subgroup_size", + "'guess' sub-group size passed, no target device " + "found, wildly guessing that sub-group size is %d." + % (subgroup_size_guess)) + return subgroup_size_guess + else: + return subgroup_size_guess + else: + raise ValueError("Invalid value for subgroup_size: %s. subgroup_size " + "must be integer, 'guess', or, if you're feeling " + "lucky, None." % (subgroup_size_requested)) + + # {{{ get_mem_access_map def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, @@ -1558,36 +1567,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, raise LoopyError("Kernel '%s': Using operation counting requires the option " "ignore_boostable_into to be set." % knl.name) - if not isinstance(subgroup_size, int): - # try to find subgroup_size - subgroup_size_guess = _find_subgroup_size_for_knl(knl) - - if subgroup_size is None: - if subgroup_size_guess is None: - # 'guess' was not passed and either no target device found - # or get_simd_group_size returned None - raise ValueError("No sub-group size passed, no target device found. " - "Either (1) pass integer value for subgroup_size, " - "(2) ensure that kernel.target is PyOpenClTarget " - "and kernel.target.device is set, or (3) pass " - "subgroup_size='guess' and hope for the best.") - else: - subgroup_size = subgroup_size_guess - - elif subgroup_size == 'guess': - if subgroup_size_guess is None: - # unable to get subgroup_size from device, so guess - subgroup_size = 32 - warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size", - "get_mem_access_map: 'guess' sub-group size " - "passed, no target device found, wildly guessing " - "that sub-group size is %d." % (subgroup_size)) - else: - subgroup_size = subgroup_size_guess - else: - raise ValueError("Invalid value for subgroup_size: %s. subgroup_size " - "must be integer, 'guess', or, if you're feeling " - "lucky, None." % (subgroup_size)) + subgroup_size = _process_subgroup_size(knl, subgroup_size) from loopy.preprocess import preprocess_kernel, infer_unknown_types knl = infer_unknown_types(knl, expect_completion=True) -- GitLab From b9d8034bb415ca91a2d392db51d278d8b34de0c0 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 3 Sep 2018 20:30:02 -0500 Subject: [PATCH 47/80] added subgroup_size arg to get_op_map in tutorial --- doc/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 71b8f438..3019a47a 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1551,7 +1551,7 @@ information provided. Now we will count the operations: .. doctest:: - >>> op_map = lp.get_op_map(knl) + >>> op_map = lp.get_op_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(op_map)) Op(np:dtype('float32'), add, workitem) : ... -- GitLab From eb4cfc8ad59d84bbc21cf3c719680e8fdda3c857 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 3 Sep 2018 20:45:21 -0500 Subject: [PATCH 48/80] updated doctests to reflect count granularity change for local ops (workitem->subgroup) --- doc/tutorial.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 3019a47a..1272d2a5 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1553,11 +1553,11 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add, workitem) : ... + Op(np:dtype('float32'), add, subgroup) : ... Each line of output will look roughly like:: - Op(np:dtype('float32'), add, workitem) : [l, m, n] -> { l * m * n : l > 0 and m > 0 and n > 0 } + Op(np:dtype('float32'), add, subgroup) : [l, m, n] -> { l * m * n : l > 0 and m > 0 and n > 0 } :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A @@ -1579,12 +1579,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} >>> from loopy.statistics import CountGranularity as CG - >>> f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(param_dict) - >>> f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(param_dict) - >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(param_dict) - >>> f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(param_dict) - >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.WORKITEM)].eval_with_dict(param_dict) - >>> i32add = op_map[lp.Op(np.int32, 'add', CG.WORKITEM)].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 -- GitLab From 6b34b8aa3a259b969514a9671fa91663a767c8a7 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 3 Sep 2018 21:05:36 -0500 Subject: [PATCH 49/80] passing subgroup_size to get_op_map --- test/test_numa_diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 6b578838..15d5ea7c 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -231,7 +231,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa if 1: print("OPS") - op_map = lp.get_op_map(hsv) + op_map = lp.get_op_map(hsv, subgroup_size=32) print(lp.stringify_stats_mapping(op_map)) print("MEM") -- GitLab From 6e798d058cddda0ca5dcb6c9519f4c42c59db97b Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 3 Sep 2018 21:22:59 -0500 Subject: [PATCH 50/80] bug fix, calling iteritems on dict within ToCountMap, rather than ToCountMap itself --- loopy/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index f71e1d91..3fecfb77 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1391,7 +1391,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, for insn in knl.instructions: if isinstance(insn, (CallInstruction, CInstruction, Assignment)): ops = op_counter(insn.assignee) + op_counter(insn.expression) - for key, val in six.iteritems(ops): + for key, val in six.iteritems(ops.count_map): op_map = ( op_map + ToCountMap({key: val}) -- GitLab From aee564f107546e8db9d5b0186c26aca10f2d3b8a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 6 Sep 2018 13:38:08 -0500 Subject: [PATCH 51/80] Fix and test generation of ISPC streaming stores --- loopy/target/ispc.py | 4 ++-- test/test_target.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 261475eb..9009b144 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -437,9 +437,9 @@ class ISPCASTBuilder(CASTBuilder): else: for dep in get_dependencies(term): if filter_iname_tags_by_type( - kernel.iname_to_tags[dep], LocalIndexTag): + kernel.iname_to_tags.get(dep, []), LocalIndexTag): tag, = filter_iname_tags_by_type( - kernel.iname_to_tags[dep], LocalIndexTag, 1) + kernel.iname_to_tags.get(dep, []), LocalIndexTag, 1) if tag.axis == 0: raise LoopyError( "streaming store must have stride 1 in " diff --git a/test/test_target.py b/test/test_target.py index eb94bdc8..a0011426 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -327,6 +327,35 @@ def test_target_invalid_type_cast(): lp.TypeCast(dtype, 1) +def test_ispc_streaming_stores(): + stream_dtype = np.float32 + index_dtype = np.int32 + + knl = lp.make_kernel( + "{[i]: 0<=i0") + knl = lp.split_iname( + knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) + knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") + knl = lp.tag_instructions(knl, "!streaming_store") + + knl = lp.add_and_infer_dtypes(knl, { + var: stream_dtype + for var in vars + }) + + knl = lp.set_argument_order(knl, vars + ["n"]) + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + lp.generate_code_v2(knl).all_code() + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 680842c5612a8422cba5e7a6286d37e20cdfdaf6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 6 Sep 2018 20:59:39 -0500 Subject: [PATCH 52/80] split_iname handles within correctly --- loopy/transform/iname.py | 22 ++++++++++++++++++---- test/test_transform.py | 17 +++++++++++++++++ 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 2b618a46..65f1c2ec 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -177,6 +177,22 @@ def _split_iname_backend(kernel, split_iname, for syntax. """ + from loopy.match import parse_stack_match + within = parse_stack_match(within) + + # {{{ return the same kernel if no kernel matches + + def _do_not_transform_if_no_within_matches(): + for insn in kernel.instructions: + if within(kernel, insn, ()): + return + + return kernel + + _do_not_transform_if_no_within_matches() + + # }}} + existing_tags = kernel.iname_tags(split_iname) from loopy.kernel.data import ForceSequentialTag, filter_iname_tags_by_type if (do_tagged_check and existing_tags @@ -249,7 +265,8 @@ def _split_iname_backend(kernel, split_iname, new_insns = [] for insn in kernel.instructions: - if split_iname in insn.within_inames: + if split_iname in insn.within_inames and ( + within(kernel, insn, ())): new_within_inames = ( (insn.within_inames.copy() - frozenset([split_iname])) @@ -284,9 +301,6 @@ def _split_iname_backend(kernel, split_iname, applied_iname_rewrites=applied_iname_rewrites, loop_priority=frozenset(new_priorities)) - from loopy.match import parse_stack_match - within = parse_stack_match(within) - rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) ins = _InameSplitter(rule_mapping_context, within, diff --git a/test/test_transform.py b/test/test_transform.py index ed184fb5..394cf668 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -533,6 +533,23 @@ def test_uniquify_instruction_ids(): assert all(isinstance(id, str) for id in insn_ids) +def test_split_iname_only_if_in_within(): + knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + c[i] = 3*d[i] {id=to_split} + a[i] = 2*b[i] {id=not_to_split} + """) + + knl = lp.split_iname(knl, "i", 4, within='id:to_split') + + for insn in knl.instructions: + if insn.id == 'to_split': + assert insn.within_inames == frozenset({'i_outer', 'i_inner'}) + if insn.id == 'not_to_split': + assert insn.within_inames == frozenset({'i'}) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 796c89fcd819efe80e4c868996cd688559c482d9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 6 Sep 2018 22:49:25 -0500 Subject: [PATCH 53/80] improve the logic of project_out --- loopy/transform/iname.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 65f1c2ec..fb3609b6 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -177,14 +177,15 @@ def _split_iname_backend(kernel, split_iname, for syntax. """ - from loopy.match import parse_stack_match - within = parse_stack_match(within) + from loopy.match import parse_stack_match, parse_match + stacked_within = parse_stack_match(within) + within = parse_match(within) # {{{ return the same kernel if no kernel matches def _do_not_transform_if_no_within_matches(): for insn in kernel.instructions: - if within(kernel, insn, ()): + if within(kernel, insn): return return kernel @@ -246,10 +247,15 @@ def _split_iname_backend(kernel, split_iname, name_dim_type, name_idx = space.get_var_dict()[split_iname] s = s.intersect(fixed_constraint_set) - if within is None: - s = s.project_out(name_dim_type, name_idx, 1) + def _project_out_only_if_all_instructions_in_within(): + for insn in kernel.instructions: + if split_iname in insn.within_inames and ( + not within(kernel, insn)): + return s + + return s.project_out(name_dim_type, name_idx, 1) - return s + return _project_out_only_if_all_instructions_in_within() new_domains = [process_set(dom) for dom in kernel.domains] @@ -266,7 +272,7 @@ def _split_iname_backend(kernel, split_iname, new_insns = [] for insn in kernel.instructions: if split_iname in insn.within_inames and ( - within(kernel, insn, ())): + within(kernel, insn)): new_within_inames = ( (insn.within_inames.copy() - frozenset([split_iname])) @@ -303,7 +309,7 @@ def _split_iname_backend(kernel, split_iname, rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) - ins = _InameSplitter(rule_mapping_context, within, + ins = _InameSplitter(rule_mapping_context, stacked_within, split_iname, outer_iname, inner_iname, new_loop_index) kernel = ins.map_kernel(kernel) -- GitLab From 190e41dbfc534c0126e2a5e73659c1120dbdb43d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 8 Sep 2018 11:05:01 -0500 Subject: [PATCH 54/80] corrects get_highlighte_code --- loopy/target/c/c_execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 6b80bae2..f7622936 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -402,7 +402,7 @@ class CKernelExecutor(KernelExecutorBase): if self.kernel.options.write_cl: output = all_code if self.kernel.options.highlight_cl: - output = get_highlighted_code(code=output) + output = get_highlighted_code(output) if self.kernel.options.write_cl is True: print(output) -- GitLab From 2cac2bf1c91e87a97b673b907604146a91d2e696 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 10 Sep 2018 15:58:17 -0500 Subject: [PATCH 55/80] disables support for parse_stack_match in split_iname --- loopy/transform/iname.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index fb3609b6..ad1da3e7 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -139,8 +139,7 @@ class _InameSplitter(RuleAwareIdentityMapper): and self.split_iname not in expn_state.arg_context and self.within( expn_state.kernel, - expn_state.instruction, - expn_state.stack)): + expn_state.instruction)): new_inames = list(expr.inames) new_inames.remove(self.split_iname) new_inames.extend([self.outer_iname, self.inner_iname]) @@ -157,8 +156,7 @@ class _InameSplitter(RuleAwareIdentityMapper): and self.split_iname not in expn_state.arg_context and self.within( expn_state.kernel, - expn_state.instruction, - expn_state.stack)): + expn_state.instruction)): return self.replacement_index else: return super(_InameSplitter, self).map_variable(expr, expn_state) @@ -177,8 +175,7 @@ def _split_iname_backend(kernel, split_iname, for syntax. """ - from loopy.match import parse_stack_match, parse_match - stacked_within = parse_stack_match(within) + from loopy.match import parse_match within = parse_match(within) # {{{ return the same kernel if no kernel matches @@ -309,7 +306,7 @@ def _split_iname_backend(kernel, split_iname, rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) - ins = _InameSplitter(rule_mapping_context, stacked_within, + ins = _InameSplitter(rule_mapping_context, within, split_iname, outer_iname, inner_iname, new_loop_index) kernel = ins.map_kernel(kernel) @@ -349,7 +346,7 @@ def split_iname(kernel, split_iname, inner_length, :arg inner_tag: The iname tag (see :ref:`iname-tags`) to apply to *inner_iname*. :arg within: a stack match as understood by - :func:`loopy.match.parse_stack_match`. + :func:`loopy.match.parse_match`. """ def make_new_loop_index(inner, outer): return inner + outer*inner_length -- GitLab From eb42917a6d5b7a923384ae91902cb7cc89dc63ba Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 17 Sep 2018 11:50:31 -0500 Subject: [PATCH 56/80] fixes the statistics tests --- loopy/statistics.py | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 9894656b..5dddd49e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1286,8 +1286,8 @@ def count_insn_runs(knl, program_callables_info, insn, count_redundant_work, @memoize_method -def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, - count_granularity=CountGranularity.WORKITEM): +def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size, + count_redundant_work, count_granularity=CountGranularity.WORKITEM): insn = knl.id_to_insn[insn_id] if count_granularity is None: @@ -1299,11 +1299,12 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, if count_granularity == CountGranularity.WORKITEM: return count_insn_runs( - knl, insn, count_redundant_work=count_redundant_work, + knl, program_callables_info, insn, + count_redundant_work=count_redundant_work, disregard_local_axes=False) ct_disregard_local = count_insn_runs( - knl, insn, disregard_local_axes=True, + knl, program_callables_info, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) if count_granularity == CountGranularity.WORKGROUP: @@ -1311,7 +1312,7 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, elif count_granularity == CountGranularity.SUBGROUP: # get the group size from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds() + _, local_size = knl.get_grid_size_upper_bounds(program_callables_info) workgroup_size = 1 if local_size: for size in local_size: @@ -1353,12 +1354,8 @@ def get_op_map_for_single_kernel(knl, program_callables_info, subgroup_size = _process_subgroup_size(knl, subgroup_size) - from loopy.preprocess import preprocess_kernel, infer_unknown_types - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl) + op_counter = ExpressionOpCounter(knl, program_callables_info) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, @@ -1371,9 +1368,9 @@ def get_op_map_for_single_kernel(knl, program_callables_info, op_map = ( op_map + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) + * _get_insn_count(knl, program_callables_info, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity)) elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass @@ -1547,10 +1544,6 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info, subgroup_size = _process_subgroup_size(knl, subgroup_size) - from loopy.preprocess import preprocess_kernel, infer_unknown_types - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - access_map = ToCountMap() access_counter_g = GlobalMemAccessCounter(knl, program_callables_info) access_counter_l = LocalMemAccessCounter(knl, program_callables_info) @@ -1576,18 +1569,18 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info, access_map = ( access_map + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) + * _get_insn_count(knl, program_callables_info, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity)) for key, val in six.iteritems(access_assignee.count_map): access_map = ( access_map + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) + * _get_insn_count(knl, program_callables_info, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity)) elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass -- GitLab From 7389731759bb8b5d8978a7368a2236e7a9554631 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 17 Sep 2018 12:57:09 -0500 Subject: [PATCH 57/80] make the test adapt to the progam model --- test/test_target.py | 2 -- test/test_transform.py | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/test/test_target.py b/test/test_target.py index 0eee835c..a5186c71 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -347,8 +347,6 @@ def test_ispc_streaming_stores(): knl = lp.set_argument_order(knl, vars + ["n"]) - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) lp.generate_code_v2(knl).all_code() diff --git a/test/test_transform.py b/test/test_transform.py index f67cb927..04162331 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -544,16 +544,16 @@ def test_uniquify_instruction_ids(): def test_split_iname_only_if_in_within(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0<=i<10}", """ c[i] = 3*d[i] {id=to_split} a[i] = 2*b[i] {id=not_to_split} """) - knl = lp.split_iname(knl, "i", 4, within='id:to_split') + prog = lp.split_iname(prog, "i", 4, within='id:to_split') - for insn in knl.instructions: + for insn in prog.root_kernel.instructions: if insn.id == 'to_split': assert insn.within_inames == frozenset({'i_outer', 'i_inner'}) if insn.id == 'not_to_split': -- GitLab From ba27e5defa26d171e5039de2fa877fc1e1b144d0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 14 Oct 2018 20:17:13 -0500 Subject: [PATCH 58/80] minor changes after the review --- examples/python/hello-loopy.py | 3 +-- loopy/auto_test.py | 2 +- loopy/check.py | 4 ++-- loopy/codegen/__init__.py | 11 +++++++++++ loopy/type_inference.py | 4 ++-- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py index 764cea0e..9098c544 100644 --- a/examples/python/hello-loopy.py +++ b/examples/python/hello-loopy.py @@ -16,8 +16,7 @@ a = cl.array.arange(queue, n, dtype=np.float32) # ------ knl = lp.make_kernel( "{ [i]: 0<=i Date: Sun, 14 Oct 2018 20:19:03 -0500 Subject: [PATCH 59/80] arg_is_output_only -> args_are_output_only --- loopy/kernel/creation.py | 4 ++-- loopy/kernel/function_interface.py | 4 ++-- loopy/kernel/tools.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index bc996d9c..685232c6 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2166,8 +2166,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) - from loopy.kernel.tools import infer_arg_is_output_only - knl = infer_arg_is_output_only(knl) + from loopy.kernel.tools import infer_args_are_output_only + knl = infer_args_are_output_only(knl) from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index c8b5a953..323690af 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -111,8 +111,8 @@ def get_kw_pos_association(kernel): Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in *kernel*. """ - from loopy.kernel.tools import infer_arg_is_output_only - kernel = infer_arg_is_output_only(kernel) + from loopy.kernel.tools import infer_args_are_output_only + kernel = infer_args_are_output_only(kernel) kw_to_pos = {} pos_to_kw = {} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 3c0c2443..3f4defc5 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1866,7 +1866,7 @@ def find_aliasing_equivalence_classes(kernel): # {{{ direction helper tools -def infer_arg_is_output_only(kernel): +def infer_args_are_output_only(kernel): """ Returns a copy of *kernel* with the attribute ``is_output_only`` set. -- GitLab From 111a5eb42b33b3d080027175533a06f57d32283a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 14 Oct 2018 20:28:15 -0500 Subject: [PATCH 60/80] minor changes after review --- loopy/kernel/function_interface.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 323690af..268bdaa1 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -111,8 +111,6 @@ def get_kw_pos_association(kernel): Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in *kernel*. """ - from loopy.kernel.tools import infer_args_are_output_only - kernel = infer_args_are_output_only(kernel) kw_to_pos = {} pos_to_kw = {} @@ -136,7 +134,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord): """ Helper class to set the :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the - callee kernels. Refer + callee kernels. Refer to :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. @@ -301,7 +299,8 @@ class InKernelCallable(ImmutableRecord): self.arg_id_to_descr is not None) def generate_preambles(self, target): - """ Yields the target specific preamble. + """ + Yields the target specific preamble. """ raise NotImplementedError() -- GitLab From c194c74e22513140f9e0afd92a428c42ba3fcfb6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 14 Oct 2018 20:30:27 -0500 Subject: [PATCH 61/80] program_callables_info, ProgramCallablesInfo -> callables_table, CallablesTable --- doc/tutorial.rst | 4 +- examples/python/global_barrier_removal.py | 2 +- loopy/check.py | 24 ++--- loopy/codegen/__init__.py | 28 +++--- loopy/codegen/control.py | 2 +- loopy/codegen/loop.py | 2 +- loopy/kernel/__init__.py | 16 +-- loopy/kernel/function_interface.py | 16 +-- loopy/kernel/tools.py | 12 +-- loopy/library/function.py | 12 +-- loopy/library/random123.py | 12 +-- loopy/library/reduction.py | 8 +- loopy/preprocess.py | 98 +++++++++---------- loopy/program.py | 114 +++++++++++----------- loopy/schedule/__init__.py | 18 ++-- loopy/statistics.py | 76 +++++++-------- loopy/target/__init__.py | 2 +- loopy/target/c/__init__.py | 14 +-- loopy/target/c/codegen/expression.py | 10 +- loopy/target/cuda.py | 14 +-- loopy/target/execution.py | 2 +- loopy/target/ispc.py | 4 +- loopy/target/opencl.py | 22 ++--- loopy/target/pyopencl.py | 20 ++-- loopy/target/python.py | 6 +- loopy/transform/buffer.py | 12 +-- loopy/transform/callable.py | 14 +-- loopy/transform/data.py | 12 +-- loopy/transform/fusion.py | 12 +-- loopy/transform/iname.py | 4 +- loopy/transform/instruction.py | 2 +- loopy/transform/precompute.py | 12 +-- loopy/transform/save.py | 12 +-- loopy/transform/subst.py | 2 +- loopy/type_inference.py | 80 +++++++-------- test/test_loopy.py | 14 +-- test/testlib.py | 10 +- 37 files changed, 362 insertions(+), 362 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 6a7a977a..25082f88 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1208,7 +1208,7 @@ happens when the instruction schedule is generated. To see the schedule, we should call :func:`loopy.get_one_scheduled_kernel`: >>> prog = lp.preprocess_kernel(prog) - >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) + >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) >>> prog = prog.with_root_kernel(knl) >>> print(prog) --------------------------------------------------------------------------- @@ -1240,7 +1240,7 @@ that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to put those instructions into the schedule. >>> prog = lp.save_and_reload_temporaries(prog) - >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) # Schedule added instructions + >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) # Schedule added instructions >>> prog = prog.with_root_kernel(knl) >>> print(prog) --------------------------------------------------------------------------- diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index cc4926fe..884fb0bd 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -24,7 +24,7 @@ from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel -knl = get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info) +knl = get_one_scheduled_kernel(knl.root_kernel, knl.callables_table) # map schedule onto host or device print(knl) diff --git a/loopy/check.py b/loopy/check.py index bfcd7aa2..64cf80a4 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -206,7 +206,7 @@ def check_multiple_tags_allowed(kernel): "tags: {1}".format(iname, tags)) -def check_for_double_use_of_hw_axes(kernel, program_callables_info): +def check_for_double_use_of_hw_axes(kernel, callables_table): from loopy.kernel.data import UniqueTag from loopy.kernel.instruction import CallInstruction from loopy.kernel.function_interface import CallableKernel @@ -224,7 +224,7 @@ def check_for_double_use_of_hw_axes(kernel, program_callables_info): # check usage of iname tags in the callee kernel if isinstance(insn, CallInstruction): - in_knl_callable = program_callables_info[ + in_knl_callable = callables_table[ insn.expression.function.name] if isinstance(in_knl_callable, CallableKernel): # check for collision in iname_tag keys in the instruction @@ -712,13 +712,13 @@ def check_variable_access_ordered(kernel): # }}} -def pre_schedule_checks(kernel, program_callables_info): +def pre_schedule_checks(kernel, callables_table): try: logger.debug("%s: pre-schedule check: start" % kernel.name) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) - check_for_double_use_of_hw_axes(kernel, program_callables_info) + check_for_double_use_of_hw_axes(kernel, callables_table) check_insn_attributes(kernel) check_loop_priority_inames_known(kernel) check_multiple_tags_allowed(kernel) @@ -746,7 +746,7 @@ def pre_schedule_checks(kernel, program_callables_info): # {{{ check for unused hw axes -def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info, +def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, @@ -763,7 +763,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info, _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index), - program_callables_info) + callables_table) group_axes = set(ax for ax, length in enumerate(group_size)) local_axes = set(ax for ax, length in enumerate(local_size)) @@ -781,7 +781,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info, sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, - program_callables_info, i) + callables_table, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] @@ -832,10 +832,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info, return past_end_i -def check_for_unused_hw_axes_in_insns(kernel, program_callables_info): +def check_for_unused_hw_axes_in_insns(kernel, callables_table): if kernel.schedule: _check_for_unused_hw_axes_in_kernel_chunk(kernel, - program_callables_info) + callables_table) # }}} @@ -989,15 +989,15 @@ def check_that_shapes_and_strides_are_arguments(kernel): # }}} -def pre_codegen_checks(kernel, program_callables_info): +def pre_codegen_checks(kernel, callables_table): try: logger.debug("pre-codegen check %s: start" % kernel.name) - check_for_unused_hw_axes_in_insns(kernel, program_callables_info) + check_for_unused_hw_axes_in_insns(kernel, callables_table) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) check_that_all_insns_are_scheduled(kernel) - kernel.target.pre_codegen_check(kernel, program_callables_info) + kernel.target.pre_codegen_check(kernel, callables_table) check_that_shapes_and_strides_are_arguments(kernel) logger.debug("pre-codegen check %s: done" % kernel.name) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index d0b19a1e..250e7215 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -192,16 +192,16 @@ class CodeGenerationState(object): .. attribute:: schedule_index_end - .. attribute:: program_callables_info + .. attribute:: callables_table - An instance of :class:`loopy.ProgramCallablesInfo`. + An instance of :class:`loopy.CallablesTable`. """ def __init__(self, kernel, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, - program_callables_info, + callables_table, vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, @@ -215,7 +215,7 @@ class CodeGenerationState(object): self.seen_atomic_dtypes = seen_atomic_dtypes self.var_subst_map = var_subst_map.copy() self.allow_complex = allow_complex - self.program_callables_info = program_callables_info + self.callables_table = callables_table self.vectorization_info = vectorization_info self.var_name_generator = var_name_generator self.is_generating_device_code = is_generating_device_code @@ -263,7 +263,7 @@ class CodeGenerationState(object): seen_atomic_dtypes=self.seen_atomic_dtypes, var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, - program_callables_info=self.program_callables_info, + callables_table=self.callables_table, vectorization_info=vectorization_info, var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, @@ -385,19 +385,19 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_for_a_single_kernel(kernel, program_callables_info): +def generate_code_for_a_single_kernel(kernel, callables_table): """ :returns: a :class:`CodeGenerationResult` :param kernel: An instance of :class:`loopy.LoopKernel`. - :param program_callables_info: An instance of - :class:`loopy.ProgramCallablesInfo`. + :param callables_table: An instance of + :class:`loopy.CallablesTable`. """ from loopy.kernel import KernelState if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel, program_callables_info) + kernel = get_one_scheduled_kernel(kernel, callables_table) if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " @@ -419,7 +419,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): # }}} from loopy.check import pre_codegen_checks - pre_codegen_checks(kernel, program_callables_info) + pre_codegen_checks(kernel, callables_table) logger.info("%s: generate code: start" % kernel.name) @@ -479,7 +479,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule), - program_callables_info=program_callables_info) + callables_table=callables_table) from loopy.codegen.result import generate_host_or_device_program @@ -556,17 +556,17 @@ def generate_code_v2(program): codegen_results = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): codegen_results[func_id] = ( generate_code_for_a_single_kernel(in_knl_callable.subkernel, - program.program_callables_info)) + program.callables_table)) device_preambles = set() for cgr in codegen_results.values(): device_preambles.update(cgr.device_preambles) - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): for preamble in in_knl_callable.generate_preambles(program.target): device_preambles.update([preamble]) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 90bdbda3..81a672a1 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -116,7 +116,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index), - codegen_state.program_callables_info) + codegen_state.callables_table) return merge_codegen_results(codegen_state, [ codegen_result, diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 39cf20c7..c282de79 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -248,7 +248,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( - insn_ids_for_block, codegen_state.program_callables_info) + insn_ids_for_block, codegen_state.callables_table) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 410f1332..70079d31 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1036,7 +1036,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): constants_only=True))) @memoize_method - def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info, + def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given @@ -1048,7 +1048,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if self.overridden_get_grid_sizes_for_insn_ids: return self.overridden_get_grid_sizes_for_insn_ids( insn_ids, - program_callables_info, + callables_table, ignore_auto=ignore_auto) all_inames_by_insns = set() @@ -1135,7 +1135,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, - program_callables_info, ignore_auto=False): + callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1146,7 +1146,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ grid_size, group_size = self.get_grid_sizes_for_insn_ids( - insn_ids, program_callables_info, ignore_auto) + insn_ids, callables_table, ignore_auto) def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr @@ -1154,7 +1154,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return tup_to_exprs(grid_size), tup_to_exprs(group_size) - def get_grid_size_upper_bounds(self, program_callables_info, ignore_auto=False): + def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1162,10 +1162,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ return self.get_grid_sizes_for_insn_ids( frozenset(insn.id for insn in self.instructions), - program_callables_info, + callables_table, ignore_auto=ignore_auto) - def get_grid_size_upper_bounds_as_exprs(self, program_callables_info, + def get_grid_size_upper_bounds_as_exprs(self, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1175,7 +1175,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return self.get_grid_sizes_for_insn_ids_as_exprs( frozenset(insn.id for insn in self.instructions), - program_callables_info, + callables_table, ignore_auto=ignore_auto) # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 268bdaa1..362fbcef 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -157,7 +157,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord): self.local_size = local_size self.global_size = global_size - def __call__(self, insn_ids, program_callables_info, ignore_auto=True): + def __call__(self, insn_ids, callables_table, ignore_auto=True): return self.local_size, self.global_size # }}} @@ -214,7 +214,7 @@ class InKernelCallable(ImmutableRecord): update_persistent_hash = LoopKernel.update_persistent_hash - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): """ :arg arg_id_to_type: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -234,7 +234,7 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() - def with_descrs(self, arg_id_to_descr, program_callables_info): + def with_descrs(self, arg_id_to_descr, callables_table): """ :arg arg_id_to_descr: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -363,16 +363,16 @@ class ScalarCallable(InKernelCallable): return (self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): raise LoopyError("No type inference information present for " "the function %s." % (self.name)) - def with_descrs(self, arg_id_to_descr, program_callables_info): + def with_descrs(self, arg_id_to_descr, callables_table): arg_id_to_descr[-1] = ValueArgDescriptor() return ( self.copy(arg_id_to_descr=arg_id_to_descr), - program_callables_info) + callables_table) def with_hw_axes_sizes(self, global_size, local_size): return self.copy() @@ -564,7 +564,7 @@ class ManglerCallable(ScalarCallable): return (self.name, self.function_mangler, self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): if self.arg_id_to_dtype is not None: # specializing an already specialized function. for arg_id, dtype in arg_id_to_dtype.items(): @@ -588,7 +588,7 @@ class ManglerCallable(ScalarCallable): return ( self.copy(name_in_target=mangle_result.target_name, arg_id_to_dtype=new_arg_id_to_dtype), - program_callables_info) + callables_table) else: # The function mangler does not agree with the arg id to dtypes # provided. Indicating that is illegal. diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 3f4defc5..006ac6ba 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -755,7 +755,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): # }}} -def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=None): +def assign_automatic_axes(kernel, callables_table, axis=0, local_size=None): logger.debug("%s: assign automatic axes" % kernel.name) # TODO: do the tag removal rigorously, might be easier after switching # to set() from tuple() @@ -769,7 +769,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non if local_size is None: _, local_size = kernel.get_grid_size_upper_bounds_as_exprs( - program_callables_info, ignore_auto=True) + callables_table, ignore_auto=True) # {{{ axis assignment helper function @@ -797,7 +797,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non return assign_automatic_axes( kernel.copy(iname_to_tags=new_iname_to_tags), - program_callables_info, + callables_table, axis=recursion_axis) if axis is None: @@ -849,7 +849,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, do_tagged_check=False), - program_callables_info=program_callables_info, + callables_table=callables_table, axis=recursion_axis, local_size=local_size) if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase): @@ -871,7 +871,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non del new_iname_to_tags[iname] return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags), - program_callables_info, axis=recursion_axis, local_size=local_size) + callables_table, axis=recursion_axis, local_size=local_size) # }}} @@ -940,7 +940,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non return kernel else: return assign_automatic_axes(kernel, - program_callables_info=program_callables_info, axis=axis+1, + callables_table=callables_table, axis=axis+1, local_size=local_size) # }}} diff --git a/loopy/library/function.py b/loopy/library/function.py index f3fb5f8c..f225b62f 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -26,33 +26,33 @@ from loopy.kernel.function_interface import ScalarCallable class MakeTupleCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): new_arg_id_to_dtype = arg_id_to_dtype.copy() for i in range(len(arg_id_to_dtype)): if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target="loopy_make_tuple"), program_callables_info) + name_in_target="loopy_make_tuple"), callables_table) - def with_descrs(self, arg_id_to_descr, program_callables_info): + def with_descrs(self, arg_id_to_descr, callables_table): from loopy.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) return ( self.copy(arg_id_to_descr=new_arg_id_to_descr), - program_callables_info) + callables_table) class IndexOfCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): new_arg_id_to_dtype = dict((i, dtype) for i, dtype in arg_id_to_dtype.items() if dtype is not None) new_arg_id_to_dtype[-1] = kernel.index_dtype return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), - program_callables_info) + callables_table) def loopy_specific_callable_func_id_to_knl_callable_mappers(target, identifier): diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 397e985b..e59a892b 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -169,14 +169,14 @@ class Random123Callable(ScalarCallable): Records information about for the random123 functions. """ - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): # the types provided aren't mature enough to specialize the # callable return (self.copy(), - program_callables_info) + callables_table) name = self.name target = kernel.target @@ -195,7 +195,7 @@ class Random123Callable(ScalarCallable): return ( self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target=fn+"_gen"), - program_callables_info) + callables_table) elif name == fn + "_f32": new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), @@ -203,7 +203,7 @@ class Random123Callable(ScalarCallable): -2: ctr_dtype, 0: ctr_dtype, 1: key_dtype} return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name), program_callables_info + name_in_target=name), callables_table elif name == fn + "_f64": new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), @@ -211,10 +211,10 @@ class Random123Callable(ScalarCallable): -2: ctr_dtype, 0: ctr_dtype, 1: key_dtype} return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name), program_callables_info + name_in_target=name), callables_table return (self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def generate_preambles(self, target): rng_variant = FUNC_NAMES_TO_RNG[self.name] diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 70df864d..7c32d0be 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -424,7 +424,7 @@ def parse_reduction_op(name): # {{{ reduction specific callables class ReductionCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): scalar_dtype = arg_id_to_dtype[0] index_dtype = arg_id_to_dtype[1] result_dtypes = self.name.reduction_op.result_dtypes(kernel, scalar_dtype, @@ -436,15 +436,15 @@ class ReductionCallable(ScalarCallable): index_dtype) + "_op" return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name_in_target), program_callables_info + name_in_target=name_in_target), callables_table - def with_descr(self, arg_id_to_descr, program_callables_info): + def with_descr(self, arg_id_to_descr, callables_table): from loopy.library.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = arg_id_to_descr.copy() new_arg_id_to_descr[-1] = ValueArgDescriptor() return ( self.copy(arg_id_to_descr=arg_id_to_descr), - program_callables_info) + callables_table) def generate_preambles(self, target): if isinstance(self.name, ArgExtOp): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 1042c857..85b0c6d4 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -890,7 +890,7 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} -def realize_reduction_for_single_kernel(kernel, program_callables_info, +def realize_reduction_for_single_kernel(kernel, callables_table, insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, force_scan=False, force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* @@ -1012,7 +1012,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, # {{{ sequential - def map_reduction_seq(expr, rec, program_callables_info, nresults, arg_dtypes, + def map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1130,7 +1130,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, v[iname].lt_set(v[0] + ubound)).get_basic_sets() return bs - def map_reduction_local(expr, rec, program_callables_info, nresults, arg_dtypes, + def map_reduction_local(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -1370,7 +1370,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, # {{{ sequential scan - def map_scan_seq(expr, rec, program_callables_info, nresults, arg_dtypes, + def map_scan_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1459,7 +1459,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, # {{{ local-parallel scan - def map_scan_local(expr, rec, program_callables_info, nresults, arg_dtypes, + def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): @@ -1468,7 +1468,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, assert scan_size > 0 if scan_size == 1: - return map_reduction_seq(expr, rec, program_callables_info, + return map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes) outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1668,15 +1668,15 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, # {{{ seq/par dispatch - def map_reduction(expr, rec, program_callables_info, nresults=1): + def map_reduction(expr, rec, callables_table, nresults=1): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. from loopy.type_inference import ( infer_arg_and_reduction_dtypes_for_reduction_expression) - arg_dtypes, reduction_dtypes, program_callables_info = ( + arg_dtypes, reduction_dtypes, callables_table = ( infer_arg_and_reduction_dtypes_for_reduction_expression( - temp_kernel, expr, program_callables_info, unknown_types_ok)) + temp_kernel, expr, callables_table, unknown_types_ok)) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames @@ -1785,7 +1785,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, for tag in temp_kernel.iname_tags(sweep_iname)))) elif parallel: return map_scan_local( - expr, rec, program_callables_info, nresults, + expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, @@ -1793,7 +1793,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, scan_param.stride) elif sequential: return map_scan_seq( - expr, rec, program_callables_info, nresults, + expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, @@ -1814,12 +1814,12 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, if n_sequential: assert n_local_par == 0 - return map_reduction_seq(expr, rec, program_callables_info, + return map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes) else: assert n_local_par > 0 return map_reduction_local( - expr, rec, program_callables_info, nresults, arg_dtypes, + expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes) # }}} @@ -1854,12 +1854,12 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: new_expressions = cb_mapper(insn.expression, - program_callables_info=program_callables_info, + callables_table=callables_table, nresults=nresults) else: new_expressions = ( cb_mapper(insn.expression, - program_callables_info=program_callables_info),) + callables_table=callables_table),) if generated_insns: # An expansion happened, so insert the generated stuff plus @@ -1952,10 +1952,10 @@ def realize_reduction(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = realize_reduction_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -1968,9 +1968,9 @@ def realize_reduction(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # }}} @@ -2153,11 +2153,11 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): """ def __init__(self, rule_mapping_context, caller_kernel, - program_callables_info): + callables_table): super(ArgDescrInferenceMapper, self).__init__( rule_mapping_context) self.caller_kernel = caller_kernel - self.program_callables_info = program_callables_info + self.callables_table = callables_table def map_call(self, expr, expn_state, **kwargs): from pymbolic.primitives import Call, CallWithKwargs @@ -2193,12 +2193,12 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): combined_arg_id_to_descr.update(assignee_id_to_descr) # specializing the function according to the parameter description - in_knl_callable = self.program_callables_info[expr.function.name] - new_in_knl_callable, self.program_callables_info = ( + in_knl_callable = self.callables_table[expr.function.name] + new_in_knl_callable, self.callables_table = ( in_knl_callable.with_descrs( - combined_arg_id_to_descr, self.program_callables_info)) - self.program_callables_info, new_func_id = ( - self.program_callables_info.with_callable( + combined_arg_id_to_descr, self.callables_table)) + self.callables_table, new_func_id = ( + self.callables_table.with_callable( expr.function.function, new_in_knl_callable)) @@ -2242,7 +2242,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): return kernel.copy(instructions=new_insns) -def traverse_to_infer_arg_descr(kernel, program_callables_info): +def traverse_to_infer_arg_descr(kernel, callables_table): """ Returns a copy of *kernel* with the argument shapes and strides matching for scoped functions in the *kernel*. Refer @@ -2258,12 +2258,12 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info): kernel.substitutions, kernel.get_var_name_generator()) arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context, - kernel, program_callables_info) + kernel, callables_table) descr_inferred_kernel = rule_mapping_context.finish_kernel( arg_descr_inf_mapper.map_kernel(kernel)) - return descr_inferred_kernel, arg_descr_inf_mapper.program_callables_info + return descr_inferred_kernel, arg_descr_inf_mapper.callables_table def infer_arg_descr(program): @@ -2272,23 +2272,23 @@ def infer_arg_descr(program): :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the callables. """ - root_kernel_callable = program.program_callables_info[program.name] - old_callables_count = program.program_callables_info.callables_count - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) + root_kernel_callable = program.callables_table[program.name] + old_callables_count = program.callables_table.callables_count + callables_table = ( + program.callables_table.with_edit_callables_mode()) root_kernel = program.root_kernel - new_root_kernel, program_callables_info = traverse_to_infer_arg_descr( - root_kernel, program_callables_info) + new_root_kernel, callables_table = traverse_to_infer_arg_descr( + root_kernel, callables_table) new_root_kernel_callable = root_kernel_callable.copy( subkernel=new_root_kernel) - program_callables_info, _ = program_callables_info.with_callable(program.name, + callables_table, _ = callables_table.with_callable(program.name, new_root_kernel_callable) - program_callables_info = program_callables_info.with_exit_edit_callables_mode( + callables_table = callables_table.with_exit_edit_callables_mode( old_callables_count) - return program.copy(program_callables_info=program_callables_info) + return program.copy(callables_table=callables_table) # }}} @@ -2298,7 +2298,7 @@ preprocess_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def preprocess_single_kernel(kernel, program_callables_info, device=None): +def preprocess_single_kernel(kernel, callables_table, device=None): from loopy.kernel import KernelState if kernel.state >= KernelState.PREPROCESSED: return kernel @@ -2356,7 +2356,7 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): # because it manipulates the depends_on field, which could prevent # defaults from being applied. kernel = realize_reduction_for_single_kernel(kernel, - program_callables_info, unknown_types_ok=False) + callables_table, unknown_types_ok=False) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators @@ -2420,7 +2420,7 @@ def infer_hw_axes_sizes(program): resolved_function_with_hw_axes_sizes_inferred = {} for func_id, in_knl_callable in ( - program.program_callables_info.items()): + program.callables_table.items()): if func_id == program.name: resolved_function_with_hw_axes_sizes_inferred[func_id] = ( in_knl_callable) @@ -2428,11 +2428,11 @@ def infer_hw_axes_sizes(program): resolved_function_with_hw_axes_sizes_inferred[func_id] = ( in_knl_callable.with_hw_axes_sizes(local_size, global_size)) - new_program_callables_info = ( - program.program_callables_info.copy( + new_callables_table = ( + program.callables_table.copy( resolved_functions=resolved_function_with_hw_axes_sizes_inferred)) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # }}} @@ -2451,16 +2451,16 @@ def preprocess_program(program, device=None): # Callable editing restrictions: # - # - should not edit program_callables_info in :meth:`preprocess_single_kernel` + # - should not edit callables_table in :meth:`preprocess_single_kernel` # as we are iterating over it.[1] # # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = preprocess_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, device) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -2472,9 +2472,9 @@ def preprocess_program(program, device=None): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - program = program.copy(program_callables_info=new_program_callables_info) + program = program.copy(callables_table=new_callables_table) # }}} diff --git a/loopy/program.py b/loopy/program.py index 7479ee04..f7c399c1 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -47,7 +47,7 @@ __doc__ = """ .. currentmodule:: loopy .. autoclass:: Program -.. autoclass:: ProgramCallablesInfo +.. autoclass:: CallablesTable .. autofunction:: make_program_from_kernel .. autofunction:: iterate_over_kernels_if_given_program @@ -73,11 +73,11 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): :arg function_ids: A container with instances of :class:`str` indicating the function identifiers to look for while scoping functions. """ - def __init__(self, rule_mapping_context, kernel, program_callables_info, + def __init__(self, rule_mapping_context, kernel, callables_table, function_id_to_in_knl_callable_mappers): super(ResolvedFunctionMarker, self).__init__(rule_mapping_context) self.kernel = kernel - self.program_callables_info = program_callables_info + self.callables_table = callables_table self.function_id_to_in_knl_callable_mappers = ( function_id_to_in_knl_callable_mappers) @@ -123,8 +123,8 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): # associate the newly created ResolvedFunction with the # resolved in-kernel callable - self.program_callables_info, new_func_id = ( - self.program_callables_info.with_added_callable( + self.callables_table, new_func_id = ( + self.callables_table.with_added_callable( expr.function, in_knl_callable)) return type(expr)( ResolvedFunction(new_func_id), @@ -144,8 +144,8 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): expr.operation.get_scalar_callables()): in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) assert in_knl_callable is not None - self.program_callables_info, _ = ( - self.program_callables_info.with_added_callable(func_id, + self.callables_table, _ = ( + self.callables_table.with_added_callable(func_id, in_knl_callable)) return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) @@ -162,37 +162,37 @@ def _default_func_id_to_kernel_callable_mappers(target): ))) -def initialize_program_callables_info_from_kernel(kernel): +def initialize_callables_table_from_kernel(kernel): """ - Returns an instance of :class:`loopy.ProgramCallablesInfo`, by resolving + Returns an instance of :class:`loopy.CallablesTable`, by resolving the functions based on :mod:`loopy`'s default function resolvers. """ # collect the default function resolvers func_id_to_kernel_callable_mappers = ( _default_func_id_to_kernel_callable_mappers(kernel.target)) - program_callables_info = ProgramCallablesInfo({}) + callables_table = CallablesTable({}) from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, kernel, program_callables_info, + rule_mapping_context, kernel, callables_table, func_id_to_kernel_callable_mappers) # mark the functions as "Resolved" in the expression nodes. kernel_with_functions_resolved = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(kernel)) - # collect the update program_callables_info - program_callables_info = resolved_function_marker.program_callables_info + # collect the update callables_table + callables_table = resolved_function_marker.callables_table callable_kernel = CallableKernel(kernel_with_functions_resolved) - # add the callable kernel to the program_callables_info - program_callables_info, _ = program_callables_info.with_added_callable( + # add the callable kernel to the callables_table + callables_table, _ = callables_table.with_added_callable( Variable(kernel.name), callable_kernel) - return program_callables_info + return callables_table # {{{ program definition @@ -206,9 +206,9 @@ class Program(ImmutableRecord): An instance of :class:`str`, also the name of the top-most level :class:`loopy.LoopKernel`. - .. attribute:: program_callables_info + .. attribute:: callables_table - An instance of :class:`loopy.program.ProgramCallablesInfo`. + An instance of :class:`loopy.program.CallablesTable`. .. attribute:: target @@ -232,16 +232,16 @@ class Program(ImmutableRecord): """ def __init__(self, name, - program_callables_info, + callables_table, target, func_id_to_in_knl_callable_mappers): - assert isinstance(program_callables_info, ProgramCallablesInfo) + assert isinstance(callables_table, CallablesTable) - assert name in program_callables_info + assert name in callables_table super(Program, self).__init__( name=name, - program_callables_info=program_callables_info, + callables_table=callables_table, target=target, func_id_to_in_knl_callable_mappers=( func_id_to_in_knl_callable_mappers)) @@ -250,7 +250,7 @@ class Program(ImmutableRecord): hash_fields = ( "name", - "program_callables_info", + "callables_table", "target",) update_persistent_hash = LoopKernel.update_persistent_hash @@ -262,7 +262,7 @@ class Program(ImmutableRecord): new_self = super(Program, self).copy(**kwargs) new_resolved_functions = {} for func_id, in_knl_callable in ( - new_self.program_callables_info.items()): + new_self.callables_table.items()): if isinstance(in_knl_callable, CallableKernel): subkernel = in_knl_callable.subkernel new_resolved_functions[func_id] = in_knl_callable.copy( @@ -270,11 +270,11 @@ class Program(ImmutableRecord): else: new_resolved_functions[func_id] = in_knl_callable - program_callables_info = new_self.program_callables_info.copy( + callables_table = new_self.callables_table.copy( resolved_functions=new_resolved_functions) return super(Program, new_self).copy( - program_callables_info=program_callables_info) + callables_table=callables_table) else: return super(Program, self).copy(**kwargs) @@ -285,7 +285,7 @@ class Program(ImmutableRecord): *global_size* and *local_size* are :class:`islpy.PwAff` objects. """ return self.root_kernel.get_grid_size_upper_bounds( - self.program_callables_info, + self.callables_table, ignore_auto=ignore_auto) def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): @@ -295,7 +295,7 @@ class Program(ImmutableRecord): *global_size* and *local_size* are :mod:`pymbolic` expressions """ return self.root_kernel.get_grid_size_upper_bounds_as_exprs( - self.program_callables_info, + self.callables_table, ignore_auto=ignore_auto) # {{{ implementation arguments @@ -338,7 +338,7 @@ class Program(ImmutableRecord): Syntactic sugar. """ - return self.program_callables_info[self.name].subkernel + return self.callables_table[self.name].subkernel @property def arg_dict(self): @@ -367,14 +367,14 @@ class Program(ImmutableRecord): Returns a copy of *self* with the topmost level kernel as *root_kernel*. """ - new_in_knl_callable = self.program_callables_info[ + new_in_knl_callable = self.callables_table[ self.name].copy(subkernel=root_kernel) new_resolved_functions = ( - self.program_callables_info.resolved_functions.copy()) + self.callables_table.resolved_functions.copy()) new_resolved_functions[self.name] = new_in_knl_callable return self.copy( - program_callables_info=self.program_callables_info.copy( + callables_table=self.callables_table.copy( resolved_functions=new_resolved_functions)) def __call__(self, *args, **kwargs): @@ -462,14 +462,14 @@ def rename_resolved_functions_in_a_single_kernel(kernel, class CallablesCountingMapper(CombineMapper): """ Returns an instance of :class:`collections.Counter` with the count of - callables registered in *program_callables_info*. + callables registered in *callables_table*. - .. attribute:: program_callables_info + .. attribute:: callables_table - An instance of :class:`loopy.program.ProgramCallablesInfo`. + An instance of :class:`loopy.program.CallablesTable`. """ - def __init__(self, program_callables_info): - self.program_callables_info = program_callables_info + def __init__(self, callables_table): + self.callables_table = callables_table def combine(self, values): return sum(values, Counter()) @@ -483,7 +483,7 @@ class CallablesCountingMapper(CombineMapper): kw_parameters = {} if isinstance(expr.function, (ResolvedFunction)): - in_knl_callable = self.program_callables_info[expr.function.name] + in_knl_callable = self.callables_table[expr.function.name] if isinstance(in_knl_callable, ScalarCallable): return (Counter([expr.function.name]) + self.combine((self.rec(child) for child in expr.parameters @@ -495,7 +495,7 @@ class CallablesCountingMapper(CombineMapper): callables_count_in_subkernel = ( count_callables_in_kernel( in_knl_callable.subkernel, - self.program_callables_info)) + self.callables_table)) return (Counter([expr.function.name]) + self.combine((self.rec(child) for child in expr.parameters @@ -525,16 +525,16 @@ class CallablesCountingMapper(CombineMapper): @memoize_method -def count_callables_in_kernel(kernel, program_callables_info): +def count_callables_in_kernel(kernel, callables_table): """ Returns an instance of :class:`collections.Counter` representing the number of callables in the *kernel* that are registered in - *program_callables_info*. + *callables_table*. """ assert isinstance(kernel, LoopKernel) callables_count = Counter() callables_counting_mapper = CallablesCountingMapper( - program_callables_info) + callables_table) subst_expander = SubstitutionRuleExpander(kernel.substitutions) for insn in kernel.instructions: @@ -555,7 +555,7 @@ def count_callables_in_kernel(kernel, program_callables_info): # {{{ program callables info -class ProgramCallablesInfo(ImmutableRecord): +class CallablesTable(ImmutableRecord): # FIXME: is CallablesTable a better name?(similar to symbol table in # compilers.) """ @@ -594,7 +594,7 @@ class ProgramCallablesInfo(ImmutableRecord): history = dict((func_id, frozenset([func_id])) for func_id in resolved_functions) - super(ProgramCallablesInfo, self).__init__( + super(CallablesTable, self).__init__( resolved_functions=resolved_functions, history=history, is_being_edited=is_being_edited) @@ -618,7 +618,7 @@ class ProgramCallablesInfo(ImmutableRecord): def callables_count(self): """ Returns an instance of :class:`collection.Counter` representing the number - of times the callables is called in program_callables_info. + of times the callables is called in callables_table. """ # should raise an error if there are more than one root kernels(which is # illegal) @@ -648,24 +648,24 @@ class ProgramCallablesInfo(ImmutableRecord): .. note:: - Always checks whether the - :attr:``loopy.ProgramCallablesInfo.resolved_functions` has + :attr:``loopy.CallablesTable.resolved_functions` has *in_kernel_callable*, does not introduce copies. - The difference between - :meth:`loopy.ProgramCallablesInfo.with_added_callable` - and :meth:`ProgramCallablesInfo.with_callable` being that + :meth:`loopy.CallablesTable.with_added_callable` + and :meth:`CallablesTable.with_callable` being that the former has no support for renaming the callable back i.e. ``with_callable`` supports renaming from ``sin_0`` to ``sin``, if possible, through the member method - ``loopy.ProgramCallablesInfo.with_exit_edit_callables_mode`` + ``loopy.CallablesTable.with_exit_edit_callables_mode`` This subtle difference makes -- - - :meth:`loopy.ProgramCallablesInfo.with_added_callable` suitable + - :meth:`loopy.CallablesTable.with_added_callable` suitable for usage while resolving the functions first time, where no renaming is needed. - - :meth:`loopy.ProgramCallablesInfo.with_callable` suitable for + - :meth:`loopy.CallablesTable.with_callable` suitable for implementing edits in callables during inference-walks. """ @@ -745,7 +745,7 @@ class ProgramCallablesInfo(ImmutableRecord): def with_callable(self, function, in_kernel_callable): """ Returns an instance of :class:`tuple` ``(new_self, new_function)``. - Also refer -- :meth:`loopy.ProgramCallablesInfo.with_added_callable` + Also refer -- :meth:`loopy.CallablesTable.with_added_callable` :arg function: An instance of :class:`pymbolic.primitives.Variable` or @@ -929,12 +929,12 @@ def make_program_from_kernel(kernel): """ # get the program callables info - program_callables_info = initialize_program_callables_info_from_kernel(kernel) + callables_table = initialize_callables_table_from_kernel(kernel) # get the program from program callables info program = Program( name=kernel.name, - program_callables_info=program_callables_info, + callables_table=callables_table, func_id_to_in_knl_callable_mappers=( _default_func_id_to_kernel_callable_mappers(kernel.target)), target=kernel.target) @@ -953,7 +953,7 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): if isinstance(program_or_kernel, Program): program = program_or_kernel new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = transform_for_single_kernel( in_knl_callable.subkernel, *args, **kwargs) @@ -968,9 +968,9 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) else: assert isinstance(program_or_kernel, LoopKernel) kernel = program_or_kernel diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 201bcc25..2b3f7a3b 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1832,7 +1832,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit): # {{{ main scheduling entrypoint -def generate_loop_schedules(kernel, program_callables_info, debug_args={}): +def generate_loop_schedules(kernel, callables_table, debug_args={}): """ .. warning:: @@ -1846,18 +1846,18 @@ def generate_loop_schedules(kernel, program_callables_info, debug_args={}): with MinRecursionLimitForScheduling(kernel): for sched in generate_loop_schedules_inner(kernel, - program_callables_info, debug_args=debug_args): + callables_table, debug_args=debug_args): yield sched -def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={}): +def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): from loopy.kernel import KernelState if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") from loopy.check import pre_schedule_checks - pre_schedule_checks(kernel, program_callables_info) + pre_schedule_checks(kernel, callables_table) schedule_count = 0 @@ -1971,7 +1971,7 @@ def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={}) kernel, gen_sched) gsize, lsize = ( - kernel.get_grid_size_upper_bounds(program_callables_info)) + kernel.get_grid_size_upper_bounds(callables_table)) if (gsize or lsize): if not kernel.options.disable_global_barriers: @@ -2028,7 +2028,7 @@ schedule_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def _get_one_scheduled_kernel_inner(kernel, program_callables_info): +def _get_one_scheduled_kernel_inner(kernel, callables_table): # This helper function exists to ensure that the generator chain is fully # out of scope after the function returns. This allows it to be # garbage-collected in the exit handler of the @@ -2038,10 +2038,10 @@ def _get_one_scheduled_kernel_inner(kernel, program_callables_info): # # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context. - return next(iter(generate_loop_schedules(kernel, program_callables_info))) + return next(iter(generate_loop_schedules(kernel, callables_table))) -def get_one_scheduled_kernel(kernel, program_callables_info): +def get_one_scheduled_kernel(kernel, callables_table): from loopy import CACHING_ENABLED sched_cache_key = kernel @@ -2060,7 +2060,7 @@ def get_one_scheduled_kernel(kernel, program_callables_info): with ProcessLogger(logger, "%s: schedule" % kernel.name): with MinRecursionLimitForScheduling(kernel): result = _get_one_scheduled_kernel_inner(kernel, - program_callables_info) + callables_table) if CACHING_ENABLED and not from_cache: schedule_cache.store_if_not_present(sched_cache_key, result) diff --git a/loopy/statistics.py b/loopy/statistics.py index 5dddd49e..d65387d1 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -648,11 +648,11 @@ class MemAccess(Record): # {{{ counter base class CounterBase(CombineMapper): - def __init__(self, knl, program_callables_info): + def __init__(self, knl, callables_table): self.knl = knl - self.program_callables_info = program_callables_info + self.callables_table = callables_table from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl, program_callables_info) + self.type_inf = TypeInferenceMapper(knl, callables_table) def combine(self, values): return sum(values) @@ -707,11 +707,11 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl, program_callables_info): + def __init__(self, knl, callables_table): self.knl = knl - self.program_callables_info = program_callables_info + self.callables_table = callables_table from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl, program_callables_info) + self.type_inf = TypeInferenceMapper(knl, callables_table) def combine(self, values): return sum(values) @@ -725,7 +725,7 @@ class ExpressionOpCounter(CounterBase): def map_call(self, expr): from loopy.symbolic import ResolvedFunction if isinstance(expr.function, ResolvedFunction): - function_identifier = self.program_callables_info[ + function_identifier = self.callables_table[ expr.function.name].name else: function_identifier = expr.function.name @@ -1111,7 +1111,7 @@ def count(kernel, set, space=None): from loopy.program import Program if isinstance(kernel, Program): if len([in_knl_callable for in_knl_callable in - kernel.program_callables_info.values() if isinstance(in_knl_callable, + kernel.callables_table.values() if isinstance(in_knl_callable, CallableKernel)]) != 1: raise NotImplementedError("Currently only supported for program with " "only one CallableKernel.") @@ -1216,10 +1216,10 @@ def count(kernel, set, space=None): return add_assumptions_guard(kernel, count) -def get_unused_hw_axes_factor(knl, program_callables_info, insn, +def get_unused_hw_axes_factor(knl, callables_table, insn, disregard_local_axes, space=None): # FIXME: Multi-kernel support - gsize, lsize = knl.get_grid_size_upper_bounds(program_callables_info) + gsize, lsize = knl.get_grid_size_upper_bounds(callables_table) g_used = set() l_used = set() @@ -1257,7 +1257,7 @@ def get_unused_hw_axes_factor(knl, program_callables_info, insn, return add_assumptions_guard(knl, result) -def count_insn_runs(knl, program_callables_info, insn, count_redundant_work, +def count_insn_runs(knl, callables_table, insn, count_redundant_work, disregard_local_axes=False): insn_inames = knl.insn_inames(insn) @@ -1278,7 +1278,7 @@ def count_insn_runs(knl, program_callables_info, insn, count_redundant_work, c = count(knl, domain, space=space) if count_redundant_work: - unused_fac = get_unused_hw_axes_factor(knl, program_callables_info, + unused_fac = get_unused_hw_axes_factor(knl, callables_table, insn, disregard_local_axes=disregard_local_axes, space=space) return c * unused_fac else: @@ -1286,7 +1286,7 @@ def count_insn_runs(knl, program_callables_info, insn, count_redundant_work, @memoize_method -def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size, +def _get_insn_count(knl, callables_table, insn_id, subgroup_size, count_redundant_work, count_granularity=CountGranularity.WORKITEM): insn = knl.id_to_insn[insn_id] @@ -1299,12 +1299,12 @@ def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size, if count_granularity == CountGranularity.WORKITEM: return count_insn_runs( - knl, program_callables_info, insn, + knl, callables_table, insn, count_redundant_work=count_redundant_work, disregard_local_axes=False) ct_disregard_local = count_insn_runs( - knl, program_callables_info, insn, disregard_local_axes=True, + knl, callables_table, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) if count_granularity == CountGranularity.WORKGROUP: @@ -1312,7 +1312,7 @@ def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size, elif count_granularity == CountGranularity.SUBGROUP: # get the group size from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds(program_callables_info) + _, local_size = knl.get_grid_size_upper_bounds(callables_table) workgroup_size = 1 if local_size: for size in local_size: @@ -1344,7 +1344,7 @@ def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size, # {{{ get_op_map -def get_op_map_for_single_kernel(knl, program_callables_info, +def get_op_map_for_single_kernel(knl, callables_table, numpy_types=True, count_redundant_work=False, subgroup_size=None): @@ -1355,7 +1355,7 @@ def get_op_map_for_single_kernel(knl, program_callables_info, subgroup_size = _process_subgroup_size(knl, subgroup_size) op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl, program_callables_info) + op_counter = ExpressionOpCounter(knl, callables_table) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, @@ -1368,7 +1368,7 @@ def get_op_map_for_single_kernel(knl, program_callables_info, op_map = ( op_map + ToCountMap({key: val}) - * _get_insn_count(knl, program_callables_info, insn.id, + * _get_insn_count(knl, callables_table, insn.id, subgroup_size, count_redundant_work, key.count_granularity)) @@ -1458,13 +1458,13 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, op_map = ToCountMap() callables_count = ( - program.program_callables_info.callables_count) + program.callables_table.callables_count) - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel knl_op_map = get_op_map_for_single_kernel(knl, - program.program_callables_info, numpy_types, + program.callables_table, numpy_types, count_redundant_work, subgroup_size) for i in range(callables_count[func_id]): @@ -1535,7 +1535,7 @@ def _process_subgroup_size(knl, subgroup_size_requested): # {{{ get_mem_access_map -def get_mem_access_map_for_single_kernel(knl, program_callables_info, +def get_mem_access_map_for_single_kernel(knl, callables_table, numpy_types=True, count_redundant_work=False, subgroup_size=None): if not knl.options.ignore_boostable_into: @@ -1545,8 +1545,8 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info, subgroup_size = _process_subgroup_size(knl, subgroup_size) access_map = ToCountMap() - access_counter_g = GlobalMemAccessCounter(knl, program_callables_info) - access_counter_l = LocalMemAccessCounter(knl, program_callables_info) + access_counter_g = GlobalMemAccessCounter(knl, callables_table) + access_counter_l = LocalMemAccessCounter(knl, callables_table) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, @@ -1569,7 +1569,7 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info, access_map = ( access_map + ToCountMap({key: val}) - * _get_insn_count(knl, program_callables_info, insn.id, + * _get_insn_count(knl, callables_table, insn.id, subgroup_size, count_redundant_work, key.count_granularity)) @@ -1578,7 +1578,7 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info, access_map = ( access_map + ToCountMap({key: val}) - * _get_insn_count(knl, program_callables_info, insn.id, + * _get_insn_count(knl, callables_table, insn.id, subgroup_size, count_redundant_work, key.count_granularity)) @@ -1700,13 +1700,13 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, access_map = ToCountMap() - callables_count = program.program_callables_info.callables_count + callables_count = program.callables_table.callables_count - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel knl_access_map = get_mem_access_map_for_single_kernel(knl, - program.program_callables_info, numpy_types, + program.callables_table, numpy_types, count_redundant_work, subgroup_size) # FIXME: didn't see any easy way to multiply @@ -1726,7 +1726,7 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, # {{{ get_synchronization_map -def get_synchronization_map_for_single_kernel(knl, program_callables_info, +def get_synchronization_map_for_single_kernel(knl, callables_table, subgroup_size=None): """Count the number of synchronization events each work-item encounters in @@ -1772,7 +1772,7 @@ def get_synchronization_map_for_single_kernel(knl, program_callables_info, from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul - knl = lp.get_one_scheduled_kernel(knl, program_callables_info) + knl = lp.get_one_scheduled_kernel(knl, callables_table) iname_list = [] result = ToCountMap() @@ -1824,13 +1824,13 @@ def get_synchronization_map(program, subgroup_size=None): program = preprocess_program(program) sync_map = ToCountMap() - callables_count = program.program_callables_info.callables_count + callables_count = program.callables_table.callables_count - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel knl_sync_map = get_synchronization_map_for_single_kernel(knl, - program.program_callables_info, subgroup_size) + program.callables_table, subgroup_size) # FIXME: didn't see any easy way to multiply for i in range(callables_count[func_id]): @@ -1887,7 +1887,7 @@ def gather_access_footprints_for_single_kernel(kernel, ignore_uncountable=False) def gather_access_footprints(program, ignore_uncountable=False): # FIMXE: works only for one callable kernel till now. if len([in_knl_callable for in_knl_callable in - program.program_callables_info.values() if isinstance(in_knl_callable, + program.callables_table.values() if isinstance(in_knl_callable, CallableKernel)]) != 1: raise NotImplementedError("Currently only supported for program with " "only one CallableKernel.") @@ -1900,9 +1900,9 @@ def gather_access_footprints(program, ignore_uncountable=False): write_footprints = [] read_footprints = [] - callables_count = program.program_callables_info.callables_count + callables_count = program.callables_table.callables_count - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel knl_write_footprints, knl_read_footprints = ( diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 92ee2dc5..f27ee4e9 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -80,7 +80,7 @@ class TargetBase(object): def preprocess(self, kernel): return kernel - def pre_codegen_check(self, kernel, program_callables_info): + def pre_codegen_check(self, kernel, callables_table): pass # }}} diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 418ce025..9b5aaf8e 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -362,7 +362,7 @@ class CMathCallable(ScalarCallable): C-Target. """ - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): name = self.name if name in ["abs", "min", "max"]: @@ -381,7 +381,7 @@ class CMathCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0] dtype = dtype.numpy_dtype @@ -409,7 +409,7 @@ class CMathCallable(ScalarCallable): self.copy(name_in_target=name, arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}), - program_callables_info) + callables_table) # binary functions if name in ["fmax", "fmin"]: @@ -424,7 +424,7 @@ class CMathCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() @@ -449,11 +449,11 @@ class CMathCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), - program_callables_info) + callables_table) return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def scope_c_math_functions(target, identifier): @@ -893,7 +893,7 @@ class CASTBuilder(ASTBuilderBase): ecm = codegen_state.expression_to_code_mapper func_id = insn.expression.function.name - in_knl_callable = codegen_state.program_callables_info[func_id] + in_knl_callable = codegen_state.callables_table[func_id] if isinstance(in_knl_callable, ScalarCallable) and ( in_knl_callable.name_in_target == 'loopy_make_tuple'): diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 65a8c202..289877d9 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -55,7 +55,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): if type_inf_mapper is None: type_inf_mapper = TypeInferenceMapper(self.kernel, - self.codegen_state.program_callables_info) + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper self.allow_complex = codegen_state.allow_complex @@ -389,7 +389,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): # {{{ implement indexof, indexof_vec identifier_name = ( - self.codegen_state.program_callables_info[expr.function.name].name) + self.codegen_state.callables_table[expr.function.name].name) if identifier_name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: raise LoopyError("%s takes exactly one argument" % identifier_name) @@ -432,11 +432,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} from loopy.kernel.function_interface import ManglerCallable - if isinstance(self.codegen_state.program_callables_info[expr.function.name], + if isinstance(self.codegen_state.callables_table[expr.function.name], ManglerCallable): from loopy.codegen import SeenFunction in_knl_callable = ( - self.codegen_state.program_callables_info[ + self.codegen_state.callables_table[ expr.function.name]) mangle_result = in_knl_callable.mangle_result(self.kernel) self.codegen_state.seen_functions.add( @@ -445,7 +445,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): mangle_result.arg_dtypes)) return ( - self.codegen_state.program_callables_info[ + self.codegen_state.callables_table[ expr.function.name].emit_call( expression_to_code_mapper=self, expression=expr, diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index e6abf73f..32b810eb 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -123,7 +123,7 @@ _CUDA_SPECIFIC_FUNCTIONS = { class CudaCallable(ScalarCallable): def cuda_with_types(self, arg_id_to_dtype, caller_kernel, - program_callables_info): + callables_table): name = self.name @@ -138,7 +138,7 @@ class CudaCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0] scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"] @@ -146,7 +146,7 @@ class CudaCallable(ScalarCallable): self.copy(name_in_target=name, arg_id_to_dtype={-1: NumpyType(scalar_dtype), 0: dtype, 1: dtype}), - program_callables_info) + callables_table) if name in _CUDA_SPECIFIC_FUNCTIONS: num_args = _CUDA_SPECIFIC_FUNCTIONS[name] @@ -161,7 +161,7 @@ class CudaCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in @@ -177,11 +177,11 @@ class CudaCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype=updated_arg_id_to_dtype), - program_callables_info) + callables_table) return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def scope_cuda_functions(target, identifier): @@ -303,7 +303,7 @@ class CUDACASTBuilder(CASTBuilder): codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.schedule, schedule_index), - codegen_state.program_callables_info) + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 43963ddb..c067bc4b 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -763,7 +763,7 @@ class KernelExecutorBase(object): from loopy.schedule import get_one_scheduled_kernel program = program.with_root_kernel( get_one_scheduled_kernel(program.root_kernel, - program.program_callables_info)) + program.callables_table)) return program diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index f8c42ad6..94a81a65 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -172,9 +172,9 @@ class ISPCTarget(CTarget): host_program_name_suffix = "" device_program_name_suffix = "_inner" - def pre_codegen_check(self, kernel, program_callables_info): + def pre_codegen_check(self, kernel, callables_table): gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs( - program_callables_info) + callables_table) if len(lsize) > 1: for i, ls_i in enumerate(lsize[1:]): if ls_i != 1: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index d8c195de..ea29665a 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -172,7 +172,7 @@ class OpenCLCallable(ScalarCallable): :class:`loopy.target.c.CMathCallable`. """ - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): name = self.name if name in ["max", "min"]: @@ -182,7 +182,7 @@ class OpenCLCallable(ScalarCallable): if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() @@ -195,7 +195,7 @@ class OpenCLCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), - program_callables_info) + callables_table) else: # Unsupported type. raise LoopyError("%s function not supported for the types %s" % @@ -212,14 +212,14 @@ class OpenCLCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0] scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] return ( self.copy(name_in_target=name, arg_id_to_dtype={-1: NumpyType(scalar_dtype), 0: dtype, 1: dtype}), - program_callables_info) + callables_table) if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] @@ -234,7 +234,7 @@ class OpenCLCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in @@ -250,7 +250,7 @@ class OpenCLCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype=updated_arg_id_to_dtype), - program_callables_info) + callables_table) if name in VECTOR_LITERAL_FUNCS: base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] @@ -266,7 +266,7 @@ class OpenCLCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(count)) @@ -276,13 +276,13 @@ class OpenCLCallable(ScalarCallable): return ( self.copy(name_in_target="(%s%d) " % (base_tp_name, count), arg_id_to_dtype=updated_arg_id_to_dtype), - program_callables_info) + callables_table) # does not satisfy any of the conditions needed for specialization. # hence just returning a copy of the callable. return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def scope_opencl_functions(target, identifier): @@ -479,7 +479,7 @@ class OpenCLCASTBuilder(CASTBuilder): _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.schedule, schedule_index), - codegen_state.program_callables_info) + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 435a5e79..d98b6cdd 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -134,7 +134,7 @@ def adjust_local_temp_var_storage(kernel, device): # {{{ check sizes against device properties -def check_sizes(kernel, program_callables_info, device): +def check_sizes(kernel, callables_table, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError @@ -152,7 +152,7 @@ def check_sizes(kernel, program_callables_info, device): parameters[arg.name] = arg.approximately glens, llens = ( - kernel.get_grid_size_upper_bounds_as_exprs(program_callables_info)) + kernel.get_grid_size_upper_bounds_as_exprs(callables_table)) if (max(len(glens), len(llens)) > device.max_work_item_dimensions): @@ -207,7 +207,7 @@ class PyOpenCLCallable(ScalarCallable): Records information about the callables which are not covered by :class:`loopy.target.opencl.OpenCLCallable` """ - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): name = self.name @@ -221,7 +221,7 @@ class PyOpenCLCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0] @@ -238,7 +238,7 @@ class PyOpenCLCallable(ScalarCallable): self.copy(name_in_target="%s_%s" % (tpname, name), arg_id_to_dtype={0: dtype, -1: NumpyType( np.dtype(dtype.numpy_dtype.type(0).real))}), - program_callables_info) + callables_table) if name in ["sqrt", "exp", "log", "sin", "cos", "tan", @@ -256,7 +256,7 @@ class PyOpenCLCallable(ScalarCallable): return ( self.copy(name_in_target="%s_%s" % (tpname, name), arg_id_to_dtype={0: dtype, -1: dtype}), - program_callables_info) + callables_table) else: # function calls for floating parameters. numpy_dtype = dtype.numpy_dtype @@ -267,11 +267,11 @@ class PyOpenCLCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype={0: dtype, -1: dtype}), - program_callables_info) + callables_table) return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def pyopencl_function_id_to_in_knl_callable_mapper(target, identifier): @@ -397,8 +397,8 @@ class PyOpenCLTarget(OpenCLTarget): kernel = adjust_local_temp_var_storage(kernel, self.device) return kernel - def pre_codegen_check(self, kernel, program_callables_info): - check_sizes(kernel, program_callables_info, self.device) + def pre_codegen_check(self, kernel, callables_table): + check_sizes(kernel, callables_table, self.device) def get_host_ast_builder(self): return PyOpenCLPythonASTBuilder(self) diff --git a/loopy/target/python.py b/loopy/target/python.py index 2e6712ec..1f83112f 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -45,7 +45,7 @@ class ExpressionToPythonMapper(StringifyMapper): if type_inf_mapper is None: type_inf_mapper = TypeInferenceMapper(self.kernel, - self.codegen_state.program_callables_info) + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper def handle_unsupported_expression(self, victim, enclosing_prec): @@ -85,7 +85,7 @@ class ExpressionToPythonMapper(StringifyMapper): def map_call(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_NONE - identifier_name = self.codegen_state.program_callables_info[ + identifier_name = self.codegen_state.callables_table[ expr.function.name].name if identifier_name in ["indexof", "indexof_vec"]: @@ -93,7 +93,7 @@ class ExpressionToPythonMapper(StringifyMapper): "indexof, indexof_vec not yet supported in Python") from loopy.kernel.function_interface import ManglerCallable - in_knl_callable = self.codegen_state.program_callables_info[ + in_knl_callable = self.codegen_state.callables_table[ expr.function.name] if isinstance(in_knl_callable, ManglerCallable): from loopy.codegen import SeenFunction diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 57c4397f..2519b6a1 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -133,7 +133,7 @@ buffer_array_cache = WriteOncePersistentDict( # Adding an argument? also add something to the cache_key below. -def buffer_array_for_single_kernel(kernel, program_callables_info, var_name, +def buffer_array_for_single_kernel(kernel, callables_table, var_name, buffer_inames, init_expression=None, store_expression=None, within=None, default_tag="l.auto", temporary_scope=None, temporary_is_local=None, fetch_bounding_box=False): @@ -534,7 +534,7 @@ def buffer_array_for_single_kernel(kernel, program_callables_info, var_name, kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel, program_callables_info) + kernel = assign_automatic_axes(kernel, callables_table) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching @@ -548,10 +548,10 @@ def buffer_array(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = buffer_array_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -564,8 +564,8 @@ def buffer_array(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # vim: foldmethod=marker diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 90f53095..0013de1d 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -46,11 +46,11 @@ def _resolved_callables_from_function_lookup(program, ``(target, identifier)`` that returns either an instance of :class:`loopy.InKernelCallable` or *None*. """ - program_callables_info = program.program_callables_info + callables_table = program.callables_table callable_knls = dict( (func_id, in_knl_callable) for func_id, in_knl_callable in - program_callables_info.items() if isinstance(in_knl_callable, + callables_table.items() if isinstance(in_knl_callable, CallableKernel)) edited_callable_knls = {} @@ -62,28 +62,28 @@ def _resolved_callables_from_function_lookup(program, kernel.substitutions, kernel.get_var_name_generator()) resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, kernel, program_callables_info, + rule_mapping_context, kernel, callables_table, [func_id_to_in_kernel_callable_mapper]) new_subkernel = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(kernel)) - program_callables_info = resolved_function_marker.program_callables_info + callables_table = resolved_function_marker.callables_table edited_callable_knls[func_id] = in_knl_callable.copy( subkernel=new_subkernel) new_resolved_functions = {} - for func_id, in_knl_callable in program_callables_info.items(): + for func_id, in_knl_callable in callables_table.items(): if func_id in edited_callable_knls: new_resolved_functions[func_id] = edited_callable_knls[func_id] else: new_resolved_functions[func_id] = in_knl_callable - program_callables_info = program_callables_info.copy( + callables_table = callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=program_callables_info) + return program.copy(callables_table=callables_table) def register_function_id_to_in_knl_callable_mapper(program, diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 5f4f2f2a..888bedc1 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -143,7 +143,7 @@ class _not_provided: # noqa: N801 pass -def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name, +def add_prefetch_for_single_kernel(kernel, callables_table, var_name, sweep_inames=[], dim_arg_names=None, # "None" is a valid value here, distinct from the default. @@ -334,7 +334,7 @@ def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name, # warning message. from loopy.transform.precompute import precompute_for_single_kernel - new_kernel = precompute_for_single_kernel(kernel, program_callables_info, + new_kernel = precompute_for_single_kernel(kernel, callables_table, subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, @@ -373,10 +373,10 @@ def add_prefetch(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = add_prefetch_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -389,9 +389,9 @@ def add_prefetch(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # }}} diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 44e69ecf..9b83f242 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -420,23 +420,23 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): """ # all the resolved functions in programs must be registered in - # main_program_callables_info + # main_callables_table main_prog_callables_info = ( - programs[0].program_callables_info) + programs[0].callables_table) old_root_kernel_callable = ( - programs[0].program_callables_info[programs[0].name]) + programs[0].callables_table[programs[0].name]) kernels = [programs[0].root_kernel] # removing the callable collisions that maybe present for prog in programs[1:]: root_kernel = prog.root_kernel renames_needed = {} - for old_func_id, in_knl_callable in prog.program_callables_info.items(): + for old_func_id, in_knl_callable in prog.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): # Fusing programs with multiple callable kernels is tough. # Reason: Need to first figure out the order in which the # callable kernels must be resolved into - # main_program_callables_info, because of renaming is + # main_callables_table, because of renaming is # needed to be done in the callable kernels before registering. # Hence disabling it until required. if in_knl_callable.subkernel.name != prog.name: @@ -468,6 +468,6 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): var(programs[0].name), new_root_kernel_callable) return programs[0].copy( - program_callables_info=main_prog_callables_info) + callables_table=main_prog_callables_info) # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index b6a0454e..fb6682f4 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1095,7 +1095,7 @@ def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=Fals def get_iname_duplication_options(program, use_boostable_into=False): - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): for option in get_iname_duplication_options_for_single_kernel( in_knl_callable.subkernel, use_boostable_into): @@ -1121,7 +1121,7 @@ def has_schedulable_iname_nesting_for_single_kernel(knl): def has_schedulable_iname_nesting(program): return all(has_schedulable_iname_nesting_for_single_kernel( in_knl_callable.subkernel) for in_knl_callable in - program.program_callables_info.values() if isinstance(in_knl_callable, + program.callables_table.values() if isinstance(in_knl_callable, CallableKernel)) # }}} diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 93cf932b..f73110ec 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -42,7 +42,7 @@ def find_instructions_in_single_kernel(kernel, insn_match): def find_instructions(program, insn_match): assert isinstance(program, Program) insns = [] - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): insns += (find_instructions_in_single_kernel( in_knl_callable.subkernel, insn_match)) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 66c7114a..71b11fa2 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -261,7 +261,7 @@ class _not_provided(object): # noqa: N801 pass -def precompute_for_single_kernel(kernel, program_callables_info, subst_use, +def precompute_for_single_kernel(kernel, callables_table, subst_use, sweep_inames=[], within=None, storage_axes=None, temporary_name=None, precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, @@ -1047,7 +1047,7 @@ def precompute_for_single_kernel(kernel, program_callables_info, subst_use, if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag): from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel, program_callables_info) + kernel = assign_automatic_axes(kernel, callables_table) return kernel @@ -1056,10 +1056,10 @@ def precompute(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = precompute_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -1072,8 +1072,8 @@ def precompute(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # vim: foldmethod=marker diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 4b957b03..e463353e 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -235,9 +235,9 @@ class TemporarySaver(object): def new_shape(self): return self.hw_dims + self.non_hw_dims - def __init__(self, kernel, program_callables_info): + def __init__(self, kernel, callables_table): self.kernel = kernel - self.program_callables_info = program_callables_info + self.callables_table = callables_table self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() @@ -441,7 +441,7 @@ class TemporarySaver(object): group_sizes, local_sizes = ( self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids, - self.program_callables_info)) + self.callables_table)) if temporary.address_space == lp.AddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. @@ -630,7 +630,7 @@ class TemporarySaver(object): kernel = lp.add_nosync(kernel, "global", source, sink) from loopy.kernel.tools import assign_automatic_axes - return assign_automatic_axes(kernel, self.program_callables_info) + return assign_automatic_axes(kernel, self.callables_table) def save(self, temporary, subkernel): self.save_or_reload_impl(temporary, subkernel, "save") @@ -754,12 +754,12 @@ def save_and_reload_temporaries(program): program = lp.preprocess_program(program) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(program.root_kernel, - program.program_callables_info) + program.callables_table) assert knl.schedule is not None liveness = LivenessAnalysis(knl) - saver = TemporarySaver(knl, program.program_callables_info) + saver = TemporarySaver(knl, program.callables_table) from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index afe3fec5..acdf5b2a 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -510,7 +510,7 @@ def find_rules_matching(knl, pattern): def find_one_rule_matching(program, pattern): rules = [] - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel rules.extend(find_rules_matching(knl, pattern)) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 43986640..029381d8 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -35,7 +35,7 @@ from loopy.diagnostic import ( TypeInferenceFailure, DependencyTypeInferenceFailure) from loopy.kernel.instruction import _DataObliviousInstruction -from loopy.program import ProgramCallablesInfo +from loopy.program import CallablesTable from loopy.symbolic import ( LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper, SubstitutionRuleExpander, ResolvedFunction, @@ -197,7 +197,7 @@ def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, program_callables_info, new_assignments=None): + def __init__(self, kernel, callables_table, new_assignments=None): """ :arg new_assignments: mapping from names to either :class:`loopy.kernel.data.TemporaryVariable` @@ -206,12 +206,12 @@ class TypeInferenceMapper(CombineMapper): instances """ self.kernel = kernel - assert isinstance(program_callables_info, ProgramCallablesInfo) + assert isinstance(callables_table, CallablesTable) if new_assignments is None: new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() - self.program_callables_info = program_callables_info + self.callables_table = callables_table self.old_calls_to_new_calls = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): @@ -245,16 +245,16 @@ class TypeInferenceMapper(CombineMapper): # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) # are Python-equal (for many common constants such as integers). - def copy(self, program_callables_info=None): - if program_callables_info is None: - program_callables_info = self.program_callables_info - return type(self)(self.kernel, program_callables_info, + def copy(self, callables_table=None): + if callables_table is None: + callables_table = self.callables_table + return type(self)(self.kernel, callables_table, self.new_assignments) def with_assignments(self, names_to_vars): new_ass = self.new_assignments.copy() new_ass.update(names_to_vars) - return type(self)(self.kernel, self.program_callables_info, new_ass) + return type(self)(self.kernel, self.callables_table, new_ass) @staticmethod def combine(dtype_sets): @@ -431,7 +431,7 @@ class TypeInferenceMapper(CombineMapper): # specializing the known function wrt type if isinstance(expr.function, ResolvedFunction): - in_knl_callable = self.program_callables_info[expr.function.name] + in_knl_callable = self.callables_table[expr.function.name] # {{{ checking that there is no overwriting of types of in_knl_callable @@ -465,17 +465,17 @@ class TypeInferenceMapper(CombineMapper): # }}} - in_knl_callable, self.program_callables_info = ( + in_knl_callable, self.callables_table = ( in_knl_callable.with_types( arg_id_to_dtype, self.kernel, - self.program_callables_info)) + self.callables_table)) in_knl_callable = in_knl_callable.with_target(self.kernel.target) # storing the type specialized function so that it can be used for # later use - self.program_callables_info, new_function_id = ( - self.program_callables_info.with_callable( + self.callables_table, new_function_id = ( + self.callables_table.with_callable( expr.function.function, in_knl_callable)) @@ -538,8 +538,8 @@ class TypeInferenceMapper(CombineMapper): in_knl_callable = ManglerCallable( identifier, function_mangler, arg_id_to_dtype, arg_id_to_descr, mangle_result.target_name) - self.program_callables_info, new_function_id = ( - self.program_callables_info.with_added_callable( + self.callables_table, new_function_id = ( + self.callables_table.with_added_callable( expr.function, in_knl_callable)) if isinstance(expr, Call): @@ -688,7 +688,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if var_name in kernel.all_params(): return [kernel.index_dtype], [], {}, ( - type_inf_mapper.program_callables_info) + type_inf_mapper.callables_table) from functools import partial debug = partial(_debug, kernel) @@ -735,13 +735,13 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if not dtype_sets: return ( None, type_inf_mapper.symbols_with_unknown_types, None, - type_inf_mapper.program_callables_info) + type_inf_mapper.callables_table) result = type_inf_mapper.combine(dtype_sets) return (result, type_inf_mapper.symbols_with_unknown_types, type_inf_mapper.old_calls_to_new_calls, - type_inf_mapper.program_callables_info) + type_inf_mapper.callables_table) # }}} @@ -768,7 +768,7 @@ class _DictUnionView: # {{{ infer_unknown_types -def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, +def infer_unknown_types_for_a_single_kernel(kernel, callables_table, expect_completion=False): """Infer types on temporaries and arguments.""" @@ -831,7 +831,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, new_temp_vars, new_arg_dict ]) - type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info, + type_inf_mapper = TypeInferenceMapper(kernel, callables_table, item_lookup) from loopy.symbolic import SubstitutionRuleExpander @@ -867,11 +867,11 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, debug("inferring type for %s %s", type(item).__name__, item.name) (result, symbols_with_unavailable_types, - new_old_calls_to_new_calls, program_callables_info) = ( + new_old_calls_to_new_calls, callables_table) = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) type_inf_mapper = type_inf_mapper.copy( - program_callables_info=program_callables_info) + callables_table=callables_table) failed = not result if not failed: @@ -979,7 +979,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, raise NotImplementedError("Unknown instructions type %s." % ( type(insn).__name__)) - program_callables_info = type_inf_mapper.program_callables_info + callables_table = type_inf_mapper.callables_table old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) end_time = time.time() @@ -1003,39 +1003,39 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, from loopy.check import check_functions_are_resolved check_functions_are_resolved(type_specialized_kernel) - return type_specialized_kernel, program_callables_info + return type_specialized_kernel, callables_table def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" - program_callables_info = program.program_callables_info + callables_table = program.callables_table type_uninferred_knl_callable = ( - program_callables_info[program.name]) + callables_table[program.name]) type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel - old_callables_count = program_callables_info.callables_count - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) - root_kernel, program_callables_info = ( + old_callables_count = callables_table.callables_count + callables_table = ( + program.callables_table.with_edit_callables_mode()) + root_kernel, callables_table = ( infer_unknown_types_for_a_single_kernel( type_uninferred_root_kernel, - program_callables_info, expect_completion)) + callables_table, expect_completion)) type_inferred_knl_callable = type_uninferred_knl_callable.copy( subkernel=root_kernel) - program_callables_info, _ = ( - program_callables_info.with_callable( + callables_table, _ = ( + callables_table.with_callable( program.name, type_inferred_knl_callable)) - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode( + callables_table = ( + callables_table.with_exit_edit_callables_mode( old_callables_count)) - return program.copy(program_callables_info=program_callables_info) + return program.copy(callables_table=callables_table) # }}} @@ -1043,8 +1043,8 @@ def infer_unknown_types(program, expect_completion=False): # {{{ reduction expression helper def infer_arg_and_reduction_dtypes_for_reduction_expression( - kernel, expr, program_callables_info, unknown_types_ok): - type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info) + kernel, expr, callables_table, unknown_types_ok): + type_inf_mapper = TypeInferenceMapper(kernel, callables_table) import loopy as lp if expr.is_tuple_typed: @@ -1076,7 +1076,7 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( for dt in reduction_dtypes) return tuple(arg_dtypes), reduction_dtypes, ( - type_inf_mapper.program_callables_info) + type_inf_mapper.callables_table) # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 43371c8a..fa32ca04 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -416,7 +416,7 @@ def test_ilp_write_race_detection_global(ctx_factory): from warnings import catch_warnings with catch_warnings(record=True) as warn_list: list(lp.generate_loop_schedules(knl.root_kernel, - knl.program_callables_info)) + knl.callables_table)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list) @@ -1271,7 +1271,7 @@ def save_and_reload_temporaries_test(queue, prog, out_expect, debug=False): from loopy.transform.save import save_and_reload_temporaries prog = save_and_reload_temporaries(prog) prog = prog.with_root_kernel(lp.get_one_scheduled_kernel(prog.root_kernel, - prog.program_callables_info)) + prog.callables_table)) if debug: print(prog) @@ -2222,7 +2222,7 @@ def test_unscheduled_insn_detection(): "...") prog = lp.preprocess_kernel(prog) - knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) prog = prog.with_root_kernel(knl) insn1, = lp.find_instructions(prog, "id:insn1") insns = prog.root_kernel.instructions[:] @@ -2392,7 +2392,7 @@ def test_barrier_insertion_near_top_of_loop(): prog = lp.set_temporary_scope(prog, "a", "local") prog = lp.set_temporary_scope(prog, "b", "local") prog = lp.preprocess_kernel(prog) - knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) print(knl) @@ -2420,7 +2420,7 @@ def test_barrier_insertion_near_bottom_of_loop(): prog = lp.set_temporary_scope(prog, "a", "local") prog = lp.set_temporary_scope(prog, "b", "local") prog = lp.preprocess_kernel(prog) - knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) print(knl) @@ -2479,7 +2479,7 @@ def test_multi_argument_reduction_type_inference(): allow_simultaneous=True) t_inf_mapper = TypeInferenceMapper(prog.root_kernel, - prog.program_callables_info) + prog.callables_table) assert ( t_inf_mapper(expr, return_tuple=True, return_dtype_set=True) @@ -2836,7 +2836,7 @@ def test_no_barriers_for_nonoverlapping_access(second_index, expect_barrier): prog = lp.preprocess_kernel(prog) knl = lp.get_one_scheduled_kernel(prog.root_kernel, - prog.program_callables_info) + prog.callables_table) assert barrier_between(knl, "first", "second") == expect_barrier diff --git a/test/testlib.py b/test/testlib.py index eebc792d..853e2584 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -9,9 +9,9 @@ class GridOverride(object): self.clean = clean self.vecsize = vecsize - def __call__(self, insn_ids, program_callables_info, ignore_auto=True): + def __call__(self, insn_ids, callables_table, ignore_auto=True): gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, - program_callables_info, ignore_auto) + callables_table, ignore_auto) return gsize, (self.vecsize,) # }}} @@ -139,14 +139,14 @@ class SeparateTemporariesPreambleTestPreambleGenerator( class Log2Callable(lp.ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: # the types provided aren't mature enough to specialize the # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0].numpy_dtype @@ -168,7 +168,7 @@ class Log2Callable(lp.ScalarCallable): self.copy(name_in_target=name_in_target, arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}), - program_callables_info) + callables_table) def register_log2_lookup(target, identifier): -- GitLab From 17bba4838c931a59b539a4bcb5cd9fa09925cad7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 15 Oct 2018 14:59:36 -0500 Subject: [PATCH 62/80] minor changes after review --- loopy/kernel/__init__.py | 11 ++--------- loopy/kernel/function_interface.py | 11 ++++++----- loopy/library/reduction.py | 12 ++++++------ loopy/program.py | 9 ++++----- loopy/tools.py | 11 +++++++++++ 5 files changed, 29 insertions(+), 25 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 70079d31..9f14dafc 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -38,7 +38,7 @@ import re from pytools import UniqueNameGenerator, generate_unique_names from loopy.diagnostic import CannotBranchDomainTree, LoopyError -from loopy.tools import natsorted +from loopy.tools import natsorted, update_persistent_hash from loopy.diagnostic import StaticValueFindingError from loopy.kernel.data import filter_iname_tags_by_type from warnings import warn @@ -1476,14 +1476,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "symbol_manglers", ) - def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - - Only works in conjunction with :class:`loopy.tools.KeyBuilder`. - """ - for field_name in self.hash_fields: - key_builder.rec(key_hash, getattr(self, field_name)) + update_persistent_hash = update_persistent_hash def __hash__(self): from loopy.tools import LoopyKeyBuilder diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 362fbcef..636d152d 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -28,7 +28,7 @@ from six.moves import zip from pytools import ImmutableRecord from loopy.diagnostic import LoopyError -from loopy.kernel import LoopKernel +from loopy.tools import update_persistent_hash __doc__ = """ @@ -49,7 +49,7 @@ __doc__ = """ class ValueArgDescriptor(ImmutableRecord): hash_fields = () - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash class ArrayArgDescriptor(ImmutableRecord): @@ -99,7 +99,7 @@ class ArrayArgDescriptor(ImmutableRecord): "address_space", "dim_tags") - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash =update_persistent_hash # }}} @@ -171,7 +171,8 @@ class InKernelCallable(ImmutableRecord): .. attribute:: name - The name of the callable which can be encountered within a kernel. + The name of the callable which can be encountered within expressions in + a kernel. .. attribute:: arg_id_to_dtype @@ -212,7 +213,7 @@ class InKernelCallable(ImmutableRecord): def __getinitargs__(self): return (self.arg_id_to_dtype, self.arg_id_to_descr) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): """ diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 7c32d0be..dd0e1e3e 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -31,7 +31,7 @@ import numpy as np from loopy.symbolic import FunctionIdentifier from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.kernel import LoopKernel +from loopy.tools import update_persistent_hash class ReductionOperation(object): @@ -227,7 +227,7 @@ class ReductionOpFunction(FunctionIdentifier): hash_fields = ( "reduction_op",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash # }}} @@ -285,7 +285,7 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): "which", "op",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): @@ -298,7 +298,7 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): "op", "base_reduction_class",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash # }}} @@ -354,7 +354,7 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation): "update_comparison", "neutral_sign",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash class ArgMinReductionOperation(_ArgExtremumReductionOperation): @@ -366,7 +366,7 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation): "update_comparison", "neutral_sign",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash # }}} diff --git a/loopy/program.py b/loopy/program.py index f7c399c1..aee2378f 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -39,6 +39,7 @@ from loopy.diagnostic import LoopyError from loopy.library.reduction import ReductionOpFunction from loopy.kernel import LoopKernel +from loopy.tools import update_persistent_hash from collections import Counter from pymbolic.primitives import Call, CallWithKwargs @@ -253,7 +254,7 @@ class Program(ImmutableRecord): "callables_table", "target",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash def copy(self, **kwargs): if 'target' in kwargs: @@ -611,7 +612,7 @@ class CallablesTable(ImmutableRecord): self.is_being_edited )) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash @property @memoize_method @@ -620,8 +621,6 @@ class CallablesTable(ImmutableRecord): Returns an instance of :class:`collection.Counter` representing the number of times the callables is called in callables_table. """ - # should raise an error if there are more than one root kernels(which is - # illegal) root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable in self.values() if isinstance(in_knl_callable, CallableKernel) and @@ -737,7 +736,7 @@ class CallablesTable(ImmutableRecord): def with_edit_callables_mode(self): """ - Initiates *self* for a walk traversal through all the callables. + Returns a copy of *self* for a walk traversal through all the callables. """ return self.copy( is_being_edited=True) diff --git a/loopy/tools.py b/loopy/tools.py index 5eabe6c3..52fc7d3c 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -43,6 +43,17 @@ else: return isinstance(obj, (int, np.integer)) +def update_persistent_hash(obj, key_hash, key_builder): + """ + Custom hash computation function for use with + :class:`pytools.persistent_dict.PersistentDict`. + + Only works in conjunction with :class:`loopy.tools.KeyBuilder`. + """ + for field_name in obj.hash_fields: + key_builder.rec(key_hash, getattr(obj, field_name)) + + # {{{ custom KeyBuilder subclass class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase): -- GitLab From dc458ada6a51a10c6283f1b90087fd722f13d00f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 14 Nov 2018 17:41:51 -0600 Subject: [PATCH 63/80] renaming: make_program_from_kernel -> make_program --- loopy/__init__.py | 4 ++-- loopy/codegen/__init__.py | 4 ++-- loopy/kernel/__init__.py | 4 ++-- loopy/kernel/creation.py | 12 ++++++------ loopy/program.py | 4 ++-- test/test_diff.py | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 8ebd4d0e..9faa28bc 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -51,7 +51,7 @@ from loopy.kernel.data import ( from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.program import ( - Program, make_program_from_kernel) + Program, make_program) from loopy.kernel import LoopKernel, KernelState, kernel_state from loopy.kernel.tools import ( @@ -175,7 +175,7 @@ __all__ = [ "ScalarCallable", "CallableKernel", - "Program", "make_program_from_kernel", + "Program", "make_program", "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 250e7215..55161ebb 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -541,10 +541,10 @@ def generate_code_v2(program): :param program: An instance of :class:`loopy.Program`. """ from loopy.kernel import LoopKernel - from loopy.program import make_program_from_kernel + from loopy.program import make_program if isinstance(program, LoopKernel): - program = make_program_from_kernel(program) + program = make_program(program) from loopy.kernel import KernelState if program.root_kernel.state == KernelState.INITIAL: diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 9f14dafc..dd7acf25 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1371,8 +1371,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): def __call__(self, *args, **kwargs): warn("Calling a LoopKernel is deprecated, call a Program " "instead.", DeprecationWarning, stacklevel=2) - from loopy.program import make_program_from_kernel - program = make_program_from_kernel(self) + from loopy.program import make_program + program = make_program(self) return program(*args, **kwargs) # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 685232c6..b794cfb8 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1954,7 +1954,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): target = kwargs.pop("target", None) seq_dependencies = kwargs.pop("seq_dependencies", False) fixed_parameters = kwargs.pop("fixed_parameters", {}) - make_program = kwargs.pop("make_program", True) + is_callee_kernel = kwargs.pop("is_callee_kernel", False) if defines: from warnings import warn @@ -2174,15 +2174,15 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): creation_plog.done() - if make_program: - from loopy.program import make_program_from_kernel - return make_program_from_kernel(knl) - else: + if is_callee_kernel: return knl + else: + from loopy.program import make_program + return make_program(knl) def make_kernel_function(*args, **kwargs): - kwargs['make_program'] = False + kwargs['is_callee_kernel'] = False return make_kernel(*args, **kwargs) # }}} diff --git a/loopy/program.py b/loopy/program.py index aee2378f..c8534f05 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -50,7 +50,7 @@ __doc__ = """ .. autoclass:: Program .. autoclass:: CallablesTable -.. autofunction:: make_program_from_kernel +.. autofunction:: make_program .. autofunction:: iterate_over_kernels_if_given_program """ @@ -921,7 +921,7 @@ class CallablesTable(ImmutableRecord): # {{{ helper functions -def make_program_from_kernel(kernel): +def make_program(kernel): """ Returns an instance of :class:`loopy.Program` with the *kernel* as the root kernel. diff --git a/test/test_diff.py b/test/test_diff.py index a7fd9298..49efc261 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -66,7 +66,7 @@ def test_diff(ctx_factory): from loopy.transform.diff import diff_kernel dknl, diff_map = diff_kernel(knl, "z", "x") - dknl = lp.make_program_from_kernel(dknl) + dknl = lp.make_program(dknl) dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") -- GitLab From eca2a3ed2dc9bcae43362dcbf7cf1f1ea3419a1f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 14 Nov 2018 21:47:43 -0600 Subject: [PATCH 64/80] some changes after review --- loopy/__init__.py | 4 ++-- loopy/kernel/creation.py | 2 +- loopy/kernel/function_interface.py | 16 ++++++++++------ test/test_diff.py | 2 +- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 9faa28bc..c2ffe5bf 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -64,7 +64,7 @@ from loopy.kernel.tools import ( get_subkernels, get_subkernel_to_insn_id_map) from loopy.types import to_loopy_type -from loopy.kernel.creation import make_kernel, UniqueName, make_kernel_function +from loopy.kernel.creation import make_kernel, UniqueName, make_function from loopy.library.reduction import register_reduction_parser # {{{ import transforms @@ -184,7 +184,7 @@ __all__ = [ "SubstitutionRule", "CallMangleInfo", - "make_kernel", "UniqueName", "make_kernel_function", + "make_kernel", "UniqueName", "make_function", "register_reduction_parser", diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index b794cfb8..823fb1b3 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2181,7 +2181,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): return make_program(knl) -def make_kernel_function(*args, **kwargs): +def make_function(*args, **kwargs): kwargs['is_callee_kernel'] = False return make_kernel(*args, **kwargs) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 636d152d..17057691 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -29,6 +29,7 @@ from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.tools import update_persistent_hash +from loopy.kernel import LoopKernel __doc__ = """ @@ -99,7 +100,7 @@ class ArrayArgDescriptor(ImmutableRecord): "address_space", "dim_tags") - update_persistent_hash =update_persistent_hash + update_persistent_hash = update_persistent_hash # }}} @@ -176,18 +177,21 @@ class InKernelCallable(ImmutableRecord): .. attribute:: arg_id_to_dtype - A mapping which indicates the arguments types and result types it would - be handling. This would be set once the callable is type specialized. + A mapping which indicates the arguments types and result types of the + callable. .. attribute:: arg_id_to_descr A mapping which gives indicates the argument shape and ``dim_tags`` it - would be responsible for generating code. These parameters would be set, - once it is shape and stride(``dim_tags``) specialized. + would be responsible for generating code. .. note:: + - "``arg_id`` can either be an instance of :class:`int` integer + corresponding to the position of the argument or an instance of + :class:`str` corresponding to the name of keyword argument accepted + by the function. - Negative "id" values ``-i`` in the mapping attributes indicate + - Negative "arg_id" values ``-i`` in the mapping attributes indicate return value with (0-based) index *i*. .. automethod:: __init__ diff --git a/test/test_diff.py b/test/test_diff.py index 49efc261..d001233c 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -55,7 +55,7 @@ def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel_function( + knl = lp.make_function( """{ [i,j]: 0<=i,j a = 1/(1+sinh(x[i] + y[j])**2) -- GitLab From 8b04d088d54806652d3ffaf19364cac1e4aaba2c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Nov 2018 00:22:11 -0600 Subject: [PATCH 65/80] small fix to make the tests runnable again --- loopy/auto_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index bee1b72f..7e23ef06 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -118,7 +118,7 @@ def make_ref_args(program, impl_arg_info, queue, parameters): shape = evaluate_shape(arg.unvec_shape, parameters) dtype = kernel_arg.dtype - is_output = arg.base_name in kernel_arg.is_output_only + is_output = kernel_arg.is_output_only if arg.arg_class is ImageArg: storage_array = ary = cl_array.empty( -- GitLab From 408bb384ec47af2cd464e303458f9017fdf40494 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Nov 2018 09:21:32 -0600 Subject: [PATCH 66/80] asserts that callees do not generate host program --- loopy/codegen/__init__.py | 2 ++ loopy/codegen/control.py | 23 ++++++++++--------- loopy/codegen/result.py | 47 ++++++++++++++++++++++----------------- 3 files changed, 41 insertions(+), 31 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 55161ebb..3fd94aa2 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -561,6 +561,8 @@ def generate_code_v2(program): codegen_results[func_id] = ( generate_code_for_a_single_kernel(in_knl_callable.subkernel, program.callables_table)) + if not in_knl_callable.subkernel.is_called_from_host: + assert codegen_results[func_id].host_program is None device_preambles = set() for cgr in codegen_results.values(): diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 81a672a1..5dfd9cb4 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -117,16 +117,19 @@ def generate_code_for_sched_index(codegen_state, sched_index): glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index), codegen_state.callables_table) - - return merge_codegen_results(codegen_state, [ - codegen_result, - - codegen_state.ast_builder.get_kernel_call( - codegen_state, - sched_item.kernel_name, - glob_grid, loc_grid, - extra_args), - ]) + if kernel.is_called_from_host: + return merge_codegen_results(codegen_state, [ + codegen_result, + + codegen_state.ast_builder.get_kernel_call( + codegen_state, + sched_item.kernel_name, + glob_grid, loc_grid, + extra_args), + ]) + else: + # do not generate host code for callee kernels + return codegen_result elif isinstance(sched_item, EnterLoop): tags = kernel.iname_tags(sched_item.iname) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 00f19d99..7950c56b 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -292,27 +292,32 @@ def generate_host_or_device_program(codegen_state, schedule_index): else: codegen_result = build_loop_nest(codegen_state, schedule_index) - codegen_result = merge_codegen_results( - codegen_state, - ast_builder.generate_top_of_body(codegen_state) - + temp_decls - + [codegen_result], - collapse=False) - - cur_prog = codegen_result.current_program(codegen_state) - body_ast = cur_prog.ast - fdecl_ast = ast_builder.get_function_declaration( - codegen_state, codegen_result, schedule_index) - - fdef_ast = ast_builder.get_function_definition( - codegen_state, codegen_result, - schedule_index, fdecl_ast, body_ast) - - codegen_result = codegen_result.with_new_program( - codegen_state, - cur_prog.copy( - ast=ast_builder.process_ast(fdef_ast), - body_ast=ast_builder.process_ast(body_ast))) + if (codegen_state.is_generating_device_code) or ( + codegen_state.kernel.is_called_from_host): + codegen_result = merge_codegen_results( + codegen_state, + ast_builder.generate_top_of_body(codegen_state) + + temp_decls + + [codegen_result], + collapse=False) + + cur_prog = codegen_result.current_program(codegen_state) + body_ast = cur_prog.ast + fdecl_ast = ast_builder.get_function_declaration( + codegen_state, codegen_result, schedule_index) + + fdef_ast = ast_builder.get_function_definition( + codegen_state, codegen_result, + schedule_index, fdecl_ast, body_ast) + + codegen_result = codegen_result.with_new_program( + codegen_state, + cur_prog.copy( + ast=ast_builder.process_ast(fdef_ast), + body_ast=ast_builder.process_ast(body_ast))) + else: + codegen_result = codegen_result.copy( + host_program=None) return codegen_result -- GitLab From 3f0d8b5461723c4b365a8ecc03784f8dcaf7c223 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Nov 2018 09:52:28 -0600 Subject: [PATCH 67/80] store the fdecls in AST format --- loopy/codegen/__init__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 3fd94aa2..00397906 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -568,20 +568,25 @@ def generate_code_v2(program): for cgr in codegen_results.values(): device_preambles.update(cgr.device_preambles) + # collecting the function declarations of callee kernels for in_knl_callable in program.callables_table.values(): for preamble in in_knl_callable.generate_preambles(program.target): device_preambles.update([preamble]) collective_device_program = codegen_results[program.name].device_programs[0] + callee_fdecls = [] + for func_id, callee_cgr in codegen_results.items(): if func_id != program.name: assert len(callee_cgr.device_programs) == 1 callee_prog_ast = callee_cgr.device_programs[0].ast collective_device_program = collective_device_program.copy( ast=Collection([callee_prog_ast, collective_device_program.ast])) + callee_fdecls.append(callee_prog_ast.fdecl) - device_preambles.update([('98_%s' % func_id, - str(callee_prog_ast.fdecl)), ]) + for callee_fdecl in callee_fdecls: + collective_device_program = collective_device_program.copy( + ast=Collection([callee_fdecl, collective_device_program.ast])) collective_device_programs = [collective_device_program] + ( codegen_results[program.name].device_programs[1:]) -- GitLab From d191d34ff87d44e7ad72f8f3b2f2324a28a399fe Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Nov 2018 09:53:52 -0600 Subject: [PATCH 68/80] removes assymetry between host and device preambles --- loopy/codegen/result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 7950c56b..268a70b2 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -133,7 +133,7 @@ class CodeGenerationResult(ImmutableRecord): preamble_codes = process_preambles( getattr(self, "host_preambles", []) + - list(getattr(self, "device_preambles", [])) + getattr(self, "device_preambles", []) ) return ( -- GitLab From b2903df6c6227960e720ea35cff174df877d4dd7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Nov 2018 11:46:56 -0600 Subject: [PATCH 69/80] small typo, to re-enable making callee kernels --- loopy/kernel/creation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 823fb1b3..c7991873 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2182,7 +2182,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): def make_function(*args, **kwargs): - kwargs['is_callee_kernel'] = False + kwargs['is_callee_kernel'] = True return make_kernel(*args, **kwargs) # }}} -- GitLab From 95ee6fed7549c36dd421b8eb9fcd768d53a139a5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Nov 2018 12:19:34 -0600 Subject: [PATCH 70/80] made device preambles list back again --- loopy/codegen/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 00397906..d8a7effc 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -564,14 +564,14 @@ def generate_code_v2(program): if not in_knl_callable.subkernel.is_called_from_host: assert codegen_results[func_id].host_program is None - device_preambles = set() + device_preambles = [] for cgr in codegen_results.values(): - device_preambles.update(cgr.device_preambles) + device_preambles.extend(cgr.device_preambles) # collecting the function declarations of callee kernels for in_knl_callable in program.callables_table.values(): for preamble in in_knl_callable.generate_preambles(program.target): - device_preambles.update([preamble]) + device_preambles.append(preamble) collective_device_program = codegen_results[program.name].device_programs[0] callee_fdecls = [] -- GitLab From c12c610978b2b1ecab1a6b619f64315b241bfa0e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Nov 2018 12:45:04 -0600 Subject: [PATCH 71/80] Merge 'master' into 'new_function_interface' --- .gitlab-ci.yml | 19 ++++++++++- LICENSE | 21 ++++++++++++ .../make-linux-build-docker-inner-part-2.sh | 4 +++ loopy/frontend/fortran/tree.py | 2 +- loopy/kernel/tools.py | 4 +-- loopy/schedule/__init__.py | 10 ++++-- loopy/statistics.py | 20 ++++++++---- loopy/symbolic.py | 2 +- loopy/target/cuda.py | 2 +- loopy/target/pyopencl.py | 3 +- requirements.txt | 5 +-- setup.cfg | 2 +- test/test_loopy.py | 19 +++++++++++ test/test_numa_diff.py | 2 +- test/test_reduction.py | 32 +++++++++++-------- test/test_statistics.py | 14 +++++--- test/test_target.py | 17 ++++++++++ 17 files changed, 137 insertions(+), 41 deletions(-) create mode 100644 LICENSE diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1caef802..ea69114d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,6 +12,10 @@ Python 2.7 POCL: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + Python 2.7 with legacy PyOpenCL: script: @@ -29,6 +33,10 @@ Python 2.7 with legacy PyOpenCL: except: - tags retry: 2 + artifacts: + reports: + junit: test/pytest.xml + Python 3.6 POCL: script: @@ -43,6 +51,10 @@ Python 3.6 POCL: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + Python 3.6 POCL Twice With Cache: script: @@ -59,6 +71,10 @@ Python 3.6 POCL Twice With Cache: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + # PyPy POCL: # script: @@ -77,7 +93,7 @@ Python 3.6 POCL Examples: script: - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib jupyter nbconvert" + - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert" - ". ./build-py-project-and-run-examples.sh" tags: - python3.6 @@ -87,6 +103,7 @@ Python 3.6 POCL Examples: except: - tags + CentOS binary: script: - (cd build-helpers; ./make-linux-build-docker.sh --nodate) diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..601df74b --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Andreas Klöckner and contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh index 1e35a1e1..035634b1 100755 --- a/build-helpers/make-linux-build-docker-inner-part-2.sh +++ b/build-helpers/make-linux-build-docker-inner-part-2.sh @@ -23,6 +23,10 @@ git clone --recursive git://github.com/inducer/loopy cd loopy grep -v pyopencl requirements.txt > myreq.txt + +# needed for pyinstaller package to be usable +echo packaging >> myreq.txt + pip install -r myreq.txt python setup.py install diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py index b1df6e3d..6939bb6a 100644 --- a/loopy/frontend/fortran/tree.py +++ b/loopy/frontend/fortran/tree.py @@ -53,7 +53,7 @@ class FTreeWalkerBase(object): ENTITY_RE = re.compile( r"^(?P[_0-9a-zA-Z]+)" - "(\((?P[-+*0-9:a-zA-Z, \t]+)\))?$") + r"(\((?P[-+*0-9:a-zA-Z, \t]+)\))?$") def parse_dimension_specs(self, node, dim_decls): def parse_bounds(bounds_str): diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 006ac6ba..3aaa8d56 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1253,7 +1253,7 @@ def draw_dependencies_as_unicode_arrows( for dep in insn.depends_on: reverse_deps.setdefault(dep, set()).add(insn.id) - # mapping of (from_id, to_id) tuples to column_index + # mapping of to_id tuples to column_index dep_to_column = {} # {{{ find column assignments @@ -1330,7 +1330,7 @@ def draw_dependencies_as_unicode_arrows( elif insn.id in starts: starts.remove(insn.id) - if starts: + if starts or pointed_at_insn_id not in processed_ids: # will continue downward row[col] = do_flag_downward(u"├", pointed_at_insn_id) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 2b3f7a3b..3dc1c0bb 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -794,9 +794,13 @@ def generate_loop_schedules_internal( if not is_ready: if debug_mode: - print("instruction '%s' is missing insn depedencies '%s'" % ( - format_insn(kernel, insn.id), ",".join( - insn.depends_on - sched_state.scheduled_insn_ids))) + # These are not that interesting when understanding scheduler + # failures. + + # print("instruction '%s' is missing insn depedencies '%s'" % ( + # format_insn(kernel, insn.id), ",".join( + # insn.depends_on - sched_state.scheduled_insn_ids))) + pass continue want = kernel.insn_inames(insn) - sched_state.parallel_inames diff --git a/loopy/statistics.py b/loopy/statistics.py index d65387d1..454cca18 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -707,9 +707,10 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl, callables_table): + def __init__(self, knl, callables_table, count_within_subscripts=True): self.knl = knl self.callables_table = callables_table + self.count_within_subscripts = count_within_subscripts from loopy.type_inference import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl, callables_table) @@ -737,7 +738,10 @@ class ExpressionOpCounter(CounterBase): ) + self.rec(expr.parameters) def map_subscript(self, expr): - return self.rec(expr.index) + if self.count_within_subscripts: + return self.rec(expr.index) + else: + return ToCountMap() def map_sum(self, expr): assert expr.children @@ -1343,10 +1347,9 @@ def _get_insn_count(knl, callables_table, insn_id, subgroup_size, # {{{ get_op_map - def get_op_map_for_single_kernel(knl, callables_table, numpy_types=True, count_redundant_work=False, - subgroup_size=None): + count_within_subscripts=True, subgroup_size=None): if not knl.options.ignore_boostable_into: raise LoopyError("Kernel '%s': Using operation counting requires the option " @@ -1394,7 +1397,7 @@ def get_op_map_for_single_kernel(knl, callables_table, def get_op_map(program, numpy_types=True, count_redundant_work=False, - subgroup_size=None): + count_within_subscripts=True, subgroup_size=None): """Count the number of operations in a loopy kernel. @@ -1410,6 +1413,9 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) + :arg count_within_subscripts: A :class:`bool` specifying whether to + count operations inside array indices. + :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` ``'guess'``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within @@ -1464,8 +1470,8 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel knl_op_map = get_op_map_for_single_kernel(knl, - program.callables_table, numpy_types, - count_redundant_work, subgroup_size) + program.callables_table, numpy_types, count_redundant_work, + count_within_subscripts, subgroup_size) for i in range(callables_count[func_id]): op_map += knl_op_map diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 92b209ac..04cf2d02 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1696,7 +1696,7 @@ def get_access_range(domain, subscript, assumptions, shape=None, if shape is not None: try: shape_aff = guarded_aff_from_expr(access_map.space, shape[idim]) - except ExpressionToAffineConversionError as sub_err: + except ExpressionToAffineConversionError: pass if shape_aff is None: diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 32b810eb..6b4385bf 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -344,7 +344,7 @@ class CUDACASTBuilder(CASTBuilder): _VEC_AXES = "xyzw" def add_vector_access(self, access_expr, index): - return access_expr.a(self._VEC_AXES[index]) + return access_expr.attr(self._VEC_AXES[index]) def emit_barrier(self, synchronization_kind, mem_kind, comment): """ diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index d98b6cdd..5ef56457 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -125,7 +125,8 @@ def adjust_local_temp_var_storage(kernel, device): new_storage_shape = storage_shape - new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape) + new_temp_vars[temp_var.name] = temp_var.copy( + storage_shape=tuple(new_storage_shape)) return kernel.copy(temporary_variables=new_temp_vars) diff --git a/requirements.txt b/requirements.txt index a3e88cfe..97c20247 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,4 @@ git+https://github.com/inducer/codepy.git git+https://github.com/inducer/f2py # Optional, needed for using the C preprocessor on Fortran -ply>=3.6 - -# This is needed for the pyinstaller executable to be usable. -packaging +ply>=3.6 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index b939ce0c..eec3dfd1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [flake8] -ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814 +ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504 max-line-length=85 exclude= loopy/target/c/compyte/ndarray, diff --git a/test/test_loopy.py b/test/test_loopy.py index fa32ca04..b770497f 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2890,6 +2890,25 @@ def test_dep_cycle_printing_and_error(): print(lp.generate_code(knl).device_code()) +def test_backwards_dep_printing_and_error(): + knl = lp.make_kernel( + "{[i]: 0<=i 1: exec(sys.argv[1]) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 62f490ce..1ba44e77 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -47,8 +47,8 @@ __all__ = [ from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa -@pytest.mark.parametrize("Nq", [7]) @pytest.mark.parametrize("ilp_multiple", [1, 2]) +@pytest.mark.parametrize("Nq", [7]) @pytest.mark.parametrize("opt_level", [11]) def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa ctx = ctx_factory() diff --git a/test/test_reduction.py b/test/test_reduction.py index 96dab405..aaf11ee2 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -219,32 +219,38 @@ def test_local_parallel_reduction(ctx_factory, size): def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() - prog = lp.make_kernel( + knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. - z[0] = sum(i, i/13) + z[0] = sum(i, a[i]) """) - ref_prog = prog + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + ref_knl = knl gsize = 128 - prog = lp.split_iname(prog, "i", gsize * 20) - prog = lp.split_iname(prog, "i_inner", gsize, outer_tag="l.0") - prog = lp.split_reduction_inward(prog, "i_inner_inner") - prog = lp.split_reduction_inward(prog, "i_inner_outer") + knl = lp.split_iname(knl, "i", gsize * 20) + knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0") + knl = lp.split_reduction_outward(knl, "i_outer") + knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule - prog = reduction_arg_to_subst_rule(prog, "i_outer") - prog = lp.precompute(prog, "red_i_outer_arg", "i_outer", + knl = reduction_arg_to_subst_rule(knl, "i_outer") + + knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") - prog = lp.realize_reduction(prog) - prog = lp.add_dependency( - prog, "writes:acc_i_outer", + knl = lp.realize_reduction(knl) + knl = lp.tag_inames(knl, "i_outer_0:g.0") + + # Keep the i_outer accumulator on the correct (lower) side of the barrier, + # otherwise there will be useless save/reload code generated. + knl = lp.add_dependency( + knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( - ref_prog, ctx, prog, parameters={"n": size}, + ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True) diff --git a/test/test_statistics.py b/test/test_statistics.py index 3f236652..41b44b5a 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -57,7 +57,8 @@ def test_op_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -161,7 +162,8 @@ def test_op_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -206,7 +208,8 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=False) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -226,7 +229,7 @@ def test_op_counter_bitwise(): i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups - assert i32add == n*m+n*m*ell*n_subgroups + assert i32add == n*m*ell*n_subgroups assert i32bw == 2*n*m*ell*n_subgroups assert i64bw == 2*n*m*n_subgroups assert i64add == i64mul == n*m*n_subgroups @@ -1153,7 +1156,8 @@ def test_summations_and_filters(): assert f32lall == (3*n*m*ell)*n_subgroups assert f64lall == (2*n*m)*n_subgroups - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) diff --git a/test/test_target.py b/test/test_target.py index a5186c71..095bf093 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -350,6 +350,23 @@ def test_ispc_streaming_stores(): lp.generate_code_v2(knl).all_code() +def test_cuda_short_vector(): + knl = lp.make_kernel( + "{ [i]: 0<=i 1: exec(sys.argv[1]) -- GitLab From cb151a4bdae8a1a9643ce6a6c93da80e5b5e56de Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Nov 2018 13:23:59 -0600 Subject: [PATCH 72/80] another one of ArrayBase erros --- loopy/kernel/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 6bf733a8..0ed1f940 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -834,6 +834,7 @@ class ArrayBase(ImmutableRecord): order=order, alignment=alignment, for_atomic=for_atomic, + target=target, **kwargs) def __eq__(self, other): -- GitLab From a385bd0632e26896a55978e4064a145fbf24a93b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Nov 2018 05:27:09 -0600 Subject: [PATCH 73/80] import changes from statistics to count within subscripts --- loopy/statistics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 454cca18..88aa49bb 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1358,7 +1358,8 @@ def get_op_map_for_single_kernel(knl, callables_table, subgroup_size = _process_subgroup_size(knl, subgroup_size) op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl, callables_table) + op_counter = ExpressionOpCounter(knl, callables_table, + count_within_subscripts) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, -- GitLab From dc0f57d8bb1fee4ed9fd4a7f6ccb39dc9a81d502 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Nov 2018 09:06:27 -0600 Subject: [PATCH 74/80] Some more merge leftovers from new_function_interface --- loopy/kernel/__init__.py | 67 ++++++++++++++++++++++++++++++++----- loopy/kernel/creation.py | 7 +++- loopy/transform/callable.py | 64 ++++++++++++++++++----------------- 3 files changed, 97 insertions(+), 41 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 928eed26..26db6ec4 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1036,20 +1036,17 @@ class LoopKernel(ImmutableRecordWithoutPickling): constants_only=True))) @memoize_method - def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, - ignore_auto=False): + def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, + callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. + :arg insn_ids: a :class:`frozenset` of instruction IDs - *global_size* and *local_size* are :class:`islpy.PwAff` objects. - """ - if self.overridden_get_grid_sizes_for_insn_ids: - return self.overridden_get_grid_sizes_for_insn_ids( - insn_ids, - callables_table, - ignore_auto=ignore_auto) + *global_size* and *local_size* are instances of :class:`dict` with + mapping of the form from ``axis`` to :class:`islpy.PwAff` objects. + """ # {{{ collecting the callee kernels in insn_ids @@ -1124,6 +1121,58 @@ class LoopKernel(ImmutableRecordWithoutPickling): return global_sizes, local_sizes @memoize_method + def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, + ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of all instructions whose IDs are given + in *insn_ids*. + + :arg insn_ids: a :class:`frozenset` of instruction IDs + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + + if self.overridden_get_grid_sizes_for_insn_ids: + return self.overridden_get_grid_sizes_for_insn_ids( + insn_ids, + callables_table=callables_table, + ignore_auto=ignore_auto) + + assert self.is_called_from_host, ("Callee kernels do not have sufficient " + "information to compute grid sizes.") + + global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( + insn_ids, callables_table, ignore_auto=ignore_auto) + + def to_dim_tuple(size_dict, which, forced_sizes={}): + forced_sizes = forced_sizes.copy() + + size_list = [] + sorted_axes = sorted(six.iterkeys(size_dict)) + + while sorted_axes or forced_sizes: + if sorted_axes: + cur_axis = sorted_axes.pop(0) + else: + cur_axis = None + + if len(size_list) in forced_sizes: + size_list.append(forced_sizes.pop(len(size_list))) + continue + + assert cur_axis is not None + + if cur_axis > len(size_list): + raise LoopyError("%s axis %d unused for %s" % ( + which, len(size_list), self.name)) + + size_list.append(size_dict[cur_axis]) + + return tuple(size_list) + + return (to_dim_tuple(global_sizes, "global"), + to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) + def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 060b5d76..52e299b6 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2146,7 +2146,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # {{{ handle kernel language version - if is_callee_kernel: + if not is_callee_kernel: from loopy.version import LANGUAGE_VERSION_SYMBOLS version_to_symbol = dict( @@ -2353,6 +2353,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): def make_function(*args, **kwargs): + lang_version = kwargs.pop('lang_version', None) + if lang_version: + raise LoopyError("lang_version should be set for program, not " + "functions.") + kwargs['is_callee_kernel'] = True return make_kernel(*args, **kwargs) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 532f6021..e293543f 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -173,7 +173,7 @@ def register_callable_kernel(program, callee_kernel): expected_num_assignees = len([arg for arg in callee_kernel.args if arg.is_output_only]) expected_num_parameters = len(callee_kernel.args) - expected_num_assignees - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): caller_kernel = in_knl_callable.subkernel for insn in caller_kernel.instructions: @@ -211,8 +211,9 @@ def register_callable_kernel(program, callee_kernel): # take the function resolvers from the Program and resolve the functions in # the callee kernel - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) + old_callables_count = program.callables_table.callables_count + callables_table = ( + program.callables_table.with_edit_callables_mode()) from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( @@ -220,16 +221,17 @@ def register_callable_kernel(program, callee_kernel): callee_kernel.get_var_name_generator()) resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, callee_kernel, program_callables_info, + rule_mapping_context, callee_kernel, callables_table, program.func_id_to_in_knl_callable_mappers) callee_kernel = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(callee_kernel)) - program_callables_info = resolved_function_marker.program_callables_info + callables_table = resolved_function_marker.callables_table - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) - program = program.copy(program_callables_info=program_callables_info) + callables_table = ( + callables_table.with_exit_edit_callables_mode( + old_callables_count)) + program = program.copy(callables_table=callables_table) # making the target of the child kernel to be same as the target of parent # kernel. @@ -492,26 +494,26 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): # {{{ inline callable kernel def _inline_single_callable_kernel(caller_kernel, function_name, - program_callables_info): + callables_table): old_insns = caller_kernel.instructions for insn in old_insns: if isinstance(insn, CallInstruction): # FIXME This seems to use identifiers across namespaces. Why not # check whether the function is a scoped function first? ~AK - if insn.expression.function.name in program_callables_info: - history_of_identifier = program_callables_info.history[ + if insn.expression.function.name in callables_table: + history_of_identifier = callables_table.history[ insn.expression.function.name] if function_name in history_of_identifier: - in_knl_callable = program_callables_info[ + in_knl_callable = callables_table[ insn.expression.function.name] assert isinstance(in_knl_callable, CallableKernel) caller_kernel = _inline_call_instruction( caller_kernel, in_knl_callable.subkernel, insn) - program_callables_info = ( - program_callables_info.with_deleted_callable( + callables_table = ( + callables_table.with_deleted_callable( insn.expression.function.name, - program_callables_info.num_times_callables_called[ + callables_table.num_times_callables_called[ caller_kernel.name])) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): @@ -521,7 +523,7 @@ def _inline_single_callable_kernel(caller_kernel, function_name, "Unknown instruction type %s" % type(insn).__name__) - return caller_kernel, program_callables_info + return caller_kernel, callables_table # FIXME This should take a 'within' parameter to be able to only inline @@ -533,33 +535,33 @@ def inline_callable_kernel(program, function_name): """ from loopy.preprocess import infer_arg_descr program = infer_arg_descr(program) - program_callables_info = program.program_callables_info - old_program_callables_info = program_callables_info.copy() + callables_table = program.callables_table + old_callables_table = callables_table.copy() edited_callable_kernels = {} - for func_id, in_knl_callable in old_program_callables_info.items(): - if function_name not in old_program_callables_info.history[func_id] and ( + for func_id, in_knl_callable in old_callables_table.items(): + if function_name not in old_callables_table.history[func_id] and ( isinstance(in_knl_callable, CallableKernel)): caller_kernel = in_knl_callable.subkernel - caller_kernel, program_callables_info = ( + caller_kernel, callables_table = ( _inline_single_callable_kernel(caller_kernel, function_name, - program_callables_info)) + callables_table)) edited_callable_kernels[func_id] = in_knl_callable.copy( subkernel=caller_kernel) new_resolved_functions = {} - for func_id, in_knl_callable in program_callables_info.items(): + for func_id, in_knl_callable in callables_table.items(): if func_id in edited_callable_kernels: new_resolved_functions[func_id] = edited_callable_kernels[func_id] else: new_resolved_functions[func_id] = in_knl_callable - program_callables_info = program_callables_info.copy( + callables_table = callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=program_callables_info) + return program.copy(callables_table=callables_table) # }}} @@ -719,20 +721,20 @@ def _match_caller_callee_argument_dimension_(program, callee_function_name): callee_function_name).map_kernel caller_knl, = [in_knl_callable.subkernel for in_knl_callable in - program.program_callables_info.values() if isinstance(in_knl_callable, + program.callables_table.values() if isinstance(in_knl_callable, CallableKernel) and is_invoking_callee(in_knl_callable.subkernel)] - old_callee_knl = program.program_callables_info[ + old_callee_knl = program.callables_table[ callee_function_name].subkernel new_callee_kernel = _match_caller_callee_argument_dimension_for_single_kernel( caller_knl, old_callee_knl) - new_program_callables_info = program.program_callables_info.copy() - new_program_callables_info.resolved_functions[callee_function_name] = ( - new_program_callables_info[callee_function_name].copy( + new_callables_table = program.callables_table.copy() + new_callables_table.resolved_functions[callee_function_name] = ( + new_callables_table[callee_function_name].copy( subkernel=new_callee_kernel)) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # }}} -- GitLab From 20371326ee0fad5ad62217231bb35e7aa65fe11b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Nov 2018 10:03:36 -0600 Subject: [PATCH 75/80] some more program_callables_info -> callables_table --- loopy/transform/callable.py | 46 ++++++++++++------------- loopy/transform/pack_and_unpack_args.py | 14 ++++---- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index e293543f..f812b8ea 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -31,7 +31,7 @@ from loopy.kernel import LoopKernel from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, - CInstruction, _DataObliviousInstruction) + Assignment, CInstruction, _DataObliviousInstruction) from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import (get_kw_pos_association, @@ -211,26 +211,19 @@ def register_callable_kernel(program, callee_kernel): # take the function resolvers from the Program and resolve the functions in # the callee kernel - old_callables_count = program.callables_table.callables_count - callables_table = ( - program.callables_table.with_edit_callables_mode()) - from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( callee_kernel.substitutions, callee_kernel.get_var_name_generator()) resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, callee_kernel, callables_table, + rule_mapping_context, callee_kernel, program.callables_table, program.func_id_to_in_knl_callable_mappers) callee_kernel = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(callee_kernel)) - callables_table = resolved_function_marker.callables_table + callables_table = resolved_function_marker.callables_table.copy() - callables_table = ( - callables_table.with_exit_edit_callables_mode( - old_callables_count)) program = program.copy(callables_table=callables_table) # making the target of the child kernel to be same as the target of parent @@ -462,15 +455,25 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): type(atomicity)(var_map[p.Variable(atomicity.var_name)].name) for atomicity in insn.atomicity) - insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in callee kernel - priority=instruction.priority, - depends_on=depends_on, - tags=insn.tags | instruction.tags, - atomicity=new_atomicity - ) + if isinstance(insn, Assignment): + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on, + tags=insn.tags | instruction.tags, + atomicity=new_atomicity + ) + else: + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on, + tags=insn.tags | instruction.tags, + ) inner_insns.append(insn) inner_insns.append(noop_end) @@ -510,11 +513,6 @@ def _inline_single_callable_kernel(caller_kernel, function_name, assert isinstance(in_knl_callable, CallableKernel) caller_kernel = _inline_call_instruction( caller_kernel, in_knl_callable.subkernel, insn) - callables_table = ( - callables_table.with_deleted_callable( - insn.expression.function.name, - callables_table.num_times_callables_called[ - caller_kernel.name])) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): pass diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 73407257..e5ed850c 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -37,7 +37,7 @@ __doc__ = """ def pack_and_unpack_args_for_call_for_single_kernel(kernel, - program_callables_info, call_name, args_to_pack=None, + callables_table, call_name, args_to_pack=None, args_to_unpack=None): """ Returns a a copy of *kernel* with instructions appended to copy the @@ -63,10 +63,10 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, if not isinstance(insn, CallInstruction): # pack and unpack call only be done for CallInstructions. continue - if insn.expression.function.name not in program_callables_info: + if insn.expression.function.name not in callables_table: continue - in_knl_callable = program_callables_info[ + in_knl_callable = callables_table[ insn.expression.function.name] if in_knl_callable.name != call_name: @@ -324,10 +324,10 @@ def pack_and_unpack_args_for_call(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = pack_and_unpack_args_for_call_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -340,8 +340,8 @@ def pack_and_unpack_args_for_call(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # vim: foldmethod=marker -- GitLab From 600f9d1bdcf3f9f46fb7a56cd9c5fc00ce84a555 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Nov 2018 10:42:01 -0600 Subject: [PATCH 76/80] re-adds some missing checks --- loopy/check.py | 4 ++-- loopy/target/c/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 82b99a43..659e210f 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -729,8 +729,8 @@ def pre_schedule_checks(kernel, callables_table): check_for_data_dependent_parallel_bounds(kernel) check_bounds(kernel) check_write_destinations(kernel) - # check_has_schedulable_iname_nesting(kernel) - # check_variable_access_ordered(kernel) + check_has_schedulable_iname_nesting(kernel) + check_variable_access_ordered(kernel) logger.debug("%s: pre-schedule check: done" % kernel.name) except KeyboardInterrupt: diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index ca4d6b00..ac3dec32 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -435,7 +435,7 @@ class CMathCallable(ScalarCallable): if dtype.kind == "c": raise LoopyTypeError("%s does not support complex numbers") - elif dtype.kind == "f" and name in ["fmax", "fmin"]: + elif dtype.kind == "f" or name in ["fmax", "fmin"]: from loopy.target.opencl import OpenCLTarget if not isinstance(caller_kernel.target, OpenCLTarget): if dtype == np.float64: -- GitLab From 1d48377532bc8092bbc613fa09a63f166047ef10 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Nov 2018 04:17:28 -0600 Subject: [PATCH 77/80] reverted the changes in type inference --- loopy/target/c/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index ac3dec32..58051e42 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -435,7 +435,7 @@ class CMathCallable(ScalarCallable): if dtype.kind == "c": raise LoopyTypeError("%s does not support complex numbers") - elif dtype.kind == "f" or name in ["fmax", "fmin"]: + elif dtype.kind == "f": from loopy.target.opencl import OpenCLTarget if not isinstance(caller_kernel.target, OpenCLTarget): if dtype == np.float64: -- GitLab From a840eed1fed2dd3f0ba636f7f2cd9ae446d55531 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Nov 2018 05:55:49 -0600 Subject: [PATCH 78/80] minor changes to relax type inference --- loopy/statistics.py | 5 +++++ loopy/type_inference.py | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/loopy/statistics.py b/loopy/statistics.py index 965c164e..c621ea72 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -34,6 +34,8 @@ from loopy.kernel.data import ( from loopy.diagnostic import warn_with_kernel, LoopyError from pytools import Record, memoize_method from loopy.kernel.function_interface import ScalarCallable, CallableKernel +from loopy.kernel import LoopKernel +from loopy.program import make_program __doc__ = """ @@ -1458,6 +1460,9 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, """ + if isinstance(program, LoopKernel): + program = make_program(program) + from loopy.preprocess import preprocess_program, infer_unknown_types program = infer_unknown_types(program, expect_completion=True) program = preprocess_program(program) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 4137709e..5047dcc2 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -457,6 +457,10 @@ class TypeInferenceMapper(CombineMapper): np.int64): continue + if np.can_cast(arg_id_to_dtype[id].dtype.type, + in_knl_callable.arg_id_to_dtype[id].dtype.type): + continue + # }}} raise LoopyError("Overwriting a specialized function " -- GitLab From 237b7ef44125410dd3d7a23f75fa3a838331e560 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Nov 2018 06:04:25 -0600 Subject: [PATCH 79/80] some more leftover program_callables_info -> callables_table --- examples/python/call-external.py | 6 +++--- loopy/kernel/function_interface.py | 16 ++++++++-------- loopy/kernel/tools.py | 6 +++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/python/call-external.py b/examples/python/call-external.py index 68618a7e..c13d99bd 100644 --- a/examples/python/call-external.py +++ b/examples/python/call-external.py @@ -7,14 +7,14 @@ from loopy.target.c import CTarget # {{{ blas callable class BLASCallable(lp.ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): for i in range(0, 2): if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: # the types provided aren't mature enough to specialize the # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) mat_dtype = arg_id_to_dtype[0].numpy_dtype vec_dtype = arg_id_to_dtype[1].numpy_dtype @@ -34,7 +34,7 @@ class BLASCallable(lp.ScalarCallable): from loopy.types import NumpyType return self.copy(name_in_target=name_in_target, arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype), - -1: NumpyType(vec_dtype)}), program_callables_info + -1: NumpyType(vec_dtype)}), callables_table def emit_call_insn(self, insn, target, expression_to_code_mapper): assert self.is_ready_for_codegen() diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index fa7a87fe..3e628f5c 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -532,7 +532,7 @@ class CallableKernel(InKernelCallable): return self.subkernel.name def with_types(self, arg_id_to_dtype, caller_kernel, - program_callables_info): + callables_table): kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) new_args = [] @@ -555,10 +555,10 @@ class CallableKernel(InKernelCallable): # infer the types of the written variables based on the knowledge # of the types of the arguments supplied - specialized_kernel, program_callables_info = ( + specialized_kernel, callables_table = ( infer_unknown_types_for_a_single_kernel( pre_specialized_subkernel, - program_callables_info, + callables_table, expect_completion=True)) new_arg_id_to_dtype = {} @@ -571,9 +571,9 @@ class CallableKernel(InKernelCallable): # Return the kernel call with specialized subkernel and the corresponding # new arg_id_to_dtype return self.copy(subkernel=specialized_kernel, - arg_id_to_dtype=new_arg_id_to_dtype), program_callables_info + arg_id_to_dtype=new_arg_id_to_dtype), callables_table - def with_descrs(self, arg_id_to_descr, program_callables_info): + def with_descrs(self, arg_id_to_descr, callables_table): # tune the subkernel so that we have the matching shapes and # dim_tags @@ -602,15 +602,15 @@ class CallableKernel(InKernelCallable): type(descr)) descriptor_specialized_knl = self.subkernel.copy(args=new_args) from loopy.preprocess import traverse_to_infer_arg_descr - descriptor_specialized_knl, program_callables_info = ( + descriptor_specialized_knl, callables_table = ( traverse_to_infer_arg_descr(descriptor_specialized_knl, - program_callables_info)) + callables_table)) return ( self.copy( subkernel=descriptor_specialized_knl, arg_id_to_descr=arg_id_to_descr), - program_callables_info) + callables_table) def with_packing_for_args(self): from loopy.kernel.data import AddressSpace diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 125577c9..26856d64 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1866,7 +1866,7 @@ def find_aliasing_equivalence_classes(kernel): # {{{ callee kernel tools -def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,): +def get_direct_callee_kernels(kernel, callables_table, insn_ids=None,): """ Returns an instance of :class:`frozenset` of all the callee kernels called in instructions in the *kernel* whose IDs are given in *insn_ids*. @@ -1892,8 +1892,8 @@ def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,): from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, CInstruction, _DataObliviousInstruction) if isinstance(insn, CallInstruction): - if insn.expression.function.name in program_callables_info: - in_knl_callable = program_callables_info[ + if insn.expression.function.name in callables_table: + in_knl_callable = callables_table[ insn.expression.function.name] if isinstance(in_knl_callable, CallableKernel): return in_knl_callable.subkernel -- GitLab From 608ac4016fdba92e87a7df384560dac9d2979eb4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Nov 2018 06:29:06 -0600 Subject: [PATCH 80/80] ArrayArg->GlobalArg --- doc/tutorial.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index c134e4fb..25082f88 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1145,7 +1145,7 @@ the right by 1 in parallel: ... end ... """, ... [ - ... lp.ArrayArg("arr", shape=("n",), dtype=np.int32), + ... lp.GlobalArg("arr", shape=("n",), dtype=np.int32), ... "...", ... ], ... name="rotate_v1", @@ -1189,7 +1189,7 @@ Let us start with an example. Consider the kernel from above with a ... end ... """, ... [ - ... lp.ArrayArg("arr", shape=("n",), dtype=np.int32), + ... lp.GlobalArg("arr", shape=("n",), dtype=np.int32), ... "...", ... ], ... name="rotate_v2", @@ -1323,8 +1323,8 @@ tagged, as in the following example:: "{ [i]: 0<=i