diff --git a/loopy/check.py b/loopy/check.py index e9ff2425e2c4627dcf75591bdea4e112f1497643..3f7d99076745e545b1ecc4faa913b561dfd3e96c 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -8,8 +8,8 @@ from __future__ import division def check_for_unused_hw_axes_in_insns(kernel): group_size, local_size = kernel.get_grid_sizes_as_exprs() - group_axes = set(range(len(group_size))) - local_axes = set(range(len(local_size))) + group_axes = set(ax for ax, length in enumerate(group_size) if length != 1) + local_axes = set(ax for ax, length in enumerate(local_size) if length != 1) from loopy.kernel import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag for insn in kernel.instructions: diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index bbd6ab8bcc7830763f6485db1af7316b5d61820f..88a64e7b270f4c1077f645710d5ba9a8111f4357 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -149,7 +149,6 @@ def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): constraints = constraint_bset.get_constraints() - from pymbolic import expand from pymbolic.mapper.constant_folder import CommutativeConstantFoldingMapper cfm = CommutativeConstantFoldingMapper() @@ -174,7 +173,7 @@ def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): from pymbolic import var rhs += iname_coeff*var(iname) end_conds.append("%s >= 0" % - ccm(cfm(expand(rhs)))) + ccm(cfm(rhs))) else: # iname_coeff > 0 kind, bound = solve_constraint_for_bound(cns, iname) assert kind == ">=" diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index ade8d9fe5a79ed1bce3a70a398375b52b56695e0..3d81a5d6a29f181f2f1f0f43a4787c15bce14ed1 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -29,9 +29,6 @@ def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain): # {{{ conditional-minimizing slab decomposition def get_slab_decomposition(kernel, iname, sched_index, codegen_state): - space = kernel.space - tag = kernel.iname_to_tag.get(iname) - lb_cns_orig, ub_cns_orig = get_simple_loop_bounds(kernel, sched_index, iname, codegen_state.implemented_domain) @@ -39,7 +36,6 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): iname_tp, iname_idx = kernel.iname_to_dim[iname] - constraints = [lb_cns_orig] if lower_incr or upper_incr: bounds = kernel.get_iname_bounds(iname) @@ -118,8 +114,6 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): # {{{ unrolled loops def generate_unroll_loop(kernel, sched_index, codegen_state): - ccm = codegen_state.c_code_mapper - space = kernel.space iname = kernel.schedule[sched_index].iname tag = kernel.iname_to_tag.get(iname) @@ -167,7 +161,9 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left= global_size, local_size = kernel.get_grid_sizes() + hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() + tag = kernel.iname_to_tag.get(iname) assert isinstance(tag, UniqueTag) @@ -205,8 +201,6 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left= raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") - ccm = codegen_state.c_code_mapper - result = [] from loopy.codegen import add_comment @@ -230,9 +224,7 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left= def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): ccm = codegen_state.c_code_mapper - space = kernel.space iname = kernel.schedule[sched_index].iname - tag = kernel.iname_to_tag.get(iname) slabs = get_slab_decomposition( kernel, iname, sched_index, codegen_state) diff --git a/loopy/compiled.py b/loopy/compiled.py index 0c6ec7580ee3a29de72aab6c307e7404f3970311..9b976aa186600ac6f511e72d7fcc58342566ea69 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -1,4 +1,5 @@ from __future__ import division +import pyopencl as cl diff --git a/loopy/kernel.py b/loopy/kernel.py index ad27038e65aa050e3266644806b89658535f0225..04bce3bad9da15dd14c20d5e63c3276c86750fbc 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -772,6 +772,8 @@ class LoopKernel(Record): size_list = [] sorted_axes = sorted(size_dict.iterkeys()) + zero_aff = isl.Aff.zero_on_domain(self.space.params()) + while sorted_axes or forced_sizes: if sorted_axes: cur_axis = sorted_axes.pop(0) @@ -781,8 +783,7 @@ class LoopKernel(Record): if len(size_list) in forced_sizes: size_list.append( isl.PwAff.from_aff( - isl.Aff.zero_on_domain(self.space.params()) - + forced_sizes.pop(len(size_list)))) + zero_aff + forced_sizes.pop(len(size_list)))) continue assert cur_axis is not None @@ -792,7 +793,7 @@ class LoopKernel(Record): from warnings import warn warn("%s axis %d unassigned--assuming length 1" % ( which, len(size_list)), LoopyAdvisory) - size_list.append(1) + size_list.append(zero_aff + 1) size_list.append(size_dict[cur_axis]) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index b25dd296528c9ca8d816e69376cf27e36a8ae8c5..8f28afb33c1e45f0cbec609ed907567d2ee745fd 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -441,8 +441,6 @@ def pw_aff_to_expr(pw_aff): return aff_to_expr(aff) def aff_from_expr(space, expr): - n = space.dim(dim_type.set) - zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(space)) context = {} for name, (dt, pos) in space.get_var_dict().iteritems(): @@ -452,7 +450,7 @@ def aff_from_expr(space, expr): context[name] = zero.set_coefficient(dt, pos, 1) from pymbolic import evaluate - return evaluate(expr, context) + return zero + evaluate(expr, context) # }}}