diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 058af59bcc59593189948a0fbd15c7d09349f05c..59df3af620050eef712881e955098af6678cea79 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -28,8 +28,6 @@ from loopy.diagnostic import LoopyError, warn from pytools import Record import islpy as isl -import numpy as np - from pytools.persistent_dict import PersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION @@ -205,6 +203,8 @@ class CodeGenerationState(object): set of :class:`SeenFunction` instances + .. attribute:: seen_atomic_dtypes + .. attribute:: var_subst_map .. attribute:: allow_complex @@ -215,17 +215,19 @@ class CodeGenerationState(object): """ def __init__(self, kernel, implemented_domain, implemented_predicates, - seen_dtypes, seen_functions, var_subst_map, + seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, - vectorization_info=None): + vectorization_info=None, var_name_generator=None): self.kernel = kernel self.implemented_domain = implemented_domain self.implemented_predicates = implemented_predicates self.seen_dtypes = seen_dtypes self.seen_functions = seen_functions + self.seen_atomic_dtypes = seen_atomic_dtypes self.var_subst_map = var_subst_map.copy() self.allow_complex = allow_complex self.vectorization_info = vectorization_info + self.var_name_generator = var_name_generator # {{{ copy helpers @@ -245,9 +247,11 @@ class CodeGenerationState(object): implemented_predicates or self.implemented_predicates), seen_dtypes=self.seen_dtypes, seen_functions=self.seen_functions, + seen_atomic_dtypes=self.seen_atomic_dtypes, var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, - vectorization_info=vectorization_info) + vectorization_info=vectorization_info, + var_name_generator=self.var_name_generator) def copy_and_assign(self, name, value): """Make a copy of self with variable *name* fixed to *value*.""" @@ -347,7 +351,8 @@ class POD(Declarator): """ def __init__(self, target, dtype, name): - dtype = np.dtype(dtype) + from loopy.types import LoopyType + assert isinstance(dtype, LoopyType) self.target = target self.ctype = target.dtype_to_typename(dtype) @@ -528,6 +533,7 @@ def generate_code(kernel, device=None): seen_dtypes = set() seen_functions = set() + seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( @@ -536,8 +542,10 @@ def generate_code(kernel, device=None): implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, + seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, - allow_complex=allow_complex) + allow_complex=allow_complex, + var_name_generator=kernel.get_var_name_generator()) code_str, implemented_domains = kernel.target.generate_code( kernel, codegen_state, impl_arg_info) @@ -555,10 +563,22 @@ def generate_code(kernel, device=None): preambles = kernel.preambles[:] + from pytools import Record + + class PreambleInfo(Record): + pass + + preamble_info = PreambleInfo( + kernel=kernel, + seen_dtypes=seen_dtypes, + seen_functions=seen_functions, + # a set of LoopyTypes (!) + seen_atomic_dtypes=seen_atomic_dtypes) + preamble_generators = (kernel.preamble_generators + kernel.target.preamble_generators()) for prea_gen in preamble_generators: - preambles.extend(prea_gen(kernel, seen_dtypes, seen_functions)) + preambles.extend(prea_gen(preamble_info)) seen_preamble_tags = set() dedup_preambles = [] diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 871ed10a7b4c12aa799002ae5213d400a2ef961f..f4c48443f9062bf362e9a27681d2967b8f82807d 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -92,6 +92,8 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): from loopy.expression import dtype_to_type_context, VectorizabilityChecker + # {{{ vectorization handling + if codegen_state.vectorization_info: if insn.atomicity: raise Unvectorizable("atomic operation") @@ -111,18 +113,49 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): del lhs_is_vector del rhs_is_vector - expr = insn.expression + # }}} (assignee_var_name, assignee_indices), = insn.assignees_and_indices() - target_dtype = kernel.get_var_descriptor(assignee_var_name).dtype + lhs_dtype = kernel.get_var_descriptor(assignee_var_name).dtype + + if insn.atomicity is not None: + lhs_atomicity = [ + a for a in insn.atomicity if a.var_name == assignee_var_name] + assert len(lhs_atomicity) <= 1 + if lhs_atomicity: + lhs_atomicity, = lhs_atomicity + else: + lhs_atomicity = None + else: + lhs_atomicity = None + + from loopy.kernel.data import AtomicInit, AtomicUpdate - from cgen import Assign lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None) - result = Assign( - lhs_code, - ecm(expr, prec=PREC_NONE, - type_context=dtype_to_type_context(kernel.target, target_dtype), - needed_dtype=target_dtype)) + rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype) + if lhs_atomicity is None: + from cgen import Assign + result = Assign( + lhs_code, + ecm(insn.expression, prec=PREC_NONE, + type_context=rhs_type_context, + needed_dtype=lhs_dtype)) + + elif isinstance(lhs_atomicity, AtomicInit): + raise NotImplementedError("atomic init") + + elif isinstance(lhs_atomicity, AtomicUpdate): + codegen_state.seen_atomic_dtypes.add(lhs_dtype) + result = kernel.target.generate_atomic_update( + kernel, codegen_state, lhs_atomicity, + insn.assignee, insn.expression, + lhs_dtype, rhs_type_context) + + else: + raise ValueError("unexpected lhs atomicity type: %s" + % type(lhs_atomicity).__name__) + + # {{{ tracing if kernel.options.trace_assignments or kernel.options.trace_assignment_values: if codegen_state.vectorization_info and is_vector: @@ -179,6 +212,8 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): # print first, execute later -> helps find segfaults result = Block([printf_insn, result]) + # }}} + return result diff --git a/loopy/expression.py b/loopy/expression.py index e550886b768dda5e3a42e37af6c5acb29225dc06..16c09b82821044f880e0f459af980da13526211b 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -76,12 +76,19 @@ class TypeInferenceMapper(CombineMapper): # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) # are Python-equal (for many common constants such as integers). + def with_assignments(self, names_to_vars): + new_ass = self.new_assignments.copy() + new_ass.update(names_to_vars) + return type(self)(self.kernel, new_ass) + @staticmethod def combine(dtypes): # dtypes may just be a generator expr dtypes = list(dtypes) - from loopy.types import NumpyType + from loopy.types import LoopyType, NumpyType + assert all(isinstance(dtype, LoopyType) for dtype in dtypes) + if not all(isinstance(dtype, NumpyType) for dtype in dtypes): from pytools import is_single_valued, single_valued if not is_single_valued(dtypes): diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index d03ce86b6a800d9ee5d96d94caa9053fd8b1d458..90dd27133c03d6dfde5cb49707b25980d28348f1 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -314,14 +314,14 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes): return None -def reduction_preamble_generator(kernel, seen_dtypes, seen_functions): +def reduction_preamble_generator(preamble_info): from loopy.target.opencl import OpenCLTarget - for func in seen_functions: + for func in preamble_info.seen_functions: if isinstance(func.name, ArgExtFunction): - if not isinstance(kernel.target, OpenCLTarget): + if not isinstance(preamble_info.kernel.target, OpenCLTarget): raise LoopyError("only OpenCL supported for now") - yield get_argext_preamble(kernel.target, func.name) + yield get_argext_preamble(preamble_info.kernel.target, func.name) # vim: fdm=marker diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 85e58a809e51ff8067c9effb59240fb5125b46db..c8c2324e828a6c69696c4e6a828eb79c29d230e8 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -133,6 +133,10 @@ class TargetBase(object): def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): raise NotImplementedError() + def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity, + lhs_expr, rhs_expr, lhs_dtype): + raise NotImplementedError("atomic update") + # }}} # vim: foldmethod=marker diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 5d6a856d14f57b277e6c083717346bbeb11af7b6..f9a5e4390d07231e3933127e262e86e58c4c5279 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -28,14 +28,50 @@ import six import numpy as np # noqa from loopy.target import TargetBase +from loopy.diagnostic import LoopyError from pytools import memoize_method +# {{{ dtype registry wrapper + +class DTypeRegistryWrapper(object): + def __init__(self, wrapped_registry): + self.wrapped_registry = wrapped_registry + + def get_or_register_dtype(self, names, dtype=None): + if dtype is not None: + from loopy.types import LoopyType, NumpyType + assert isinstance(dtype, LoopyType) + + if isinstance(dtype, NumpyType): + return self.wrapped_registry.get_or_register_dtype( + names, dtype.dtype) + else: + raise LoopyError( + "unable to get or register type '%s'" + % dtype) + else: + return self.wrapped_registry.get_or_register_dtype(names, dtype) + + def dtype_to_ctype(self, dtype): + from loopy.types import LoopyType, NumpyType + assert isinstance(dtype, LoopyType) + + if isinstance(dtype, NumpyType): + return self.wrapped_registry.dtype_to_ctype(dtype) + else: + raise LoopyError( + "unable to convert type '%s' to C" + % dtype) + +# }}} + + # {{{ preamble generator -def _preamble_generator(kernel, seen_dtypes, seen_functions): - c_funcs = set(func.c_name for func in seen_functions) +def _preamble_generator(preamble_info): + c_funcs = set(func.c_name for func in preamble_info.seen_functions) if "int_floor_div" in c_funcs: yield ("05_int_floor_div", """ #define int_floor_div(a,b) \ @@ -75,7 +111,7 @@ class CTarget(TargetBase): result = DTypeRegistry() fill_registry_with_c_types(result, respect_windows=False, include_bool=True) - return result + return DTypeRegistryWrapper(result) def is_vector_dtype(self, dtype): return False diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index ce6e039fa3f60feae72458308915f87babab2867..63a053c586362a10fa2eea70b83b2659646f62d0 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -1,7 +1,4 @@ -from __future__ import division -from __future__ import absolute_import -from six.moves import range -from six.moves import zip +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -26,6 +23,8 @@ THE SOFTWARE. """ +from six.moves import range, zip + import numpy as np from pymbolic.mapper import RecursiveMapper @@ -43,17 +42,24 @@ from loopy.types import LoopyType # {{{ C code mapper class LoopyCCodeMapper(RecursiveMapper): - def __init__(self, codegen_state, fortran_abi=False): + def __init__(self, codegen_state, fortran_abi=False, type_inf_mapper=None): self.kernel = codegen_state.kernel self.codegen_state = codegen_state - self.type_inf_mapper = TypeInferenceMapper(self.kernel) + if type_inf_mapper is None: + type_inf_mapper = TypeInferenceMapper(self.kernel) + self.type_inf_mapper = type_inf_mapper + self.allow_complex = codegen_state.allow_complex self.fortran_abi = fortran_abi # {{{ helpers + def with_assignments(self, names_to_vars): + type_inf_mapper = self.type_inf_mapper.with_assignments(names_to_vars) + return type(self)(self.codegen_state, self.fortran_abi, type_inf_mapper) + def infer_type(self, expr): result = self.type_inf_mapper(expr) assert isinstance(result, LoopyType) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 68be1ce3bf9d579592064e4707910802a70cdbc1..1e224ff1f87f37270802736fe441cfebff4c9a4a 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -31,6 +31,34 @@ from loopy.target.c.codegen.expression import LoopyCCodeMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType +from loopy.target.c import DTypeRegistryWrapper + + +# {{{ dtype registry wrappers + +class DTypeRegistryWrapperWithAtomics(DTypeRegistryWrapper): + def get_or_register_dtype(self, names, dtype=None): + if dtype is not None: + from loopy.types import AtomicNumpyType, NumpyType + if isinstance(dtype, AtomicNumpyType): + return super(self.wrapped_registry.get_or_register_dtype( + names, NumpyType(dtype.dtype))) + + return super(self.wrapped_registry.get_or_register_dtype( + names, dtype)) + + +class DTypeRegistryWrapperWithCL1Atomics(DTypeRegistryWrapperWithAtomics): + def dtype_to_ctype(self, dtype): + from loopy.types import AtomicNumpyType + + if isinstance(dtype, AtomicNumpyType): + return "volatile " + self.wrapped_registry.dtype_to_ctype(dtype) + else: + return super(DTypeRegistryWrapperWithCL1Atomics, self).dtype_to_ctype( + dtype) + +# }}} # {{{ vector types @@ -159,10 +187,10 @@ def opencl_symbol_mangler(kernel, name): # {{{ preamble generator -def opencl_preamble_generator(kernel, seen_dtypes, seen_functions): +def opencl_preamble_generator(preamble_info): has_double = False - for dtype in seen_dtypes: + for dtype in preamble_info.seen_dtypes: if dtype in [np.float64, np.complex128]: has_double = True @@ -173,6 +201,17 @@ def opencl_preamble_generator(kernel, seen_dtypes, seen_functions): #endif """) + from loopy.types import AtomicNumpyType + seen_64_bit_atomics = any( + isinstance(dtype, AtomicNumpyType) and dtype.numpy_dtype.itemsize == 8 + for dtype in preamble_info.seen_atomic_dtypes) + + if seen_64_bit_atomics: + # FIXME: Should gate on "CL1" atomics style + yield ("00_enable_64bit_atomics", """ + #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable + """) + # }}} @@ -194,15 +233,18 @@ class OpenCLTarget(CTarget): """A target for the OpenCL C heterogeneous compute programming language. """ - def __init__(self, atomics_flavor="cl2"): + def __init__(self, atomics_flavor="cl1"): """ - :arg atomics_flavor: one of ``"cl2"`` (C11-style atomics from OpenCL 2.0), + :arg atomics_flavor: one of ``"cl1"`` (C11-style atomics from OpenCL 2.0), ``"cl1"`` (OpenCL 1.1 atomics, using bit-for-bit compare-and-swap for floating point), ``"cl1-exch"`` (OpenCL 1.1 atomics, using - double-exchange for floating point). + double-exchange for floating point--not yet supported). """ super(OpenCLTarget, self).__init__() + if atomics_flavor not in ["cl1", "cl2"]: + raise ValueError("unsupported atomics flavor: %s" % atomics_flavor) + self.atomics_flavor = atomics_flavor # {{{ library @@ -241,7 +283,10 @@ class OpenCLTarget(CTarget): _register_vector_types(result) - return result + if self.atomics_flavor == "cl1": + return DTypeRegistryWrapperWithCL1Atomics(result) + else: + raise NotImplementedError("atomics flavor: %s" % self.atomics_flavor) def is_vector_dtype(self, dtype): return (isinstance(dtype, NumpyType) @@ -364,6 +409,91 @@ class OpenCLTarget(CTarget): return CLConstant(arg_decl) + # {{{ code generation for atomic update + + def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + from pymbolic.mapper.stringifier import PREC_NONE + + # FIXME: Could detect operations, generate atomic_{add,...} when + # appropriate. + + if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [ + np.int32, np.int64, np.float32, np.float64]: + from cgen import Block, DoWhile, Assign + from loopy.codegen import POD + old_val_var = codegen_state.var_name_generator("loopy_old_val") + new_val_var = codegen_state.var_name_generator("loopy_new_val") + + from loopy.kernel.data import TemporaryVariable + ecm = codegen_state.expression_to_code_mapper.with_assignments( + { + old_val_var: TemporaryVariable(old_val_var, lhs_dtype), + new_val_var: TemporaryVariable(new_val_var, lhs_dtype), + }) + + lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None) + + from pymbolic.mapper.substitutor import make_subst_func + from pymbolic import var + from loopy.symbolic import SubstitutionMapper + + subst = SubstitutionMapper( + make_subst_func({lhs_expr: var(old_val_var)})) + rhs_expr_code = ecm(subst(rhs_expr), prec=PREC_NONE, + type_context=rhs_type_context, + needed_dtype=lhs_dtype) + + if lhs_dtype.numpy_dtype.itemsize == 4: + func_name = "atomic_cmpxchg" + elif lhs_dtype.numpy_dtype.itemsize == 8: + func_name = "atom_cmpxchg" + else: + raise LoopyError("unexpected atomic size") + + cast_str = "" + old_val = old_val_var + new_val = new_val_var + + if lhs_dtype.numpy_dtype.kind == "f": + if lhs_dtype.numpy_dtype == np.float32: + ctype = "int" + elif lhs_dtype.numpy_dtype == np.float64: + ctype = "long" + else: + assert False + + old_val = "*(%s *) &" % ctype + old_val + new_val = "*(%s *) &" % ctype + new_val + cast_str = "(__global %s *) " % ctype + + return Block([ + POD(self, NumpyType(lhs_dtype.dtype), old_val_var), + POD(self, NumpyType(lhs_dtype.dtype), new_val_var), + DoWhile( + "%(func_name)s(" + "%(cast_str)s&(%(lhs_expr)s), " + "%(old_val)s, " + "%(new_val)s" + ") != %(old_val)s" + % { + "func_name": func_name, + "cast_str": cast_str, + "lhs_expr": lhs_expr_code, + "old_val": old_val, + "new_val": new_val, + }, + Block([ + Assign(old_val_var, lhs_expr_code), + Assign(new_val_var, rhs_expr_code), + ]) + ) + ]) + else: + raise NotImplementedError("atomic update for '%s'" % lhs_dtype) + + # }}} + # }}} # }}} diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index e17f9515ad8528244ab585bb3826ae8a473c4b78..add5bd9e235289350cf53af5b3a9a07049df053f 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -207,12 +207,12 @@ def pyopencl_function_mangler(target, name, arg_dtypes): # {{{ preamble generator -def pyopencl_preamble_generator(target, seen_dtypes, seen_functions): +def pyopencl_preamble_generator(preamble_info): has_double = False has_complex = False from loopy.types import NumpyType - for dtype in seen_dtypes: + for dtype in preamble_info.seen_dtypes: if (isinstance(dtype, NumpyType) and dtype.dtype in [np.float64, np.complex128]): has_double = True @@ -290,9 +290,15 @@ class PyOpenCLTarget(OpenCLTarget): try: from pyopencl.compyte.dtypes import TYPE_REGISTRY except ImportError: - return _LegacyTypeRegistryStub() + result = _LegacyTypeRegistryStub() else: - return TYPE_REGISTRY + result = TYPE_REGISTRY + + from loopy.target.opencl import DTypeRegistryWrapperWithCL1Atomics + if self.atomics_flavor == "cl1": + return DTypeRegistryWrapperWithCL1Atomics(result) + else: + raise NotImplementedError("atomics flavor: %s" % self.atomics_flavor) def is_vector_dtype(self, dtype): from pyopencl.array import vec diff --git a/test/test_loopy.py b/test/test_loopy.py index c7d7755e7a7b51d8085d57b7263b0249bd9520e1..5929e7aa724b18f808260c3d594c91a5fb8e8b7e 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2421,24 +2421,29 @@ def test_chunk_iname(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130)) -def test_atomic(ctx_factory): +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +def test_atomic(ctx_factory, dtype): ctx = ctx_factory() + if ( + np.dtype(dtype).itemsize == 8 + and "cl_khr_int64_base_atomics" not in ctx.devices[0].extensions): + pytest.skip("64-bit atomics not supported on device") + knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i%20] = out[i%20] + 2*a[i] {atomic}", [ - lp.GlobalArg("out", np.float32, shape=lp.auto, - for_atomic=True), - lp.GlobalArg("a", np.float32, shape=lp.auto), + lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True), + lp.GlobalArg("a", dtype, shape=lp.auto), "..." ], assumptions="n>0") ref_knl = knl - knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0") - knl = lp.set_loop_priority(knl, "i_outer, i_inner") - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130)) + knl = lp.split_iname(knl, "i", 512) + knl = lp.split_iname(knl, "i_inner", 128, outer_tag="unr", inner_tag="g.0") + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000)) if __name__ == "__main__":