Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tasmith4/loopy
  • ben_sepanski/loopy
  • arghdos/loopy
  • inducer/loopy
  • wence-/loopy
  • isuruf/loopy
  • fikl2/loopy
  • xywei/loopy
  • kaushikcfd/loopy
  • zweiner2/loopy
10 results
Show changes
Showing
with 5399 additions and 2662 deletions
from __future__ import annotations
__copyright__ = "Copyright (C) 2017 Nick Curtis"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import ctypes
import logging
import os
import tempfile
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Callable, ClassVar, Sequence
import numpy as np
from codepy.jit import compile_from_string
from codepy.toolchain import GCCToolchain, ToolchainGuessError, guess_toolchain
from pytools import memoize_method
from pytools.codegen import CodeGenerator, Indentation
from pytools.prefork import ExecError
from loopy.kernel.array import ArrayBase
from loopy.target.execution import (
ExecutionWrapperGeneratorBase,
ExecutorBase,
get_highlighted_code,
)
from loopy.types import LoopyType
if TYPE_CHECKING:
from constantdict import constantdict
from loopy.codegen.result import GeneratedProgram
from loopy.kernel import LoopKernel
from loopy.kernel.data import ArrayArg
from loopy.schedule.tools import KernelArgInfo
from loopy.translation_unit import TranslationUnit
from loopy.typing import Expression
logger = logging.getLogger(__name__)
DEF_EVEN_DIV_FUNCTION = """
def _lpy_even_div(a, b):
result, remdr = divmod(a, b)
if remdr != 0:
# FIXME: This error message is kind of crummy.
raise ValueError("expected even division")
return result
def _lpy_even_div_none(a, b):
if a is None:
return None
result, remdr = divmod(a, b)
if remdr != 0:
# FIXME: This error message is kind of crummy.
raise ValueError("expected even division")
return result
"""
# {{{ CExecutionWrapperGenerator
class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
"""
Specialized form of the :class:`ExecutionWrapperGeneratorBase` for
pyopencl execution
"""
def __init__(self):
system_args = ["_lpy_c_kernels"]
super().__init__(system_args)
def python_dtype_str_inner(self, dtype):
if np.dtype(str(dtype)).isbuiltin:
name = dtype.name
if dtype.name == "bool":
name = "bool_"
return f"_lpy_np.dtype(_lpy_np.{name})"
raise Exception(f"dtype: {dtype} not recognized")
# {{{ handle non numpy arguments
def handle_non_numpy_arg(self, gen, arg):
pass
# }}}
# {{{ handle allocation of unspecified arguments
def handle_alloc(
self, gen: CodeGenerator, arg: ArrayArg,
strify: Callable[[Expression | tuple[Expression]], str],
skip_arg_checks: bool) -> None:
"""
Handle allocation of non-specified arguments for C-execution
"""
from pymbolic import var
assert isinstance(arg.shape, tuple)
assert isinstance(arg.dtype, LoopyType)
num_axes = len(arg.shape)
for i in range(num_axes):
gen("_lpy_shape_%d = %s" % (i, strify(arg.shape[i])))
from loopy.kernel.array import get_strides
strides = get_strides(arg)
num_axes = len(strides)
itemsize = arg.dtype.numpy_dtype.itemsize
for i in range(num_axes):
gen("_lpy_strides_%d = %s" % (i, strify(
itemsize*strides[i])))
if not skip_arg_checks:
for i in range(num_axes):
gen("assert _lpy_strides_%d > 0, "
"\"'%s' has negative stride in axis %d\""
% (i, arg.name, i))
sym_strides = tuple(
var("_lpy_strides_%d" % i)
for i in range(num_axes))
sym_shape = tuple(
var("_lpy_shape_%d" % i)
for i in range(num_axes))
# find order of array
from loopy.kernel.array import get_strides
strides = get_strides(arg)
order = "'C'" if (arg.shape == () or strides[-1] == 1) else "'F'"
gen(f"{arg.name} = _lpy_np.empty({strify(sym_shape)}, "
f"{self.python_dtype_str(gen, arg.dtype.numpy_dtype)}, "
f"order={order})")
expected_strides = tuple(
var("_lpy_expected_strides_%s" % i)
for i in range(num_axes))
gen("{} = {}.strides".format(strify(expected_strides), arg.name))
# check strides
if not skip_arg_checks:
strides_check_expr = self.get_strides_check_expr(
[strify(s) for s in sym_shape],
[strify(s) for s in sym_strides],
[strify(s) for s in expected_strides])
gen(f"assert {strides_check_expr}, "
f"'Strides of loopy created array {arg.name}, "
"do not match expected.'")
for i in range(num_axes):
gen("del _lpy_shape_%d" % i)
gen("del _lpy_strides_%d" % i)
gen("")
# }}}
def target_specific_preamble(self, gen):
"""
Add default C-imports to preamble
"""
gen.add_to_preamble("import numpy as _lpy_np")
gen.add_to_preamble(DEF_EVEN_DIV_FUNCTION)
def initialize_system_args(self, gen):
"""
Initializes possibly empty system arguments
"""
pass
# {{{ generate invocation
def generate_invocation(self, gen: CodeGenerator, kernel: LoopKernel,
kai: KernelArgInfo, host_program_name: str, args: Sequence[str]) -> None:
gen("for knl in _lpy_c_kernels:")
with Indentation(gen):
gen(f"knl({', '.join(args)})")
# }}}
# {{{
def generate_output_handler(self, gen: CodeGenerator,
kernel: LoopKernel, kai: KernelArgInfo) -> None:
options = kernel.options
if options.return_dict:
gen("return None, {%s}"
% ", ".join(f'"{arg_name}": {arg_name}'
for arg_name in kai.passed_arg_names
if kernel.arg_dict[arg_name].is_output))
else:
out_names = [arg_name for arg_name in kai.passed_arg_names
if kernel.arg_dict[arg_name].is_output]
if out_names:
gen(f"return None, ({', '.join(out_names)},)")
else:
gen("return None, ()")
# }}}
def generate_host_code(self, gen, codegen_result):
# "host" code for C is embedded in the same file as the "device" code
# this will enable a logical jumping off point for global barriers for
# OpenMP, etc.
pass
def get_arg_pass(self, arg):
return arg.name
# }}}
# {{{ CCompiler
class CCompiler:
"""
The compiler module handles invocation of compilers to generate a shared lib
using codepy, which can subsequently be loaded via ctypes.
The general strategy here is as follows:
1. A :class:`codepy.Toolchain` is guessed from distutils.
The user may override any flags obtained therein by passing in arguments
to cc, cflags, etc.
2. The kernel source is built into and object first, then made into a shared
library using :meth:`codepy.jit.compile_from_string`, which additionally
handles caching
3. The resulting shared library is turned into a :class:`ctypes.CDLL`
to enable calling by the invoker generated by, e.g.,
:class:`CExecutionWrapperGenerator`
"""
def __init__(self, toolchain=None,
cc="gcc", cflags=None,
ldflags=None, libraries=None,
include_dirs=None, library_dirs=None, defines=None,
source_suffix="c"):
if cflags is None:
cflags = "-std=c99 -O3 -fPIC".split()
if ldflags is None:
ldflags = "-shared".split()
if libraries is None:
libraries = []
if include_dirs is None:
include_dirs = []
if library_dirs is None:
library_dirs = []
if defines is None:
defines = []
# try to get a default toolchain
# or subclass supplied version if available
self.toolchain = toolchain
if toolchain is None:
try:
self.toolchain = guess_toolchain()
except (ToolchainGuessError, ExecError):
# missing compiler python was built with (likely, Conda)
# use a default GCCToolchain
logger = logging.getLogger(__name__)
logger.warn("Default toolchain guessed from python config "
"not found, replacing with default GCCToolchain.")
# this is ugly, but I'm not sure there's a clean way to copy the
# default args
self.toolchain = GCCToolchain(
cc="gcc",
ld="ld",
cflags="-std=c99 -O3 -fPIC".split(),
ldflags="-shared".split(),
libraries=[],
library_dirs=[],
defines=[],
undefines=[],
so_ext=".so",
o_ext=".o",
include_dirs=[],
features=set())
if toolchain is None:
# copy in all differing values
diff = {"cc": cc,
"cflags": cflags,
"ldflags": ldflags,
"libraries": libraries,
"include_dirs": include_dirs,
"library_dirs": library_dirs,
"defines": defines}
# filter empty and those equal to toolchain defaults
diff = {k: v for k, v in diff.items()
if v and (not hasattr(self.toolchain, k) or
getattr(self.toolchain, k) != v)}
self.toolchain = self.toolchain.copy(**diff)
self.tempdir = tempfile.mkdtemp(prefix="tmp_loopy")
self.source_suffix = source_suffix
def _tempname(self, name):
"""Build temporary filename path in tempdir."""
return os.path.join(self.tempdir, name)
def build(self, name, code, debug=False, wait_on_error=None,
debug_recompile=True, extra_build_options: Sequence[str] = ()):
"""Compile code, build and load shared library."""
logger.debug(code)
c_fname = self._tempname("code." + self.source_suffix)
# build object
_, _mod_name, ext_file, recompiled = \
compile_from_string(
self.toolchain.copy(
cflags=[*self.toolchain.cflags, *extra_build_options]),
name, code,
source_name=c_fname,
cache_dir=self.tempdir,
debug=debug,
debug_recompile=debug_recompile,
object=False)
if recompiled:
logger.debug(f"Kernel {name} compiled from source")
else:
logger.debug(f"Kernel {name} retrieved from cache")
# and return compiled
return ctypes.CDLL(ext_file)
# }}}
# {{{ CPlusPlusCompiler
class CPlusPlusCompiler(CCompiler):
"""Subclass of CCompiler to invoke a C++ compiler."""
def __init__(self, toolchain=None,
cc="g++", cflags=None,
ldflags=None, libraries=None,
include_dirs=None, library_dirs=None, defines=None,
source_suffix="cpp"):
super().__init__(
toolchain=toolchain, cc=cc, cflags=cflags, ldflags=ldflags,
libraries=libraries, include_dirs=include_dirs,
library_dirs=library_dirs, defines=defines, source_suffix=source_suffix)
# }}}
# {{{ placeholder till ctypes fixes: https://github.com/python/cpython/issues/61103
class Complex64(ctypes.Structure):
_fields_: ClassVar = [("real", ctypes.c_float), ("imag", ctypes.c_float)]
class Complex128(ctypes.Structure):
_fields_: ClassVar = [("real", ctypes.c_double), ("imag", ctypes.c_double)]
class Complex256(ctypes.Structure):
_fields_: ClassVar = [("real", ctypes.c_longdouble), ("imag", ctypes.c_longdouble)]
_NUMPY_COMPLEX_TYPE_TO_CTYPE = {
np.complex64: Complex64,
np.complex128: Complex128,
}
if hasattr(np, "complex256"):
_NUMPY_COMPLEX_TYPE_TO_CTYPE[np.complex256] = Complex256
# }}}
# {{{ _args_to_ctypes
def _args_to_ctypes(kernel: LoopKernel, passed_names: Sequence[str]):
def _dtype_to_ctype(dtype):
"""Map NumPy dtype to equivalent ctypes type."""
if dtype.is_complex():
# complex ctypes aren't exposed
np_dtype = dtype.numpy_dtype.type
basetype = _NUMPY_COMPLEX_TYPE_TO_CTYPE[np_dtype]
else:
basetype = np.ctypeslib.as_ctypes_type(dtype)
return basetype
arg_info = []
for arg_name in passed_names:
arg = kernel.arg_dict[arg_name]
ctype = _dtype_to_ctype(arg.dtype)
if isinstance(arg, ArrayBase):
ctype = ctypes.POINTER(ctype)
arg_info.append(ctype)
return arg_info
# }}}
# {{{ CompiledCKernel
class CompiledCKernel:
"""
A CompiledCKernel wraps a loopy kernel, compiling it and loading the
result as a shared library, and provides access to the kernel as a
ctypes function object, wrapped by the __call__ method, which attempts
to automatically map argument types.
"""
def __init__(self, kernel: LoopKernel, devprog: GeneratedProgram,
passed_names: Sequence[str], dev_code: str,
comp: CCompiler | None = None):
# get code and build
self.code = dev_code
self.comp = comp if comp is not None else CCompiler()
self.dll = self.comp.build(devprog.name, self.code,
extra_build_options=kernel.options.build_options)
# get the function declaration for interface with ctypes
self._fn = getattr(self.dll, devprog.name)
# kernels are void by defn.
self._fn.restype = None
self._fn.argtypes = _args_to_ctypes(kernel, passed_names)
def __call__(self, *args):
"""Execute kernel with given args mapped to ctypes equivalents."""
args_ = []
for arg, arg_t in zip(args, self._fn.argtypes):
if hasattr(arg, "ctypes"):
if arg.size == 0:
# TODO eliminate unused arguments from kernel
arg_ = arg_t(0.0)
else:
arg_ = arg.ctypes.data_as(arg_t)
else:
arg_ = arg_t(arg)
args_.append(arg_)
self._fn(*args_)
# }}}
@dataclass(frozen=True)
class _KernelInfo:
t_unit: TranslationUnit
c_kernels: Sequence[CompiledCKernel]
invoker: Callable[..., Any]
# {{{ CExecutor
class CExecutor(ExecutorBase):
"""An object connecting a kernel to a :class:`CompiledKernel`
for execution.
.. automethod:: __init__
.. automethod:: __call__
"""
def __init__(self, program, entrypoint, compiler: CCompiler | None = None):
"""
:arg kernel: may be a loopy.LoopKernel, a generator returning kernels
(a warning will be issued if more than one is returned). If the
kernel has not yet been loop-scheduled, that is done, too, with no
specific arguments.
"""
self.compiler = compiler if compiler else CCompiler()
super().__init__(program, entrypoint)
def get_invoker_uncached(self, kernel, entrypoint, codegen_result):
generator = CExecutionWrapperGenerator()
return generator(kernel, entrypoint, codegen_result)
def get_wrapper_generator(self):
return CExecutionWrapperGenerator()
@memoize_method
def translation_unit_info(self,
arg_to_dtype: constantdict[str, LoopyType] | None = None) -> _KernelInfo:
t_unit = self.get_typed_and_scheduled_translation_unit(arg_to_dtype)
from loopy.codegen import generate_code_v2
codegen_result = generate_code_v2(t_unit)
dev_code = codegen_result.device_code()
host_code = codegen_result.host_code()
all_code = "\n".join([dev_code, "", host_code])
if t_unit[self.entrypoint].options.write_code:
output = all_code
if t_unit[self.entrypoint].options.allow_terminal_colors:
output = get_highlighted_code(output)
if t_unit[self.entrypoint].options.write_code is True:
print(output)
else:
with open(t_unit[self.entrypoint].options.write_code, "w") as outf:
outf.write(output)
if t_unit[self.entrypoint].options.edit_code:
from pytools import invoke_editor
dev_code = invoke_editor(dev_code, "code.c")
# update code from editor
all_code = "\n".join([dev_code, "", host_code])
c_kernels = []
from loopy.schedule.tools import get_kernel_arg_info
kai = get_kernel_arg_info(t_unit[self.entrypoint])
for dp in codegen_result.device_programs:
c_kernels.append(CompiledCKernel(
t_unit[self.entrypoint], dp, kai.passed_names, all_code,
self.compiler))
return _KernelInfo(
t_unit=t_unit,
c_kernels=c_kernels,
invoker=self.get_invoker(t_unit, self.entrypoint, codegen_result))
def __call__(self, *args, **kwargs):
"""
:returns: ``(None, output)`` the output is a tuple of output arguments
(arguments that are written as part of the kernel). The order is given
by the order of kernel arguments. If this order is unspecified
(such as when kernel arguments are inferred automatically),
enable :attr:`loopy.Options.return_dict` to make *output* a
:class:`dict` instead, with keys of argument names and values
of the returned arrays.
"""
if __debug__:
self.check_for_required_array_arguments(kwargs.keys())
if self.packing_controller is not None:
kwargs = self.packing_controller(kwargs)
program_info = self.translation_unit_info(self.arg_to_dtype(kwargs))
return program_info.invoker(
program_info.c_kernels, *args, **kwargs)
# }}}
# vim: foldmethod=marker
from __future__ import division, absolute_import from __future__ import annotations
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
...@@ -23,36 +24,73 @@ THE SOFTWARE. ...@@ -23,36 +24,73 @@ THE SOFTWARE.
""" """
from six.moves import range, zip from typing import TYPE_CHECKING
import numpy as np import numpy as np
from pymbolic.mapper import RecursiveMapper, IdentityMapper
from pymbolic.mapper.stringifier import (PREC_NONE, PREC_CALL, PREC_PRODUCT,
PREC_POWER,
PREC_UNARY, PREC_LOGICAL_OR, PREC_LOGICAL_AND)
import islpy as isl import islpy as isl
import pymbolic.primitives as p import pymbolic.primitives as p
from pymbolic import var from pymbolic import var
from pymbolic.mapper import IdentityMapper, Mapper
from pymbolic.mapper.stringifier import (
PREC_BITWISE_AND,
PREC_BITWISE_OR,
PREC_BITWISE_XOR,
PREC_CALL,
PREC_LOGICAL_AND,
PREC_LOGICAL_OR,
PREC_NONE,
PREC_PRODUCT,
PREC_SHIFT,
PREC_UNARY,
)
from loopy.diagnostic import LoopyError
from loopy.expression import dtype_to_type_context
from loopy.target.c import CExpression
from loopy.type_inference import TypeInferenceMapper, TypeReader
from loopy.types import LoopyType
from loopy.typing import Expression, is_integer
from loopy.expression import dtype_to_type_context if TYPE_CHECKING:
from loopy.type_inference import TypeInferenceMapper from loopy.codegen import CodeGenerationState
from loopy.symbolic import TypeCast
from loopy.diagnostic import LoopyError, LoopyWarning
from loopy.tools import is_integer __doc__ = """
from loopy.types import LoopyType .. currentmodule:: loopy.target.c.codegen.expression
.. autoclass:: ExpressionToCExpressionMapper
"""
# {{{ Loopy expression to C expression mapper # {{{ Loopy expression to C expression mapper
class ExpressionToCExpressionMapper(IdentityMapper): class ExpressionToCExpressionMapper(IdentityMapper):
def __init__(self, codegen_state, fortran_abi=False, type_inf_mapper=None): """
Mapper that converts a loopy-semantic expression to a C-semantic expression
with typecasts, appropriate arithmetic semantic mapping, etc.
.. note::
- All mapper methods take in an extra argument called *type_context*.
The purpose of *type_context* is to inform the method about the
expected type for untyped expressions such as python scalars. The
type of the expressions takes precedence over *type_context*.
"""
def __init__(self,
codegen_state: CodeGenerationState,
fortran_abi: bool = False,
type_inf_mapper: TypeInferenceMapper | None = None
) -> None:
self.kernel = codegen_state.kernel self.kernel = codegen_state.kernel
self.codegen_state = codegen_state self.codegen_state = codegen_state
if type_inf_mapper is None: if type_inf_mapper is None:
type_inf_mapper = TypeInferenceMapper(self.kernel) type_inf_mapper = TypeReader(self.kernel,
self.codegen_state.callables_table)
self.type_inf_mapper = type_inf_mapper self.type_inf_mapper = type_inf_mapper
self.allow_complex = codegen_state.allow_complex self.allow_complex = codegen_state.allow_complex
...@@ -65,7 +103,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -65,7 +103,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
type_inf_mapper = self.type_inf_mapper.with_assignments(names_to_vars) type_inf_mapper = self.type_inf_mapper.with_assignments(names_to_vars)
return type(self)(self.codegen_state, self.fortran_abi, type_inf_mapper) return type(self)(self.codegen_state, self.fortran_abi, type_inf_mapper)
def infer_type(self, expr): def infer_type(self, expr: Expression) -> LoopyType:
result = self.type_inf_mapper(expr) result = self.type_inf_mapper(expr)
assert isinstance(result, LoopyType) assert isinstance(result, LoopyType)
...@@ -88,29 +126,29 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -88,29 +126,29 @@ class ExpressionToCExpressionMapper(IdentityMapper):
return ary return ary
def wrap_in_typecast(self, actual_type, needed_dtype, s): def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s):
if (actual_type.is_complex() and needed_dtype.is_complex() if actual_type != needed_type:
and actual_type != needed_dtype): registry = self.codegen_state.ast_builder.target.get_dtype_registry()
return var("%s_cast" % self.complex_type_name(needed_dtype))(s) cast = var("(%s) " % registry.dtype_to_ctype(needed_type))
elif not actual_type.is_complex() and needed_dtype.is_complex(): return cast(s)
return var("%s_fromreal" % self.complex_type_name(needed_dtype))(s)
else: return s
return s
def rec(self, expr, type_context=None, needed_dtype=None): def rec(self, expr, type_context=None, needed_type: LoopyType | None = None): # type: ignore[override]
if needed_dtype is None: result = super().rec(expr, type_context)
return RecursiveMapper.rec(self, expr, type_context)
return self.wrap_in_typecast( if needed_type is None:
self.infer_type(expr), needed_dtype, return result
RecursiveMapper.rec(self, expr, type_context)) else:
return self.wrap_in_typecast(
self.infer_type(expr), needed_type,
result)
def __call__(self, expr, prec=None, type_context=None, needed_dtype=None): def __call__(self, expr, prec=None, type_context=None, needed_dtype=None):
if prec is None: if prec is None:
prec = PREC_NONE prec = PREC_NONE
assert prec == PREC_NONE assert prec == PREC_NONE
from loopy.target.c import CExpression
return CExpression( return CExpression(
self.codegen_state.ast_builder.get_c_expression_to_code_mapper(), self.codegen_state.ast_builder.get_c_expression_to_code_mapper(),
self.rec(expr, type_context, needed_dtype)) self.rec(expr, type_context, needed_dtype))
...@@ -118,13 +156,15 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -118,13 +156,15 @@ class ExpressionToCExpressionMapper(IdentityMapper):
# }}} # }}}
def map_variable(self, expr, type_context): def map_variable(self, expr, type_context):
from loopy.kernel.data import AddressSpace, ValueArg
def postproc(x): def postproc(x):
return x return x
if expr.name in self.codegen_state.var_subst_map: if expr.name in self.codegen_state.var_subst_map:
if self.kernel.options.annotate_inames: if self.kernel.options.annotate_inames:
return var( return var(
"/* %s */ %s" % ( "/* {} */ {}".format(
expr.name, expr.name,
self.rec(self.codegen_state.var_subst_map[expr.name], self.rec(self.codegen_state.var_subst_map[expr.name],
type_context))) type_context)))
...@@ -137,20 +177,29 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -137,20 +177,29 @@ class ExpressionToCExpressionMapper(IdentityMapper):
if isinstance(arg, ArrayBase): if isinstance(arg, ArrayBase):
if arg.shape == (): if arg.shape == ():
if arg.offset: if arg.offset:
# FIXME
raise NotImplementedError("in-memory scalar with offset") from loopy.kernel.array import _apply_offset
from loopy.symbolic import simplify_using_aff
subscript = _apply_offset(0, arg)
result = self.make_subscript(
arg,
var(expr.name),
simplify_using_aff(
self.kernel, self.rec(subscript, "i")))
return result
else: else:
return var(expr.name)[0] return var(expr.name)[0]
else: else:
raise RuntimeError("unsubscripted reference to array '%s'" raise RuntimeError("unsubscripted reference to array '%s'"
% expr.name) % expr.name)
from loopy.kernel.data import ValueArg
if isinstance(arg, ValueArg) and self.fortran_abi: if isinstance(arg, ValueArg) and self.fortran_abi:
postproc = lambda x: x[0] # noqa postproc = lambda x: x[0] # noqa
elif expr.name in self.kernel.temporary_variables: elif expr.name in self.kernel.temporary_variables:
temporary = self.kernel.temporary_variables[expr.name] temporary = self.kernel.temporary_variables[expr.name]
if temporary.base_storage: if (temporary.base_storage
or temporary.address_space == AddressSpace.GLOBAL):
postproc = lambda x: x[0] # noqa postproc = lambda x: x[0] # noqa
result = self.kernel.mangle_symbol(self.codegen_state.ast_builder, expr.name) result = self.kernel.mangle_symbol(self.codegen_state.ast_builder, expr.name)
...@@ -163,14 +212,19 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -163,14 +212,19 @@ class ExpressionToCExpressionMapper(IdentityMapper):
def map_tagged_variable(self, expr, type_context): def map_tagged_variable(self, expr, type_context):
return var(expr.name) return var(expr.name)
def map_sub_array_ref(self, expr, type_context):
from loopy.symbolic import get_start_subscript_from_sar
return var("&")(self.rec(get_start_subscript_from_sar(expr, self.kernel),
type_context))
def map_subscript(self, expr, type_context): def map_subscript(self, expr, type_context):
def base_impl(expr, type_context): def base_impl(expr, type_context):
return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')] return self.rec(expr.aggregate, type_context)[self.rec(expr.index, "i")]
def make_var(name): def make_var(name):
from loopy import TaggedVariable from loopy import TaggedVariable
if isinstance(expr.aggregate, TaggedVariable): if isinstance(expr.aggregate, TaggedVariable):
return TaggedVariable(name, expr.aggregate.tag) return TaggedVariable(name, expr.aggregate.tags)
else: else:
return var(name) return var(name)
...@@ -180,19 +234,18 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -180,19 +234,18 @@ class ExpressionToCExpressionMapper(IdentityMapper):
ary = self.find_array(expr) ary = self.find_array(expr)
from loopy.kernel.array import get_access_info
from pymbolic import evaluate from pymbolic import evaluate
from loopy.kernel.array import get_access_info
from loopy.symbolic import simplify_using_aff from loopy.symbolic import simplify_using_aff
index_tuple = tuple( index_tuple = tuple(
simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)
access_info = get_access_info(self.kernel.target, ary, index_tuple, access_info = get_access_info(self.kernel, ary, index_tuple,
lambda expr: evaluate(expr, self.codegen_state.var_subst_map), lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
self.codegen_state.vectorization_info) self.codegen_state.vectorization_info)
from loopy.kernel.data import ( from loopy.kernel.data import ArrayArg, ConstantArg, ImageArg, TemporaryVariable
ImageArg, GlobalArg, TemporaryVariable, ConstantArg)
if isinstance(ary, ImageArg): if isinstance(ary, ImageArg):
extra_axes = 0 extra_axes = 0
...@@ -213,7 +266,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -213,7 +266,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
base_access = var("read_imagef")( base_access = var("read_imagef")(
var(ary.name), var(ary.name),
var("loopy_sampler"), var("loopy_sampler"),
var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, 'i'))) var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, "i")))
if ary.dtype.numpy_dtype == np.float32: if ary.dtype.numpy_dtype == np.float32:
return base_access.attr("x") return base_access.attr("x")
...@@ -225,13 +278,16 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -225,13 +278,16 @@ class ExpressionToCExpressionMapper(IdentityMapper):
raise NotImplementedError( raise NotImplementedError(
"non-floating-point images not supported for now") "non-floating-point images not supported for now")
elif isinstance(ary, (GlobalArg, TemporaryVariable, ConstantArg)): elif isinstance(ary, (ArrayArg, TemporaryVariable, ConstantArg)):
if len(access_info.subscripts) == 0: if len(access_info.subscripts) == 0:
if ( if (
(isinstance(ary, (ConstantArg, GlobalArg)) or isinstance(ary, (ConstantArg, ArrayArg)) or
(isinstance(ary, TemporaryVariable) and ary.base_storage))): (isinstance(ary, TemporaryVariable) and ary.base_storage)):
# unsubscripted global args are pointers # unsubscripted global args are pointers
result = make_var(access_info.array_name)[0] result = self.make_subscript(
ary,
make_var(access_info.array_name),
(0,))
else: else:
# unsubscripted temp vars are scalars # unsubscripted temp vars are scalars
...@@ -240,7 +296,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -240,7 +296,11 @@ class ExpressionToCExpressionMapper(IdentityMapper):
else: else:
subscript, = access_info.subscripts subscript, = access_info.subscripts
result = make_var(access_info.array_name)[self.rec(subscript, 'i')] result = self.make_subscript(
ary,
make_var(access_info.array_name),
simplify_using_aff(
self.kernel, self.rec(subscript, "i")))
if access_info.vector_index is not None: if access_info.vector_index is not None:
return self.codegen_state.ast_builder.add_vector_access( return self.codegen_state.ast_builder.add_vector_access(
...@@ -249,13 +309,13 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -249,13 +309,13 @@ class ExpressionToCExpressionMapper(IdentityMapper):
return result return result
else: else:
assert False raise AssertionError()
def map_linear_subscript(self, expr, type_context): def map_linear_subscript(self, expr, type_context):
from pymbolic.primitives import Variable from pymbolic.primitives import Variable
if not isinstance(expr.aggregate, Variable): if not isinstance(expr.aggregate, Variable):
raise RuntimeError("linear indexing on non-variable: %s" raise RuntimeError("linear indexing on non-variable: %s"
% expr) % expr)
if expr.aggregate.name in self.kernel.arg_dict: if expr.aggregate.name in self.kernel.arg_dict:
arg = self.kernel.arg_dict[expr.aggregate.name] arg = self.kernel.arg_dict[expr.aggregate.name]
...@@ -272,8 +332,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -272,8 +332,10 @@ class ExpressionToCExpressionMapper(IdentityMapper):
else: else:
offset = 0 offset = 0
return var(expr.aggregate.name)[ return self.make_subscript(
self.rec(offset + expr.index, 'i')] arg,
var(expr.aggregate.name),
self.rec(offset + expr.index, "i"))
elif expr.aggregate.name in self.kernel.temporary_variables: elif expr.aggregate.name in self.kernel.temporary_variables:
raise RuntimeError("linear indexing is not supported on temporaries: %s" raise RuntimeError("linear indexing is not supported on temporaries: %s"
...@@ -283,7 +345,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -283,7 +345,10 @@ class ExpressionToCExpressionMapper(IdentityMapper):
raise RuntimeError( raise RuntimeError(
"nothing known about variable '%s'" % expr.aggregate.name) "nothing known about variable '%s'" % expr.aggregate.name)
def map_floor_div(self, expr, type_context): def make_subscript(self, array, base_expr, subscript):
return base_expr[subscript]
def _map_integer_div_operator(self, base_func_name, op_func, expr, type_context):
from loopy.symbolic import get_dependencies from loopy.symbolic import get_dependencies
iname_deps = get_dependencies(expr) & self.kernel.all_inames() iname_deps = get_dependencies(expr) & self.kernel.all_inames()
domain = self.kernel.get_inames_domain(iname_deps) domain = self.kernel.get_inames_domain(iname_deps)
...@@ -292,40 +357,68 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -292,40 +357,68 @@ class ExpressionToCExpressionMapper(IdentityMapper):
assumptions, domain = isl.align_two(assumption_non_param, domain) assumptions, domain = isl.align_two(assumption_non_param, domain)
domain = domain & assumptions domain = domain & assumptions
num_type = self.infer_type(expr.numerator)
den_type = self.infer_type(expr.denominator)
if not num_type.is_integral() or not den_type.is_integral():
raise NotImplementedError("remainder and floordiv "
"for floating-point types")
from loopy.isl_helpers import is_nonnegative from loopy.isl_helpers import is_nonnegative
num_nonneg = is_nonnegative(expr.numerator, domain) num_nonneg = is_nonnegative(expr.numerator, domain) \
den_nonneg = is_nonnegative(expr.denominator, domain) or num_type.numpy_dtype.kind == "u"
den_nonneg = is_nonnegative(expr.denominator, domain) \
or den_type.numpy_dtype.kind == "u"
result_dtype = self.infer_type(expr)
suffix = result_dtype.numpy_dtype.type.__name__
def seen_func(name): def seen_func(name):
idt = self.kernel.index_dtype
from loopy.codegen import SeenFunction from loopy.codegen import SeenFunction
self.codegen_state.seen_functions.add( self.codegen_state.seen_functions.add(
SeenFunction(name, name, (idt, idt))) SeenFunction(
name, f"{name}_{suffix}",
(result_dtype, result_dtype),
(result_dtype,)))
if den_nonneg: if den_nonneg:
if num_nonneg: if num_nonneg:
# parenthesize to avoid negative signs being dragged in from the return op_func(
# outside by associativity self.rec(expr.numerator, "i"),
return ( self.rec(expr.denominator, "i"))
self.rec(expr.numerator, type_context)
//
self.rec(expr.denominator, type_context))
else: else:
seen_func("int_floor_div_pos_b") seen_func(f"{base_func_name}_pos_b")
return var("int_floor_div_pos_b")( return var(f"{base_func_name}_pos_b_{suffix}")(
self.rec(expr.numerator, 'i'), self.rec(expr.numerator, "i"),
self.rec(expr.denominator, 'i')) self.rec(expr.denominator, "i"))
else: else:
seen_func("int_floor_div") seen_func(base_func_name)
return var("int_floor_div")( return var(f"{base_func_name}_{suffix}")(
self.rec(expr.numerator, 'i'), self.rec(expr.numerator, "i"),
self.rec(expr.denominator, 'i')) self.rec(expr.denominator, "i"))
def map_floor_div(self, expr, type_context):
import operator
return self._map_integer_div_operator(
"loopy_floor_div", operator.floordiv, expr, type_context)
def map_remainder(self, expr, type_context):
tgt_dtype = self.infer_type(expr)
if tgt_dtype.is_complex():
raise RuntimeError("complex remainder not defined")
import operator
return self._map_integer_div_operator(
"loopy_mod", operator.mod, expr, type_context)
def map_if(self, expr, type_context): def map_if(self, expr, type_context):
from loopy.types import to_loopy_type
result_type = self.infer_type(expr)
return type(expr)( return type(expr)(
self.rec(expr.condition, "i"), self.rec(expr.condition, type_context,
self.rec(expr.then, type_context), to_loopy_type(np.bool_)),
self.rec(expr.else_, type_context), self.rec(expr.then, type_context, result_type),
self.rec(expr.else_, type_context, result_type),
) )
def map_comparison(self, expr, type_context): def map_comparison(self, expr, type_context):
...@@ -338,333 +431,146 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -338,333 +431,146 @@ class ExpressionToCExpressionMapper(IdentityMapper):
expr.operator, expr.operator,
self.rec(expr.right, inner_type_context)) self.rec(expr.right, inner_type_context))
def map_type_cast(self, expr: TypeCast, type_context: str):
return self.rec(expr.child, type_context, expr.type)
def map_constant(self, expr, type_context): def map_constant(self, expr, type_context):
from loopy.symbolic import Literal
if isinstance(expr, (complex, np.complexfloating)): if isinstance(expr, (complex, np.complexfloating)):
try: real = self.rec(expr.real, type_context)
dtype = expr.dtype imag = self.rec(expr.imag, type_context)
except AttributeError: iota = p.Variable("I" if "I" not in self.kernel.all_variable_names()
# (COMPLEX_GUESS_LOGIC) else "_Complex_I")
# This made it through type 'guessing' above, and it return real + imag*iota
# was concluded above (search for COMPLEX_GUESS_LOGIC), elif np.isnan(expr):
# that nothing was lost by using single precision. from warnings import warn
cast_type = "cfloat" warn("Encountered 'bare' floating point NaN value. Since NaN != NaN,"
" this leads to problems with cache retrieval."
" Consider using `pymbolic.primitives.NaN` instead of `math.nan`."
" The generated code will be equivalent with the added benefit"
" of sound pickling/unpickling of kernel objects.", stacklevel=1)
from pymbolic.primitives import NaN
data_type = expr.dtype.type if isinstance(expr, np.generic) else None
return self.map_nan(NaN(data_type), type_context)
elif np.isneginf(expr):
return -p.Variable("INFINITY")
elif np.isinf(expr):
return p.Variable("INFINITY")
elif isinstance(expr, np.generic):
# Explicitly typed: Generated code must reflect type exactly.
# FIXME: This assumes a 32-bit architecture.
if isinstance(expr, np.float32):
return Literal(repr(float(expr))+"f")
elif isinstance(expr, np.float64):
return Literal(repr(float(expr)))
# Disabled for now, possibly should be a subtarget.
# elif isinstance(expr, np.float128):
# return Literal(repr(expr)+"l")
elif isinstance(expr, np.integer):
suffix = ""
iinfo = np.iinfo(expr)
if iinfo.min == 0:
suffix += "u"
if iinfo.max > (2**31-1):
suffix += "l"
return Literal(repr(int(expr))+suffix)
elif isinstance(expr, np.bool_):
return Literal("true") if expr else Literal("false")
else: else:
if dtype == np.complex128: raise LoopyError("do not know how to generate code for "
cast_type = "cdouble" "constant of numpy type '%s'" % type(expr).__name__)
elif dtype == np.complex64:
cast_type = "cfloat"
else:
raise RuntimeError("unsupported complex type in expression "
"generation: %s" % type(expr))
return var("%s_new" % cast_type)(expr.real, expr.imag) elif np.isfinite(expr):
else:
from loopy.symbolic import Literal
if type_context == "f": if type_context == "f":
return Literal(repr(float(expr))+"f") return Literal(repr(float(expr))+"f")
elif type_context == "d": elif type_context == "d":
return Literal(repr(float(expr))) return Literal(repr(float(expr)))
elif type_context == "i": elif type_context in ["i", "b"]:
return int(expr) return int(expr)
else: else:
if is_integer(expr): if is_integer(expr):
return int(expr) return int(expr)
raise RuntimeError("don't know how to generate code " raise RuntimeError("don't know how to generate code "
"for constant '%s'" % expr) "for constant '%s'" % expr)
def map_call(self, expr, type_context):
from pymbolic.primitives import Variable, Subscript
identifier = expr.function
# {{{ implement indexof, indexof_vec
if identifier.name in ["indexof", "indexof_vec"]:
if len(expr.parameters) != 1:
raise LoopyError("%s takes exactly one argument" % identifier.name)
arg, = expr.parameters
if not isinstance(arg, Subscript):
raise LoopyError(
"argument to %s must be a subscript" % identifier.name)
ary = self.find_array(arg)
from loopy.kernel.array import get_access_info
from pymbolic import evaluate
access_info = get_access_info(self.kernel.target, ary, arg.index,
lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
self.codegen_state.vectorization_info)
from loopy.kernel.data import ImageArg
if isinstance(ary, ImageArg):
raise LoopyError("%s does not support images" % identifier.name)
if identifier.name == "indexof":
return access_info.subscripts[0]
elif identifier.name == "indexof_vec":
from loopy.kernel.array import VectorArrayDimTag
ivec = None
for iaxis, dim_tag in enumerate(ary.dim_tags):
if isinstance(dim_tag, VectorArrayDimTag):
ivec = iaxis
if ivec is None:
return access_info.subscripts[0]
else:
return (
access_info.subscripts[0]*ary.shape[ivec]
+ access_info.vector_index)
else:
raise RuntimeError("should not get here")
# }}}
if isinstance(identifier, Variable):
identifier = identifier.name
par_dtypes = tuple(self.infer_type(par) for par in expr.parameters)
processed_parameters = None
mangle_result = self.kernel.mangle_function(
identifier, par_dtypes,
ast_builder=self.codegen_state.ast_builder)
if mangle_result is None:
raise RuntimeError("function '%s' unknown--"
"maybe you need to register a function mangler?"
% identifier)
if len(mangle_result.result_dtypes) != 1:
raise LoopyError("functions with more or fewer than one return value "
"may not be used in an expression")
if mangle_result.arg_dtypes is not None:
processed_parameters = tuple(
self.rec(par,
dtype_to_type_context(self.kernel.target, tgt_dtype),
tgt_dtype)
for par, par_dtype, tgt_dtype in zip(
expr.parameters, par_dtypes, mangle_result.arg_dtypes))
else: else:
# /!\ FIXME For some functions (e.g. 'sin'), it makes sense to raise LoopyError("don't know how to generate code "
# propagate the type context here. But for many others, it does "for constant '%s'" % expr)
# not. Using the inferred type as a stopgap for now.
processed_parameters = tuple(
self.rec(par,
type_context=dtype_to_type_context(
self.kernel.target, par_dtype))
for par, par_dtype in zip(expr.parameters, par_dtypes))
from warnings import warn def map_call(self, expr, type_context):
warn("Calling function '%s' with unknown C signature--" return (
"return CallMangleInfo.arg_dtypes" self.codegen_state.callables_table[
% identifier, LoopyWarning) expr.function.name].emit_call(
expression_to_code_mapper=self,
from loopy.codegen import SeenFunction expression=expr,
self.codegen_state.seen_functions.add( target=self.kernel.target))
SeenFunction(identifier,
mangle_result.target_name,
mangle_result.arg_dtypes or par_dtypes))
return var(mangle_result.target_name)(*processed_parameters)
# {{{ deal with complex-valued variables # {{{ deal with complex-valued variables
def complex_type_name(self, dtype):
from loopy.types import NumpyType
if not isinstance(dtype, NumpyType):
raise LoopyError("'%s' is not a complex type" % dtype)
if dtype.dtype == np.complex64:
return "cfloat"
if dtype.dtype == np.complex128:
return "cdouble"
else:
raise RuntimeError
def map_sum(self, expr, type_context):
def base_impl(expr, type_context):
return super(ExpressionToCExpressionMapper, self).map_sum(
expr, type_context)
# I've added 'type_context == "i"' because of the following
# idiotic corner case: Code generation for subscripts comes
# through here, and it may involve variables that we know
# nothing about (offsets and such). If we fall into the allow_complex
# branch, we'll try to do type inference on these variables,
# and stuff breaks. This band-aid works around that. -AK
if not self.allow_complex or type_context == "i":
return base_impl(expr, type_context)
tgt_dtype = self.infer_type(expr)
is_complex = tgt_dtype.is_complex()
if not is_complex:
return base_impl(expr, type_context)
else:
tgt_name = self.complex_type_name(tgt_dtype)
reals = []
complexes = []
for child in expr.children:
if self.infer_type(child).is_complex():
complexes.append(child)
else:
reals.append(child)
real_sum = p.flattened_sum([self.rec(r, type_context) for r in reals])
complex_sum = self.rec(complexes[0], type_context, tgt_dtype)
for child in complexes[1:]:
complex_sum = var("%s_add" % tgt_name)(
complex_sum,
self.rec(child, type_context, tgt_dtype))
if real_sum:
return var("%s_radd" % tgt_name)(real_sum, complex_sum)
else:
return complex_sum
def map_product(self, expr, type_context):
def base_impl(expr, type_context):
return super(ExpressionToCExpressionMapper, self).map_product(
expr, type_context)
# I've added 'type_context == "i"' because of the following
# idiotic corner case: Code generation for subscripts comes
# through here, and it may involve variables that we know
# nothing about (offsets and such). If we fall into the allow_complex
# branch, we'll try to do type inference on these variables,
# and stuff breaks. This band-aid works around that. -AK
if not self.allow_complex or type_context == "i":
return base_impl(expr, type_context)
tgt_dtype = self.infer_type(expr)
is_complex = tgt_dtype.is_complex()
if not is_complex:
return base_impl(expr, type_context)
else:
tgt_name = self.complex_type_name(tgt_dtype)
reals = []
complexes = []
for child in expr.children:
if self.infer_type(child).is_complex():
complexes.append(child)
else:
reals.append(child)
real_prd = p.flattened_product(
[self.rec(r, type_context) for r in reals])
complex_prd = self.rec(complexes[0], type_context, tgt_dtype)
for child in complexes[1:]:
complex_prd = var("%s_mul" % tgt_name)(
complex_prd,
self.rec(child, type_context, tgt_dtype))
if real_prd:
return var("%s_rmul" % tgt_name)(real_prd, complex_prd)
else:
return complex_prd
def map_quotient(self, expr, type_context): def map_quotient(self, expr, type_context):
def base_impl(expr, type_context, num_tgt_dtype=None):
num = self.rec(expr.numerator, type_context, num_tgt_dtype)
# analogous to ^{-1}
denom = self.rec(expr.denominator, type_context)
if (n_dtype.kind not in "fc"
and d_dtype.kind not in "fc"):
# must both be integers
if type_context == "f":
num = var("(float) ")(num)
denom = var("(float) ")(denom)
elif type_context == "d":
num = var("(double) ")(num)
denom = var("(double) ")(denom)
return type(expr)(num, denom)
n_dtype = self.infer_type(expr.numerator).numpy_dtype n_dtype = self.infer_type(expr.numerator).numpy_dtype
d_dtype = self.infer_type(expr.denominator).numpy_dtype d_dtype = self.infer_type(expr.denominator).numpy_dtype
if not self.allow_complex: num = self.rec(expr.numerator, type_context)
return base_impl(expr, type_context)
n_complex = 'c' == n_dtype.kind
d_complex = 'c' == d_dtype.kind
tgt_dtype = self.infer_type(expr)
if not (n_complex or d_complex): # analogous to ^{-1}
return base_impl(expr, type_context) denom = self.rec(expr.denominator, type_context)
elif n_complex and not d_complex:
return var("%s_divider" % self.complex_type_name(tgt_dtype))(
self.rec(expr.numerator, type_context, tgt_dtype),
self.rec(expr.denominator, type_context))
elif not n_complex and d_complex:
return var("%s_rdivide" % self.complex_type_name(tgt_dtype))(
self.rec(expr.numerator, type_context),
self.rec(expr.denominator, type_context, tgt_dtype))
else:
return var("%s_divide" % self.complex_type_name(tgt_dtype))(
self.rec(expr.numerator, type_context, tgt_dtype),
self.rec(expr.denominator, type_context, tgt_dtype))
def map_remainder(self, expr, type_context): if (n_dtype.kind not in "fc"
tgt_dtype = self.infer_type(expr) and d_dtype.kind not in "fc"):
if tgt_dtype.is_complex(): # must both be integers
raise RuntimeError("complex remainder not defined") if type_context == "f":
num = var("(float) ")(num)
denom = var("(float) ")(denom)
elif type_context == "d":
num = var("(double) ")(num)
denom = var("(double) ")(denom)
return super(ExpressionToCExpressionMapper, self).map_remainder( return type(expr)(num, denom)
expr, type_context)
def map_power(self, expr, type_context): def map_power(self, expr, type_context):
def base_impl(expr, type_context):
from pymbolic.primitives import is_constant, is_zero
if is_constant(expr.exponent):
if is_zero(expr.exponent):
return 1
elif is_zero(expr.exponent - 1):
return self.rec(expr.base, type_context)
elif is_zero(expr.exponent - 2):
return self.rec(expr.base*expr.base, type_context)
return type(expr)(
self.rec(expr.base, type_context),
self.rec(expr.exponent, type_context))
if not self.allow_complex:
return base_impl(expr, type_context)
tgt_dtype = self.infer_type(expr) tgt_dtype = self.infer_type(expr)
if tgt_dtype.is_complex(): base_dtype = self.infer_type(expr.base)
if expr.exponent in [2, 3, 4]: exponent_dtype = self.infer_type(expr.exponent)
value = expr.base
for i in range(expr.exponent-1): from pymbolic.primitives import is_constant, is_zero
value = value * expr.base if is_constant(expr.exponent):
return self.rec(value, type_context) if is_zero(expr.exponent):
else: return 1
b_complex = self.infer_type(expr.base).is_complex() elif is_zero(expr.exponent - 1):
e_complex = self.infer_type(expr.exponent).is_complex() return self.rec(expr.base, type_context)
elif is_zero(expr.exponent - 2):
if b_complex and not e_complex: return self.rec(expr.base*expr.base, type_context)
return var("%s_powr" % self.complex_type_name(tgt_dtype))(
self.rec(expr.base, type_context, tgt_dtype), if exponent_dtype.is_integral():
self.rec(expr.exponent, type_context)) from loopy.codegen import SeenFunction
else: func_name = ("loopy_pow_"
return var("%s_pow" % self.complex_type_name(tgt_dtype))( f"{tgt_dtype.numpy_dtype}_{exponent_dtype.numpy_dtype}")
self.rec(expr.base, type_context, tgt_dtype),
self.rec(expr.exponent, type_context, tgt_dtype))
return base_impl(expr, type_context) self.codegen_state.seen_functions.add(
SeenFunction(
"int_pow", func_name,
(tgt_dtype, exponent_dtype),
(tgt_dtype, )))
# FIXME: This need some more callables to be registered.
return var(func_name)(self.rec(expr.base, type_context),
self.rec(expr.exponent, type_context))
else:
from loopy.codegen import SeenFunction
clbl = self.codegen_state.ast_builder.known_callables["pow"]
clbl = clbl.with_types({0: tgt_dtype, 1: exponent_dtype},
self.codegen_state.callables_table)[0]
self.codegen_state.seen_functions.add(
SeenFunction(
clbl.name, clbl.name_in_target,
(base_dtype, exponent_dtype),
(tgt_dtype,)))
return var(clbl.name_in_target)(self.rec(expr.base, type_context),
self.rec(expr.exponent, type_context))
# }}} # }}}
...@@ -674,12 +580,42 @@ class ExpressionToCExpressionMapper(IdentityMapper): ...@@ -674,12 +580,42 @@ class ExpressionToCExpressionMapper(IdentityMapper):
def map_local_hw_index(self, expr, type_context): def map_local_hw_index(self, expr, type_context):
raise LoopyError("plain C does not have local hw axes") raise LoopyError("plain C does not have local hw axes")
def map_nan(self, expr, type_context):
from loopy.types import NumpyType
if expr.data_type is None:
if type_context == "f":
return p.Variable("NAN")
elif type_context == "d":
registry = self.codegen_state.ast_builder.target.get_dtype_registry()
lpy_type = NumpyType(np.dtype(np.float32))
cast = var("(%s)" % registry.dtype_to_ctype(lpy_type))
return cast(p.Variable("NAN"))
else:
raise NotImplementedError("lowering NaN with type context"
f" '{type_context}'.")
else:
if isinstance(expr.data_type(float("nan")), np.float32):
return p.Variable("NAN")
elif isinstance(expr.data_type(float("nan")), np.floating):
registry = self.codegen_state.ast_builder.target.get_dtype_registry()
lpy_type = NumpyType(np.dtype(expr.data_type))
cast = var("(%s)" % registry.dtype_to_ctype(lpy_type))
return cast(p.Variable("NAN"))
elif isinstance(expr.data_type(float("nan")), np.complexfloating):
real_dtype = np.empty(0, dtype=expr.data_type).real.dtype.type
return self.map_constant(real_dtype("nan") + expr.data_type(1j),
type_context)
else:
raise NotImplementedError(f"{type(self.kernel.target)} does not"
f" support NaNs of type {expr.data_type}.")
# }}} # }}}
# {{{ C expression to code mapper # {{{ C expression to code mapper
class CExpressionToCodeMapper(RecursiveMapper): class CExpressionToCodeMapper(Mapper):
# {{{ helpers # {{{ helpers
def parenthesize_if_needed(self, s, enclosing_prec, my_prec): def parenthesize_if_needed(self, s, enclosing_prec, my_prec):
...@@ -688,10 +624,21 @@ class CExpressionToCodeMapper(RecursiveMapper): ...@@ -688,10 +624,21 @@ class CExpressionToCodeMapper(RecursiveMapper):
else: else:
return s return s
def join_rec(self, joiner, iterable, prec): def join_rec(self, joiner, iterable, prec, force_parens_around=()):
f = joiner.join("%s" for i in iterable) f = joiner.join("%s" for i in iterable)
return f % tuple( return f % tuple(
self.rec(i, prec) for i in iterable) self.rec_with_force_parens_around(
i, prec, force_parens_around=force_parens_around)
for i in iterable)
def rec_with_force_parens_around(
self, expr, enclosing_prec, force_parens_around=()):
result = self.rec(expr, enclosing_prec)
if isinstance(expr, force_parens_around):
result = "(%s)" % result
return result
def join(self, joiner, iterable): def join(self, joiner, iterable):
f = joiner.join("%s" for i in iterable) f = joiner.join("%s" for i in iterable)
...@@ -700,18 +647,30 @@ class CExpressionToCodeMapper(RecursiveMapper): ...@@ -700,18 +647,30 @@ class CExpressionToCodeMapper(RecursiveMapper):
# }}} # }}}
def map_constant(self, expr, prec): def map_constant(self, expr, prec):
return repr(expr) if isinstance(expr, np.generic):
if isinstance(expr, np.integer):
# FIXME: Add type suffixes?
return repr(int(expr))
elif isinstance(expr, np.float32):
return f"{float(expr)!r}f"
elif isinstance(expr, np.float64):
return repr(float(expr))
else:
raise NotImplementedError(
f"unimplemented numpy-to-C conversion: {type(expr)}")
else:
return repr(expr)
def map_call(self, expr, enclosing_prec): def map_call(self, expr, enclosing_prec):
from pymbolic.mapper.stringifier import PREC_CALL, PREC_NONE
from pymbolic.primitives import Variable from pymbolic.primitives import Variable
from pymbolic.mapper.stringifier import PREC_NONE, PREC_CALL
if isinstance(expr.function, Variable): if isinstance(expr.function, Variable):
func = expr.function.name func = expr.function.name
else: else:
func = self.rec(expr.function, PREC_CALL) func = self.rec(expr.function, PREC_CALL+1)
return self.parenthesize_if_needed( return self.parenthesize_if_needed(
"%s(%s)" % ( "{}({})".format(
func, func,
self.join_rec(", ", expr.parameters, PREC_NONE)), self.join_rec(", ", expr.parameters, PREC_NONE)),
enclosing_prec, PREC_CALL) enclosing_prec, PREC_CALL)
...@@ -727,25 +686,17 @@ class CExpressionToCodeMapper(RecursiveMapper): ...@@ -727,25 +686,17 @@ class CExpressionToCodeMapper(RecursiveMapper):
def map_lookup(self, expr, enclosing_prec): def map_lookup(self, expr, enclosing_prec):
return self.parenthesize_if_needed( return self.parenthesize_if_needed(
"%s.%s" % ( "{}.{}".format(
self.rec(expr.aggregate, PREC_CALL), expr.name), self.rec(expr.aggregate, PREC_CALL), expr.name),
enclosing_prec, PREC_CALL) enclosing_prec, PREC_CALL)
def map_subscript(self, expr, enclosing_prec): def map_subscript(self, expr, enclosing_prec):
return self.parenthesize_if_needed( return self.parenthesize_if_needed(
"%s[%s]" % ( "{}[{}]".format(
self.rec(expr.aggregate, PREC_CALL), self.rec(expr.aggregate, PREC_CALL+1),
self.rec(expr.index, PREC_NONE)), self.rec(expr.index, PREC_NONE)),
enclosing_prec, PREC_CALL) enclosing_prec, PREC_CALL)
def map_floor_div(self, expr, enclosing_prec):
# parenthesize to avoid negative signs being dragged in from the
# outside by associativity
return "(%s / %s)" % (
self.rec(expr.numerator, PREC_PRODUCT),
# analogous to ^{-1}
self.rec(expr.denominator, PREC_POWER))
def map_min(self, expr, enclosing_prec): def map_min(self, expr, enclosing_prec):
what = type(expr).__name__.lower() what = type(expr).__name__.lower()
...@@ -753,7 +704,7 @@ class CExpressionToCodeMapper(RecursiveMapper): ...@@ -753,7 +704,7 @@ class CExpressionToCodeMapper(RecursiveMapper):
result = self.rec(children.pop(), PREC_NONE) result = self.rec(children.pop(), PREC_NONE)
while children: while children:
result = "%s(%s, %s)" % (what, result = "{}({}, {})".format(what,
self.rec(children.pop(), PREC_NONE), self.rec(children.pop(), PREC_NONE),
result) result)
...@@ -762,9 +713,13 @@ class CExpressionToCodeMapper(RecursiveMapper): ...@@ -762,9 +713,13 @@ class CExpressionToCodeMapper(RecursiveMapper):
map_max = map_min map_max = map_min
def map_if(self, expr, enclosing_prec): def map_if(self, expr, enclosing_prec):
from pymbolic.mapper.stringifier import PREC_NONE from pymbolic.mapper.stringifier import PREC_CALL, PREC_NONE
return "(%s ? %s : %s)" % ( return "({} ? {} : {})".format(
self.rec(expr.condition, PREC_NONE), # Force parentheses around the condition to prevent compiler
# warnings regarding precedence (e.g. with POCL 1.8/LLVM 12):
# "warning: pocl-cache/tempfile_BYDWne.cl:96:2241: operator '?:'
# has lower precedence than '*'; '*' will be evaluated first"
self.rec(expr.condition, PREC_CALL),
self.rec(expr.then, PREC_NONE), self.rec(expr.then, PREC_NONE),
self.rec(expr.else_, PREC_NONE), self.rec(expr.else_, PREC_NONE),
) )
...@@ -773,7 +728,7 @@ class CExpressionToCodeMapper(RecursiveMapper): ...@@ -773,7 +728,7 @@ class CExpressionToCodeMapper(RecursiveMapper):
from pymbolic.mapper.stringifier import PREC_COMPARISON from pymbolic.mapper.stringifier import PREC_COMPARISON
return self.parenthesize_if_needed( return self.parenthesize_if_needed(
"%s %s %s" % ( "{} {} {}".format(
self.rec(expr.left, PREC_COMPARISON), self.rec(expr.left, PREC_COMPARISON),
expr.operator, expr.operator,
self.rec(expr.right, PREC_COMPARISON)), self.rec(expr.right, PREC_COMPARISON)),
...@@ -782,6 +737,16 @@ class CExpressionToCodeMapper(RecursiveMapper): ...@@ -782,6 +737,16 @@ class CExpressionToCodeMapper(RecursiveMapper):
def map_literal(self, expr, enclosing_prec): def map_literal(self, expr, enclosing_prec):
return expr.s return expr.s
def map_left_shift(self, expr, enclosing_prec):
return self.parenthesize_if_needed(
self.join_rec(" << ", (expr.shiftee, expr.shift), PREC_SHIFT),
enclosing_prec, PREC_SHIFT)
def map_right_shift(self, expr, enclosing_prec):
return self.parenthesize_if_needed(
self.join_rec(" >> ", (expr.shiftee, expr.shift), PREC_SHIFT),
enclosing_prec, PREC_SHIFT)
def map_logical_not(self, expr, enclosing_prec): def map_logical_not(self, expr, enclosing_prec):
return self.parenthesize_if_needed( return self.parenthesize_if_needed(
"!" + self.rec(expr.child, PREC_UNARY), "!" + self.rec(expr.child, PREC_UNARY),
...@@ -807,6 +772,26 @@ class CExpressionToCodeMapper(RecursiveMapper): ...@@ -807,6 +772,26 @@ class CExpressionToCodeMapper(RecursiveMapper):
result = "(%s)" % result result = "(%s)" % result
return result return result
def map_bitwise_not(self, expr, enclosing_prec):
return self.parenthesize_if_needed(
"~" + self.rec(expr.child, PREC_UNARY),
enclosing_prec, PREC_UNARY)
def map_bitwise_and(self, expr, enclosing_prec):
return self.parenthesize_if_needed(
self.join_rec(" & ", expr.children, PREC_BITWISE_AND),
enclosing_prec, PREC_BITWISE_AND)
def map_bitwise_or(self, expr, enclosing_prec):
return self.parenthesize_if_needed(
self.join_rec(" | ", expr.children, PREC_BITWISE_OR),
enclosing_prec, PREC_BITWISE_OR)
def map_bitwise_xor(self, expr, enclosing_prec):
return self.parenthesize_if_needed(
self.join_rec(" ^ ", expr.children, PREC_BITWISE_XOR),
enclosing_prec, PREC_BITWISE_XOR)
def map_sum(self, expr, enclosing_prec): def map_sum(self, expr, enclosing_prec):
from pymbolic.mapper.stringifier import PREC_SUM from pymbolic.mapper.stringifier import PREC_SUM
...@@ -814,38 +799,43 @@ class CExpressionToCodeMapper(RecursiveMapper): ...@@ -814,38 +799,43 @@ class CExpressionToCodeMapper(RecursiveMapper):
self.join_rec(" + ", expr.children, PREC_SUM), self.join_rec(" + ", expr.children, PREC_SUM),
enclosing_prec, PREC_SUM) enclosing_prec, PREC_SUM)
multiplicative_primitives = (p.Product, p.Quotient, p.FloorDiv, p.Remainder)
def map_product(self, expr, enclosing_prec): def map_product(self, expr, enclosing_prec):
# Spaces prevent '**z' (times dereference z), which force_parens_around = (p.Quotient, p.FloorDiv, p.Remainder)
# is hard to read.
# Spaces prevent '**z' (times dereference z), which is hard to read.
return self.parenthesize_if_needed( return self.parenthesize_if_needed(
self.join_rec(" * ", expr.children, PREC_PRODUCT), self.join_rec(" * ", expr.children, PREC_PRODUCT,
force_parens_around=force_parens_around),
enclosing_prec, PREC_PRODUCT) enclosing_prec, PREC_PRODUCT)
def map_quotient(self, expr, enclosing_prec): def _map_division_operator(self, operator, expr, enclosing_prec):
num = self.rec(expr.numerator, PREC_PRODUCT) num_s = self.rec_with_force_parens_around(expr.numerator, PREC_PRODUCT,
force_parens_around=self.multiplicative_primitives)
# analogous to ^{-1} denom_s = self.rec_with_force_parens_around(expr.denominator, PREC_PRODUCT,
denom = self.rec(expr.denominator, PREC_POWER) force_parens_around=self.multiplicative_primitives)
return self.parenthesize_if_needed( return self.parenthesize_if_needed(
"%s / %s" % ( f"{num_s} {operator} {denom_s}",
# Space is necessary--otherwise '/*' # Space is necessary--otherwise '/*'
# (i.e. divide-dererference) becomes # (i.e. divide-dererference) becomes
# start-of-comment in C. # start-of-comment in C.
num,
denom),
enclosing_prec, PREC_PRODUCT) enclosing_prec, PREC_PRODUCT)
def map_quotient(self, expr, enclosing_prec):
return self._map_division_operator("/", expr, enclosing_prec)
def map_floor_div(self, expr, enclosing_prec):
return self._map_division_operator("/", expr, enclosing_prec)
def map_remainder(self, expr, enclosing_prec): def map_remainder(self, expr, enclosing_prec):
return "(%s %% %s)" % ( return self._map_division_operator("%", expr, enclosing_prec)
self.rec(expr.numerator, PREC_PRODUCT),
# PREC_POWER analogous to ^{-1}
self.rec(expr.denominator, PREC_POWER))
def map_power(self, expr, enclosing_prec): def map_power(self, expr, enclosing_prec):
return "pow(%s, %s)" % ( raise RuntimeError(f"'{expr}' should have been transformed to 'Call'"
self.rec(expr.base, PREC_NONE), " expression node.")
self.rec(expr.exponent, PREC_NONE))
def map_array_literal(self, expr, enclosing_prec): def map_array_literal(self, expr, enclosing_prec):
return "{ %s }" % self.join_rec(", ", expr.children, PREC_NONE) return "{ %s }" % self.join_rec(", ", expr.children, PREC_NONE)
......
Subproject commit 11dc00352423cddd71f09e809d0a22ab1c3ea7a5 Subproject commit 955160ac2f504dabcd8641471a56146fa1afe35d
"""CUDA target independent of PyCUDA.""" """CUDA target independent of PyCUDA."""
from __future__ import annotations
from __future__ import division, absolute_import
__copyright__ = "Copyright (C) 2015 Andreas Kloeckner" __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
...@@ -24,16 +24,32 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ...@@ -24,16 +24,32 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
""" """
from typing import TYPE_CHECKING, Sequence
import numpy as np import numpy as np
from cgen import Const, Declarator, Generable, Pointer
from pymbolic import var
from pytools import memoize_method from pytools import memoize_method
from loopy.target.c import CTarget, CASTBuilder from loopy.diagnostic import LoopyError, LoopyTypeError
from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag, VectorArrayDimTag
from loopy.kernel.data import (
AddressSpace,
ArrayArg,
ConstantArg,
ImageArg,
TemporaryVariable,
)
from loopy.kernel.function_interface import ScalarCallable
from loopy.target.c import CFamilyASTBuilder, CFamilyTarget
from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
from loopy.diagnostic import LoopyError
from loopy.types import NumpyType from loopy.types import NumpyType
from loopy.kernel.data import temp_var_scope
from pymbolic import var
if TYPE_CHECKING:
from loopy.codegen import CodeGenerationState
from loopy.codegen.result import CodeGenerationResult
# {{{ vector types # {{{ vector types
...@@ -45,7 +61,8 @@ class vec: # noqa ...@@ -45,7 +61,8 @@ class vec: # noqa
def _create_vector_types(): def _create_vector_types():
field_names = ["x", "y", "z", "w"] field_names = ["x", "y", "z", "w"]
if tuple.__itemsize__ * 8 == 32: import sys
if sys.maxsize <= 2**33:
long_dtype = np.int32 long_dtype = np.int32
ulong_dtype = np.uint32 ulong_dtype = np.uint32
else: else:
...@@ -57,18 +74,18 @@ def _create_vector_types(): ...@@ -57,18 +74,18 @@ def _create_vector_types():
vec.type_to_scalar_and_count = {} vec.type_to_scalar_and_count = {}
for base_name, base_type, counts in [ for base_name, base_type, counts in [
('char', np.int8, [1, 2, 3, 4]), ("char", np.int8, [1, 2, 3, 4]),
('uchar', np.uint8, [1, 2, 3, 4]), ("uchar", np.uint8, [1, 2, 3, 4]),
('short', np.int16, [1, 2, 3, 4]), ("short", np.int16, [1, 2, 3, 4]),
('ushort', np.uint16, [1, 2, 3, 4]), ("ushort", np.uint16, [1, 2, 3, 4]),
('int', np.int32, [1, 2, 3, 4]), ("int", np.int32, [1, 2, 3, 4]),
('uint', np.uint32, [1, 2, 3, 4]), ("uint", np.uint32, [1, 2, 3, 4]),
('long', long_dtype, [1, 2, 3, 4]), ("long", long_dtype, [1, 2, 3, 4]),
('ulong', ulong_dtype, [1, 2, 3, 4]), ("ulong", ulong_dtype, [1, 2, 3, 4]),
('longlong', np.int64, [1, 2]), ("longlong", np.int64, [1, 2]),
('ulonglong', np.uint64, [1, 2]), ("ulonglong", np.uint64, [1, 2]),
('float', np.float32, [1, 2, 3, 4]), ("float", np.float32, [1, 2, 3, 4]),
('double', np.float64, [1, 2]), ("double", np.float64, [1, 2]),
]: ]:
for count in counts: for count in counts:
name = "%s%d" % (base_name, count) name = "%s%d" % (base_name, count)
...@@ -80,10 +97,10 @@ def _create_vector_types(): ...@@ -80,10 +97,10 @@ def _create_vector_types():
titles.extend((len(names)-len(titles))*[None]) titles.extend((len(names)-len(titles))*[None])
try: try:
dtype = np.dtype(dict( dtype = np.dtype({
names=names, "names": names,
formats=[base_type]*count, "formats": [base_type]*count,
titles=titles)) "titles": titles})
except NotImplementedError: except NotImplementedError:
try: try:
dtype = np.dtype([((n, title), base_type) dtype = np.dtype([((n, title), base_type)
...@@ -110,31 +127,82 @@ def _register_vector_types(dtype_registry): ...@@ -110,31 +127,82 @@ def _register_vector_types(dtype_registry):
# }}} # }}}
# {{{ function mangler # {{{ function scoper
_CUDA_SPECIFIC_FUNCTIONS = {
"rsqrt": 1,
"atan2": 2,
}
class CudaCallable(ScalarCallable):
def cuda_function_mangler(kernel, name, arg_dtypes): def cuda_with_types(self, arg_id_to_dtype, callables_table):
if not isinstance(name, str):
return None
if name in ["max", "min"] and len(arg_dtypes) == 2: name = self.name
dtype = np.find_common_type([], arg_dtypes)
if dtype.kind == "c": if name in _CUDA_SPECIFIC_FUNCTIONS:
raise RuntimeError("min/max do not support complex numbers") num_args = _CUDA_SPECIFIC_FUNCTIONS[name]
if dtype.kind == "f": # {{{ sanity checks
name = "f" + name
return dtype, name for id, dtype in arg_id_to_dtype.items():
if not -1 <= id < num_args:
raise LoopyError("%s can take only %d arguments." % (name,
num_args))
if name in "atan2" and len(arg_dtypes) == 2: if dtype is not None and dtype.kind == "c":
return arg_dtypes[0], name raise LoopyTypeError(
f"'{name}' does not support complex arguments.")
if name == "dot": # }}}
scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"]
return scalar_dtype, name
return None for i in range(num_args):
if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
# the types provided aren't mature enough to specialize the
# callable
return (
self.copy(arg_id_to_dtype=arg_id_to_dtype),
callables_table)
dtype = np.result_type(*[
dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
if id >= 0])
updated_arg_id_to_dtype = {id: NumpyType(dtype)
for id in range(-1, num_args)}
return (
self.copy(name_in_target=name,
arg_id_to_dtype=updated_arg_id_to_dtype),
callables_table)
if name == "dot":
# CUDA dot function:
# Performs dot product. Input types: vector and return type: scalar.
for i in range(2):
if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
# the types provided aren't mature enough to specialize the
# callable
return (
self.copy(arg_id_to_dtype=arg_id_to_dtype),
callables_table)
input_dtype = arg_id_to_dtype[0]
scalar_dtype, _offset, _field_name = input_dtype.fields["x"]
return_dtype = scalar_dtype
return self.copy(arg_id_to_dtype={0: input_dtype, 1: input_dtype,
-1: return_dtype})
return (
self.copy(arg_id_to_dtype=arg_id_to_dtype),
callables_table)
def get_cuda_callables():
cuda_func_ids = {"dot"} | set(_CUDA_SPECIFIC_FUNCTIONS)
return {id_: CudaCallable(name=id_) for id_ in cuda_func_ids}
# }}} # }}}
...@@ -144,31 +212,18 @@ def cuda_function_mangler(kernel, name, arg_dtypes): ...@@ -144,31 +212,18 @@ def cuda_function_mangler(kernel, name, arg_dtypes):
class ExpressionToCudaCExpressionMapper(ExpressionToCExpressionMapper): class ExpressionToCudaCExpressionMapper(ExpressionToCExpressionMapper):
_GRID_AXES = "xyz" _GRID_AXES = "xyz"
@staticmethod
def _get_index_ctype(kernel):
if kernel.index_dtype.numpy_dtype == np.int32:
return "int32_t"
elif kernel.index_dtype.numpy_dtype == np.int64:
return "int64_t"
else:
raise LoopyError("unexpected index type")
def map_group_hw_index(self, expr, type_context): def map_group_hw_index(self, expr, type_context):
return var("((%s) blockIdx.%s)" % ( return var(f"bIdx({self._GRID_AXES[expr.axis]})")
self._get_index_ctype(self.kernel),
self._GRID_AXES[expr.axis]))
def map_local_hw_index(self, expr, type_context): def map_local_hw_index(self, expr, type_context):
return var("((%s) threadIdx.%s)" % ( return var(f"tIdx({self._GRID_AXES[expr.axis]})")
self._get_index_ctype(self.kernel),
self._GRID_AXES[expr.axis]))
# }}} # }}}
# {{{ target # {{{ target
class CudaTarget(CTarget): class CudaTarget(CFamilyTarget):
"""A target for Nvidia's CUDA GPU programming language.""" """A target for Nvidia's CUDA GPU programming language."""
def __init__(self, extern_c=True): def __init__(self, extern_c=True):
...@@ -178,7 +233,10 @@ class CudaTarget(CTarget): ...@@ -178,7 +233,10 @@ class CudaTarget(CTarget):
""" """
self.extern_c = extern_c self.extern_c = extern_c
super(CudaTarget, self).__init__() super().__init__()
def split_kernel_at_global_barriers(self):
return True
def get_device_ast_builder(self): def get_device_ast_builder(self):
return CUDACASTBuilder(self) return CUDACASTBuilder(self)
...@@ -187,11 +245,13 @@ class CudaTarget(CTarget): ...@@ -187,11 +245,13 @@ class CudaTarget(CTarget):
@memoize_method @memoize_method
def get_dtype_registry(self): def get_dtype_registry(self):
from loopy.target.c.compyte.dtypes import (DTypeRegistry, from loopy.target.c.compyte.dtypes import (
fill_registry_with_opencl_c_types) DTypeRegistry,
fill_registry_with_c_types,
)
result = DTypeRegistry() result = DTypeRegistry()
fill_registry_with_opencl_c_types(result) fill_registry_with_c_types(result, respect_windows=True)
# no complex number support--needs PyOpenCLTarget # no complex number support--needs PyOpenCLTarget
...@@ -204,33 +264,80 @@ class CudaTarget(CTarget): ...@@ -204,33 +264,80 @@ class CudaTarget(CTarget):
and dtype.numpy_dtype in list(vec.types.values())) and dtype.numpy_dtype in list(vec.types.values()))
def vector_dtype(self, base, count): def vector_dtype(self, base, count):
return NumpyType( return NumpyType(vec.types[base.numpy_dtype, count])
vec.types[base.numpy_dtype, count],
target=self)
# }}} # }}}
# }}} # }}}
# {{{ preamable generator
def cuda_preamble_generator(preamble_info):
from loopy.types import AtomicNumpyType
seen_64_bit_atomics = any(
isinstance(dtype, AtomicNumpyType) and dtype.numpy_dtype.itemsize == 8
for dtype in preamble_info.seen_atomic_dtypes)
if seen_64_bit_atomics:
# Source:
# docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
yield ("00_enable_64bit_atomics", """
#if __CUDA_ARCH__ < 600
__device__ double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
""")
from loopy.tools import remove_common_indentation
kernel = preamble_info.kernel
idx_ctype = kernel.target.dtype_to_typename(kernel.index_dtype)
yield ("00_declare_gid_lid",
remove_common_indentation(f"""
#define bIdx(N) (({idx_ctype}) blockIdx.N)
#define tIdx(N) (({idx_ctype}) threadIdx.N)
"""))
# }}}
# {{{ ast builder # {{{ ast builder
class CUDACASTBuilder(CASTBuilder): class CUDACASTBuilder(CFamilyASTBuilder):
preamble_function_qualifier = "inline __device__"
# {{{ library # {{{ library
def function_manglers(self): @property
return ( def known_callables(self):
super(CUDACASTBuilder, self).function_manglers() + [ callables = super().known_callables
cuda_function_mangler callables.update(get_cuda_callables())
]) return callables
# }}} # }}}
# {{{ top-level codegen # {{{ top-level codegen
def get_function_declaration(self, codegen_state, codegen_result, def get_function_declaration(
schedule_index): self, codegen_state: CodeGenerationState,
fdecl = super(CUDACASTBuilder, self).get_function_declaration( codegen_result: CodeGenerationResult, schedule_index: int
) -> tuple[Sequence[tuple[str, str]], Generable]:
preambles, fdecl = super().get_function_declaration(
codegen_state, codegen_result, schedule_index) codegen_state, codegen_result, schedule_index)
from loopy.target.c import FunctionDeclarationWrapper from loopy.target.c import FunctionDeclarationWrapper
...@@ -245,10 +352,12 @@ class CUDACASTBuilder(CASTBuilder): ...@@ -245,10 +352,12 @@ class CUDACASTBuilder(CASTBuilder):
fdecl = Extern("C", fdecl) fdecl = Extern("C", fdecl)
from loopy.schedule import get_insn_ids_for_block_at from loopy.schedule import get_insn_ids_for_block_at
assert codegen_state.kernel.linearization is not None
_, local_grid_size = \ _, local_grid_size = \
codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
get_insn_ids_for_block_at( get_insn_ids_for_block_at(
codegen_state.kernel.schedule, schedule_index)) codegen_state.kernel.linearization, schedule_index),
codegen_state.callables_table)
from loopy.symbolic import get_dependencies from loopy.symbolic import get_dependencies
if not get_dependencies(local_grid_size): if not get_dependencies(local_grid_size):
...@@ -259,25 +368,12 @@ class CUDACASTBuilder(CASTBuilder): ...@@ -259,25 +368,12 @@ class CUDACASTBuilder(CASTBuilder):
fdecl = CudaLaunchBounds(nthreads, fdecl) fdecl = CudaLaunchBounds(nthreads, fdecl)
return FunctionDeclarationWrapper(fdecl) return preambles, FunctionDeclarationWrapper(fdecl)
def generate_code(self, kernel, codegen_state, impl_arg_info): def preamble_generators(self):
code, implemented_domains = (
super(CudaTarget, self).generate_code(
kernel, codegen_state, impl_arg_info))
return code, implemented_domains return (
[*super().preamble_generators(), cuda_preamble_generator])
def generate_body(self, kernel, codegen_state):
body, implemented_domains = (
super(CudaTarget, self).generate_body(kernel, codegen_state))
from loopy.kernel.data import ImageArg
if any(isinstance(arg, ImageArg) for arg in kernel.args):
raise NotImplementedError("not yet: texture arguments in CUDA")
return body, implemented_domains
# }}} # }}}
...@@ -289,64 +385,217 @@ class CUDACASTBuilder(CASTBuilder): ...@@ -289,64 +385,217 @@ class CUDACASTBuilder(CASTBuilder):
_VEC_AXES = "xyzw" _VEC_AXES = "xyzw"
def add_vector_access(self, access_expr, index): def add_vector_access(self, access_expr, index):
return access_expr.a(self._VEC_AXES[index]) return access_expr.attr(self._VEC_AXES[index])
def emit_barrier(self, kind, comment): def emit_barrier(self, synchronization_kind, mem_kind, comment):
""" """
:arg kind: ``"local"`` or ``"global"`` :arg kind: ``"local"`` or ``"global"``
:arg memkind: unused
:return: a :class:`loopy.codegen.GeneratedInstruction`. :return: a :class:`loopy.codegen.GeneratedInstruction`.
""" """
if kind == "local": if synchronization_kind == "local":
if comment: if comment:
comment = " /* %s */" % comment comment = " /* %s */" % comment
from cgen import Statement from cgen import Statement
return Statement("__syncthreads()%s" % comment) return Statement("__syncthreads()%s" % comment)
elif kind == "global": elif synchronization_kind == "global":
raise LoopyError("CUDA does not have global barriers") raise LoopyError("CUDA does not have global barriers")
else: else:
raise LoopyError("unknown barrier kind") raise LoopyError("unknown barrier kind")
def wrap_temporary_decl(self, decl, scope): # }}}
if scope == temp_var_scope.LOCAL:
from cgen.cuda import CudaShared # {{{ declarators
def wrap_decl_for_address_space(
self, decl: Declarator, address_space: AddressSpace) -> Declarator:
from cgen.cuda import CudaGlobal, CudaShared
if address_space == AddressSpace.GLOBAL:
return CudaGlobal(decl)
if address_space == AddressSpace.LOCAL:
return CudaShared(decl) return CudaShared(decl)
elif scope == temp_var_scope.PRIVATE: elif address_space == AddressSpace.PRIVATE:
return decl return decl
else: else:
raise ValueError("unexpected temporary variable scope: %s" raise ValueError("unexpected address_space: %s"
% scope) % address_space)
def wrap_global_constant(self, decl): def wrap_global_constant(self, decl: Declarator) -> Declarator:
from cgen.cuda import CudaConstant from cgen.cuda import CudaConstant, CudaGlobal
assert isinstance(decl, CudaGlobal)
decl = decl.subdecl
return CudaConstant(decl) return CudaConstant(decl)
def get_global_arg_decl(self, name, shape, dtype, is_written): # duplicated in OpenCL, update there if updating here
from loopy.target.c import POD # uses the correct complex type def get_array_base_declarator(self, ary: ArrayBase) -> Declarator:
from cgen import Const dtype = ary.dtype
from cgen.cuda import CudaRestrictPointer
vec_size = ary.vector_length()
if vec_size > 1:
dtype = self.target.vector_dtype(dtype, vec_size)
if ary.dim_tags:
for dim_tag in ary.dim_tags:
if isinstance(dim_tag, (FixedStrideArrayDimTag, VectorArrayDimTag)):
# we're OK with those
pass
else:
raise NotImplementedError(
f"{type(self).__name__} does not understand axis tag "
f"'{type(dim_tag)}.")
from loopy.target.c import POD
return POD(self, dtype, ary.name)
arg_decl = CudaRestrictPointer(POD(self, dtype, name)) def get_array_arg_declarator(
self, arg: ArrayArg, is_written: bool) -> Declarator:
from cgen.cuda import CudaRestrictPointer
arg_decl: Declarator = CudaRestrictPointer(
self.get_array_base_declarator(arg))
if not is_written: if not is_written:
arg_decl = Const(arg_decl) arg_decl = Const(arg_decl)
return arg_decl return arg_decl
def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): def get_constant_arg_declarator(self, arg: ConstantArg) -> Declarator:
from cgen import RestrictPointer
from cgen.cuda import CudaConstant
# constant *is* an address space as far as CUDA is concerned, do not re-wrap
return CudaConstant(RestrictPointer(self.get_array_base_declarator(arg)))
def get_image_arg_declarator(
self, arg: ImageArg, is_written: bool) -> Declarator:
raise NotImplementedError("not yet: texture arguments in CUDA") raise NotImplementedError("not yet: texture arguments in CUDA")
def get_constant_arg_decl(self, name, shape, dtype, is_written): def emit_temp_var_decl_for_tv_with_base_storage(self,
from loopy.target.c import POD # uses the correct complex type codegen_state: CodeGenerationState,
from cgen import RestrictPointer, Const tv: TemporaryVariable) -> Generable:
from cgen.cuda import CudaConstant from cgen import Initializer
arg_decl = RestrictPointer(POD(self, dtype, name)) from loopy.target.c import POD, _ConstPointer, _ConstRestrictPointer
if not is_written: assert tv.base_storage is not None
arg_decl = Const(arg_decl) ecm = codegen_state.expression_to_code_mapper
return CudaConstant(arg_decl) cast_decl: Declarator = POD(self, tv.dtype, "")
temp_var_decl: Declarator = POD(self, tv.dtype, tv.name)
if tv._base_storage_access_may_be_aliasing:
ptrtype: type[Pointer] = _ConstPointer
else:
# The 'restrict' part of this is a complete lie--of course
# all these temporaries are aliased. But we're promising to
# not use them to shovel data from one representation to the
# other. That counts, right?
ptrtype = _ConstRestrictPointer
cast_decl = ptrtype(cast_decl)
temp_var_decl = ptrtype(temp_var_decl)
cast_tp, cast_d = cast_decl.get_decl_pair()
return Initializer(
temp_var_decl,
"({} {}) ({} + {})".format(
" ".join(cast_tp), cast_d, tv.base_storage, ecm(tv.offset)
),
)
# }}}
# {{{ atomics
def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
from cgen import Statement
from pymbolic.mapper.stringifier import PREC_NONE
from pymbolic.primitives import Sum
if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
np.int32, np.int64, np.float32, np.float64]:
# atomicAdd
if isinstance(rhs_expr, Sum):
ecm = self.get_expression_to_code_mapper(codegen_state)
new_rhs_expr = Sum(tuple(c for c in rhs_expr.children
if c != lhs_expr))
lhs_expr_code = ecm(lhs_expr)
rhs_expr_code = ecm(new_rhs_expr)
return Statement("atomicAdd(&{}, {})".format(
lhs_expr_code, rhs_expr_code))
else:
from cgen import Assign, Block, DoWhile
from loopy.target.c import POD
old_val_var = codegen_state.var_name_generator("loopy_old_val")
new_val_var = codegen_state.var_name_generator("loopy_new_val")
from loopy.kernel.data import TemporaryVariable
ecm = codegen_state.expression_to_code_mapper.with_assignments(
{
old_val_var: TemporaryVariable(old_val_var, lhs_dtype),
new_val_var: TemporaryVariable(new_val_var, lhs_dtype),
})
lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None)
from pymbolic import var
from pymbolic.mapper.substitutor import make_subst_func
from loopy.symbolic import SubstitutionMapper
subst = SubstitutionMapper(
make_subst_func({lhs_expr: var(old_val_var)}))
rhs_expr_code = ecm(subst(rhs_expr), prec=PREC_NONE,
type_context=rhs_type_context,
needed_dtype=lhs_dtype)
cast_str = ""
old_val = old_val_var
new_val = new_val_var
if lhs_dtype.numpy_dtype.kind == "f":
if lhs_dtype.numpy_dtype == np.float32:
ctype = "int"
elif lhs_dtype.numpy_dtype == np.float64:
ctype = "long"
else:
raise AssertionError()
old_val = "*(%s *) &" % ctype + old_val
new_val = "*(%s *) &" % ctype + new_val
cast_str = "(%s *) " % (ctype)
return Block([
POD(self, NumpyType(lhs_dtype.dtype),
old_val_var),
POD(self, NumpyType(lhs_dtype.dtype),
new_val_var),
DoWhile(
"atomicCAS("
"%(cast_str)s&(%(lhs_expr)s), "
"%(old_val)s, "
"%(new_val)s"
") != %(old_val)s"
% {
"cast_str": cast_str,
"lhs_expr": lhs_expr_code,
"old_val": old_val,
"new_val": new_val,
},
Block([
Assign(old_val_var, lhs_expr_code),
Assign(new_val_var, rhs_expr_code),
])
)
])
else:
raise NotImplementedError("atomic update for '%s'" % lhs_dtype)
# }}} # }}}
......
from __future__ import annotations
__copyright__ = "Copyright (C) 2012-17 Andreas Kloeckner, Nick Curtis"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import (
TYPE_CHECKING,
Any,
Callable,
Mapping,
Sequence,
cast,
)
from constantdict import constantdict
from pymbolic import Variable, var
from pytools.codegen import CodeGenerator, Indentation
from pytools.py_codegen import PythonFunctionGenerator
from loopy.diagnostic import LoopyError
logger = logging.getLogger(__name__)
from pytools.persistent_dict import WriteOncePersistentDict
from loopy.kernel import KernelState, LoopKernel
from loopy.kernel.data import ArrayArg, _ArraySeparationInfo, auto
from loopy.tools import LoopyKeyBuilder, caches
from loopy.types import LoopyType, NumpyType
from loopy.typing import Expression, integer_expr_or_err
from loopy.version import DATA_MODEL_VERSION
if TYPE_CHECKING:
from loopy.schedule.tools import KernelArgInfo
from loopy.translation_unit import TranslationUnit
# {{{ object array argument packing
class SeparateArrayPackingController:
"""For argument arrays with axes tagged to be implemented as separate
arrays, this class provides preprocessing of the incoming arguments so that
all sub-arrays may be passed in one object array (under the original,
un-split argument name) and are unpacked into separate arrays before being
passed to the kernel.
It also repacks outgoing arrays of this type back into an object array.
"""
def __init__(self, packing_info: dict[str, _ArraySeparationInfo]) -> None:
# These must work to index tuples if 1D.
def untuple_length_1_indices(
ind: tuple[int, ...]) -> int | tuple[int, ...]:
if len(ind) == 1:
return ind[0]
else:
return ind
self.packing_info = {
name: {
untuple_length_1_indices(ind): sep_name
for ind, sep_name in sep_info.subarray_names.items()
}
for name, sep_info in packing_info.items()
}
def __call__(self, kernel_kwargs: dict[str, Any]) -> dict[str, Any]:
kernel_kwargs = kernel_kwargs.copy()
for name, ind_to_subary_name in self.packing_info.items():
if name in kernel_kwargs:
arg = kernel_kwargs[name]
for index, unpacked_name in ind_to_subary_name.items():
assert unpacked_name not in kernel_kwargs
kernel_kwargs[unpacked_name] = arg[index]
del kernel_kwargs[name]
return kernel_kwargs
# }}}
# {{{ ExecutionWrapperGeneratorBase
def _str_to_expr(name_or_expr: str | Expression) -> Expression:
if isinstance(name_or_expr, str):
return var(name_or_expr)
else:
return name_or_expr
@dataclass(frozen=True)
class _ArgFindingEquation:
lhs: Expression
rhs: Expression
# Arg finding code is sorted by priority, all equations (across all unknowns)
# of lowest priority first.
order: int
based_on_names: frozenset[str]
class ExecutionWrapperGeneratorBase(ABC):
"""
A set of common methods for generating a wrapper
for execution
"""
def __init__(self, system_args):
self.system_args = system_args[:]
from pytools import UniqueNameGenerator
self.dtype_name_generator = UniqueNameGenerator(forced_prefix="_lpy_dtype_")
self.dtype_str_to_name = {}
@abstractmethod
def python_dtype_str_inner(self, dtype):
pass
def python_dtype_str(self, gen: CodeGenerator, numpy_dtype):
dtype_str = self.python_dtype_str_inner(numpy_dtype)
try:
return self.dtype_str_to_name[dtype_str]
except KeyError:
pass
dtype_name = self.dtype_name_generator()
gen.add_to_preamble(f"{dtype_name} = _lpy_np.dtype({dtype_str})")
self.dtype_str_to_name[dtype_str] = dtype_name
return dtype_name
# {{{ invoker generation
# /!\ This code runs in a namespace controlled by the user.
# Prefix all auxiliary variables with "_lpy".
# {{{ integer arg finding from array data
def generate_integer_arg_finding_from_array_data(
self, gen: CodeGenerator, kernel: LoopKernel, kai: KernelArgInfo
) -> None:
from loopy.diagnostic import ParameterFinderWarning
from loopy.kernel.array import get_strides
from loopy.kernel.data import ArrayArg
from loopy.symbolic import DependencyMapper, StringifyMapper
dep_map: DependencyMapper[[]] = DependencyMapper()
# {{{ find equations
equations: list[_ArgFindingEquation] = []
for arg_name in kai.passed_arg_names:
arg = kernel.arg_dict[arg_name]
assert arg.dtype is not None
if isinstance(arg, ArrayArg):
assert arg.shape is not auto
if isinstance(arg.shape, tuple):
for axis_nr, shape_i in enumerate(arg.shape):
if shape_i is not None:
equations.append(
_ArgFindingEquation(
lhs=var(arg.name).attr("shape")[axis_nr],
rhs=shape_i,
order=0,
based_on_names=frozenset({arg.name})))
strides = get_strides(arg)
for axis_nr, stride_i in enumerate(strides):
if stride_i is not None:
equations.append(
_ArgFindingEquation(
lhs=var("_lpy_even_div")(
var(arg.name).attr("strides")[axis_nr],
arg.dtype.itemsize),
rhs=_str_to_expr(stride_i),
order=0,
based_on_names=frozenset({arg.name}),
))
if not arg.is_input and isinstance(arg.shape, tuple):
# If no value was found by other means, provide
# C-contiguous default strides for output-only
# arguments.
equations.append(
_ArgFindingEquation(
lhs=(integer_expr_or_err(strides[axis_nr + 1])
* integer_expr_or_err(
arg.shape[axis_nr + 1]))
if axis_nr + 1 < len(strides)
else 1,
rhs=_str_to_expr(stride_i),
# Find strides from last dim to first,
# starting at order=1 so that shape
# parameters (found above) are
# available.
order=len(strides) - axis_nr,
based_on_names=frozenset(),
))
if arg.offset is not None:
equations.append(
_ArgFindingEquation(
lhs=var("_lpy_even_div_none")(
var("getattr")(
var(arg.name), var('"offset"'), var("None")),
arg.dtype.itemsize),
rhs=_str_to_expr(arg.offset),
order=0,
based_on_names=frozenset([arg.name]),
))
# If no value was found by other means, default to zero.
equations.append(
_ArgFindingEquation(
lhs=0,
rhs=_str_to_expr(arg.offset),
order=1,
based_on_names=frozenset(),
))
# }}}
# {{{ regroup equations by unknown
order_to_unknown_to_equations: \
dict[int, dict[str, list[_ArgFindingEquation]]] = {}
for eqn in equations:
deps = dep_map(eqn.rhs)
if len(deps) == 1:
unknown_var, = deps
order_to_unknown_to_equations \
.setdefault(eqn.order, {}) \
.setdefault(cast("Variable", unknown_var).name, []) \
.append(eqn)
else:
# Zero deps: nothing to determine, forget about it.
# 2+ deps: not implemented
pass
del equations
# }}}
# {{{ generate arg finding code
from pymbolic.algorithm import solve_affine_equations_for
from pytools.codegen import CodeGenerator
gen("# {{{ find integer arguments from array data")
gen("")
for order_value in sorted(order_to_unknown_to_equations):
for unknown_name in sorted(order_to_unknown_to_equations[order_value]):
unk_equations = sorted(
order_to_unknown_to_equations[order_value][unknown_name],
key=lambda eqn: eqn.order)
subgen = CodeGenerator()
seen_based_on_names: set[frozenset[str]] = set()
if_or_elif = "if"
for eqn in unk_equations:
if eqn.rhs == Variable(unknown_name):
# Some of the expressions above are non-affine. Let's not
# get carried away by trying to solve a much more complex
# problem than needed.
value_expr = eqn.lhs
else:
try:
# overkill :)
value_expr = solve_affine_equations_for(
[unknown_name],
[(eqn.lhs, eqn.rhs)]
)[Variable(unknown_name)]
except Exception as e:
# went wrong? oh well
from warnings import warn
warn("Unable to generate code to automatically "
f"find '{unknown_name}' "
f"from '{', '.join(eqn.based_on_names)}':\n"
f"{e}", ParameterFinderWarning, stacklevel=1)
continue
# Do not use more than one bit of data from each of the
# 'based_on_names' to find each value, i.e. if a value can be
# found via shape and strides, only one of them suffices.
# This also helps because strides can be unreliable in the
# face of zero-length axes.
if eqn.based_on_names in seen_based_on_names:
continue
seen_based_on_names.add(eqn.based_on_names)
if eqn.based_on_names:
condition = " and ".join(
f"{ary_name} is not None"
for ary_name in eqn.based_on_names)
else:
condition = "True"
subgen(f"{if_or_elif} {condition}:")
with Indentation(subgen):
subgen(
f"{unknown_name} = {StringifyMapper()(value_expr)}")
if_or_elif = "elif"
subgen("")
if subgen.code:
gen(f"if {unknown_name} is None:")
with Indentation(gen):
gen.extend(subgen)
gen("# }}}")
gen("")
# }}}
# }}}
# {{{ check that value args are present
def generate_value_arg_check(
self, gen: CodeGenerator, kernel: LoopKernel, kai: KernelArgInfo
) -> None:
if kernel.options.skip_arg_checks:
return
from loopy.kernel.data import ValueArg
gen("# {{{ check that value args are present")
gen("")
for arg_name in kai.passed_arg_names:
arg = kernel.arg_dict[arg_name]
if not isinstance(arg, ValueArg):
continue
gen("if %s is None:" % arg.name)
with Indentation(gen):
gen("raise TypeError(\"value argument '%s' "
"was not given and could not be automatically "
'determined")' % arg.name)
gen("# }}}")
gen("")
# }}}
# {{{ handle non numpy arguments
def handle_non_numpy_arg(self, gen: CodeGenerator, arg):
raise NotImplementedError()
# }}}
# {{{ handle allocation of unspecified arguments
def handle_alloc(
self, gen: CodeGenerator, arg: ArrayArg,
strify: Callable[[Expression | tuple[Expression]], str],
skip_arg_checks: bool) -> None:
"""
Handle allocation of non-specified arguments for C-execution
"""
raise NotImplementedError()
# }}}
def get_arg_pass(self, arg):
raise NotImplementedError()
def get_strides_check_expr(self, shape, strides, expected_strides):
assert len(shape) == len(strides) == len(expected_strides)
# Returns an expression suitable for use for checking the strides of an
# argument. Arguments should be sequences of strings.
# Shape axes of length 1 are ignored because strides along these
# axes are never used: The only valid index is 1.
match_expr = " and ".join(
f"({shape_i} == 1 or {strides_i} == {expected_strides_i})"
for shape_i, strides_i, expected_strides_i
in zip(shape, strides, expected_strides)) or "True"
if shape:
# If any shape component is zero, the array is empty and the strides
# don't matter.
match_expr = (f"({match_expr})"
+ "".join(f" or not {shape_i}" for shape_i in shape))
return match_expr
# {{{ arg setup
def generate_arg_setup(
self, gen: CodeGenerator, kernel: LoopKernel, kai: KernelArgInfo,
) -> Sequence[str]:
options = kernel.options
import loopy as lp
from loopy.kernel.array import ArrayBase
from loopy.kernel.data import ImageArg
from loopy.symbolic import StringifyMapper
from loopy.types import NumpyType
gen("# {{{ set up array arguments")
gen("")
if not options.no_numpy:
gen("_lpy_encountered_numpy = False")
gen("_lpy_encountered_dev = False")
gen("")
args = []
strify = StringifyMapper()
for arg_name in kai.passed_arg_names:
arg = kernel.arg_dict[arg_name]
is_written = arg.name in kernel.get_written_variables()
if not isinstance(arg, ArrayBase):
args.append(arg.name)
continue
gen("# {{{ process %s" % arg.name)
gen("")
if not options.no_numpy:
self.handle_non_numpy_arg(gen, arg)
if not options.skip_arg_checks and arg.is_input:
gen("if %s is None:" % arg.name)
with Indentation(gen):
gen("raise RuntimeError(\"input argument '%s' must "
'be supplied")' % arg.name)
gen("")
if (is_written and isinstance(arg, ImageArg)
and not options.skip_arg_checks):
gen("if %s is None:" % arg.name)
with Indentation(gen):
gen("raise RuntimeError(\"written image '%s' must "
'be supplied")' % arg.name)
gen("")
if is_written and arg.shape is None and not options.skip_arg_checks:
gen("if %s is None:" % arg.name)
with Indentation(gen):
gen("raise RuntimeError(\"written argument '%s' has "
'unknown shape and must be supplied")' % arg.name)
gen("")
possibly_made_by_loopy = False
# {{{ allocate written arrays, if needed
if arg.is_output \
and isinstance(arg, (lp.ArrayArg, lp.ConstantArg)) \
and arg.shape is not None \
and all(si is not None for si in arg.shape):
if not isinstance(arg.dtype, NumpyType):
raise LoopyError("do not know how to pass arg of type '%s'"
% arg.dtype)
possibly_made_by_loopy = True
gen("_lpy_made_by_loopy = False")
gen("")
gen("if %s is None:" % arg.name)
with Indentation(gen):
self.handle_alloc(
gen, arg, strify, options.skip_arg_checks)
gen("_lpy_made_by_loopy = True")
gen("")
# }}}
# {{{ argument checking
if isinstance(arg, (lp.ArrayArg, lp.ConstantArg)) \
and not options.skip_arg_checks:
if possibly_made_by_loopy:
gen("if not _lpy_made_by_loopy:")
else:
gen("if True:")
with Indentation(gen):
gen("if %s.dtype != %s:"
% (arg.name, self.python_dtype_str(
gen, arg.dtype.numpy_dtype)))
with Indentation(gen):
gen("raise TypeError(\"dtype mismatch on argument '%s' "
'(got: %%s, expected: %s)" %% %s.dtype)'
% (arg.name, arg.dtype, arg.name))
# {{{ generate shape checking code
def strify_allowing_none(shape_axis):
if shape_axis is None:
return "None"
else:
return strify(shape_axis)
def strify_tuple(t: tuple[Expression, ...] | None) -> str:
if t is None:
return "None"
if len(t) == 0:
return "()"
else:
return "(%s,)" % ", ".join(
strify_allowing_none(sa)
for sa in t)
shape_mismatch_msg = (
"raise ValueError(\"shape mismatch on argument '%s' "
'(got: %%s, expected: %%s)" '
"%% (%s.shape, %s))"
% (arg.name, arg.name, strify_tuple(arg.shape)))
if arg.shape is None:
pass
elif any(shape_axis is None for shape_axis in arg.shape):
assert isinstance(arg.shape, tuple)
gen("if len(%s.shape) != %s:"
% (arg.name, len(arg.shape)))
with Indentation(gen):
gen(shape_mismatch_msg)
for i, shape_axis in enumerate(arg.shape):
if shape_axis is None:
continue
gen("if %s.shape[%d] != %s:"
% (arg.name, i, strify(shape_axis)))
with Indentation(gen):
gen(shape_mismatch_msg)
else: # not None, no Nones in tuple
gen("if %s.shape != %s:"
% (arg.name, strify(arg.shape)))
with Indentation(gen):
gen(shape_mismatch_msg)
# }}}
from loopy.kernel.array import get_strides
strides = get_strides(arg)
if strides and arg.dim_tags and arg.shape is not None:
assert isinstance(arg.shape, tuple)
itemsize = arg.dtype.numpy_dtype.itemsize
sym_strides = tuple(itemsize*s_i for s_i in strides)
ndim = len(arg.shape)
shape = ["_lpy_shape_%d" % i for i in range(ndim)]
strides = ["_lpy_stride_%d" % i for i in range(ndim)]
gen("({},) = {}.shape".format(", ".join(shape), arg.name))
gen("({},) = {}.strides".format(
", ".join(strides), arg.name))
gen("if not (%s):"
% self.get_strides_check_expr(
shape, strides,
[strify(s) for s in sym_strides]))
with Indentation(gen):
gen(f"_lpy_got = {arg.name}.strides")
gen(f"_lpy_expected = {strify_tuple(sym_strides)}")
gen('raise ValueError("strides mismatch on '
"argument '%s' "
'(got: %%s, expected: %%s)" '
"%% (_lpy_got, _lpy_expected))"
% arg.name)
if not arg.offset:
gen("if hasattr({}, 'offset') and {}.offset:".format(
arg.name, arg.name))
with Indentation(gen):
gen("raise ValueError(\"Argument '%s' does not "
"allow arrays with offsets. Try passing "
"default_offset=loopy.auto to make_kernel()."
'")' % arg.name)
gen("")
# }}}
if possibly_made_by_loopy and not options.skip_arg_checks:
gen("del _lpy_made_by_loopy")
gen("")
if isinstance(arg, (lp.ArrayArg, lp.ConstantArg)):
args.append(self.get_arg_pass(arg))
else:
args.append(arg.name)
gen("")
gen("# }}}")
gen("")
gen("# }}}")
gen("")
return args
# }}}
def target_specific_preamble(self, gen):
"""
Add target specific imports to preamble
"""
raise NotImplementedError()
def initialize_system_args(self, gen):
"""
Override to initialize any default system args
"""
raise NotImplementedError()
# {{{ generate invocation
def generate_invocation(self, gen: CodeGenerator, kernel: LoopKernel,
kai: KernelArgInfo, host_program_name: str, args: Sequence[str]) -> None:
raise NotImplementedError()
# }}}
# {{{ output
def generate_output_handler(self, gen: CodeGenerator,
kernel: LoopKernel, kai: KernelArgInfo) -> None:
raise NotImplementedError()
# }}}
def generate_host_code(self, gen, codegen_result):
raise NotImplementedError
def __call__(self, program, entrypoint, codegen_result):
"""
Generates the wrapping python invoker for this execution target
:arg kernel: the loopy :class:`LoopKernel`(s) to be executed
:codegen_result: the loopy :class:`CodeGenerationResult` created
by code generation
:returns: A python callable that handles execution of this
kernel
"""
kernel = program[entrypoint]
options = kernel.options
from loopy.schedule.tools import get_kernel_arg_info
kai = get_kernel_arg_info(kernel)
gen = PythonFunctionGenerator(
"invoke_%s_loopy_kernel" % entrypoint,
self.system_args + [
"%s=None" % arg_name
for arg_name in kai.passed_arg_names
])
self.target_specific_preamble(gen)
gen.add_to_preamble("")
self.generate_host_code(gen, codegen_result)
gen.add_to_preamble("")
self.initialize_system_args(gen)
self.generate_integer_arg_finding_from_array_data(
gen, program[entrypoint], kai)
self.generate_value_arg_check(gen, program[entrypoint], kai)
args = self.generate_arg_setup(gen, program[entrypoint], kai)
# FIXME: should we make this as a dict as well.
host_program_name = codegen_result.host_programs[entrypoint].name
self.generate_invocation(gen, program[entrypoint], kai,
host_program_name, args)
self.generate_output_handler(gen, program[entrypoint], kai)
if options.write_wrapper:
output = gen.get()
if options.allow_terminal_colors:
output = get_highlighted_python_code(output)
if options.write_wrapper is True:
print(output)
else:
with open(options.write_wrapper, "w") as outf:
outf.write(output)
return gen.get_picklable_function()
# }}}
# }}}
typed_and_scheduled_cache: WriteOncePersistentDict[
tuple[str, TranslationUnit, Mapping[str, LoopyType] | None],
TranslationUnit
] = WriteOncePersistentDict(
"loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION,
key_builder=LoopyKeyBuilder(),
safe_sync=False)
caches.append(typed_and_scheduled_cache)
invoker_cache: WriteOncePersistentDict[
tuple[str, TranslationUnit, str],
str
] = WriteOncePersistentDict(
"loopy-invoker-cache-v10-"+DATA_MODEL_VERSION,
key_builder=LoopyKeyBuilder(),
safe_sync=False)
caches.append(invoker_cache)
# {{{ kernel executor
class ExecutorBase:
"""An object allowing the execution of an entrypoint of a
:class:`~loopy.TranslationUnit`. Create these objects using
:meth:`loopy.TranslationUnit.executor`.
.. automethod:: __call__
"""
packing_controller: SeparateArrayPackingController | None
def __init__(self, t_unit: TranslationUnit, entrypoint: str):
self.t_unit = t_unit
self.entrypoint = entrypoint
kernel = self.t_unit[entrypoint]
self.output_names = {arg.name for arg in kernel.args if arg.is_output}
from loopy import ArrayArg
self.input_array_names = {
arg.name for arg in kernel.args
if arg.is_input and isinstance(arg, ArrayArg)}
self.has_runtime_typed_args = any(arg.dtype is None for arg in kernel.args)
# We're doing this ahead of time to learn about array separation.
# This will be done again as part of preprocessing below, and we're
# betting that it happens consistently both times. (No reason it wouldn't,
# but it is done redundantly.) We can't *use* the result of this
# because we need to do the 'official' array separation after type
# inference has completed.
from loopy.preprocess import make_arrays_for_sep_arrays
self.separated_entry_knl = make_arrays_for_sep_arrays(
self.t_unit[self.entrypoint])
self.sep_info = self.separated_entry_knl._separation_info()
if self.sep_info:
self.packing_controller = SeparateArrayPackingController(self.sep_info)
else:
self.packing_controller = None
def check_for_required_array_arguments(self, input_args):
# Formerly, the first exception raised when a required argument is not
# passed was often at type inference. This exists to raise a more meaningful
# message in such scenarios. Since type inference precedes compilation, this
# check cannot be deferred to the generated invoker code.
# See discussion at
# https://github.com/inducer/loopy/pull/160#issuecomment-867761204
# and links therin for context.
if not self.input_array_names <= set(input_args):
missing_args = self.input_array_names - set(input_args)
kernel = self.t_unit[self.entrypoint]
raise LoopyError(
f"Kernel {kernel.name}() missing required array input arguments: "
f"{', '.join(missing_args)}. "
"If this is a surprise, maybe you need to add is_input=False to "
"your argument.")
def get_typed_and_scheduled_translation_unit_uncached(
self, arg_to_dtype: constantdict[str, LoopyType] | None
) -> TranslationUnit:
t_unit = self.t_unit
if arg_to_dtype:
entry_knl = t_unit[self.entrypoint]
# FIXME: This is not so nice. This transfers types from the
# subarrays of sep-tagged arrays to the 'main' array, because
# type inference fails otherwise.
mm = arg_to_dtype.mutate()
for name, sep_info in self.sep_info.items():
if entry_knl.arg_dict[name].dtype is None:
for sep_name in sep_info.subarray_names.values():
if sep_name in arg_to_dtype:
mm[name] = arg_to_dtype[sep_name]
del mm[sep_name]
arg_to_dtype = mm.finish()
from loopy.kernel.tools import add_dtypes
t_unit = t_unit.with_kernel(add_dtypes(entry_knl, arg_to_dtype))
from loopy.type_inference import infer_unknown_types
t_unit = infer_unknown_types(t_unit, expect_completion=True)
if t_unit.state < KernelState.PREPROCESSED:
from loopy.preprocess import preprocess_program
t_unit = preprocess_program(t_unit)
if t_unit.state < KernelState.LINEARIZED:
from loopy.schedule import linearize
t_unit = linearize(t_unit)
return t_unit
def get_typed_and_scheduled_translation_unit(
self, arg_to_dtype: constantdict[str, LoopyType] | None
) -> TranslationUnit:
from loopy import CACHING_ENABLED
cache_key = (type(self).__name__, self.t_unit, arg_to_dtype)
if CACHING_ENABLED:
try:
return typed_and_scheduled_cache[cache_key]
except KeyError:
pass
logger.debug("%s: typed-and-scheduled cache miss" %
self.t_unit.entrypoints)
t_unit = self.get_typed_and_scheduled_translation_unit_uncached(arg_to_dtype)
if CACHING_ENABLED:
typed_and_scheduled_cache.store_if_not_present(cache_key, t_unit)
return t_unit
def arg_to_dtype(self, kwargs) -> constantdict[str, LoopyType] | None:
if not self.has_runtime_typed_args:
return None
arg_dict = self.separated_entry_knl.arg_dict
arg_to_dtype = {}
for arg_name, val in kwargs.items():
arg = arg_dict[arg_name]
if arg.dtype is None and val is not None:
try:
dtype = val.dtype
except AttributeError:
pass
else:
arg_to_dtype[arg_name] = NumpyType(dtype)
return constantdict(arg_to_dtype)
# {{{ debugging aids
def get_highlighted_code(self, entrypoint, arg_to_dtype=None, code=None):
if code is None:
code = self.get_code(entrypoint, arg_to_dtype)
return get_highlighted_code(code)
def get_code(
self, entrypoint: str,
arg_to_dtype: constantdict[str, LoopyType] | None = None) -> str:
kernel = self.get_typed_and_scheduled_translation_unit(arg_to_dtype)
from loopy.codegen import generate_code_v2
code = generate_code_v2(kernel)
return code.device_code()
def get_invoker_uncached(self, program, entrypoint, *args):
raise NotImplementedError()
def get_invoker(self, t_unit, entrypoint, *args):
from loopy import CACHING_ENABLED
cache_key = (self.__class__.__name__, (t_unit, entrypoint))
if CACHING_ENABLED:
try:
return invoker_cache[cache_key]
except KeyError:
pass
logger.debug("%s: invoker cache miss" % entrypoint)
invoker = self.get_invoker_uncached(t_unit, entrypoint, *args)
if CACHING_ENABLED:
invoker_cache.store_if_not_present(cache_key, invoker)
return invoker
# }}}
# {{{ call and info generator
def __call__(self, queue, **kwargs):
raise NotImplementedError()
# }}}
# }}}
# {{{ code highlighters
def get_highlighted_code(text, python=False):
try:
from pygments import highlight
except ImportError:
return text
else:
from pygments.formatters import TerminalFormatter
from pygments.lexers import CLexer, PythonLexer
return highlight(text, CLexer() if not python else PythonLexer(),
TerminalFormatter())
def get_highlighted_python_code(text):
return get_highlighted_code(text, True)
# }}}
# vim: foldmethod=marker
"""Target for Intel ISPC.""" """Target for Intel ISPC."""
from __future__ import annotations
from __future__ import division, absolute_import
__copyright__ = "Copyright (C) 2015 Andreas Kloeckner" __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
...@@ -24,19 +24,79 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ...@@ -24,19 +24,79 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
""" """
import operator
from functools import reduce
from typing import TYPE_CHECKING, Iterable, Sequence, cast
import numpy as np
from typing_extensions import Never
import numpy as np # noqa
from loopy.target.c import CTarget, CASTBuilder
from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
from loopy.diagnostic import LoopyError
from loopy.symbolic import Literal
from pymbolic import var
import pymbolic.primitives as p import pymbolic.primitives as p
from loopy.kernel.data import temp_var_scope from cgen import Collection, Const, Declarator, Generable
from pymbolic import var
from pymbolic.mapper.stringifier import PREC_NONE from pymbolic.mapper.stringifier import PREC_NONE
from pymbolic.mapper.substitutor import make_subst_func
from pytools import memoize_method from pytools import memoize_method
from loopy.diagnostic import LoopyError
from loopy.kernel.data import AddressSpace, ArrayArg, LocalInameTag, TemporaryVariable
from loopy.symbolic import (
CoefficientCollector,
CombineMapper,
GroupHardwareAxisIndex,
Literal,
LocalHardwareAxisIndex,
SubstitutionMapper,
flatten,
)
from loopy.target.c import CFamilyASTBuilder, CFamilyTarget
from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
if TYPE_CHECKING:
from loopy.codegen import CodeGenerationState
from loopy.codegen.result import CodeGenerationResult
from loopy.kernel import LoopKernel
from loopy.kernel.instruction import Assignment
from loopy.schedule import CallKernel
from loopy.types import LoopyType
from loopy.typing import Expression
class IsVaryingMapper(CombineMapper[bool, []]):
# FIXME: Update this if/when ispc reduction support is added.
def __init__(self, kernel: LoopKernel) -> None:
self.kernel = kernel
super().__init__()
def combine(self, values: Iterable[bool]) -> bool:
return reduce(operator.or_, values, False)
def map_constant(self, expr):
return False
def map_group_hw_index(self, expr: GroupHardwareAxisIndex) -> Never:
# These only exist for a brief blip in time inside the expr-to-cexpr
# mapper. We should never see them.
raise AssertionError()
def map_local_hw_index(self, expr: LocalHardwareAxisIndex) -> Never:
# These only exist for a brief blip in time inside the expr-to-cexpr
# mapper. We should never see them.
raise AssertionError()
def map_variable(self, expr: p.Variable) -> bool:
iname = self.kernel.inames.get(expr.name)
if iname is not None:
ltags = iname.tags_of_type(LocalInameTag)
if ltags:
ltag, = ltags
assert ltag.axis == 0
return True
return False
# {{{ expression mapper # {{{ expression mapper
...@@ -69,10 +129,10 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): ...@@ -69,10 +129,10 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
elif type_context == "d": elif type_context == "d":
# Keepin' the good ideas flowin' since '66. # Keepin' the good ideas flowin' since '66.
return Literal(repr(float(expr))+"d") return Literal(repr(float(expr))+"d")
elif type_context == "i": elif type_context in ["i", "b"]:
return expr return expr
else: else:
from loopy.tools import is_integer from loopy.typing import is_integer
if is_integer(expr): if is_integer(expr):
return expr return expr
...@@ -82,19 +142,19 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): ...@@ -82,19 +142,19 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
def map_variable(self, expr, type_context): def map_variable(self, expr, type_context):
tv = self.kernel.temporary_variables.get(expr.name) tv = self.kernel.temporary_variables.get(expr.name)
if tv is not None and tv.scope == temp_var_scope.PRIVATE: if tv is not None and tv.address_space == AddressSpace.PRIVATE:
# FIXME: This is a pretty coarse way of deciding what # FIXME: This is a pretty coarse way of deciding what
# private temporaries get duplicated. Refine? (See also # private temporaries get duplicated. Refine? (See also
# below in decl generation) # below in decl generation)
gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() _gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs(
self.codegen_state.callables_table)
if lsize: if lsize:
return expr[var("programIndex")] return expr[var("programIndex")]
else: else:
return expr return expr
else: else:
return super(ExprToISPCExprMapper, self).map_variable( return super().map_variable(expr, type_context)
expr, type_context)
def map_subscript(self, expr, type_context): def map_subscript(self, expr, type_context):
from loopy.kernel.data import TemporaryVariable from loopy.kernel.data import TemporaryVariable
...@@ -102,22 +162,23 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): ...@@ -102,22 +162,23 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
ary = self.find_array(expr) ary = self.find_array(expr)
if (isinstance(ary, TemporaryVariable) if (isinstance(ary, TemporaryVariable)
and ary.scope == temp_var_scope.PRIVATE): and ary.address_space == AddressSpace.PRIVATE):
# generate access code for acccess to private-index temporaries # generate access code for access to private-index temporaries
gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() _gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs()
if lsize: if lsize:
lsize, = lsize lsize, = lsize
from loopy.kernel.array import get_access_info
from pymbolic import evaluate from pymbolic import evaluate
access_info = get_access_info(self.kernel.target, ary, expr.index, from loopy.kernel.array import get_access_info
access_info = get_access_info(self.kernel, ary, expr.index,
lambda expr: evaluate(expr, self.codegen_state.var_subst_map), lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
self.codegen_state.vectorization_info) self.codegen_state.vectorization_info)
subscript, = access_info.subscripts subscript, = access_info.subscripts
result = var(access_info.array_name)[ result = var(access_info.array_name)[
var("programIndex") + self.rec(lsize*subscript, 'i')] var("programIndex") + self.rec(lsize*subscript, "i")]
if access_info.vector_index is not None: if access_info.vector_index is not None:
return self.kernel.target.add_vector_access( return self.kernel.target.add_vector_access(
...@@ -125,16 +186,39 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): ...@@ -125,16 +186,39 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
else: else:
return result return result
return super(ExprToISPCExprMapper, self).map_subscript( return super().map_subscript(
expr, type_context) expr, type_context)
def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s):
raise NotImplementedError("wrap_in_typecast needs uniform-ness information "
"for ispc")
def rec(self, expr, type_context=None, needed_type: LoopyType | None = None): # type: ignore[override]
result = super().rec(expr, type_context)
if needed_type is None:
return result
else:
actual_type = self.infer_type(expr)
if actual_type != needed_type:
# FIXME: problematic: potential quadratic complexity
is_varying = IsVaryingMapper(self.kernel)(expr)
registry = self.codegen_state.ast_builder.target.get_dtype_registry()
cast = var("("
f"{'varying' if is_varying else 'uniform'} "
f"{registry.dtype_to_ctype(needed_type)}"
") ")
return cast(result)
return result
# }}} # }}}
# {{{ type registry # {{{ type registry
def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True): def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True):
reg.get_or_register_dtype("bool", np.bool) reg.get_or_register_dtype("bool", bool)
reg.get_or_register_dtype(["int8", "signed char", "char"], np.int8) reg.get_or_register_dtype(["int8", "signed char", "char"], np.int8)
reg.get_or_register_dtype(["uint8", "unsigned char"], np.uint8) reg.get_or_register_dtype(["uint8", "unsigned char"], np.uint8)
...@@ -154,28 +238,20 @@ def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True): ...@@ -154,28 +238,20 @@ def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True):
# }}} # }}}
class ISPCTarget(CTarget): class ISPCTarget(CFamilyTarget):
"""A code generation target for Intel's `ISPC <https://ispc.github.io/>`_ """A code generation target for Intel's `ISPC <https://ispc.github.io/>`_
SPMD programming language, to target Intel's Knight's hardware and modern SPMD programming language, to target Intel's Knight's hardware and modern
Intel CPUs with wide vector units. Intel CPUs with wide vector units.
""" """
def __init__(self, occa_mode=False):
"""
:arg occa_mode: Whether to modify the generated call signature to
be compatible with OCCA
"""
self.occa_mode = occa_mode
super(ISPCTarget, self).__init__()
host_program_name_suffix = "" host_program_name_suffix = ""
device_program_name_suffix = "_inner" device_program_name_suffix = "_inner"
def pre_codegen_check(self, kernel): def pre_codegen_entrypoint_check(self, kernel, callables_table):
gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs() _gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs(
callables_table)
if len(lsize) > 1: if len(lsize) > 1:
for i, ls_i in enumerate(lsize[1:]): for ls_i in lsize[1:]:
if ls_i != 1: if ls_i != 1:
raise LoopyError("local axis %d (0-based) " raise LoopyError("local axis %d (0-based) "
"has length > 1, which is unsupported " "has length > 1, which is unsupported "
...@@ -200,47 +276,42 @@ class ISPCTarget(CTarget): ...@@ -200,47 +276,42 @@ class ISPCTarget(CTarget):
# }}} # }}}
class ISPCASTBuilder(CASTBuilder): class ISPCASTBuilder(CFamilyASTBuilder):
def _arg_names_and_decls(self, codegen_state):
implemented_data_info = codegen_state.implemented_data_info
arg_names = [iai.name for iai in implemented_data_info]
arg_decls = [
self.idi_to_cgen_declarator(codegen_state.kernel, idi)
for idi in implemented_data_info]
# {{{ occa compatibility hackery
from cgen import Value
if self.target.occa_mode:
from cgen import ArrayOf, Const
from cgen.ispc import ISPCUniform
arg_decls = [
Const(ISPCUniform(ArrayOf(Value("int", "loopy_dims")))),
Const(ISPCUniform(Value("int", "o1"))),
Const(ISPCUniform(Value("int", "o2"))),
Const(ISPCUniform(Value("int", "o3"))),
] + arg_decls
arg_names = ["loopy_dims", "o1", "o2", "o3"] + arg_names
# }}}
return arg_names, arg_decls
# {{{ top-level codegen # {{{ top-level codegen
def get_function_declaration(self, codegen_state, codegen_result, def get_function_declaration(
schedule_index): self, codegen_state: CodeGenerationState,
codegen_result: CodeGenerationResult, schedule_index: int
) -> tuple[Sequence[tuple[str, str]], Generable]:
name = codegen_result.current_program(codegen_state).name name = codegen_result.current_program(codegen_state).name
kernel = codegen_state.kernel
from cgen import (FunctionDeclaration, Value) assert codegen_state.kernel.linearization is not None
subkernel_name = cast(
"CallKernel",
codegen_state.kernel.linearization[schedule_index]
).kernel_name
from cgen import FunctionDeclaration, Value
from cgen.ispc import ISPCExport, ISPCTask from cgen.ispc import ISPCExport, ISPCTask
arg_names, arg_decls = self._arg_names_and_decls(codegen_state) if codegen_state.is_entrypoint:
# subkernel launches occur only as part of entrypoint kernels for now
from loopy.schedule.tools import get_subkernel_arg_info
skai = get_subkernel_arg_info(codegen_state.kernel, subkernel_name)
passed_names = skai.passed_names
written_names = skai.written_names
else:
passed_names = [arg.name for arg in kernel.args]
written_names = kernel.get_written_variables()
arg_decls = [self.arg_to_cgen_declarator(
kernel, arg_name,
is_written=arg_name in written_names)
for arg_name in passed_names]
if codegen_state.is_generating_device_code: if codegen_state.is_generating_device_code:
result = ISPCTask( result: Declarator = ISPCTask(
FunctionDeclaration( FunctionDeclaration(
Value("void", name), Value("void", name),
arg_decls)) arg_decls))
...@@ -251,35 +322,45 @@ class ISPCASTBuilder(CASTBuilder): ...@@ -251,35 +322,45 @@ class ISPCASTBuilder(CASTBuilder):
arg_decls)) arg_decls))
from loopy.target.c import FunctionDeclarationWrapper from loopy.target.c import FunctionDeclarationWrapper
return FunctionDeclarationWrapper(result) return [], FunctionDeclarationWrapper(result)
# }}} def get_kernel_call(self, codegen_state: CodeGenerationState,
subkernel_name: str,
def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): gsize: tuple[Expression, ...],
lsize: tuple[Expression, ...]) -> Generable:
kernel = codegen_state.kernel
ecm = self.get_expression_to_code_mapper(codegen_state) ecm = self.get_expression_to_code_mapper(codegen_state)
from pymbolic.mapper.stringifier import PREC_NONE from pymbolic.mapper.stringifier import PREC_NONE
result = [] result = []
from cgen import Statement as S, Block from cgen import Block, Statement as S
if lsize: if lsize:
result.append( result.append(
S( S(
"assert(programCount == (%s))" "assert(programCount == (%s))"
% ecm(lsize[0], PREC_NONE))) % ecm(lsize[0], PREC_NONE)))
arg_names, arg_decls = self._arg_names_and_decls(codegen_state) if codegen_state.is_entrypoint:
# subkernel launches occur only as part of entrypoint kernels for now
from loopy.schedule.tools import get_subkernel_arg_info
skai = get_subkernel_arg_info(codegen_state.kernel, subkernel_name)
passed_names = skai.passed_names
else:
passed_names = [arg.name for arg in kernel.args]
from cgen.ispc import ISPCLaunch from cgen.ispc import ISPCLaunch
result.append( result.append(
ISPCLaunch( ISPCLaunch(
tuple(ecm(gs_i, PREC_NONE) for gs_i in gsize), tuple(ecm(gs_i, PREC_NONE) for gs_i in gsize),
"%s(%s)" % ( "{}({})".format(
name, subkernel_name,
", ".join(arg_names) ", ".join(passed_names)
))) )))
return Block(result) return Block(result)
# }}}
# {{{ code generation guts # {{{ code generation guts
def get_expression_to_c_expression_mapper(self, codegen_state): def get_expression_to_c_expression_mapper(self, codegen_state):
...@@ -288,31 +369,57 @@ class ISPCASTBuilder(CASTBuilder): ...@@ -288,31 +369,57 @@ class ISPCASTBuilder(CASTBuilder):
def add_vector_access(self, access_expr, index): def add_vector_access(self, access_expr, index):
return access_expr[index] return access_expr[index]
def emit_barrier(self, kind, comment): def emit_barrier(self, synchronization_kind, mem_kind, comment):
from cgen import Comment, Statement from cgen import Comment, Statement
assert comment assert comment
if kind == "local": if synchronization_kind == "local":
return Comment("local barrier: %s" % comment) return Comment("local barrier: %s" % comment)
elif kind == "global": elif synchronization_kind == "global":
return Statement("sync; /* %s */" % comment) return Statement("sync; /* %s */" % comment)
else: else:
raise LoopyError("unknown barrier kind") raise LoopyError("unknown barrier kind")
def get_temporary_decl(self, codegen_state, sched_index, temp_var, decl_info): # }}}
from loopy.target.c import POD # uses the correct complex type
temp_var_decl = POD(self, decl_info.dtype, decl_info.name) # {{{ declarators
shape = decl_info.shape def get_value_arg_declarator(
self, name: str, dtype: LoopyType, is_written: bool) -> Declarator:
from cgen.ispc import ISPCUniform
return ISPCUniform(super().get_value_arg_declarator(
name, dtype, is_written))
if temp_var.scope == temp_var_scope.PRIVATE: def get_array_arg_declarator(
self, arg: ArrayArg, is_written: bool) -> Declarator:
# FIXME restrict?
from cgen.ispc import ISPCUniform, ISPCUniformPointer
decl: Declarator = ISPCUniform(
ISPCUniformPointer(self.get_array_base_declarator(arg)))
if not is_written:
decl = Const(decl)
return decl
def get_temporary_var_declarator(self,
codegen_state: CodeGenerationState,
temp_var: TemporaryVariable) -> Declarator:
temp_var_decl = self.get_array_base_declarator(temp_var)
shape = temp_var.shape
assert isinstance(shape, tuple)
if temp_var.address_space == AddressSpace.PRIVATE:
# FIXME: This is a pretty coarse way of deciding what # FIXME: This is a pretty coarse way of deciding what
# private temporaries get duplicated. Refine? (See also # private temporaries get duplicated. Refine? (See also
# above in expr to code mapper) # above in expr to code mapper)
_, lsize = codegen_state.kernel.get_grid_size_upper_bounds_as_exprs() _, lsize = codegen_state.kernel.get_grid_size_upper_bounds_as_exprs(
codegen_state.callables_table)
shape = lsize + shape shape = lsize + shape
if shape: if shape:
...@@ -325,44 +432,15 @@ class ISPCASTBuilder(CASTBuilder): ...@@ -325,44 +432,15 @@ class ISPCASTBuilder(CASTBuilder):
return temp_var_decl return temp_var_decl
def wrap_temporary_decl(self, decl, scope): # }}}
from cgen.ispc import ISPCUniform
return ISPCUniform(decl)
def get_global_arg_decl(self, name, shape, dtype, is_written):
from loopy.target.c import POD # uses the correct complex type
from cgen import Const
from cgen.ispc import ISPCUniformPointer, ISPCUniform
arg_decl = ISPCUniformPointer(POD(self, dtype, name))
if not is_written:
arg_decl = Const(arg_decl)
arg_decl = ISPCUniform(arg_decl)
return arg_decl
def get_value_arg_decl(self, name, shape, dtype, is_written):
result = super(ISPCASTBuilder, self).get_value_arg_decl(
name, shape, dtype, is_written)
from cgen import Reference, Const
was_const = isinstance(result, Const)
if was_const:
result = result.subdecl
if self.target.occa_mode:
result = Reference(result)
if was_const:
result = Const(result)
from cgen.ispc import ISPCUniform # {{{ emit_...
return ISPCUniform(result)
def emit_assignment(self, codegen_state, insn): def emit_assignment(
self,
codegen_state: CodeGenerationState,
insn: Assignment
):
kernel = codegen_state.kernel kernel = codegen_state.kernel
ecm = codegen_state.expression_to_code_mapper ecm = codegen_state.expression_to_code_mapper
...@@ -374,9 +452,10 @@ class ISPCASTBuilder(CASTBuilder): ...@@ -374,9 +452,10 @@ class ISPCASTBuilder(CASTBuilder):
if insn.atomicity: if insn.atomicity:
raise NotImplementedError("atomic ops in ISPC") raise NotImplementedError("atomic ops in ISPC")
from loopy.expression import dtype_to_type_context
from pymbolic.mapper.stringifier import PREC_NONE from pymbolic.mapper.stringifier import PREC_NONE
from loopy.expression import dtype_to_type_context
rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype) rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
rhs_code = ecm(insn.expression, prec=PREC_NONE, rhs_code = ecm(insn.expression, prec=PREC_NONE,
type_context=rhs_type_context, type_context=rhs_type_context,
...@@ -386,86 +465,69 @@ class ISPCASTBuilder(CASTBuilder): ...@@ -386,86 +465,69 @@ class ISPCASTBuilder(CASTBuilder):
# {{{ handle streaming stores # {{{ handle streaming stores
if "!streaming_store" in insn.tags: from loopy.kernel.instruction import UseStreamingStoreTag
if UseStreamingStoreTag() in insn.tags:
ary = ecm.find_array(lhs) ary = ecm.find_array(lhs)
from loopy.kernel.array import get_access_info
from pymbolic import evaluate from pymbolic import evaluate
from loopy.kernel.array import get_access_info
from loopy.symbolic import simplify_using_aff from loopy.symbolic import simplify_using_aff
if not isinstance(lhs, p.Subscript):
raise LoopyError("streaming store must have a subscript as argument")
from loopy.kernel.data import ArrayArg, TemporaryVariable
if not isinstance(ary, (ArrayArg, TemporaryVariable)):
raise LoopyError("array type not supported in ISPC: %s"
% type(ary).__name)
index_tuple = tuple( index_tuple = tuple(
simplify_using_aff(kernel, idx) for idx in lhs.index_tuple) simplify_using_aff(kernel, idx) for idx in lhs.index_tuple)
access_info = get_access_info(kernel.target, ary, index_tuple, access_info = get_access_info(kernel, ary, index_tuple,
lambda expr: evaluate(expr, self.codegen_state.var_subst_map), lambda expr: cast("int",
evaluate(expr, codegen_state.var_subst_map)),
codegen_state.vectorization_info) codegen_state.vectorization_info)
from loopy.kernel.data import GlobalArg, TemporaryVariable l0_inames = {
iname for iname in insn.within_inames
if not isinstance(ary, (GlobalArg, TemporaryVariable)): if kernel.inames[iname].tags_of_type(LocalInameTag)}
raise LoopyError("array type not supported in ISPC: %s"
% type(ary).__name)
if len(access_info.subscripts) != 1: if len(access_info.subscripts) != 1:
raise LoopyError("streaming stores must have a subscript") raise LoopyError("streaming stores must have a subscript")
subscript, = access_info.subscripts subscript, = access_info.subscripts
from pymbolic.primitives import Sum, flattened_sum, Variable if l0_inames:
if isinstance(subscript, Sum): l0_iname, = l0_inames
terms = subscript.children coeffs = CoefficientCollector([l0_iname])(subscript)
else: if coeffs[p.Variable(l0_iname)] != 1:
terms = (subscript.children,) raise ValueError("coefficient of streaming store index "
"in l.0 variable must be 1")
new_terms = []
from loopy.kernel.data import LocalIndexTag
from loopy.symbolic import get_dependencies
saw_l0 = False
for term in terms:
if (isinstance(term, Variable)
and isinstance(
kernel.iname_to_tag.get(term.name), LocalIndexTag)
and kernel.iname_to_tag.get(term.name).axis == 0):
if saw_l0:
raise LoopyError("streaming store must have stride 1 "
"in local index, got: %s" % subscript)
saw_l0 = True
continue
else:
for dep in get_dependencies(term):
if (
isinstance(
kernel.iname_to_tag.get(dep), LocalIndexTag)
and kernel.iname_to_tag.get(dep).axis == 0):
raise LoopyError("streaming store must have stride 1 "
"in local index, got: %s" % subscript)
new_terms.append(term)
if not saw_l0: subscript = flatten(
raise LoopyError("streaming store must have stride 1 in " SubstitutionMapper(make_subst_func({l0_iname: 0}))(subscript))
"local index, got: %s" % subscript) del l0_iname
if access_info.vector_index is not None: if access_info.vector_index is not None:
raise LoopyError("streaming store may not use a short-vector " raise LoopyError("streaming store may not use a short-vector "
"data type") "data type")
rhs_has_programindex = any( if (l0_inames
isinstance( and not IsVaryingMapper(codegen_state.kernel)(insn.expression)):
kernel.iname_to_tag.get(dep), LocalIndexTag) # rhs is uniform, must be cast to varying in order for streaming_store
and kernel.iname_to_tag.get(dep).axis == 0 # to perform a vector store.
for dep in get_dependencies(insn.expression)) registry = codegen_state.ast_builder.target.get_dtype_registry()
rhs_code = var("(varying "
if not rhs_has_programindex: f"{registry.dtype_to_ctype(lhs_dtype)}"
rhs_code = "broadcast(%s, 0)" % rhs_code f") ({rhs_code})")
from cgen import Statement from cgen import Statement
return Statement( return Statement(
"streaming_store(%s + %s, %s)" "streaming_store(%s + %s, %s)"
% ( % (
access_info.array_name, access_info.array_name,
ecm(flattened_sum(new_terms), PREC_NONE, 'i'), ecm(subscript, PREC_NONE, "i"),
rhs_code)) rhs_code))
# }}} # }}}
...@@ -474,17 +536,16 @@ class ISPCASTBuilder(CASTBuilder): ...@@ -474,17 +536,16 @@ class ISPCASTBuilder(CASTBuilder):
return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code) return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)
def emit_sequential_loop(self, codegen_state, iname, iname_dtype, def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
lbound, ubound, inner): lbound, ubound, inner, hints):
ecm = codegen_state.expression_to_code_mapper ecm = codegen_state.expression_to_code_mapper
from loopy.target.c import POD
from pymbolic.mapper.stringifier import PREC_NONE
from cgen import For, InlineInitializer from cgen import For, InlineInitializer
from cgen.ispc import ISPCUniform from cgen.ispc import ISPCUniform
from pymbolic.mapper.stringifier import PREC_NONE
from loopy.target.c import POD
return For( loop = For(
InlineInitializer( InlineInitializer(
ISPCUniform(POD(self, iname_dtype, iname)), ISPCUniform(POD(self, iname_dtype, iname)),
ecm(lbound, PREC_NONE, "i")), ecm(lbound, PREC_NONE, "i")),
...@@ -493,6 +554,12 @@ class ISPCASTBuilder(CASTBuilder): ...@@ -493,6 +554,12 @@ class ISPCASTBuilder(CASTBuilder):
PREC_NONE, "i"), PREC_NONE, "i"),
"++%s" % iname, "++%s" % iname,
inner) inner)
if hints:
return Collection([*list(hints), loop])
else:
return loop
# }}} # }}}
......
"""Python host AST builder for integration with PyOpenCL."""
from __future__ import division, absolute_import
__copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
from pytools import memoize_method
from loopy.target.python import ExpressionToPythonMapper, PythonASTBuilderBase
from loopy.target import TargetBase, DummyHostASTBuilder
from loopy.diagnostic import LoopyWarning
# {{{ base numba
def _base_numba_preamble_generator(preamble_info):
yield ("06_numba_imports", """
import numba as _lpy_numba
""")
class NumbaBaseASTBuilder(PythonASTBuilderBase):
def preamble_generators(self):
return (
super(NumbaBaseASTBuilder, self).preamble_generators() + [
_base_numba_preamble_generator
])
def get_function_definition(self, codegen_state, codegen_result,
schedule_index,
function_decl, function_body):
assert function_decl is None
from genpy import Function
return Function(
codegen_result.current_program(codegen_state).name,
[idi.name for idi in codegen_state.implemented_data_info],
function_body,
decorators=self.get_python_function_decorators())
def get_python_function_decorators(self):
return ()
def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
from pymbolic.mapper.stringifier import PREC_NONE
from genpy import Statement
ecm = self.get_expression_to_code_mapper(codegen_state)
implemented_data_info = codegen_state.implemented_data_info
return Statement(
"%s[%s, %s](%s)" % (
name,
ecm(gsize, PREC_NONE),
ecm(lsize, PREC_NONE),
", ".join(idi.name for idi in implemented_data_info)
))
class NumbaJITASTBuilder(NumbaBaseASTBuilder):
def get_python_function_decorators(self):
return ("@_lpy_numba.jit",)
class NumbaTarget(TargetBase):
"""A target for plain Python as understood by Numba, without any parallel extensions.
"""
def __init__(self):
from warnings import warn
warn("The Numba targets are not yet feature-complete",
LoopyWarning, stacklevel=2)
def split_kernel_at_global_barriers(self):
return False
def get_host_ast_builder(self):
return DummyHostASTBuilder(self)
def get_device_ast_builder(self):
return NumbaJITASTBuilder(self)
# {{{ types
@memoize_method
def get_dtype_registry(self):
from loopy.target.c import DTypeRegistryWrapper
from loopy.target.c.compyte.dtypes import (
DTypeRegistry, fill_registry_with_c_types)
result = DTypeRegistry()
fill_registry_with_c_types(result, respect_windows=False,
include_bool=True)
return DTypeRegistryWrapper(result)
def is_vector_dtype(self, dtype):
return False
def get_vector_dtype(self, base, count):
raise KeyError()
def get_or_register_dtype(self, names, dtype=None):
# These kind of shouldn't be here.
return self.get_dtype_registry().get_or_register_dtype(names, dtype)
def dtype_to_typename(self, dtype):
# These kind of shouldn't be here.
return self.get_dtype_registry().dtype_to_ctype(dtype)
# }}}
# }}}
# {{{ numba.cuda
class NumbaCudaExpressionToPythonMapper(ExpressionToPythonMapper):
_GRID_AXES = "xyz"
def map_group_hw_index(self, expr, enclosing_prec):
return "_lpy_ncu.blockIdx.%s" % self._GRID_AXES[expr.axis]
def map_local_hw_index(self, expr, enclosing_prec):
return "_lpy_ncu.threadIdx.%s" % self._GRID_AXES[expr.axis]
def _cuda_numba_preamble_generator(preamble_info):
yield ("06_import_numba_cuda", """
import numba.cuda as _lpy_ncu
""")
class NumbaCudaASTBuilder(NumbaBaseASTBuilder):
def preamble_generators(self):
return (
super(NumbaCudaASTBuilder, self).preamble_generators() + [
_cuda_numba_preamble_generator
])
def get_python_function_decorators(self):
return ("@_lpy_ncu.jit",)
def get_expression_to_code_mapper(self, codegen_state):
return NumbaCudaExpressionToPythonMapper(codegen_state)
class NumbaCudaTarget(TargetBase):
"""A target for Numba with CUDA extensions.
"""
host_program_name_suffix = ""
device_program_name_suffix = "_inner"
def __init__(self):
from warnings import warn
warn("The Numba target is not yet feature-complete",
LoopyWarning, stacklevel=2)
def split_kernel_at_global_barriers(self):
return True
def get_host_ast_builder(self):
return NumbaBaseASTBuilder(self)
def get_device_ast_builder(self):
return NumbaCudaASTBuilder(self)
# {{{ types
@memoize_method
def get_dtype_registry(self):
from loopy.target.c import DTypeRegistryWrapper
from loopy.target.c.compyte.dtypes import (
DTypeRegistry, fill_registry_with_c_types)
result = DTypeRegistry()
fill_registry_with_c_types(result, respect_windows=False,
include_bool=True)
return DTypeRegistryWrapper(result)
def is_vector_dtype(self, dtype):
return False
def get_vector_dtype(self, base, count):
raise KeyError()
def get_or_register_dtype(self, names, dtype=None):
# These kind of shouldn't be here.
return self.get_dtype_registry().get_or_register_dtype(names, dtype)
def dtype_to_typename(self, dtype):
# These kind of shouldn't be here.
return self.get_dtype_registry().dtype_to_ctype(dtype)
# }}}
# }}}
# vim: foldmethod=marker
"""OpenCL target independent of PyOpenCL.""" """OpenCL target independent of PyOpenCL."""
from __future__ import annotations
from __future__ import division, absolute_import
__copyright__ = "Copyright (C) 2015 Andreas Kloeckner" __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
...@@ -24,30 +24,59 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ...@@ -24,30 +24,59 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
""" """
from typing import TYPE_CHECKING, Literal, Sequence
import numpy as np import numpy as np
from constantdict import constantdict
from loopy.target.c import CTarget, CASTBuilder from pymbolic import var
from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
from pytools import memoize_method from pytools import memoize_method
from loopy.diagnostic import LoopyError
from loopy.diagnostic import LoopyError, LoopyTypeError
from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag, VectorArrayDimTag
from loopy.kernel.data import AddressSpace, ConstantArg, ImageArg
from loopy.kernel.function_interface import ScalarCallable
from loopy.target.c import CFamilyASTBuilder, CFamilyTarget, DTypeRegistryWrapper
from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
from loopy.types import NumpyType from loopy.types import NumpyType
from loopy.target.c import DTypeRegistryWrapper
from loopy.kernel.data import temp_var_scope, CallMangleInfo
from pymbolic import var if TYPE_CHECKING:
from cgen import Declarator, Generable
from loopy.codegen import CodeGenerationState
from loopy.codegen.result import CodeGenerationResult
# {{{ dtype registry wrappers # {{{ dtype registry wrappers
class DTypeRegistryWrapperWithInt8ForBool(DTypeRegistryWrapper):
"""
A DType registry that uses int8 for bool_ types.
.. note::
This sub-class is needed because compyte's type registry does
not support type aliases.
"""
def dtype_to_ctype(self, dtype):
from loopy.types import NumpyType
if isinstance(dtype, NumpyType) and dtype.dtype == np.bool_:
return self.wrapped_registry.dtype_to_ctype(
NumpyType(np.int8))
return self.wrapped_registry.dtype_to_ctype(dtype)
class DTypeRegistryWrapperWithAtomics(DTypeRegistryWrapper): class DTypeRegistryWrapperWithAtomics(DTypeRegistryWrapper):
def get_or_register_dtype(self, names, dtype=None): def get_or_register_dtype(self, names, dtype=None):
if dtype is not None: if dtype is not None:
from loopy.types import AtomicNumpyType, NumpyType from loopy.types import AtomicNumpyType, NumpyType
if isinstance(dtype, AtomicNumpyType): if isinstance(dtype, AtomicNumpyType):
return super(self.wrapped_registry.get_or_register_dtype( return self.wrapped_registry.get_or_register_dtype(
names, NumpyType(dtype.dtype))) names, NumpyType(dtype.dtype))
return super(DTypeRegistryWrapperWithAtomics, self).get_or_register_dtype( return self.wrapped_registry.get_or_register_dtype(names, dtype)
names, dtype)
class DTypeRegistryWrapperWithCL1Atomics(DTypeRegistryWrapperWithAtomics): class DTypeRegistryWrapperWithCL1Atomics(DTypeRegistryWrapperWithAtomics):
...@@ -57,8 +86,7 @@ class DTypeRegistryWrapperWithCL1Atomics(DTypeRegistryWrapperWithAtomics): ...@@ -57,8 +86,7 @@ class DTypeRegistryWrapperWithCL1Atomics(DTypeRegistryWrapperWithAtomics):
if isinstance(dtype, AtomicNumpyType): if isinstance(dtype, AtomicNumpyType):
return "volatile " + self.wrapped_registry.dtype_to_ctype(dtype) return "volatile " + self.wrapped_registry.dtype_to_ctype(dtype)
else: else:
return super(DTypeRegistryWrapperWithCL1Atomics, self).dtype_to_ctype( return self.wrapped_registry.dtype_to_ctype(dtype)
dtype)
# }}} # }}}
...@@ -79,16 +107,16 @@ def _create_vector_types(): ...@@ -79,16 +107,16 @@ def _create_vector_types():
counts = [2, 3, 4, 8, 16] counts = [2, 3, 4, 8, 16]
for base_name, base_type in [ for base_name, base_type in [
('char', np.int8), ("char", np.int8),
('uchar', np.uint8), ("uchar", np.uint8),
('short', np.int16), ("short", np.int16),
('ushort', np.uint16), ("ushort", np.uint16),
('int', np.int32), ("int", np.int32),
('uint', np.uint32), ("uint", np.uint32),
('long', np.int64), ("long", np.int64),
('ulong', np.uint64), ("ulong", np.uint64),
('float', np.float32), ("float", np.float32),
('double', np.float64), ("double", np.float64),
]: ]:
for count in counts: for count in counts:
name = "%s%d" % (base_name, count) name = "%s%d" % (base_name, count)
...@@ -107,10 +135,10 @@ def _create_vector_types(): ...@@ -107,10 +135,10 @@ def _create_vector_types():
titles.extend((len(names)-len(titles))*[None]) titles.extend((len(names)-len(titles))*[None])
try: try:
dtype = np.dtype(dict( dtype = np.dtype({
names=names, "names": names,
formats=[base_type]*padded_count, "formats": [base_type]*padded_count,
titles=titles)) "titles": titles})
except NotImplementedError: except NotImplementedError:
try: try:
dtype = np.dtype([((n, title), base_type) dtype = np.dtype([((n, title), base_type)
...@@ -140,96 +168,288 @@ def _register_vector_types(dtype_registry): ...@@ -140,96 +168,288 @@ def _register_vector_types(dtype_registry):
# {{{ function mangler # {{{ function mangler
_CL_SIMPLE_MULTI_ARG_FUNCTIONS = { _CL_SIMPLE_MULTI_ARG_FUNCTIONS = {
"rsqrt": 1,
"clamp": 3, "clamp": 3,
"atan2": 2, "atan2": 2,
} }
VECTOR_LITERAL_FUNCS = dict( VECTOR_LITERAL_FUNCS = {
("make_%s%d" % (name, count), (name, dtype, count)) "make_%s%d" % (name, count): (name, dtype, count)
for name, dtype in [ for name, dtype in [
('char', np.int8), ("char", np.int8),
('uchar', np.uint8), ("uchar", np.uint8),
('short', np.int16), ("short", np.int16),
('ushort', np.uint16), ("ushort", np.uint16),
('int', np.int32), ("int", np.int32),
('uint', np.uint32), ("uint", np.uint32),
('long', np.int64), ("long", np.int64),
('ulong', np.uint64), ("ulong", np.uint64),
('float', np.float32), ("float", np.float32),
('double', np.float64), ("double", np.float64),
] ]
for count in [2, 3, 4, 8, 16] for count in [2, 3, 4, 8, 16]
) }
def opencl_function_mangler(kernel, name, arg_dtypes): class OpenCLCallable(ScalarCallable):
if not isinstance(name, str): """
return None Records information about OpenCL functions which are not covered by
:class:`loopy.target.c.CMathCallable`.
"""
def with_types(self, arg_id_to_dtype, callables_table):
name = self.name
# {{{ unary functions
if name == "abs":
for id in arg_id_to_dtype:
if not -1 <= id <= 0:
raise LoopyError(f"'{name}' can take only one argument.")
if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
return (
self.copy(arg_id_to_dtype=constantdict(arg_id_to_dtype)),
callables_table)
dtype = arg_id_to_dtype[0].numpy_dtype
if dtype.kind in ("u", "i"):
# OpenCL C 2.2, Section 6.13.3: abs returns *u*gentype
from loopy.types import to_unsigned_dtype
return (self.copy(name_in_target=name,
arg_id_to_dtype=constantdict({
0: NumpyType(dtype),
-1: NumpyType(to_unsigned_dtype(dtype))
})),
callables_table)
elif dtype.kind == "f":
name = "fabs"
else:
raise LoopyTypeError(f"'{name}' does not support type {dtype}")
# deliberately not elif: abs branch above may end up taking this.
if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh",
"tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor",
"erf", "erfc"]:
for id in arg_id_to_dtype:
if not -1 <= id <= 0:
raise LoopyError(f"'{name}' can take only one argument.")
if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
return (
self.copy(arg_id_to_dtype=constantdict(arg_id_to_dtype)),
callables_table)
dtype = arg_id_to_dtype[0]
dtype = dtype.numpy_dtype
if dtype.kind in ("u", "i"):
# ints and unsigned casted to float32
dtype = np.float32
elif dtype.kind == "c":
raise LoopyTypeError(f"{name} does not support type {dtype}")
return (
self.copy(name_in_target=name,
arg_id_to_dtype=constantdict({
0: NumpyType(dtype),
-1: NumpyType(dtype)
})),
callables_table)
# }}}
# binary functions
elif name in ["fmax", "fmin", "atan2", "copysign"]:
for id in arg_id_to_dtype:
if not -1 <= id <= 1:
# FIXME: Do we need to raise here?:
# The pattern we generally follow is that if we don't find
# a function, then we just return None
raise LoopyError("%s can take only two arguments." % name)
if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
return (
self.copy(arg_id_to_dtype=constantdict(arg_id_to_dtype)),
callables_table)
dtype = np.result_type(*[
dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
if id >= 0])
if dtype.kind == "c":
raise LoopyTypeError(f"'{name}' does not support complex numbers")
dtype = NumpyType(dtype)
return (
self.copy(name_in_target=name,
arg_id_to_dtype=constantdict({
-1: dtype, 0: dtype, 1: dtype
})),
callables_table)
elif name in ["max", "min"]:
for id in arg_id_to_dtype:
if not -1 <= id <= 1:
raise LoopyError("%s can take only 2 arguments." % name)
if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype:
return (
self.copy(arg_id_to_dtype=constantdict(arg_id_to_dtype)),
callables_table)
common_dtype = np.result_type(*[
dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
if (id >= 0 and dtype is not None)])
if common_dtype.kind in ["u", "i", "f"]:
if common_dtype.kind == "f":
name = "f"+name
dtype = NumpyType(common_dtype)
return (
self.copy(name_in_target=name,
arg_id_to_dtype=constantdict({
-1: dtype, 0: dtype, 1: dtype
})),
callables_table)
else:
# Unsupported type.
raise LoopyError("%s function not supported for the types %s" %
(name, common_dtype))
elif name == "dot":
for id in arg_id_to_dtype:
if not -1 <= id <= 1:
raise LoopyError(f"'{name}' can take only 2 arguments.")
if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
# the types provided aren't mature enough to specialize the
# callable
return (
self.copy(arg_id_to_dtype=constantdict(arg_id_to_dtype)),
callables_table)
dtype = arg_id_to_dtype[0]
scalar_dtype, _offset, _field_name = dtype.numpy_dtype.fields["s0"]
return (
self.copy(name_in_target=name, arg_id_to_dtype=constantdict({
-1: NumpyType(scalar_dtype), 0: dtype, 1: dtype
})),
callables_table)
elif name == "pow":
for id in arg_id_to_dtype:
if not -1 <= id <= 1:
raise LoopyError(f"'{name}' can take only 2 arguments.")
common_dtype = np.result_type(*[
dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
if (id >= 0 and dtype is not None)])
if common_dtype == np.float64:
name = "powf64"
elif common_dtype == np.float32:
name = "powf32"
else:
raise LoopyTypeError(f"'pow' does not support type {dtype}.")
result_dtype = NumpyType(common_dtype)
return (
self.copy(name_in_target=name,
arg_id_to_dtype=constantdict({
-1: result_dtype,
0: common_dtype,
1: common_dtype
})),
callables_table)
elif name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS:
num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name]
for id in arg_id_to_dtype:
if not -1 <= id < num_args:
raise LoopyError("%s can take only %d arguments." % (name,
num_args))
for i in range(num_args):
if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
# the types provided aren't mature enough to specialize the
# callable
return (
self.copy(arg_id_to_dtype=constantdict(arg_id_to_dtype)),
callables_table)
dtype = np.result_type(*[
dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
if id >= 0])
if dtype.kind == "c":
raise LoopyError("%s does not support complex numbers"
% name)
updated_arg_id_to_dtype = constantdict({
id: NumpyType(dtype) for id in range(-1, num_args)
})
return (
self.copy(name_in_target=name,
arg_id_to_dtype=updated_arg_id_to_dtype),
callables_table)
elif name in VECTOR_LITERAL_FUNCS:
base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name]
for id in arg_id_to_dtype:
if not -1 <= id < count:
raise LoopyError("%s can take only %d arguments." % (name,
num_args))
for i in range(count):
if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
# the types provided aren't mature enough to specialize the
# callable
return (
self.copy(arg_id_to_dtype=constantdict(arg_id_to_dtype)),
callables_table)
updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in range(count)}
updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype(
NumpyType(dtype), count)
return (
self.copy(
name_in_target="(%s%d) " % (base_tp_name, count),
arg_id_to_dtype=constantdict(updated_arg_id_to_dtype)),
callables_table)
# does not satisfy any of the conditions needed for specialization.
# hence just returning a copy of the callable.
return (
self.copy(arg_id_to_dtype=constantdict(arg_id_to_dtype)),
callables_table)
if (name == "abs" def get_opencl_callables():
and len(arg_dtypes) == 1 """
and arg_dtypes[0].numpy_dtype.kind == "f"): Returns an instance of :class:`InKernelCallable` if the function defined by
return CallMangleInfo( *identifier* is known in OpenCL.
target_name="fabs", """
result_dtypes=arg_dtypes, opencl_function_ids = (
arg_dtypes=arg_dtypes) {"dot", "abs",
"cos", "cosh", "sin", "sinh", "tan", "tanh",
if name in ["max", "min"] and len(arg_dtypes) == 2: "acos", "acosh", "asin", "asinh", "atan", "atanh", "atan2",
dtype = np.find_common_type( "pow", "exp", "log", "log10", "sqrt", "ceil", "floor",
[], [dtype.numpy_dtype for dtype in arg_dtypes]) "max", "min", "fmax", "fmin",
"fabs", "erf", "erfc"}
if dtype.kind == "c": | set(_CL_SIMPLE_MULTI_ARG_FUNCTIONS)
raise RuntimeError("min/max do not support complex numbers") | set(VECTOR_LITERAL_FUNCS))
if dtype.kind == "f": return {id_: OpenCLCallable(name=id_) for id_ in
name = "f" + name opencl_function_ids}
result_dtype = NumpyType(dtype)
return CallMangleInfo(
target_name=name,
result_dtypes=(result_dtype,),
arg_dtypes=2*(result_dtype,))
if name == "dot":
scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"]
return CallMangleInfo(
target_name=name,
result_dtypes=(NumpyType(scalar_dtype),),
arg_dtypes=(arg_dtypes[0],)*2)
if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS:
num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name]
if len(arg_dtypes) != num_args:
raise LoopyError("%s takes %d arguments (%d received)"
% (name, num_args, len(arg_dtypes)))
dtype = np.find_common_type(
[], [dtype.numpy_dtype for dtype in arg_dtypes])
if dtype.kind == "c":
raise LoopyError("%s does not support complex numbers"
% name)
result_dtype = NumpyType(dtype)
return CallMangleInfo(
target_name=name,
result_dtypes=(result_dtype,),
arg_dtypes=(result_dtype,)*num_args)
if name in VECTOR_LITERAL_FUNCS:
base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name]
if count != len(arg_dtypes):
return None
return CallMangleInfo(
target_name="(%s%d) " % (base_tp_name, count),
result_dtypes=(kernel.target.vector_dtype(
NumpyType(dtype), count),),
arg_dtypes=(NumpyType(dtype),)*count)
return None
# }}} # }}}
...@@ -253,6 +473,8 @@ def opencl_symbol_mangler(kernel, name): ...@@ -253,6 +473,8 @@ def opencl_symbol_mangler(kernel, name):
return NumpyType(np.dtype(np.int32)), name return NumpyType(np.dtype(np.int32)), name
elif name.startswith("LONG_"): elif name.startswith("LONG_"):
return NumpyType(np.dtype(np.int64)), name return NumpyType(np.dtype(np.int64)), name
elif name == "HUGE_VAL":
return NumpyType(np.dtype(np.float64)), name
else: else:
return None return None
...@@ -289,12 +511,26 @@ def opencl_preamble_generator(preamble_info): ...@@ -289,12 +511,26 @@ def opencl_preamble_generator(preamble_info):
from loopy.tools import remove_common_indentation from loopy.tools import remove_common_indentation
kernel = preamble_info.kernel kernel = preamble_info.kernel
idx_ctype = kernel.target.dtype_to_typename(kernel.index_dtype)
yield ("00_declare_gid_lid", yield ("00_declare_gid_lid",
remove_common_indentation(""" remove_common_indentation(f"""
#define lid(N) ((%(idx_ctype)s) get_local_id(N)) #define lid(N) (({idx_ctype}) get_local_id(N))
#define gid(N) ((%(idx_ctype)s) get_group_id(N)) #define gid(N) (({idx_ctype}) get_group_id(N))
""" % dict(idx_ctype=kernel.target.dtype_to_typename( """))
kernel.index_dtype))))
for func in preamble_info.seen_functions:
if func.name == "pow" and func.c_name == "powf32":
yield ("08_clpowf32", """
inline float powf32(float x, float y) {
return pow(x, y);
}""")
if func.name == "pow" and func.c_name == "powf64":
yield ("08_clpowf64", """
inline double powf64(double x, double y) {
return pow(x, y);
}""")
# }}} # }}}
...@@ -302,6 +538,15 @@ def opencl_preamble_generator(preamble_info): ...@@ -302,6 +538,15 @@ def opencl_preamble_generator(preamble_info):
# {{{ expression mapper # {{{ expression mapper
class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper): class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
def wrap_in_typecast(self, actual_type, needed_dtype, s):
if needed_dtype.dtype.kind == "b" and actual_type.dtype.kind == "f":
# CL does not perform implicit conversion from float-type to a bool.
from pymbolic.primitives import Comparison
return Comparison(s, "!=", 0)
return super().wrap_in_typecast(actual_type, needed_dtype, s)
def map_group_hw_index(self, expr, type_context): def map_group_hw_index(self, expr, type_context):
return var("gid")(expr.axis) return var("gid")(expr.axis)
...@@ -313,18 +558,21 @@ class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper): ...@@ -313,18 +558,21 @@ class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
# {{{ target # {{{ target
class OpenCLTarget(CTarget): class OpenCLTarget(CFamilyTarget):
"""A target for the OpenCL C heterogeneous compute programming language. """A target for the OpenCL C heterogeneous compute programming language.
""" """
def __init__(self, atomics_flavor=None): def __init__(self, atomics_flavor=None, use_int8_for_bool=True):
""" """
:arg atomics_flavor: one of ``"cl1"`` (C11-style atomics from OpenCL 2.0), :arg atomics_flavor: one of ``"cl1"`` (C11-style atomics from OpenCL 2.0),
``"cl1"`` (OpenCL 1.1 atomics, using bit-for-bit compare-and-swap ``"cl1"`` (OpenCL 1.1 atomics, using bit-for-bit compare-and-swap
for floating point), ``"cl1-exch"`` (OpenCL 1.1 atomics, using for floating point), ``"cl1-exch"`` (OpenCL 1.1 atomics, using
double-exchange for floating point--not yet supported). double-exchange for floating point--not yet supported).
:arg use_int8_for_bool: Size of *bool* is undefined as per
OpenCL spec, if *True* all bool_ variables would be treated
as int8's.
""" """
super(OpenCLTarget, self).__init__() super().__init__()
if atomics_flavor is None: if atomics_flavor is None:
atomics_flavor = "cl1" atomics_flavor = "cl1"
...@@ -333,6 +581,7 @@ class OpenCLTarget(CTarget): ...@@ -333,6 +581,7 @@ class OpenCLTarget(CTarget):
raise ValueError("unsupported atomics flavor: %s" % atomics_flavor) raise ValueError("unsupported atomics flavor: %s" % atomics_flavor)
self.atomics_flavor = atomics_flavor self.atomics_flavor = atomics_flavor
self.use_int8_for_bool = use_int8_for_bool
def split_kernel_at_global_barriers(self): def split_kernel_at_global_barriers(self):
return True return True
...@@ -342,8 +591,10 @@ class OpenCLTarget(CTarget): ...@@ -342,8 +591,10 @@ class OpenCLTarget(CTarget):
@memoize_method @memoize_method
def get_dtype_registry(self): def get_dtype_registry(self):
from loopy.target.c.compyte.dtypes import (DTypeRegistry, from loopy.target.c.compyte.dtypes import (
fill_registry_with_opencl_c_types) DTypeRegistry,
fill_registry_with_opencl_c_types,
)
result = DTypeRegistry() result = DTypeRegistry()
fill_registry_with_opencl_c_types(result) fill_registry_with_opencl_c_types(result)
...@@ -353,70 +604,78 @@ class OpenCLTarget(CTarget): ...@@ -353,70 +604,78 @@ class OpenCLTarget(CTarget):
_register_vector_types(result) _register_vector_types(result)
if self.atomics_flavor == "cl1": if self.atomics_flavor == "cl1":
return DTypeRegistryWrapperWithCL1Atomics(result) result = DTypeRegistryWrapperWithCL1Atomics(result)
else: else:
raise NotImplementedError("atomics flavor: %s" % self.atomics_flavor) raise NotImplementedError("atomics flavor: %s" % self.atomics_flavor)
if self.use_int8_for_bool:
result = DTypeRegistryWrapperWithInt8ForBool(result)
return result
def is_vector_dtype(self, dtype): def is_vector_dtype(self, dtype):
return (isinstance(dtype, NumpyType) return (isinstance(dtype, NumpyType)
and dtype.numpy_dtype in list(vec.types.values())) and dtype.numpy_dtype in list(vec.types.values()))
def vector_dtype(self, base, count): def vector_dtype(self, base, count):
return NumpyType( return NumpyType(vec.types[base.numpy_dtype, count])
vec.types[base.numpy_dtype, count],
target=self)
# }}}
# }}} # }}}
# {{{ ast builder # {{{ ast builder
class OpenCLCASTBuilder(CASTBuilder): class OpenCLCASTBuilder(CFamilyASTBuilder):
# {{{ library # {{{ library
def function_manglers(self): @property
return ( def known_callables(self):
super(OpenCLCASTBuilder, self).function_manglers() + [ callables = super().known_callables
opencl_function_mangler callables.update(get_opencl_callables())
]) return callables
def symbol_manglers(self): def symbol_manglers(self):
return ( return (
super(OpenCLCASTBuilder, self).symbol_manglers() + [ [*super().symbol_manglers(), opencl_symbol_mangler])
opencl_symbol_mangler
])
def preamble_generators(self): def preamble_generators(self):
from loopy.library.reduction import reduction_preamble_generator
return ( return (
super(OpenCLCASTBuilder, self).preamble_generators() + [ [*super().preamble_generators(), opencl_preamble_generator])
opencl_preamble_generator,
reduction_preamble_generator,
])
# }}} # }}}
# {{{ top-level codegen # {{{ top-level codegen
def get_function_declaration(self, codegen_state, codegen_result, def get_function_declaration(
schedule_index): self, codegen_state: CodeGenerationState,
fdecl = super(OpenCLCASTBuilder, self).get_function_declaration( codegen_result: CodeGenerationResult, schedule_index: int
) -> tuple[Sequence[tuple[str, str]], Generable]:
preambles, fdecl = super().get_function_declaration(
codegen_state, codegen_result, schedule_index) codegen_state, codegen_result, schedule_index)
from loopy.target.c import FunctionDeclarationWrapper from loopy.target.c import FunctionDeclarationWrapper
assert isinstance(fdecl, FunctionDeclarationWrapper) assert isinstance(fdecl, FunctionDeclarationWrapper)
fdecl = fdecl.subdecl if not codegen_state.is_entrypoint:
# auxiliary kernels need not mention opencl specific qualifiers
# for a functions signature
return preambles, fdecl
return preambles, FunctionDeclarationWrapper(
self._wrap_kernel_decl(codegen_state, schedule_index, fdecl.subdecl))
def _wrap_kernel_decl(
self, codegen_state: CodeGenerationState, schedule_index: int,
fdecl: Declarator) -> Declarator:
from cgen.opencl import CLKernel, CLRequiredWorkGroupSize from cgen.opencl import CLKernel, CLRequiredWorkGroupSize
fdecl = CLKernel(fdecl) fdecl = CLKernel(fdecl)
from loopy.schedule import get_insn_ids_for_block_at from loopy.schedule import get_insn_ids_for_block_at
assert codegen_state.kernel.linearization is not None
_, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
get_insn_ids_for_block_at( get_insn_ids_for_block_at(
codegen_state.kernel.schedule, schedule_index)) codegen_state.kernel.linearization, schedule_index),
codegen_state.callables_table)
from loopy.symbolic import get_dependencies from loopy.symbolic import get_dependencies
if not get_dependencies(local_sizes): if not get_dependencies(local_sizes):
...@@ -425,12 +684,12 @@ class OpenCLCASTBuilder(CASTBuilder): ...@@ -425,12 +684,12 @@ class OpenCLCASTBuilder(CASTBuilder):
fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl) fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl)
return FunctionDeclarationWrapper(fdecl) return fdecl
def generate_top_of_body(self, codegen_state): def generate_top_of_body(self, codegen_state):
from loopy.kernel.data import ImageArg from loopy.kernel.data import ImageArg
if any(isinstance(arg, ImageArg) for arg in codegen_state.kernel.args): if any(isinstance(arg, ImageArg) for arg in codegen_state.kernel.args):
from cgen import Value, Const, Initializer from cgen import Const, Initializer, Value
return [ return [
Initializer(Const(Value("sampler_t", "loopy_sampler")), Initializer(Const(Value("sampler_t", "loopy_sampler")),
"CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP " "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP "
...@@ -441,8 +700,6 @@ class OpenCLCASTBuilder(CASTBuilder): ...@@ -441,8 +700,6 @@ class OpenCLCASTBuilder(CASTBuilder):
# }}} # }}}
# {{{ code generation guts
def get_expression_to_c_expression_mapper(self, codegen_state): def get_expression_to_c_expression_mapper(self, codegen_state):
return ExpressionToOpenCLCExpressionMapper(codegen_state) return ExpressionToOpenCLCExpressionMapper(codegen_state)
...@@ -450,64 +707,102 @@ class OpenCLCASTBuilder(CASTBuilder): ...@@ -450,64 +707,102 @@ class OpenCLCASTBuilder(CASTBuilder):
# The 'int' avoids an 'L' suffix for long ints. # The 'int' avoids an 'L' suffix for long ints.
return access_expr.attr("s%s" % hex(int(index))[2:]) return access_expr.attr("s%s" % hex(int(index))[2:])
def emit_barrier(self, kind, comment): def emit_barrier(self, synchronization_kind, mem_kind, comment):
""" """
:arg kind: ``"local"`` or ``"global"`` :arg kind: ``"local"`` or ``"global"``
:return: a :class:`loopy.codegen.GeneratedInstruction`. :return: a :class:`loopy.codegen.GeneratedInstruction`.
""" """
if kind == "local": if synchronization_kind == "local":
if comment: if comment:
comment = " /* %s */" % comment comment = " /* %s */" % comment
mem_kind = mem_kind.upper()
from cgen import Statement from cgen import Statement
return Statement("barrier(CLK_LOCAL_MEM_FENCE)%s" % comment) return Statement(f"barrier(CLK_{mem_kind}_MEM_FENCE){comment}")
elif kind == "global": elif synchronization_kind == "global":
raise LoopyError("OpenCL does not have global barriers") raise LoopyError("OpenCL does not have global barriers")
else: else:
raise LoopyError("unknown barrier kind") raise LoopyError("unknown barrier kind")
def wrap_temporary_decl(self, decl, scope): # {{{ declarators
if scope == temp_var_scope.LOCAL:
from cgen.opencl import CLLocal def wrap_decl_for_address_space(
self, decl: Declarator, address_space: AddressSpace) -> Declarator:
from cgen.opencl import CLGlobal, CLLocal
if address_space == AddressSpace.GLOBAL:
return CLGlobal(decl)
elif address_space == AddressSpace.LOCAL:
return CLLocal(decl) return CLLocal(decl)
elif scope == temp_var_scope.PRIVATE: elif address_space == AddressSpace.PRIVATE:
return decl return decl
else: else:
raise ValueError("unexpected temporary variable scope: %s" raise ValueError("unexpected temporary variable address space: %s"
% scope) % address_space)
def wrap_global_constant(self, decl: Declarator) -> Declarator:
from cgen.opencl import CLConstant, CLGlobal
assert isinstance(decl, CLGlobal)
decl = decl.subdecl
def wrap_global_constant(self, decl):
from cgen.opencl import CLConstant
return CLConstant(decl) return CLConstant(decl)
def get_global_arg_decl(self, name, shape, dtype, is_written): # duplicated in CUDA, update there if updating here
from cgen.opencl import CLGlobal def get_array_base_declarator(self, ary: ArrayBase) -> Declarator:
dtype = ary.dtype
return CLGlobal(super(OpenCLCASTBuilder, self).get_global_arg_decl( vec_size = ary.vector_length()
name, shape, dtype, is_written)) if vec_size > 1:
dtype = self.target.vector_dtype(dtype, vec_size)
def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): if ary.dim_tags:
if is_written: for dim_tag in ary.dim_tags:
mode = "w" if isinstance(dim_tag, (FixedStrideArrayDimTag, VectorArrayDimTag)):
else: # we're OK with those
mode = "r" pass
from cgen.opencl import CLImage else:
return CLImage(num_target_axes, mode, name) raise NotImplementedError(
f"{type(self).__name__} does not understand axis tag "
f"'{type(dim_tag)}.")
def get_constant_arg_decl(self, name, shape, dtype, is_written): from loopy.target.c import POD
from loopy.target.c import POD # uses the correct complex type return POD(self, dtype, ary.name)
from cgen import RestrictPointer, Const
def get_constant_arg_declarator(self, arg: ConstantArg) -> Declarator:
from cgen import RestrictPointer
from cgen.opencl import CLConstant from cgen.opencl import CLConstant
arg_decl = RestrictPointer(POD(self, dtype, name)) # constant *is* an address space as far as CL is concerned, do not re-wrap
return CLConstant(RestrictPointer(self.get_array_base_declarator(
arg)))
def get_image_arg_declarator(
self, arg: ImageArg, is_written: bool) -> Declarator:
from cgen.opencl import CLImage
mode: Literal["r", "w"] = "w" if is_written else "r"
return CLImage(arg.num_target_axes(), mode, arg.name)
# }}}
if not is_written: # {{{ atomics
arg_decl = Const(arg_decl)
return CLConstant(arg_decl) def emit_atomic_init(self, codegen_state, lhs_atomicity, lhs_var,
lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
# for the CL1 flavor, this is as simple as a regular update with whatever
# the RHS value is...
# {{{ code generation for atomic update return self.emit_atomic_update(codegen_state, lhs_atomicity, lhs_var,
lhs_expr, rhs_expr, lhs_dtype, rhs_type_context)
def emit_unroll_hint(self, value):
# See https://man.opencl.org/attributes-loopUnroll.html
from cgen import Line
if value:
return Line(f"__attribute__((opencl_unroll_hint({value})))")
else:
return Line("__attribute__((opencl_unroll_hint))")
def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
...@@ -518,22 +813,26 @@ class OpenCLCASTBuilder(CASTBuilder): ...@@ -518,22 +813,26 @@ class OpenCLCASTBuilder(CASTBuilder):
if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [ if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
np.int32, np.int64, np.float32, np.float64]: np.int32, np.int64, np.float32, np.float64]:
from cgen import Block, DoWhile, Assign from cgen import Assign, Block, DoWhile
from loopy.target.c import POD from loopy.target.c import POD
old_val_var = codegen_state.var_name_generator("loopy_old_val") old_val_var = codegen_state.var_name_generator("loopy_old_val")
new_val_var = codegen_state.var_name_generator("loopy_new_val") new_val_var = codegen_state.var_name_generator("loopy_new_val")
from loopy.kernel.data import TemporaryVariable, temp_var_scope from loopy.kernel.data import AddressSpace, TemporaryVariable
ecm = codegen_state.expression_to_code_mapper.with_assignments( ecm = codegen_state.expression_to_code_mapper.with_assignments(
{ {
old_val_var: TemporaryVariable(old_val_var, lhs_dtype), old_val_var: TemporaryVariable(old_val_var, lhs_dtype,
new_val_var: TemporaryVariable(new_val_var, lhs_dtype), shape=()),
new_val_var: TemporaryVariable(new_val_var, lhs_dtype,
shape=()),
}) })
lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None) lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None)
from pymbolic.mapper.substitutor import make_subst_func
from pymbolic import var from pymbolic import var
from pymbolic.mapper.substitutor import make_subst_func
from loopy.symbolic import SubstitutionMapper from loopy.symbolic import SubstitutionMapper
subst = SubstitutionMapper( subst = SubstitutionMapper(
...@@ -559,32 +858,40 @@ class OpenCLCASTBuilder(CASTBuilder): ...@@ -559,32 +858,40 @@ class OpenCLCASTBuilder(CASTBuilder):
elif lhs_dtype.numpy_dtype == np.float64: elif lhs_dtype.numpy_dtype == np.float64:
ctype = "long" ctype = "long"
else: else:
assert False raise AssertionError()
from loopy.kernel.data import TemporaryVariable, GlobalArg from loopy.kernel.data import ArrayArg, TemporaryVariable
if isinstance(lhs_var, GlobalArg): if (
isinstance(lhs_var, ArrayArg)
and
lhs_var.address_space == AddressSpace.GLOBAL):
var_kind = "__global" var_kind = "__global"
elif (
isinstance(lhs_var, ArrayArg)
and
lhs_var.address_space == AddressSpace.LOCAL):
var_kind = "__local"
elif ( elif (
isinstance(lhs_var, TemporaryVariable) isinstance(lhs_var, TemporaryVariable)
and lhs_var.scope == temp_var_scope.LOCAL): and lhs_var.address_space == AddressSpace.LOCAL):
var_kind = "__local" var_kind = "__local"
elif ( elif (
isinstance(lhs_var, TemporaryVariable) isinstance(lhs_var, TemporaryVariable)
and lhs_var.scope == temp_var_scope.GLOBAL): and lhs_var.address_space == AddressSpace.GLOBAL):
var_kind = "__global" var_kind = "__global"
else: else:
raise LoopyError("unexpected kind of variable '%s' in " raise LoopyError("unexpected kind of variable '%s' in "
"atomic operation: " "atomic operation: '%s'"
% (lhs_var.name, type(lhs_var).__name__)) % (lhs_var.name, type(lhs_var).__name__))
old_val = "*(%s *) &" % ctype + old_val old_val = "*(%s *) &" % ctype + old_val
new_val = "*(%s *) &" % ctype + new_val new_val = "*(%s *) &" % ctype + new_val
cast_str = "(%s %s *) " % (var_kind, ctype) cast_str = f"({var_kind} {ctype} *) "
return Block([ return Block([
POD(self, NumpyType(lhs_dtype.dtype, target=self.target), POD(self, NumpyType(lhs_dtype.dtype),
old_val_var), old_val_var),
POD(self, NumpyType(lhs_dtype.dtype, target=self.target), POD(self, NumpyType(lhs_dtype.dtype),
new_val_var), new_val_var),
DoWhile( DoWhile(
"%(func_name)s(" "%(func_name)s("
...@@ -610,7 +917,44 @@ class OpenCLCASTBuilder(CASTBuilder): ...@@ -610,7 +917,44 @@ class OpenCLCASTBuilder(CASTBuilder):
# }}} # }}}
# }}} # }}}
# {{{ volatile mem access target
class VolatileMemExpressionToOpenCLCExpressionMapper(
ExpressionToOpenCLCExpressionMapper):
def make_subscript(self, array, base_expr, subscript):
registry = self.codegen_state.ast_builder.target.get_dtype_registry()
from loopy.kernel.data import AddressSpace
if array.address_space == AddressSpace.GLOBAL:
aspace = "__global "
elif array.address_space == AddressSpace.LOCAL:
aspace = "__local "
elif array.address_space == AddressSpace.PRIVATE:
aspace = ""
else:
raise ValueError("unexpected value of address space")
from pymbolic import var
return var(
"(%s volatile %s *) "
% (
registry.dtype_to_ctype(array.dtype),
aspace,
)
)(base_expr)[subscript]
class VolatileMemOpenCLCASTBuilder(OpenCLCASTBuilder):
def get_expression_to_c_expression_mapper(self, codegen_state):
return VolatileMemExpressionToOpenCLCExpressionMapper(codegen_state)
class VolatileMemOpenCLTarget(OpenCLTarget):
def get_device_ast_builder(self):
return VolatileMemOpenCLCASTBuilder(self)
# }}} # }}}
......
"""OpenCL target integrated with PyOpenCL.""" from __future__ import annotations
from __future__ import division, absolute_import
import sys """OpenCL target integrated with PyOpenCL."""
__copyright__ = "Copyright (C) 2015 Andreas Kloeckner" __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
...@@ -26,206 +25,166 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ...@@ -26,206 +25,166 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
""" """
import six import logging
from six.moves import range from typing import TYPE_CHECKING, Any, Sequence, cast
from warnings import warn
import numpy as np import numpy as np
from constantdict import constantdict
from loopy.kernel.data import CallMangleInfo
from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder import pymbolic.primitives as p
from cgen import (
Block,
Collection,
Const,
Declarator,
FunctionBody,
Generable,
Initializer,
Line,
Pointer,
)
from cgen.opencl import CLGlobal
from loopy.diagnostic import LoopyError, LoopyTypeError
from loopy.kernel.data import (
ArrayArg,
ConstantArg,
ImageArg,
TemporaryVariable,
ValueArg,
)
from loopy.kernel.function_interface import ScalarCallable
from loopy.schedule import CallKernel
from loopy.target.opencl import (
ExpressionToOpenCLCExpressionMapper,
OpenCLCASTBuilder,
OpenCLTarget,
)
from loopy.target.python import PythonASTBuilderBase from loopy.target.python import PythonASTBuilderBase
from loopy.types import NumpyType from loopy.types import NumpyType
from loopy.diagnostic import LoopyError, warn_with_kernel
from warnings import warn
import logging
logger = logging.getLogger(__name__)
# {{{ temp storage adjust for bank conflict logger = logging.getLogger(__name__)
def adjust_local_temp_var_storage(kernel, device): if TYPE_CHECKING:
import genpy
import pyopencl as cl import pyopencl as cl
import pyopencl.characterize as cl_char
logger.debug("%s: adjust temp var storage" % kernel.name)
new_temp_vars = {}
from loopy.kernel.data import temp_var_scope
lmem_size = cl_char.usable_local_mem_size(device)
for temp_var in six.itervalues(kernel.temporary_variables):
if temp_var.scope != temp_var_scope.LOCAL:
new_temp_vars[temp_var.name] = \
temp_var.copy(storage_shape=temp_var.shape)
continue
other_loctemp_nbytes = [
tv.nbytes
for tv in six.itervalues(kernel.temporary_variables)
if tv.scope == temp_var_scope.LOCAL
and tv.name != temp_var.name]
storage_shape = temp_var.storage_shape
if storage_shape is None:
storage_shape = temp_var.shape
storage_shape = list(storage_shape)
# sizes of all dims except the last one, which we may change
# below to avoid bank conflicts
from pytools import product
if device.local_mem_type == cl.device_local_mem_type.GLOBAL:
# FIXME: could try to avoid cache associativity disasters
new_storage_shape = storage_shape
elif device.local_mem_type == cl.device_local_mem_type.LOCAL:
min_mult = cl_char.local_memory_bank_count(device)
good_incr = None
new_storage_shape = storage_shape
min_why_not = None
for increment in range(storage_shape[-1]//2):
test_storage_shape = storage_shape[:]
test_storage_shape[-1] = test_storage_shape[-1] + increment
new_mult, why_not = cl_char.why_not_local_access_conflict_free(
device, temp_var.dtype.itemsize,
temp_var.shape, test_storage_shape)
# will choose smallest increment 'automatically'
if new_mult < min_mult:
new_lmem_use = (sum(other_loctemp_nbytes)
+ temp_var.dtype.itemsize*product(test_storage_shape))
if new_lmem_use < lmem_size:
new_storage_shape = test_storage_shape
min_mult = new_mult
min_why_not = why_not
good_incr = increment
if min_mult != 1:
from warnings import warn
from loopy.diagnostic import LoopyAdvisory
warn("could not find a conflict-free mem layout "
"for local variable '%s' "
"(currently: %dx conflict, increment: %s, reason: %s)"
% (temp_var.name, min_mult, good_incr, min_why_not),
LoopyAdvisory)
else:
from warnings import warn
warn("unknown type of local memory")
new_storage_shape = storage_shape from loopy.codegen import CodeGenerationState
from loopy.codegen.result import CodeGenerationResult
from loopy.kernel import LoopKernel
from loopy.target.pyopencl_execution import PyOpenCLExecutor
from loopy.translation_unit import FunctionIdT, TranslationUnit
from loopy.typing import Expression
new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape)
return kernel.copy(temporary_variables=new_temp_vars) # {{{ pyopencl function scopers
# }}} class PyOpenCLCallable(ScalarCallable):
"""
Records information about the callables which are not covered by
:class:`loopy.target.opencl.OpenCLCallable`
"""
def with_types(self, arg_id_to_dtype, callables_table):
name = self.name
for id in arg_id_to_dtype:
# since all the below functions are single arg.
if not -1 <= id <= 0:
raise LoopyError(f"{name} can only take one argument")
if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
# the types provided aren't mature enough to specialize the
# callable
return (
self.copy(arg_id_to_dtype=constantdict(arg_id_to_dtype)),
callables_table)
dtype = arg_id_to_dtype[0]
if name in ["real", "imag", "abs"]:
if dtype.is_complex():
if dtype.numpy_dtype == np.complex64:
tpname = "cfloat"
elif dtype.numpy_dtype == np.complex128:
tpname = "cdouble"
else:
raise LoopyTypeError(f"unexpected complex type '{dtype}'")
return (
self.copy(name_in_target=f"{tpname}_{name}",
arg_id_to_dtype=constantdict({
0: dtype,
-1: NumpyType(np.dtype(dtype.numpy_dtype.type(0).real))
})),
callables_table)
if name in ["real", "imag", "conj"]:
if not dtype.is_complex():
tpname = dtype.numpy_dtype.type.__name__
return (
self.copy(
name_in_target=f"_lpy_{name}_{tpname}",
arg_id_to_dtype=constantdict({0: dtype, -1: dtype})),
callables_table)
if name in ["sqrt", "exp", "log",
"sin", "cos", "tan",
"sinh", "cosh", "tanh",
"conj"]:
if dtype.is_complex():
# function parameters are complex.
if dtype.numpy_dtype == np.complex64:
tpname = "cfloat"
elif dtype.numpy_dtype == np.complex128:
tpname = "cdouble"
else:
raise LoopyTypeError("unexpected complex type '%s'" % dtype)
return (
self.copy(name_in_target=f"{tpname}_{name}",
arg_id_to_dtype=constantdict({0: dtype, -1: dtype})),
callables_table)
# fall back to pure OpenCL for real-valued arguments
from loopy.target.opencl import OpenCLCallable
return OpenCLCallable(name,
arg_id_to_dtype=self.arg_id_to_dtype,
arg_id_to_descr=self.arg_id_to_descr,
name_in_target=self.name_in_target).with_types(
arg_id_to_dtype, callables_table)
def generate_preambles(self, target):
name = self.name_in_target
if (name.startswith("_lpy_real")
or name.startswith("_lpy_conj")
or name.startswith("_lpy_imag")):
if name.startswith("_lpy_real") or name.startswith("_lpy_conj"):
ret = "x"
else:
ret = "0"
dtype = self.arg_id_to_dtype[-1]
ctype = target.dtype_to_typename(dtype)
# {{{ check sizes against device properties yield (f"40_{name}", f"""
static inline {ctype} {name}({ctype} x) {{
return {ret};
}}
""")
def check_sizes(kernel, device):
import loopy as lp
from loopy.diagnostic import LoopyAdvisory, LoopyError def get_pyopencl_callables():
pyopencl_ids = ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh",
if device is None: "tanh", "conj", "real", "imag", "abs"]
warn_with_kernel(kernel, "no_device_in_pre_codegen_checks", return {id_: PyOpenCLCallable(name=id_) for id_ in pyopencl_ids}
"No device parameter was passed to the PyOpenCLTarget. "
"Perhaps you want to pass a device to benefit from "
"additional checking.", LoopyAdvisory)
return
parameters = {}
for arg in kernel.args:
if isinstance(arg, lp.ValueArg) and arg.approximately is not None:
parameters[arg.name] = arg.approximately
glens, llens = kernel.get_grid_size_upper_bounds_as_exprs()
if (max(len(glens), len(llens))
> device.max_work_item_dimensions):
raise LoopyError("too many work item dimensions")
from pymbolic import evaluate
from pymbolic.mapper.evaluator import UnknownVariableError
try:
glens = evaluate(glens, parameters)
llens = evaluate(llens, parameters)
except UnknownVariableError as name:
from warnings import warn
warn("could not check axis bounds because no value "
"for variable '%s' was passed to check_kernels()"
% name, LoopyAdvisory)
else:
for i in range(len(llens)):
if llens[i] > device.max_work_item_sizes[i]:
raise LoopyError("group axis %d too big" % i)
from pytools import product
if product(llens) > device.max_work_group_size:
raise LoopyError("work group too big")
local_mem_use = kernel.local_mem_use()
from pyopencl.characterize import usable_local_mem_size
import numbers
if isinstance(local_mem_use, numbers.Integral):
if local_mem_use > usable_local_mem_size(device):
raise LoopyError("using too much local memory")
else:
warn_with_kernel(kernel, "non_constant_local_mem",
"The amount of local memory used by the kernel "
"is not a constant. This will likely cause problems.")
from loopy.kernel.data import ConstantArg
const_arg_count = sum(
1 for arg in kernel.args
if isinstance(arg, ConstantArg))
if const_arg_count > device.max_constant_args:
raise LoopyError("too many constant arguments")
# }}} # }}}
def pyopencl_function_mangler(target, name, arg_dtypes):
if len(arg_dtypes) == 1 and isinstance(name, str):
arg_dtype, = arg_dtypes
if arg_dtype.is_complex():
if arg_dtype.numpy_dtype == np.complex64:
tpname = "cfloat"
elif arg_dtype.numpy_dtype == np.complex128:
tpname = "cdouble"
else:
raise RuntimeError("unexpected complex type '%s'" % arg_dtype)
if name in ["sqrt", "exp", "log",
"sin", "cos", "tan",
"sinh", "cosh", "tanh",
"conj"]:
return CallMangleInfo(
target_name="%s_%s" % (tpname, name),
result_dtypes=(arg_dtype,),
arg_dtypes=(arg_dtype,))
if name in ["real", "imag", "abs"]:
return CallMangleInfo(
target_name="%s_%s" % (tpname, name),
result_dtypes=(NumpyType(
np.dtype(arg_dtype.numpy_dtype.type(0).real)),
),
arg_dtypes=(arg_dtype,))
return None
# {{{ preamble generator # {{{ preamble generator
def pyopencl_preamble_generator(preamble_info): def pyopencl_preamble_generator(preamble_info):
...@@ -244,29 +203,301 @@ def pyopencl_preamble_generator(preamble_info): ...@@ -244,29 +203,301 @@ def pyopencl_preamble_generator(preamble_info):
if has_double: if has_double:
yield ("10_include_complex_header", """ yield ("10_include_complex_header", """
#define PYOPENCL_DEFINE_CDOUBLE #define PYOPENCL_DEFINE_CDOUBLE
#ifndef PYOPENCL_COMPLEX_ENABLE_EXTENDED_ALIGNMENT
#define PYOPENCL_COMPLEX_ENABLE_EXTENDED_ALIGNMENT 1
#endif
#include <pyopencl-complex.h> #include <pyopencl-complex.h>
""") """)
else: else:
yield ("10_include_complex_header", """ yield ("10_include_complex_header", """
#ifndef PYOPENCL_COMPLEX_ENABLE_EXTENDED_ALIGNMENT
#define PYOPENCL_COMPLEX_ENABLE_EXTENDED_ALIGNMENT 1
#endif
#include <pyopencl-complex.h> #include <pyopencl-complex.h>
""") """)
# }}} # }}}
# {{{ pyopencl tools # {{{ expression mapper
class ExpressionToPyOpenCLCExpressionMapper(ExpressionToOpenCLCExpressionMapper):
def complex_type_name(self, dtype):
from loopy.types import NumpyType
if not isinstance(dtype, NumpyType):
raise LoopyError("'%s' is not a complex type" % dtype)
if dtype.dtype == np.complex64:
return "cfloat"
if dtype.dtype == np.complex128:
return "cdouble"
else:
raise RuntimeError
def wrap_in_typecast(self, actual_type, needed_type, s):
if (actual_type.is_complex() and needed_type.is_complex()
and actual_type != needed_type):
return p.Variable("%s_cast" % self.complex_type_name(needed_type))(s)
elif not actual_type.is_complex() and needed_type.is_complex():
return p.Variable("%s_fromreal" % self.complex_type_name(needed_type))(
s)
else:
return super().wrap_in_typecast(actual_type, needed_type, s)
def map_sum(self, expr, type_context):
# I've added 'type_context == "i"' because of the following
# idiotic corner case: Code generation for subscripts comes
# through here, and it may involve variables that we know
# nothing about (offsets and such). If we fall into the allow_complex
# branch, we'll try to do type inference on these variables,
# and stuff breaks. This band-aid works around that. -AK
if not self.allow_complex or type_context == "i":
return super().map_sum(expr, type_context)
tgt_dtype = self.infer_type(expr)
is_complex = tgt_dtype.is_complex()
if not is_complex:
return super().map_sum(expr, type_context)
elif not self.kernel.options.allow_fp_reordering:
if len(expr.children) == 0:
return tgt_dtype(0)
tgt_name = self.complex_type_name(tgt_dtype)
result = None
lhs_is_complex = False
for child in expr.children:
rhs_is_complex = self.infer_type(child).is_complex()
if rhs_is_complex:
child_val = self.rec(child, type_context, tgt_dtype)
else:
child_val = self.rec(child, type_context)
if result is None:
result = child_val
elif lhs_is_complex and rhs_is_complex:
result = p.Variable(f"{tgt_name}_add")(result, child_val)
elif lhs_is_complex and not rhs_is_complex:
result = p.Variable(f"{tgt_name}_addr")(result, child_val)
elif not lhs_is_complex and rhs_is_complex:
result = p.Variable(f"{tgt_name}_radd")(result, child_val)
else:
result = p.Sum((result, child_val))
lhs_is_complex = lhs_is_complex or rhs_is_complex
return result
else:
tgt_name = self.complex_type_name(tgt_dtype)
reals = []
complexes = []
for child in expr.children:
if self.infer_type(child).is_complex():
complexes.append(child)
else:
reals.append(child)
real_sum = p.flattened_sum([self.rec(r, type_context) for r in reals])
c_applied = [self.rec(c, type_context, tgt_dtype) for c in complexes]
mul_name = f"{tgt_name}_mul"
def binary_tree_add(start, end):
if start + 1 == end:
return c_applied[start]
mid = (start + end)//2
lsum = binary_tree_add(start, mid)
rsum = binary_tree_add(mid, end)
# FMAs should ideally be recognized by the compiler, but some
# compilers fail to do so. For eg:
#
# res = complex_add(c, complex_mul(a, b))
#
# leads to code that looks like below because of the temporary
# given by ``complex_mul(a, b)``.
#
# tmp.real = a.real * b.real - a.imag * b.imag
# tmp.imag = a.real * b.imag + a.imag * b.real
# res.real = c.real + tmp.real
# res.imag = c.imag + tmp.imag
#
# clang can fuse across multiple statements like this with
# -ffp-contract=fast which is the default for PTX codegen, but
# for some unknown reason, clang fails to see the FMAs.
#
# We need to do this only for complex as we have temporaries
# only in complex. For reals, the code generated looks like
#
# res = c + a * b
#
# and clang is able to generate an FMA for this code.
if isinstance(lsum, p.Call) and isinstance(lsum.function,
p.Variable) and lsum.function.name == mul_name:
return p.Variable(f"{tgt_name}_fma")(*lsum.parameters, rsum)
elif isinstance(rsum, p.Call) and isinstance(rsum.function,
p.Variable) and rsum.function.name == mul_name:
return p.Variable(f"{tgt_name}_fma")(*rsum.parameters, lsum)
else:
return p.Variable(f"{tgt_name}_add")(lsum, rsum)
complex_sum = binary_tree_add(0, len(c_applied))
if reals:
return p.Variable(f"{tgt_name}_radd")(real_sum, complex_sum)
else:
return complex_sum
def map_product(self, expr, type_context):
# I've added 'type_context == "i"' because of the following
# idiotic corner case: Code generation for subscripts comes
# through here, and it may involve variables that we know
# nothing about (offsets and such). If we fall into the allow_complex
# branch, we'll try to do type inference on these variables,
# and stuff breaks. This band-aid works around that. -AK
if not self.allow_complex or type_context == "i":
return super().map_product(expr, type_context)
tgt_dtype = self.infer_type(expr)
is_complex = tgt_dtype.is_complex()
if not is_complex:
return super().map_product(expr, type_context)
elif not self.kernel.options.allow_fp_reordering:
tgt_name = self.complex_type_name(tgt_dtype)
result = None
lhs_is_complex = False
for child in expr.children:
rhs_is_complex = self.infer_type(child).is_complex()
if rhs_is_complex:
child_val = self.rec(child, type_context, tgt_dtype)
else:
child_val = self.rec(child, type_context)
if result is None:
result = child_val
elif lhs_is_complex and rhs_is_complex:
result = p.Variable(f"{tgt_name}_mul")(result, child_val)
elif lhs_is_complex and not rhs_is_complex:
result = p.Variable(f"{tgt_name}_mulr")(result, child_val)
elif not lhs_is_complex and rhs_is_complex:
result = p.Variable(f"{tgt_name}_rmul")(result, child_val)
else:
result = p.Product((result, child_val))
lhs_is_complex = lhs_is_complex or rhs_is_complex
return result
else:
tgt_name = self.complex_type_name(tgt_dtype)
class _LegacyTypeRegistryStub(object): reals = []
"""Adapts legacy PyOpenCL type registry to be usable with PyOpenCLTarget.""" complexes = []
for child in expr.children:
if self.infer_type(child).is_complex():
complexes.append(child)
else:
reals.append(child)
def get_or_register_dtype(self, names, dtype=None): real_prd = p.flattened_product(
from pyopencl.compyte.dtypes import get_or_register_dtype [self.rec(r, type_context) for r in reals])
return get_or_register_dtype(names, dtype)
def dtype_to_ctype(self, dtype): c_applied = [self.rec(c, type_context, tgt_dtype) for c in complexes]
from pyopencl.compyte.dtypes import dtype_to_ctype
return dtype_to_ctype(dtype) def binary_tree_mul(start, end):
if start + 1 == end:
return c_applied[start]
mid = (start + end)//2
lsum = binary_tree_mul(start, mid)
rsum = binary_tree_mul(mid, end)
return p.Variable("%s_mul" % tgt_name)(lsum, rsum)
complex_prd = binary_tree_mul(0, len(complexes))
if reals:
return p.Variable("%s_rmul" % tgt_name)(real_prd, complex_prd)
else:
return complex_prd
def map_quotient(self, expr, type_context):
n_dtype = self.infer_type(expr.numerator).numpy_dtype
d_dtype = self.infer_type(expr.denominator).numpy_dtype
tgt_dtype = self.infer_type(expr)
n_complex = "c" == n_dtype.kind
d_complex = "c" == d_dtype.kind
if not self.allow_complex or (not (n_complex or d_complex)):
return super().map_quotient(expr, type_context)
if n_complex and not d_complex:
return p.Variable("%s_divider" % self.complex_type_name(tgt_dtype))(
self.rec(expr.numerator, type_context, tgt_dtype),
self.rec(expr.denominator, type_context))
elif not n_complex and d_complex:
return p.Variable("%s_rdivide" % self.complex_type_name(tgt_dtype))(
self.rec(expr.numerator, type_context),
self.rec(expr.denominator, type_context, tgt_dtype))
else:
return p.Variable("%s_divide" % self.complex_type_name(tgt_dtype))(
self.rec(expr.numerator, type_context, tgt_dtype),
self.rec(expr.denominator, type_context, tgt_dtype))
def map_constant(self, expr, type_context):
if isinstance(expr, (complex, np.complexfloating)):
try:
dtype = expr.dtype
except AttributeError:
# (COMPLEX_GUESS_LOGIC) This made it through type 'guessing' in
# type inference, and it was concluded there (search for
# COMPLEX_GUESS_LOGIC in loopy.type_inference), that no
# accuracy was lost by using single precision.
cast_type = "cfloat"
else:
if dtype == np.complex128:
cast_type = "cdouble"
elif dtype == np.complex64:
cast_type = "cfloat"
else:
raise RuntimeError("unsupported complex type in expression "
"generation: %s" % type(expr))
return p.Variable("%s_new" % cast_type)(self.rec(expr.real,
type_context),
self.rec(expr.imag,
type_context))
return super().map_constant(expr, type_context)
def map_power(self, expr, type_context):
tgt_dtype = self.infer_type(expr)
base_dtype = self.infer_type(expr.base)
exponent_dtype = self.infer_type(expr.exponent)
if not self.allow_complex or (not tgt_dtype.is_complex()):
return super().map_power(expr, type_context)
if expr.exponent in [2, 3, 4]:
value = expr.base
for _i in range(expr.exponent-1):
value = value * expr.base
return self.rec(value, type_context)
else:
b_complex = base_dtype.is_complex()
e_complex = exponent_dtype.is_complex()
if b_complex and not e_complex:
return p.Variable("%s_powr" % self.complex_type_name(tgt_dtype))(
self.rec(expr.base, type_context, tgt_dtype),
self.rec(expr.exponent, type_context))
else:
return p.Variable("%s_pow" % self.complex_type_name(tgt_dtype))(
self.rec(expr.base, type_context, tgt_dtype),
self.rec(expr.exponent, type_context, tgt_dtype))
# }}} # }}}
...@@ -279,68 +510,56 @@ class PyOpenCLTarget(OpenCLTarget): ...@@ -279,68 +510,56 @@ class PyOpenCLTarget(OpenCLTarget):
warnings) and support for complex numbers. warnings) and support for complex numbers.
""" """
# FIXME make prefixes conform to naming rules
# (see Reference: Loopy's Model of a Kernel)
host_program_name_prefix = "_lpy_host_" host_program_name_prefix = "_lpy_host_"
host_program_name_suffix = "" host_program_name_suffix = ""
def __init__(self, device=None, pyopencl_module_name="_lpy_cl", # FIXME Not yet complete
atomics_flavor=None): limit_arg_size_nbytes: int | None
# This ensures the dtype registry is populated. pointer_size_nbytes: int
import pyopencl.tools # noqa
super(PyOpenCLTarget, self).__init__(
atomics_flavor=atomics_flavor)
self.device = device def __init__(
self.pyopencl_module_name = pyopencl_module_name self, device=None, *, pyopencl_module_name: str = "_lpy_cl",
atomics_flavor=None, use_int8_for_bool: bool = True,
limit_arg_size_nbytes: int | None = None,
pointer_size_nbytes: int | None = None
) -> None:
# This ensures the dtype registry is populated.
import pyopencl.tools
comparison_fields = ["device"] super().__init__(
atomics_flavor=atomics_flavor,
use_int8_for_bool=use_int8_for_bool)
def update_persistent_hash(self, key_hash, key_builder): import pyopencl.version
super(PyOpenCLTarget, self).update_persistent_hash(key_hash, key_builder) if pyopencl.version.VERSION < (2021, 2):
key_builder.rec(key_hash, getattr(self.device, "persistent_unique_id", None)) raise RuntimeError("The version of loopy you have installed "
"generates invoker code that requires PyOpenCL 2021.2 "
"or newer.")
def __getstate__(self): if device is not None:
dev_id = None warn("Passing device is deprecated, it will stop working in 2022.",
if self.device is not None: DeprecationWarning, stacklevel=2)
dev_id = self.device.persistent_unique_id
return { self.pyopencl_module_name = pyopencl_module_name
"device_id": dev_id,
"atomics_flavor": self.atomics_flavor,
"fortran_abi": self.fortran_abi,
"pyopencl_module_name": self.pyopencl_module_name,
}
def __setstate__(self, state): if pointer_size_nbytes is None:
self.atomics_flavor = state["atomics_flavor"] pointer_size_nbytes = tuple.__itemsize__
self.fortran_abi = state["fortran_abi"]
self.pyopencl_module_name = state["pyopencl_module_name"]
dev_id = state["device_id"] self.limit_arg_size_nbytes = limit_arg_size_nbytes
if dev_id is None: self.pointer_size_nbytes = pointer_size_nbytes
self.device = None
else:
import pyopencl as cl
matches = [
dev
for plat in cl.get_platforms()
for dev in plat.get_devices()
if dev.persistent_unique_id == dev_id]
if matches:
self.device = matches[0]
else:
raise LoopyError(
"cannot unpickle device '%s': not found"
% dev_id)
def preprocess(self, kernel): @property
if self.device is not None: def device(self):
kernel = adjust_local_temp_var_storage(kernel, self.device) warn("PyOpenCLTarget.device is deprecated, it will stop working in 2022.",
return kernel DeprecationWarning, stacklevel=2)
return None
def pre_codegen_check(self, kernel): # NB: Not including 'device', as that is handled specially here.
check_sizes(kernel, self.device) hash_fields = (*OpenCLTarget.hash_fields, "pyopencl_module_name")
comparison_fields = (*OpenCLTarget.comparison_fields, "pyopencl_module_name")
def get_host_ast_builder(self): def get_host_ast_builder(self):
return PyOpenCLPythonASTBuilder(self) return PyOpenCLPythonASTBuilder(self)
...@@ -351,19 +570,22 @@ class PyOpenCLTarget(OpenCLTarget): ...@@ -351,19 +570,22 @@ class PyOpenCLTarget(OpenCLTarget):
# {{{ types # {{{ types
def get_dtype_registry(self): def get_dtype_registry(self):
try: from pyopencl.compyte.dtypes import TYPE_REGISTRY
from pyopencl.compyte.dtypes import TYPE_REGISTRY result = TYPE_REGISTRY
except ImportError:
result = _LegacyTypeRegistryStub()
else:
result = TYPE_REGISTRY
from loopy.target.opencl import DTypeRegistryWrapperWithCL1Atomics from loopy.target.opencl import (
DTypeRegistryWrapperWithCL1Atomics,
DTypeRegistryWrapperWithInt8ForBool,
)
result = DTypeRegistryWrapperWithInt8ForBool(result)
if self.atomics_flavor == "cl1": if self.atomics_flavor == "cl1":
return DTypeRegistryWrapperWithCL1Atomics(result) result = DTypeRegistryWrapperWithCL1Atomics(result)
else: else:
raise NotImplementedError("atomics flavor: %s" % self.atomics_flavor) raise NotImplementedError("atomics flavor: %s" % self.atomics_flavor)
return result
def is_vector_dtype(self, dtype): def is_vector_dtype(self, dtype):
try: try:
import pyopencl.cltypes as cltypes import pyopencl.cltypes as cltypes
...@@ -383,9 +605,7 @@ class PyOpenCLTarget(OpenCLTarget): ...@@ -383,9 +605,7 @@ class PyOpenCLTarget(OpenCLTarget):
from pyopencl.array import vec from pyopencl.array import vec
vec_types = vec.types vec_types = vec.types
return NumpyType( return NumpyType(vec_types[base.numpy_dtype, count])
vec_types[base.numpy_dtype, count],
target=self)
def alignment_requirement(self, type_decl): def alignment_requirement(self, type_decl):
import struct import struct
...@@ -399,115 +619,81 @@ class PyOpenCLTarget(OpenCLTarget): ...@@ -399,115 +619,81 @@ class PyOpenCLTarget(OpenCLTarget):
# }}} # }}}
def get_kernel_executor_cache_key(self, queue, **kwargs): def get_kernel_executor_cache_key(self, queue, **kwargs):
return queue.context return (queue.context, kwargs["entrypoint"])
# type-ignore because we're making things from *args: Any more concrete,
# and mypy doesn't like it.
def get_kernel_executor(self, t_unit: TranslationUnit, # type: ignore[override]
queue_or_context: cl.CommandQueue | cl.Context,
*args: Any, entrypoint: FunctionIdT, **kwargs: Any
) -> PyOpenCLExecutor:
from pyopencl import CommandQueue
if isinstance(queue_or_context, CommandQueue):
context = queue_or_context.context
else:
context = queue_or_context
def get_kernel_executor(self, kernel, queue, **kwargs): from loopy.target.pyopencl_execution import PyOpenCLExecutor
from loopy.target.pyopencl_execution import PyOpenCLKernelExecutor return PyOpenCLExecutor(context, t_unit, entrypoint=entrypoint)
return PyOpenCLKernelExecutor(queue.context, kernel)
# }}} # }}}
# {{{ host code: value arg setup # {{{ host code: value arg setup
def generate_value_arg_setup(kernel, devices, implemented_data_info): def generate_value_arg_setup(
kernel: LoopKernel, passed_names: Sequence[str]
) -> genpy.Suite:
options = kernel.options options = kernel.options
from genpy import If, Raise, Statement as S, Suite
import loopy as lp import loopy as lp
from loopy.kernel.array import ArrayBase from loopy.kernel.array import ArrayBase
# {{{ arg counting bug handling result: list[genpy.Generable] = []
gen = result.append
# For example:
# https://github.com/pocl/pocl/issues/197
# (but Apple CPU has a similar bug)
work_around_arg_count_bug = False
warn_about_arg_count_bug = False
try:
from pyopencl.characterize import has_struct_arg_count_bug
except ImportError:
count_bug_per_dev = [False]*len(devices)
else: buf_indices_and_args = []
count_bug_per_dev = [ buf_pack_indices_and_args = []
has_struct_arg_count_bug(dev)
if dev is not None else False
for dev in devices]
if any(dev is None for dev in devices): from pyopencl.invoker import BUF_PACK_TYPECHARS
warn("{knl_name}: device not supplied to PyOpenCLTarget--"
"workarounds for broken OpenCL implementations "
"(such as those relating to complex numbers) "
"may not be enabled when needed"
.format(knl_name=kernel.name))
if any(count_bug_per_dev): def add_buf_arg(arg_idx, typechar, expr_str):
if all(count_bug_per_dev): if typechar in BUF_PACK_TYPECHARS:
work_around_arg_count_bug = True buf_pack_indices_and_args.append(arg_idx)
buf_pack_indices_and_args.append(repr(typechar.encode()))
buf_pack_indices_and_args.append(expr_str)
else: else:
warn_about_arg_count_bug = True buf_indices_and_args.append(arg_idx)
buf_indices_and_args.append(f"pack('{typechar}', {expr_str})")
# }}}
cl_arg_idx = 0
arg_idx_to_cl_arg_idx = {}
fp_arg_count = 0
from genpy import ( for arg_idx, passed_name in enumerate(passed_names):
Comment, Line, If, Raise, Assign, Statement as S, Suite) if passed_name in kernel.all_inames():
add_buf_arg(arg_idx, kernel.index_dtype.numpy_dtype.char, passed_name)
result = [] continue
gen = result.append
for arg_idx, idi in enumerate(implemented_data_info):
arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx
if not issubclass(idi.arg_class, lp.ValueArg): var_descr = kernel.get_var_descriptor(passed_name)
assert issubclass(idi.arg_class, ArrayBase) assert var_descr.dtype is not None
# assume each of those generates exactly one... if not isinstance(var_descr, lp.ValueArg):
cl_arg_idx += 1 assert isinstance(var_descr, ArrayBase)
continue continue
gen(Comment("{{{ process %s" % idi.name))
gen(Line())
if not options.skip_arg_checks: if not options.skip_arg_checks:
gen(If("%s is None" % idi.name, gen(If(f"{passed_name} is None",
Raise('RuntimeError("input argument \'{name}\' ' Raise('RuntimeError("input argument \'{var_descr.name}\' '
'must be supplied")'.format(name=idi.name)))) 'must be supplied")')))
if idi.dtype.is_integral():
gen(Comment("cast to Python int to avoid trouble "
"with struct packing or Boost.Python"))
if sys.version_info < (3,):
py_type = "long"
else:
py_type = "int"
gen(Assign(idi.name, "%s(%s)" % (py_type, idi.name)))
gen(Line())
if idi.dtype.is_composite():
gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, idi.name)))
cl_arg_idx += 1
elif idi.dtype.is_complex(): if var_descr.dtype.is_composite():
assert isinstance(idi.dtype, NumpyType) buf_indices_and_args.append(arg_idx)
buf_indices_and_args.append(f"{passed_name}")
dtype = idi.dtype elif var_descr.dtype.is_complex():
assert isinstance(var_descr.dtype, NumpyType)
if warn_about_arg_count_bug: dtype = var_descr.dtype
warn("{knl_name}: arguments include complex numbers, and "
"some (but not all) of the target devices mishandle "
"struct kernel arguments (hence the workaround is "
"disabled".format(
knl_name=kernel.name))
if dtype.numpy_dtype == np.complex64: if dtype.numpy_dtype == np.complex64:
arg_char = "f" arg_char = "f"
...@@ -516,77 +702,59 @@ def generate_value_arg_setup(kernel, devices, implemented_data_info): ...@@ -516,77 +702,59 @@ def generate_value_arg_setup(kernel, devices, implemented_data_info):
else: else:
raise TypeError("unexpected complex type: %s" % dtype) raise TypeError("unexpected complex type: %s" % dtype)
if (work_around_arg_count_bug buf_indices_and_args.append(arg_idx)
and dtype.numpy_dtype == np.complex128 buf_indices_and_args.append(
and fp_arg_count + 2 <= 8): f"_lpy_pack('{arg_char}{arg_char}', "
gen(Assign( f"{passed_name}.real, {passed_name}.imag)")
"_lpy_buf",
"_lpy_pack('{arg_char}', {arg_var}.real)"
.format(arg_char=arg_char, arg_var=idi.name)))
gen(S(
"_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
.format(cl_arg_idx=cl_arg_idx)))
cl_arg_idx += 1
gen(Assign(
"_lpy_buf",
"_lpy_pack('{arg_char}', {arg_var}.imag)"
.format(arg_char=arg_char, arg_var=idi.name)))
gen(S(
"_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
.format(cl_arg_idx=cl_arg_idx)))
cl_arg_idx += 1
else:
gen(Assign(
"_lpy_buf",
"_lpy_pack('{arg_char}{arg_char}', "
"{arg_var}.real, {arg_var}.imag)"
.format(arg_char=arg_char, arg_var=idi.name)))
gen(S(
"_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
.format(cl_arg_idx=cl_arg_idx)))
cl_arg_idx += 1
fp_arg_count += 2
elif isinstance(idi.dtype, NumpyType):
if idi.dtype.dtype.kind == "f":
fp_arg_count += 1
gen(S(
"_lpy_knl.set_arg(%d, _lpy_pack('%s', %s))"
% (cl_arg_idx, idi.dtype.dtype.char, idi.name)))
cl_arg_idx += 1 elif isinstance(var_descr.dtype, NumpyType):
add_buf_arg(arg_idx, var_descr.dtype.dtype.char, passed_name)
else: else:
raise LoopyError("do not know how to pass argument of type '%s'" raise LoopyError("do not know how to pass argument of type '%s'"
% idi.dtype) % var_descr.dtype)
for arg_kind, args_and_indices, entry_length in [
("_buf", buf_indices_and_args, 2),
("_buf_pack", buf_pack_indices_and_args, 3),
]:
assert len(args_and_indices) % entry_length == 0
if args_and_indices:
gen(S(f"_lpy_knl._set_arg{arg_kind}_multi("
f"({', '.join(str(i) for i in args_and_indices)},), "
")"))
gen(Line()) return Suite(result)
gen(Comment("}}}"))
gen(Line())
return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx
# }}} # }}}
def generate_array_arg_setup(kernel, implemented_data_info, arg_idx_to_cl_arg_idx): def generate_array_arg_setup(
from loopy.kernel.array import ArrayBase kernel: LoopKernel, passed_names: Sequence[str],
) -> genpy.Generable:
from genpy import Statement as S, Suite from genpy import Statement as S, Suite
result = [] from loopy.kernel.array import ArrayBase
result: list[genpy.Generable] = []
gen = result.append gen = result.append
for arg_idx, arg in enumerate(implemented_data_info): cl_indices_and_args: list[int | str] = []
if not issubclass(arg.arg_class, ArrayBase): for arg_idx, passed_name in enumerate(passed_names):
if passed_name in kernel.all_inames():
continue continue
cl_arg_idx = arg_idx_to_cl_arg_idx[arg_idx] var_descr = kernel.get_var_descriptor(passed_name)
if isinstance(var_descr, ArrayBase):
cl_indices_and_args.append(arg_idx)
cl_indices_and_args.append(passed_name)
if cl_indices_and_args:
assert len(cl_indices_and_args) % 2 == 0
gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, arg.name))) gen(S(f"_lpy_knl._set_arg_multi("
f"({', '.join(str(i) for i in cl_indices_and_args)},)"
")"))
return Suite(result) return Suite(result)
...@@ -599,82 +767,105 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): ...@@ -599,82 +767,105 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
# {{{ code generation guts # {{{ code generation guts
def get_function_definition(self, codegen_state, codegen_result, def get_function_definition(
schedule_index, function_decl, function_body): self, codegen_state, codegen_result,
from loopy.kernel.data import TemporaryVariable schedule_index: int, function_decl, function_body: genpy.Generable
) -> genpy.Function:
assert schedule_index == 0
from loopy.schedule.tools import get_kernel_arg_info
kai = get_kernel_arg_info(codegen_state.kernel)
args = ( args = (
["_lpy_cl_kernels", "queue"] ["_lpy_cl_kernels", "queue", *kai.passed_arg_names,
+ [idi.name for idi in codegen_state.implemented_data_info "wait_for=None", "allocator=None"])
if not issubclass(idi.arg_class, TemporaryVariable)]
+ ["wait_for=None", "allocator=None"])
from genpy import (For, Function, Suite, Import, ImportAs, Return, from genpy import For, Function, Line, Return, Statement as S, Suite
FromImport, If, Assign, Line, Statement as S)
return Function( return Function(
codegen_result.current_program(codegen_state).name, codegen_result.current_program(codegen_state).name,
args, args,
Suite([ Suite([
FromImport("struct", ["pack as _lpy_pack"]),
ImportAs("pyopencl", "_lpy_cl"),
Import("pyopencl.tools"),
Line(),
If("allocator is None",
Assign(
"allocator",
"_lpy_cl_tools.DeferredAllocator(queue.context)")),
Line(), Line(),
] + [ ] + [
Line(), Line(),
function_body, function_body,
Line(), Line(),
] + [ ] + ([
For("_tv", "_global_temporaries", For("_tv", "_global_temporaries",
# free global temporaries # Free global temporaries.
S("_tv.release()")) # Zero-size temporaries allocate as None, tolerate that.
] + [ # https://documen.tician.de/pyopencl/tools.html#pyopencl.tools.ImmediateAllocator
S("if _tv is not None: _tv.release()"))
] if self._get_global_temporaries(codegen_state) else []
) + [
Line(), Line(),
Return("_lpy_evt"), Return("_lpy_evt"),
])) ]))
def get_function_declaration(self, codegen_state, codegen_result, def get_function_declaration(
schedule_index): self, codegen_state: CodeGenerationState,
codegen_result: CodeGenerationResult, schedule_index: int
) -> tuple[Sequence[tuple[str, str]], genpy.Generable | None]:
# no such thing in Python # no such thing in Python
return None return [], None
def get_temporary_decls(self, codegen_state, schedule_state): def _get_global_temporaries(self, codegen_state):
from genpy import Assign, Comment, Line from loopy.kernel.data import AddressSpace
def alloc_nbytes(tv):
from six.moves import reduce
from operator import mul
return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1)
from loopy.kernel.data import temp_var_scope return sorted(
(tv for tv in codegen_state.kernel.temporary_variables.values()
global_temporaries = sorted( if tv.address_space == AddressSpace.GLOBAL),
(tv for tv in six.itervalues(codegen_state.kernel.temporary_variables)
if tv.scope == temp_var_scope.GLOBAL),
key=lambda tv: tv.name) key=lambda tv: tv.name)
def get_temporary_decls(self, codegen_state, schedule_index):
from genpy import Assign, Comment, Line
from pymbolic.mapper.stringifier import PREC_NONE from pymbolic.mapper.stringifier import PREC_NONE
ecm = self.get_expression_to_code_mapper(codegen_state) ecm = self.get_expression_to_code_mapper(codegen_state)
global_temporaries = self._get_global_temporaries(codegen_state)
if not global_temporaries: if not global_temporaries:
return [Assign("_global_temporaries", "[]"), Line()] return []
return [ allocated_var_names = []
Comment("{{{ allocate global temporaries"), code_lines = []
Line()] + [ code_lines.append(Line())
Assign(tv.name, "allocator(%s)" % code_lines.append(Comment("{{{ allocate global temporaries"))
ecm(alloc_nbytes(tv), PREC_NONE, "i")) code_lines.append(Line())
for tv in global_temporaries] + [
Assign("_global_temporaries", "[{tvs}]".format(tvs=", ".join( for tv in global_temporaries:
tv.name for tv in global_temporaries)))] + [ if not tv.base_storage:
Line(), if tv.nbytes:
Comment("}}}"), # NB: This does not prevent all zero-size allocations,
Line()] # as sizes are parametric, and allocation size
# could turn out to be zero at runtime.
nbytes_str = ecm(tv.nbytes, PREC_NONE, "i")
allocated_var_names.append(tv.name)
code_lines.append(Assign(tv.name,
f"allocator({nbytes_str})"))
else:
code_lines.append(Assign(tv.name, "None"))
code_lines.append(Assign("_global_temporaries", "[{tvs}]".format(
tvs=", ".join(tv for tv in allocated_var_names))))
code_lines.append(Line())
code_lines.append(Comment("}}}"))
code_lines.append(Line())
return code_lines
def get_kernel_call(
self, codegen_state: CodeGenerationState,
subkernel_name: str,
gsize: tuple[Expression, ...], lsize: tuple[Expression, ...]
) -> genpy.Suite:
from genpy import Assert, Assign, Comment, Line, Suite
kernel = codegen_state.kernel
from loopy.schedule.tools import get_subkernel_arg_info
skai = get_subkernel_arg_info(kernel, subkernel_name)
def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
ecm = self.get_expression_to_code_mapper(codegen_state) ecm = self.get_expression_to_code_mapper(codegen_state)
if not gsize: if not gsize:
...@@ -682,37 +873,100 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): ...@@ -682,37 +873,100 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
if not lsize: if not lsize:
lsize = (1,) lsize = (1,)
all_args = codegen_state.implemented_data_info + extra_args assert isinstance(kernel.target, PyOpenCLTarget)
regular_arg_names, struct_overflow_arg_names = split_args_for_overflow(
kernel, skai.passed_names,
limit_arg_size_nbytes=kernel.target.limit_arg_size_nbytes,
pointer_size_nbytes=kernel.target.pointer_size_nbytes)
value_arg_code = generate_value_arg_setup(
codegen_state.kernel, regular_arg_names)
array_arg_code = generate_array_arg_setup(
codegen_state.kernel, regular_arg_names)
if struct_overflow_arg_names:
regular_arg_names_set = frozenset(regular_arg_names)
struct_overflow_arg_names_set = frozenset(
struct_overflow_arg_names)
py_passed_args = []
struct_pack_types: list[str] = []
struct_pack_args = []
for arg_name in skai.passed_names:
if arg_name in regular_arg_names_set:
py_passed_args.append(arg_name)
else:
assert arg_name in struct_overflow_arg_names_set
arg = kernel.get_var_descriptor(arg_name)
assert arg.dtype is not None
if isinstance(arg, ValueArg):
struct_pack_types.append(arg.dtype.numpy_dtype.char)
struct_pack_args.append(arg_name)
elif isinstance(arg, (ArrayArg, ConstantArg, TemporaryVariable)):
struct_pack_types.append("P")
struct_pack_args.append(f"{arg_name}.svm_ptr")
elif isinstance(arg, ImageArg):
raise AssertionError()
else:
raise ValueError(f"unrecognized arg type: '{type(arg)}'")
cl_arg_count = len(regular_arg_names)
overflow_args_code = Suite([
# It's important for _lpy_overflow_args_buf to be in a variable.
# Otherwise, no reference to it will survive until the kernel
# launch and the buffer may be released.
Assign("_lpy_overflow_args_buf",
"_lpy_cl.Buffer(queue.context, "
"_lpy_cl.mem_flags.READ_ONLY "
"| _lpy_cl.mem_flags.COPY_HOST_PTR, "
"hostbuf="
f"_lpy_pack({''.join(struct_pack_types)!r}, "
f"{', '.join(struct_pack_args)}))"),
Line(f"_lpy_knl.set_arg({cl_arg_count}, _lpy_overflow_args_buf)")
])
cl_arg_count += 1
value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \ else:
generate_value_arg_setup( cl_arg_count = len(skai.passed_names)
codegen_state.kernel, overflow_args_code = Suite([])
[self.target.device],
all_args)
arry_arg_code = generate_array_arg_setup(
codegen_state.kernel,
all_args,
arg_idx_to_cl_arg_idx)
from genpy import Suite, Assign, Assert, Line, Comment import pyopencl.version as cl_ver
from pymbolic.mapper.stringifier import PREC_NONE from pymbolic.mapper.stringifier import PREC_NONE
if cl_ver.VERSION < (2020, 2):
from warnings import warn
warn("Your kernel invocation will likely fail because your "
"version of PyOpenCL does not support allow_empty_ndrange. "
"Please upgrade to version 2020.2 or newer.", stacklevel=2)
# TODO: Generate finer-grained dependency structure # TODO: Generate finer-grained dependency structure
return Suite([ return Suite([
Comment("{{{ enqueue %s" % name), Comment("{{{ enqueue %s" % subkernel_name),
Line(), Line(),
Assign("_lpy_knl", "_lpy_cl_kernels."+name), Assign("_lpy_knl", "_lpy_cl_kernels."+subkernel_name),
Assert("_lpy_knl.num_args == %d" % cl_arg_count), Assert(f"_lpy_knl.num_args == {cl_arg_count}, "
f"f'Kernel \"{subkernel_name}\" "
f"invoker argument count ({cl_arg_count}) does not match the "
# No f"" here since {_lpy_knl.num_args} needs to be evaluated
# at runtime, not here.
"argument count of the kernel ({_lpy_knl.num_args}).'"),
Line(), Line(),
value_arg_code, value_arg_code,
arry_arg_code, array_arg_code,
Assign("_lpy_evt", "%(pyopencl_module_name)s.enqueue_nd_range_kernel(" overflow_args_code,
"queue, _lpy_knl, " Assign("_lpy_evt",
"%(gsize)s, %(lsize)s, wait_for=wait_for, g_times_l=True)" f"{self.target.pyopencl_module_name}.enqueue_nd_range_kernel("
% dict( "queue, _lpy_knl, "
pyopencl_module_name=self.target.pyopencl_module_name, f"{ecm(gsize, prec=PREC_NONE, type_context='i')}, "
gsize=ecm(gsize, prec=PREC_NONE, type_context="i"), f"{ecm(lsize, prec=PREC_NONE, type_context='i')}, "
lsize=ecm(lsize, prec=PREC_NONE, type_context="i"))), # using positional args because pybind is slow with kwargs
"None, " # offset
"wait_for, "
"True, " # g_times_l
"True, " # allow_empty_ndrange
")"),
Assign("wait_for", "[_lpy_evt]"), Assign("wait_for", "[_lpy_evt]"),
Line(), Line(),
Comment("}}}"), Comment("}}}"),
...@@ -724,32 +978,258 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): ...@@ -724,32 +978,258 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
# }}} # }}}
# {{{ split_args_for_overflow
def split_args_for_overflow(
kernel: LoopKernel, passed_names: Sequence[str],
*, limit_arg_size_nbytes: int | None, pointer_size_nbytes: int
) -> tuple[Sequence[str], Sequence[str]]:
if limit_arg_size_nbytes is None:
return passed_names, []
regular_arg_names = []
overflow_arg_names = []
# Consider that the pointer to the arg overflow struct also occupies
# argument space.
running_arg_size = pointer_size_nbytes
for arg_name in passed_names:
arg = kernel.get_var_descriptor(arg_name)
if isinstance(arg, (ValueArg, ArrayArg, ConstantArg, TemporaryVariable)):
if isinstance(arg, ValueArg):
assert arg.dtype is not None
arg_size = arg.dtype.numpy_dtype.itemsize
else:
arg_size = pointer_size_nbytes
if running_arg_size + arg_size > limit_arg_size_nbytes:
overflow_arg_names.append(arg_name)
else:
regular_arg_names.append(arg_name)
running_arg_size += arg_size
elif isinstance(arg, ImageArg):
regular_arg_names.append(arg_name)
else:
raise ValueError(f"unrecognized arg type: '{type(arg)}'")
return regular_arg_names, overflow_arg_names
# }}}
# {{{ device ast builder # {{{ device ast builder
class PyOpenCLCASTBuilder(OpenCLCASTBuilder): class PyOpenCLCASTBuilder(OpenCLCASTBuilder):
"""A C device AST builder for integration with PyOpenCL. """A C device AST builder for integration with PyOpenCL.
""" """
# {{{ function decl/def, with arg overflow handling
def get_function_definition(
self,
codegen_state: CodeGenerationState,
codegen_result: CodeGenerationResult,
schedule_index: int,
function_decl: Generable,
function_body: Generable,
) -> Generable:
assert isinstance(function_body, Block)
kernel = codegen_state.kernel
assert kernel.linearization is not None
subkernel_name = cast("CallKernel",
kernel.linearization[schedule_index]).kernel_name
result = []
from loopy.kernel.data import AddressSpace
# We only need to write declarations for global variables with
# the first device program. `is_first_dev_prog` determines
# whether this is the first device program in the schedule.
is_first_dev_prog = codegen_state.is_generating_device_code
for i in range(schedule_index):
if isinstance(kernel.linearization[i], CallKernel):
is_first_dev_prog = False
break
if is_first_dev_prog:
for tv in sorted(
kernel.temporary_variables.values(),
key=lambda key_tv: key_tv.name):
if tv.address_space == AddressSpace.GLOBAL and (
tv.initializer is not None):
assert tv.read_only
decl = self.wrap_global_constant(
self.get_temporary_var_declarator(codegen_state, tv))
if tv.initializer is not None:
from loopy.target.c import generate_array_literal
init_decl = Initializer(decl, generate_array_literal(
codegen_state, tv, tv.initializer))
else:
init_decl = decl
result.append(init_decl)
# {{{ unpack overflow args
if codegen_state.is_entrypoint:
from loopy.schedule.tools import get_subkernel_arg_info
skai = get_subkernel_arg_info(kernel, subkernel_name)
_, struct_overflow_arg_names = split_args_for_overflow(
kernel, skai.passed_names,
limit_arg_size_nbytes=self.target.limit_arg_size_nbytes,
pointer_size_nbytes=self.target.pointer_size_nbytes)
arg_unpack_code = [
Initializer(
self.arg_to_cgen_declarator(
kernel, arg_name,
is_written=arg_name in skai.written_names),
f"_lpy_overflow_args->{arg_name}")
for arg_name in struct_overflow_arg_names
] + ([Line()] if struct_overflow_arg_names else [])
function_body = Block(arg_unpack_code + function_body.contents)
# }}}
from loopy.target.c import FunctionDeclarationWrapper
assert isinstance(function_decl, FunctionDeclarationWrapper)
if not isinstance(function_body, Block):
function_body = Block([function_body])
fbody = FunctionBody(function_decl, function_body)
if not result:
return fbody
else:
return Collection([*result, Line(), fbody])
def get_function_declaration(
self, codegen_state: CodeGenerationState,
codegen_result: CodeGenerationResult, schedule_index: int
) -> tuple[Sequence[tuple[str, str]], Generable]:
kernel = codegen_state.kernel
assert codegen_state.kernel.linearization is not None
subkernel_name = cast(
"CallKernel",
codegen_state.kernel.linearization[schedule_index]
).kernel_name
from cgen import FunctionDeclaration, Struct, Value
name_str = codegen_result.current_program(codegen_state).name
if self.target.fortran_abi:
name_str += "_"
from loopy.target.c import FunctionDeclarationWrapper
if codegen_state.is_entrypoint:
name = Value("void", name_str)
# subkernel launches occur only as part of entrypoint kernels for now
from loopy.schedule.tools import get_subkernel_arg_info
skai = get_subkernel_arg_info(kernel, subkernel_name)
passed_names = skai.passed_names
written_names = skai.written_names
regular_arg_names, struct_overflow_arg_names = split_args_for_overflow(
kernel, passed_names,
limit_arg_size_nbytes=self.target.limit_arg_size_nbytes,
pointer_size_nbytes=self.target.pointer_size_nbytes)
arg_overflow_struct_name = f"_lpy_arg_struct_{subkernel_name}"
arg_overflow_struct = Struct(
arg_overflow_struct_name, [
self.arg_to_cgen_declarator(
kernel, arg_name,
is_written=arg_name in written_names)
for arg_name in struct_overflow_arg_names])
if struct_overflow_arg_names:
logger.info(f"overflowing arguments into SVM buffer: "
f"{len(regular_arg_names)} regular/"
f"{len(struct_overflow_arg_names)} in buffer "
f"for '{subkernel_name}'")
arg_struct_preambles = [
(f"declare-{arg_overflow_struct_name}",
str(arg_overflow_struct))
] if struct_overflow_arg_names else []
arg_struct_args: list[Declarator] = [CLGlobal(Const(Pointer(Value(
f"struct {arg_overflow_struct_name}",
"_lpy_overflow_args"))))]
else:
arg_struct_preambles = []
arg_struct_args = []
return arg_struct_preambles, FunctionDeclarationWrapper(
self._wrap_kernel_decl(
codegen_state, schedule_index,
FunctionDeclaration(
name,
[self.arg_to_cgen_declarator(
kernel, arg_name,
is_written=arg_name in written_names)
for arg_name in regular_arg_names]
+ arg_struct_args
)))
else:
name = Value("static void", name_str)
passed_names = [arg.name for arg in kernel.args]
written_names = kernel.get_written_variables()
return [], FunctionDeclarationWrapper(
FunctionDeclaration(
name,
[self.arg_to_cgen_declarator(
kernel, arg_name,
is_written=arg_name in written_names)
for arg_name in passed_names]))
# }}}
# {{{ library # {{{ library
def function_manglers(self): @property
from loopy.library.random123 import random123_function_mangler def known_callables(self):
return ( from loopy.library.random123 import get_random123_callables
super(PyOpenCLCASTBuilder, self).function_manglers() + [
pyopencl_function_mangler, # order matters: e.g. prefer our abs() over that of the
random123_function_mangler # superclass
]) callables = super().known_callables
callables.update(get_pyopencl_callables())
callables.update(get_random123_callables(self.target))
return callables
def preamble_generators(self): def preamble_generators(self):
from loopy.library.random123 import random123_preamble_generator return ([pyopencl_preamble_generator, *super().preamble_generators()])
return ([
pyopencl_preamble_generator,
random123_preamble_generator,
] + super(PyOpenCLCASTBuilder, self).preamble_generators())
# }}} # }}}
def get_expression_to_c_expression_mapper(self, codegen_state):
return ExpressionToPyOpenCLCExpressionMapper(codegen_state)
# }}} # }}}
# {{{ volatile mem access target
class VolatileMemPyOpenCLCASTBuilder(PyOpenCLCASTBuilder):
def get_expression_to_c_expression_mapper(self, codegen_state):
from loopy.target.opencl import VolatileMemExpressionToOpenCLCExpressionMapper
return VolatileMemExpressionToOpenCLCExpressionMapper(codegen_state)
class VolatileMemPyOpenCLTarget(PyOpenCLTarget):
def get_device_ast_builder(self):
return VolatileMemPyOpenCLCASTBuilder(self)
# }}}
# vim: foldmethod=marker # vim: foldmethod=marker
from __future__ import division, with_statement, absolute_import from __future__ import annotations
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
...@@ -22,604 +23,272 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ...@@ -22,604 +23,272 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
""" """
import six
from six.moves import range, zip
import numpy as np
from pytools import ImmutableRecord, memoize_method
from loopy.diagnostic import ParameterFinderWarning
from pytools.py_codegen import (
Indentation, PythonFunctionGenerator)
from loopy.diagnostic import LoopyError
from loopy.types import NumpyType
from loopy.execution import KernelExecutorBase
import logging import logging
logger = logging.getLogger(__name__) from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Callable, Sequence
# {{{ invoker generation
# /!\ This code runs in a namespace controlled by the user.
# Prefix all auxiliary variables with "_lpy".
def python_dtype_str(dtype):
import pyopencl.tools as cl_tools
if dtype.isbuiltin:
return "_lpy_np."+dtype.name
else:
return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")"
% cl_tools.dtype_to_ctype(dtype))
# {{{ integer arg finding from shapes
def generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info):
# a mapping from integer argument names to a list of tuples
# (arg_name, expression), where expression is a
# unary function of kernel.arg_dict[arg_name]
# returning the desired integer argument.
iarg_to_sources = {}
from loopy.kernel.data import GlobalArg
from loopy.symbolic import DependencyMapper, StringifyMapper
dep_map = DependencyMapper()
from pymbolic import var
for arg in implemented_data_info:
if arg.arg_class is GlobalArg:
sym_shape = var(arg.name).attr("shape")
for axis_nr, shape_i in enumerate(arg.shape):
if shape_i is None:
continue
deps = dep_map(shape_i)
if len(deps) == 1:
integer_arg_var, = deps
if kernel.arg_dict[integer_arg_var.name].dtype.is_integral():
from pymbolic.algorithm import solve_affine_equations_for
try:
# friggin' overkill :)
iarg_expr = solve_affine_equations_for(
[integer_arg_var.name],
[(shape_i, sym_shape.index(axis_nr))]
)[integer_arg_var]
except Exception as e:
#from traceback import print_exc
#print_exc()
# went wrong? oh well
from warnings import warn
warn("Unable to generate code to automatically "
"find '%s' from the shape of '%s':\n%s"
% (integer_arg_var.name, arg.name, str(e)),
ParameterFinderWarning)
else:
iarg_to_sources.setdefault(integer_arg_var.name, []) \
.append((arg.name, iarg_expr))
gen("# {{{ find integer arguments from shapes")
gen("")
for iarg_name, sources in six.iteritems(iarg_to_sources):
gen("if %s is None:" % iarg_name)
with Indentation(gen):
if_stmt = "if"
for arg_name, value_expr in sources:
gen("%s %s is not None:" % (if_stmt, arg_name))
with Indentation(gen):
gen("%s = %s"
% (iarg_name, StringifyMapper()(value_expr)))
if_stmt = "elif"
gen("")
gen("# }}}")
gen("")
# }}}
# {{{ integer arg finding from offsets import numpy as np
def generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info):
options = kernel.options
gen("# {{{ find integer arguments from offsets")
gen("")
for arg in implemented_data_info:
impl_array_name = arg.offset_for_name
if impl_array_name is not None:
gen("if %s is None:" % arg.name)
with Indentation(gen):
gen("if %s is None:" % impl_array_name)
with Indentation(gen):
gen("# Output variable, we'll be allocating "
"it, with zero offset.")
gen("%s = 0" % arg.name)
gen("else:")
with Indentation(gen):
if not options.no_numpy:
gen("_lpy_offset = getattr(%s, \"offset\", 0)"
% impl_array_name)
else:
gen("_lpy_offset = %s.offset" % impl_array_name)
base_arg = kernel.impl_arg_to_arg[impl_array_name]
if not options.skip_arg_checks:
gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)"
% (arg.name, base_arg.dtype.itemsize))
gen("assert _lpy_remdr == 0, \"Offset of array '%s' is "
"not divisible by its dtype itemsize\""
% impl_array_name)
gen("del _lpy_remdr")
else:
gen("%s = _lpy_offset // %d"
% (arg.name, base_arg.dtype.itemsize))
if not options.skip_arg_checks:
gen("del _lpy_offset")
gen("# }}}")
gen("")
# }}}
# {{{ integer arg finding from strides
def generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info): from pytools import memoize_method
options = kernel.options from pytools.codegen import CodeGenerator, Indentation
gen("# {{{ find integer arguments from strides") from loopy.kernel.data import ArrayArg
gen("") from loopy.target.execution import ExecutionWrapperGeneratorBase, ExecutorBase
from loopy.typing import Expression, integer_expr_or_err
for arg in implemented_data_info:
if arg.stride_for_name_and_axis is not None:
impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis
gen("if %s is None:" % arg.name) logger = logging.getLogger(__name__)
with Indentation(gen):
if not options.skip_arg_checks:
gen("if %s is None:" % impl_array_name)
with Indentation(gen):
gen("raise RuntimeError(\"required stride '%s' for "
"argument '%s' not given or deducible from "
"passed array\")"
% (arg.name, impl_array_name))
base_arg = kernel.impl_arg_to_arg[impl_array_name]
if not options.skip_arg_checks: if TYPE_CHECKING:
gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)" from constantdict import constantdict
% (arg.name, impl_array_name, stride_impl_axis,
base_arg.dtype.dtype.itemsize))
gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' is " import pyopencl as cl
"not divisible by its dtype itemsize\""
% (stride_impl_axis, impl_array_name))
gen("del _lpy_remdr")
else:
gen("%s = _lpy_offset // %d"
% (arg.name, base_arg.dtype.itemsize))
gen("# }}}") from loopy.codegen.result import CodeGenerationResult
gen("") from loopy.kernel import LoopKernel
from loopy.schedule.tools import KernelArgInfo
from loopy.types import LoopyType
# }}}
# {{{ invoker generation
# {{{ check that value args are present # /!\ This code runs in a namespace controlled by the user.
# Prefix all auxiliary variables with "_lpy".
def generate_value_arg_check(gen, kernel, implemented_data_info):
if kernel.options.skip_arg_checks:
return
from loopy.kernel.data import ValueArg class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
"""
Specialized form of the :class:`ExecutionWrapperGeneratorBase` for
pyopencl execution
"""
gen("# {{{ check that value args are present") def __init__(self) -> None:
gen("") system_args = [
"_lpy_cl_kernels", "queue", "allocator=None", "wait_for=None",
# ignored if options.no_numpy
"out_host=None"
]
super().__init__(system_args)
def python_dtype_str_inner(self, dtype: np.dtype) -> str:
import pyopencl.tools as cl_tools
# Test for types built into numpy. dtype.isbuiltin does not work:
# https://github.com/numpy/numpy/issues/4317
# Guided by https://numpy.org/doc/stable/reference/arrays.scalars.html
if issubclass(dtype.type, (np.bool_, np.number)):
name = dtype.name
if dtype.type == np.bool_:
name = "bool_"
return f"_lpy_np.dtype(_lpy_np.{name})"
else:
return ('_lpy_cl_tools.get_or_register_dtype("%s")'
% cl_tools.dtype_to_ctype(dtype))
for arg in implemented_data_info: # {{{ handle non-numpy args
if not issubclass(arg.arg_class, ValueArg):
continue
gen("if %s is None:" % arg.name) def handle_non_numpy_arg(self, gen: CodeGenerator, arg: ArrayArg) -> None:
gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name)
with Indentation(gen): with Indentation(gen):
gen("raise TypeError(\"value argument '%s' " gen("# retain originally passed array")
"was not given and could not be automatically " gen(f"_lpy_{arg.name}_np_input = {arg.name}")
"determined\")" % arg.name) gen("# synchronous, nothing to worry about")
gen("%s = _lpy_cl_array.to_device("
gen("# }}}") "queue, %s, allocator=allocator)"
gen("") % (arg.name, arg.name))
gen("_lpy_encountered_numpy = True")
# }}} gen("elif %s is not None:" % arg.name)
with Indentation(gen):
gen("_lpy_encountered_dev = True")
# {{{ arg setup gen("_lpy_%s_np_input = None" % arg.name)
gen("else:")
def generate_arg_setup(gen, kernel, implemented_data_info, options): with Indentation(gen):
import loopy as lp gen("_lpy_%s_np_input = None" % arg.name)
from loopy.kernel.data import KernelArgument
from loopy.kernel.array import ArrayBase
from loopy.symbolic import StringifyMapper
from pymbolic import var
gen("# {{{ set up array arguments")
gen("")
if not options.no_numpy:
gen("_lpy_encountered_numpy = False")
gen("_lpy_encountered_dev = False")
gen("") gen("")
args = [] # }}}
strify = StringifyMapper()
expect_no_more_arguments = False
for arg_idx, arg in enumerate(implemented_data_info):
is_written = arg.base_name in kernel.get_written_variables()
kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
if not issubclass(arg.arg_class, KernelArgument):
expect_no_more_arguments = True
continue
if expect_no_more_arguments:
raise LoopyError("Further arguments encountered after arg info "
"describing a global temporary variable")
if not issubclass(arg.arg_class, ArrayBase): # {{{ handle allocation of unspecified arguments
args.append(arg.name)
continue
gen("# {{{ process %s" % arg.name) def handle_alloc(
self, gen: CodeGenerator, arg: ArrayArg,
strify: Callable[[Expression], str],
skip_arg_checks: bool) -> None:
"""
Handle allocation of non-specified arguments for pyopencl execution
"""
from pymbolic import var
from loopy.kernel.array import get_strides
strides = get_strides(arg)
num_axes = len(strides)
assert arg.dtype is not None
itemsize = arg.dtype.numpy_dtype.itemsize
for i in range(num_axes):
gen("_lpy_ustrides_%d = %s" % (i, strify(strides[i])))
if not skip_arg_checks:
for i in range(num_axes):
gen("assert _lpy_ustrides_%d >= 0, "
"\"'%s' has negative stride in axis %d\""
% (i, arg.name, i))
assert isinstance(arg.shape, tuple)
sym_ustrides = tuple(
var("_lpy_ustrides_%d" % i)
for i in range(num_axes))
sym_shape = tuple(arg.shape[i] for i in range(num_axes))
size_expr = 1 + sum(
integer_expr_or_err(astrd)*(integer_expr_or_err(alen)-1)
for alen, astrd in zip(sym_shape, sym_ustrides)
)
gen("_lpy_size = %s" % strify(size_expr))
sym_strides = tuple(itemsize*s_i for s_i in sym_ustrides)
dtype_name = self.python_dtype_str(gen, arg.dtype.numpy_dtype)
gen(f"{arg.name} = _lpy_cl_array.Array(None, {strify(sym_shape)}, "
f"{dtype_name}, strides={strify(sym_strides)}, "
f"data=allocator({strify(itemsize * var('_lpy_size'))}), "
"allocator=allocator, "
"_fast=True, _size=_lpy_size, "
"_context=queue.context, _queue=queue)")
for i in range(num_axes):
gen("del _lpy_ustrides_%d" % i)
gen("del _lpy_size")
gen("") gen("")
if not options.no_numpy: # }}}
gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name)
with Indentation(gen):
gen("# synchronous, nothing to worry about")
gen("%s = _lpy_cl_array.to_device("
"queue, %s, allocator=allocator)"
% (arg.name, arg.name))
gen("_lpy_encountered_numpy = True")
gen("elif %s is not None:" % arg.name)
with Indentation(gen):
gen("_lpy_encountered_dev = True")
gen("")
if not options.skip_arg_checks and not is_written:
gen("if %s is None:" % arg.name)
with Indentation(gen):
gen("raise RuntimeError(\"input argument '%s' must "
"be supplied\")" % arg.name)
gen("")
if (is_written
and arg.arg_class is lp.ImageArg
and not options.skip_arg_checks):
gen("if %s is None:" % arg.name)
with Indentation(gen):
gen("raise RuntimeError(\"written image '%s' must "
"be supplied\")" % arg.name)
gen("")
if is_written and arg.shape is None and not options.skip_arg_checks:
gen("if %s is None:" % arg.name)
with Indentation(gen):
gen("raise RuntimeError(\"written argument '%s' has "
"unknown shape and must be supplied\")" % arg.name)
gen("")
possibly_made_by_loopy = False
# {{{ allocate written arrays, if needed def target_specific_preamble(self, gen: CodeGenerator) -> None:
"""
Add default pyopencl imports to preamble
"""
gen.add_to_preamble("import numpy as _lpy_np")
gen.add_to_preamble("import pyopencl as _lpy_cl")
gen.add_to_preamble("import pyopencl.array as _lpy_cl_array")
gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools")
gen.add_to_preamble("from struct import pack as _lpy_pack")
from loopy.target.c.c_execution import DEF_EVEN_DIV_FUNCTION
gen.add_to_preamble(DEF_EVEN_DIV_FUNCTION)
def initialize_system_args(self, gen: CodeGenerator) -> None:
"""
Initializes possibly empty system arguments
"""
gen("if allocator is None:")
with Indentation(gen):
gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)")
gen("")
if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ # {{{ generate invocation
and arg.shape is not None:
if not isinstance(arg.dtype, NumpyType): def generate_invocation(self, gen: CodeGenerator, kernel: LoopKernel,
raise LoopyError("do not know how to pass arg of type '%s'" kai: KernelArgInfo, host_program_name: str, args: Sequence[str]) -> None:
% arg.dtype) if kernel.options.cl_exec_manage_array_events:
gen("""
if wait_for is None:
wait_for = []
""")
possibly_made_by_loopy = True
gen("_lpy_made_by_loopy = False")
gen("") gen("")
for arg_name in kai.passed_arg_names:
arg = kernel.arg_dict[arg_name]
if isinstance(arg, ArrayArg):
gen(
"wait_for.extend({arg_name}.events)"
.format(arg_name=arg.name))
gen("if %s is None:" % arg.name)
with Indentation(gen):
num_axes = len(arg.strides)
for i in range(num_axes):
gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i])))
itemsize = kernel_arg.dtype.numpy_dtype.itemsize
for i in range(num_axes):
gen("_lpy_strides_%d = %s" % (i, strify(
itemsize*arg.unvec_strides[i])))
if not options.skip_arg_checks:
for i in range(num_axes):
gen("assert _lpy_strides_%d > 0, "
"\"'%s' has negative stride in axis %d\""
% (i, arg.name, i))
sym_strides = tuple(
var("_lpy_strides_%d" % i)
for i in range(num_axes))
sym_shape = tuple(
var("_lpy_shape_%d" % i)
for i in range(num_axes))
alloc_size_expr = (sum(astrd*(alen-1)
for alen, astrd in zip(sym_shape, sym_strides))
+ itemsize)
gen("_lpy_alloc_size = %s" % strify(alloc_size_expr))
gen("%(name)s = _lpy_cl_array.Array(queue, %(shape)s, "
"%(dtype)s, strides=%(strides)s, "
"data=allocator(_lpy_alloc_size), allocator=allocator)"
% dict(
name=arg.name,
shape=strify(sym_shape),
strides=strify(sym_strides),
dtype=python_dtype_str(kernel_arg.dtype.numpy_dtype)))
if not options.skip_arg_checks:
for i in range(num_axes):
gen("del _lpy_shape_%d" % i)
gen("del _lpy_strides_%d" % i)
gen("del _lpy_alloc_size")
gen("")
gen("_lpy_made_by_loopy = True")
gen("")
# }}}
# {{{ argument checking
if arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
and not options.skip_arg_checks:
if possibly_made_by_loopy:
gen("if not _lpy_made_by_loopy:")
else:
gen("if True:")
with Indentation(gen):
gen("if %s.dtype != %s:"
% (arg.name, python_dtype_str(kernel_arg.dtype.numpy_dtype)))
with Indentation(gen):
gen("raise TypeError(\"dtype mismatch on argument '%s' "
"(got: %%s, expected: %s)\" %% %s.dtype)"
% (arg.name, arg.dtype, arg.name))
# {{{ generate shape checking code
def strify_allowing_none(shape_axis):
if shape_axis is None:
return "None"
else:
return strify(shape_axis)
def strify_tuple(t):
if len(t) == 0:
return "()"
else:
return "(%s,)" % ", ".join(
strify_allowing_none(sa)
for sa in t)
shape_mismatch_msg = (
"raise TypeError(\"shape mismatch on argument '%s' "
"(got: %%s, expected: %%s)\" "
"%% (%s.shape, %s))"
% (arg.name, arg.name, strify_tuple(arg.unvec_shape)))
if kernel_arg.shape is None:
pass
elif any(shape_axis is None for shape_axis in kernel_arg.shape):
gen("if len(%s.shape) != %s:"
% (arg.name, len(arg.unvec_shape)))
with Indentation(gen):
gen(shape_mismatch_msg)
for i, shape_axis in enumerate(arg.unvec_shape):
if shape_axis is None:
continue
gen("if %s.shape[%d] != %s:"
% (arg.name, i, strify(shape_axis)))
with Indentation(gen):
gen(shape_mismatch_msg)
else: # not None, no Nones in tuple
gen("if %s.shape != %s:"
% (arg.name, strify(arg.unvec_shape)))
with Indentation(gen):
gen(shape_mismatch_msg)
# }}}
if arg.unvec_strides and kernel_arg.dim_tags:
itemsize = kernel_arg.dtype.numpy_dtype.itemsize
sym_strides = tuple(
itemsize*s_i for s_i in arg.unvec_strides)
gen("if %s.strides != %s:"
% (arg.name, strify(sym_strides)))
with Indentation(gen):
gen("raise TypeError(\"strides mismatch on "
"argument '%s' (got: %%s, expected: %%s)\" "
"%% (%s.strides, %s))"
% (arg.name, arg.name, strify(sym_strides)))
if not arg.allows_offset:
gen("if %s.offset:" % arg.name)
with Indentation(gen):
gen("raise ValueError(\"Argument '%s' does not "
"allow arrays with offsets. Try passing "
"default_offset=loopy.auto to make_kernel()."
"\")" % arg.name)
gen("")
# }}}
if possibly_made_by_loopy and not options.skip_arg_checks:
gen("del _lpy_made_by_loopy")
gen("") gen("")
if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]: arg_list = (["_lpy_cl_kernels", "queue", *args,
args.append("%s.base_data" % arg.name) "wait_for=wait_for", "allocator=allocator"])
else: gen(f"_lpy_evt = {host_program_name}({', '.join(arg_list)})")
args.append("%s" % arg.name)
gen("")
gen("# }}}")
gen("")
gen("# }}}")
gen("")
return args
# }}}
def generate_invoker(kernel, codegen_result):
options = kernel.options
implemented_data_info = codegen_result.implemented_data_info
host_code = codegen_result.host_code()
system_args = [
"_lpy_cl_kernels", "queue", "allocator=None", "wait_for=None",
# ignored if options.no_numpy
"out_host=None"
]
from loopy.kernel.data import KernelArgument
gen = PythonFunctionGenerator(
"invoke_%s_loopy_kernel" % kernel.name,
system_args + [
"%s=None" % idi.name
for idi in implemented_data_info
if issubclass(idi.arg_class, KernelArgument)
])
gen.add_to_preamble("from __future__ import division")
gen.add_to_preamble("")
gen.add_to_preamble("import pyopencl as _lpy_cl")
gen.add_to_preamble("import pyopencl.array as _lpy_cl_array")
gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools")
gen.add_to_preamble("import numpy as _lpy_np")
gen.add_to_preamble("")
gen.add_to_preamble(host_code)
gen.add_to_preamble("")
gen("if allocator is None:")
with Indentation(gen):
gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)")
gen("")
generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info)
generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info)
generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info)
generate_value_arg_check(gen, kernel, implemented_data_info)
args = generate_arg_setup(gen, kernel, implemented_data_info, options)
# {{{ generate invocation
gen("_lpy_evt = {kernel_name}({args})" if kernel.options.cl_exec_manage_array_events:
.format( gen("")
kernel_name=codegen_result.host_program.name, for arg_name in kai.passed_arg_names:
args=", ".join( arg = kernel.arg_dict[arg_name]
["_lpy_cl_kernels", "queue"] if (isinstance(arg, ArrayArg)
+ args and arg.name in kernel.get_written_variables()):
+ ["wait_for=wait_for"]))) gen(f"{arg.name}.add_event(_lpy_evt)")
# }}} # }}}
# {{{ output # {{{ generate_output_handler
if not options.no_numpy: def generate_output_handler(self, gen: CodeGenerator,
gen("if out_host is None and (_lpy_encountered_numpy " kernel: LoopKernel, kai: KernelArgInfo) -> None:
"and not _lpy_encountered_dev):") options = kernel.options
with Indentation(gen):
gen("out_host = True")
gen("if out_host:") if not options.no_numpy:
with Indentation(gen): gen("if out_host is None and (_lpy_encountered_numpy "
gen("pass") # if no outputs (?!) "and not _lpy_encountered_dev):")
for arg in implemented_data_info: with Indentation(gen):
if not issubclass(arg.arg_class, KernelArgument): gen("out_host = True")
continue
is_written = arg.base_name in kernel.get_written_variables() for arg_name in kai.passed_arg_names:
if is_written: arg = kernel.arg_dict[arg_name]
gen("%s = %s.get(queue=queue)" % (arg.name, arg.name)) if arg.is_output:
np_name = "_lpy_%s_np_input" % arg.name
gen("if out_host or %s is not None:" % np_name)
with Indentation(gen):
gen("%s = %s.get(queue=queue, ary=%s)"
% (arg.name, arg.name, np_name))
gen("") gen("")
if options.return_dict: if options.return_dict:
gen("return _lpy_evt, {%s}" gen("return _lpy_evt, {%s}"
% ", ".join("\"%s\": %s" % (arg.name, arg.name) % ", ".join(f'"{arg_name}": {arg_name}'
for arg in implemented_data_info for arg_name in kai.passed_arg_names
if issubclass(arg.arg_class, KernelArgument) if kernel.arg_dict[arg_name].is_output))
if arg.base_name in kernel.get_written_variables()))
else:
out_args = [arg
for arg in implemented_data_info
if issubclass(arg.arg_class, KernelArgument)
if arg.base_name in kernel.get_written_variables()]
if out_args:
gen("return _lpy_evt, (%s,)"
% ", ".join(arg.name for arg in out_args))
else: else:
gen("return _lpy_evt, ()") passed_arg_names_set = frozenset(kai.passed_arg_names)
out_names = [
# Must ensure that these occur in the same order as in
# kernel.args.
arg.name
for arg in kernel.args
if arg.name in passed_arg_names_set
if arg.is_output]
if out_names:
gen("return _lpy_evt, (%s,)"
% ", ".join(out_names))
else:
gen("return _lpy_evt, ()")
# }}} # }}}
if options.write_wrapper: def generate_host_code(
output = gen.get() self, gen: CodeGenerator, codegen_result: CodeGenerationResult
if options.highlight_wrapper: ) -> None:
output = get_highlighted_python_code(output) gen.add_to_preamble(codegen_result.host_code())
if options.write_wrapper is True:
print(output)
else:
with open(options.write_wrapper, "w") as outf:
outf.write(output)
return gen.get_function()
def get_arg_pass(self, arg):
return "%s.base_data" % arg.name
# }}} # }}}
# {{{ kernel executor @dataclass(frozen=True)
class _KernelInfo:
class _CLKernelInfo(ImmutableRecord): cl_kernels: _Kernels
pass invoker: Callable[..., Any]
class _CLKernels(object): class _Kernels:
pass pass
class PyOpenCLKernelExecutor(KernelExecutorBase): # {{{ kernel executor
class PyOpenCLExecutor(ExecutorBase):
"""An object connecting a kernel to a :class:`pyopencl.Context` """An object connecting a kernel to a :class:`pyopencl.Context`
for execution. for execution.
...@@ -627,94 +296,71 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): ...@@ -627,94 +296,71 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
.. automethod:: __call__ .. automethod:: __call__
""" """
def __init__(self, context, kernel): def __init__(self, context: cl.Context, t_unit, entrypoint):
""" super().__init__(t_unit, entrypoint)
:arg context: a :class:`pyopencl.Context`
:arg kernel: may be a loopy.LoopKernel, a generator returning kernels
(a warning will be issued if more than one is returned). If the
kernel has not yet been loop-scheduled, that is done, too, with no
specific arguments.
"""
super(PyOpenCLKernelExecutor, self).__init__(kernel)
self.context = context self.context = context
from loopy.target.pyopencl import PyOpenCLTarget def get_invoker_uncached(self, t_unit, entrypoint, codegen_result):
if isinstance(kernel.target, PyOpenCLTarget): generator = PyOpenCLExecutionWrapperGenerator()
self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0])) return generator(t_unit, entrypoint, codegen_result)
def get_wrapper_generator(self):
return PyOpenCLExecutionWrapperGenerator()
@memoize_method @memoize_method
def cl_kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): def translation_unit_info(
kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) self,
arg_to_dtype: constantdict[str, LoopyType] | None = None) -> _KernelInfo:
t_unit = self.get_typed_and_scheduled_translation_unit(arg_to_dtype)
# FIXME: now just need to add the types to the arguments
from loopy.codegen import generate_code_v2 from loopy.codegen import generate_code_v2
codegen_result = generate_code_v2(kernel) from loopy.target.execution import get_highlighted_code
codegen_result = generate_code_v2(t_unit)
dev_code = codegen_result.device_code() dev_code = codegen_result.device_code()
if self.kernel.options.write_cl: if t_unit[self.entrypoint].options.write_code:
# FIXME: redirect to "translation unit" level option as well.
output = dev_code output = dev_code
if self.kernel.options.highlight_cl: if self.t_unit[self.entrypoint].options.allow_terminal_colors:
output = get_highlighted_cl_code(output) output = get_highlighted_code(output)
if self.kernel.options.write_cl is True: if self.t_unit[self.entrypoint].options.write_code is True:
print(output) print(output)
else: else:
with open(self.kernel.options.write_cl, "w") as outf: with open(
self.t_unit[self.entrypoint].options.write_code, "w"
) as outf:
outf.write(output) outf.write(output)
if self.kernel.options.edit_cl: if t_unit[self.entrypoint].options.edit_code:
# FIXME: redirect to "translation unit" level option as well.
from pytools import invoke_editor from pytools import invoke_editor
dev_code = invoke_editor(dev_code, "code.cl") dev_code = invoke_editor(dev_code, "code.cl")
import pyopencl as cl import pyopencl as cl
# FIXME: redirect to "translation unit" level option as well.
cl_program = ( cl_program = (
cl.Program(self.context, dev_code) cl.Program(self.context, dev_code)
.build(options=kernel.options.cl_build_options)) .build(options=t_unit[self.entrypoint].options.build_options))
cl_kernels = _CLKernels() cl_kernels = _Kernels()
for dp in codegen_result.device_programs: for dp in cl_program.kernel_names.split(";"):
setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) setattr(cl_kernels, dp, getattr(cl_program, dp))
return _CLKernelInfo( return _KernelInfo(
kernel=kernel,
cl_kernels=cl_kernels, cl_kernels=cl_kernels,
implemented_data_info=codegen_result.implemented_data_info, invoker=self.get_invoker(t_unit, self.entrypoint, codegen_result))
invoker=generate_invoker(kernel, codegen_result))
# {{{ debugging aids
def get_code(self, arg_to_dtype=None):
def process_dtype(dtype):
if isinstance(dtype, type) and issubclass(dtype, np.generic):
dtype = np.dtype(dtype)
if isinstance(dtype, np.dtype):
dtype = NumpyType(dtype, self.kernel.target)
return dtype
if arg_to_dtype is not None:
arg_to_dtype = frozenset(
(k, process_dtype(v)) for k, v in six.iteritems(arg_to_dtype))
kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype)
from loopy.codegen import generate_code_v2
code = generate_code_v2(kernel)
return code.device_code()
def get_highlighted_code(self, arg_to_dtype=None):
return get_highlighted_cl_code(
self.get_code(arg_to_dtype))
# }}} def __call__(self, queue, *,
allocator=None, wait_for=None, out_host=None,
def __call__(self, queue, **kwargs): **kwargs):
""" """
:arg allocator: a callable passed a byte count and returning :arg allocator: a callable passed a byte count and returning
a :class:`pyopencl.Buffer`. A :class:`pyopencl` allocator a :class:`pyopencl.Buffer`. A :mod:`pyopencl` allocator
maybe. maybe.
:arg wait_for: A list of :class:`pyopencl.Event` instances :arg wait_for: A list of :class:`pyopencl.Event` instances
for which to wait. for which to wait.
...@@ -738,43 +384,18 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): ...@@ -738,43 +384,18 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
of the returned arrays. of the returned arrays.
""" """
allocator = kwargs.pop("allocator", None) if __debug__:
wait_for = kwargs.pop("wait_for", None) self.check_for_required_array_arguments(kwargs.keys())
out_host = kwargs.pop("out_host", None)
kwargs = self.packing_controller.unpack(kwargs) if self.packing_controller is not None:
kwargs = self.packing_controller(kwargs)
kernel_info = self.cl_kernel_info(self.arg_to_dtype_set(kwargs)) translation_unit_info = self.translation_unit_info(self.arg_to_dtype(kwargs))
return kernel_info.invoker( return translation_unit_info.invoker(
kernel_info.cl_kernels, queue, allocator, wait_for, translation_unit_info.cl_kernels, queue, allocator, wait_for,
out_host, **kwargs) out_host, **kwargs)
# }}} # }}}
def get_highlighted_python_code(text):
try:
from pygments import highlight
except ImportError:
return text
else:
from pygments.lexers import PythonLexer
from pygments.formatters import TerminalFormatter
return highlight(text, PythonLexer(), TerminalFormatter())
def get_highlighted_cl_code(text):
try:
from pygments import highlight
except ImportError:
return text
else:
from pygments.lexers import CLexer
from pygments.formatters import TerminalFormatter
return highlight(text, CLexer(), TerminalFormatter())
# vim: foldmethod=marker # vim: foldmethod=marker
"""Python host AST builder for integration with PyOpenCL.""" """Python host AST builder for integration with PyOpenCL."""
from __future__ import annotations
from __future__ import division, absolute_import
__copyright__ = "Copyright (C) 2016 Andreas Kloeckner" __copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
...@@ -24,16 +24,23 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ...@@ -24,16 +24,23 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
""" """
import six from typing import TYPE_CHECKING, Sequence
import numpy as np import numpy as np
from genpy import Collection, Generable, Suite
from pymbolic.mapper import Mapper from pymbolic.mapper import Mapper
from pymbolic.mapper.stringifier import StringifyMapper from pymbolic.mapper.stringifier import StringifyMapper
from loopy.type_inference import TypeInferenceMapper
from loopy.diagnostic import LoopyError
from loopy.kernel.data import ValueArg from loopy.kernel.data import ValueArg
from loopy.diagnostic import LoopyError # noqa
from loopy.target import ASTBuilderBase from loopy.target import ASTBuilderBase
from genpy import Suite from loopy.type_inference import TypeReader
if TYPE_CHECKING:
from loopy.codegen import CodeGenerationState
from loopy.codegen.result import CodeGenerationResult
# {{{ expression to code # {{{ expression to code
...@@ -44,19 +51,24 @@ class ExpressionToPythonMapper(StringifyMapper): ...@@ -44,19 +51,24 @@ class ExpressionToPythonMapper(StringifyMapper):
self.codegen_state = codegen_state self.codegen_state = codegen_state
if type_inf_mapper is None: if type_inf_mapper is None:
type_inf_mapper = TypeInferenceMapper(self.kernel) type_inf_mapper = TypeReader(self.kernel,
self.codegen_state.callables_table)
self.type_inf_mapper = type_inf_mapper self.type_inf_mapper = type_inf_mapper
def handle_unsupported_expression(self, victim, enclosing_prec): def handle_unsupported_expression(self, victim, enclosing_prec):
return Mapper.handle_unsupported_expression(self, victim, enclosing_prec) return Mapper.handle_unsupported_expression(self, victim, enclosing_prec)
def rec(self, expr, prec, type_context=None, needed_dtype=None): def rec(self, expr, prec, type_context=None, needed_dtype=None):
return super(ExpressionToPythonMapper, self).rec(expr, prec) return super().rec(expr, prec)
__call__ = rec # FIXME: Fix once mappers are precisely typed
__call__ = rec # type: ignore[assignment]
def map_constant(self, expr, enclosing_prec): def map_constant(self, expr, enclosing_prec):
return repr(expr) if isinstance(expr, np.generic):
return repr(expr).replace("np.", "_lpy_np.")
else:
return repr(expr)
def map_variable(self, expr, enclosing_prec): def map_variable(self, expr, enclosing_prec):
if expr.name in self.codegen_state.var_subst_map: if expr.name in self.codegen_state.var_subst_map:
...@@ -66,63 +78,46 @@ class ExpressionToPythonMapper(StringifyMapper): ...@@ -66,63 +78,46 @@ class ExpressionToPythonMapper(StringifyMapper):
enclosing_prec)) enclosing_prec))
if expr.name in self.kernel.all_inames(): if expr.name in self.kernel.all_inames():
return super(ExpressionToPythonMapper, self).map_variable( return super().map_variable(
expr, enclosing_prec) expr, enclosing_prec)
var_descr = self.kernel.get_var_descriptor(expr.name) var_descr = self.kernel.get_var_descriptor(expr.name)
if isinstance(var_descr, ValueArg): if isinstance(var_descr, ValueArg):
return super(ExpressionToPythonMapper, self).map_variable( return super().map_variable(
expr, enclosing_prec) expr, enclosing_prec)
return super(ExpressionToPythonMapper, self).map_variable( return super().map_variable(
expr, enclosing_prec) expr, enclosing_prec)
def map_subscript(self, expr, enclosing_prec): def map_subscript(self, expr, enclosing_prec):
return super(ExpressionToPythonMapper, self).map_subscript( return super().map_subscript(
expr, enclosing_prec) expr, enclosing_prec)
def map_call(self, expr, enclosing_prec): def map_call(self, expr, enclosing_prec):
from pymbolic.primitives import Variable
from pymbolic.mapper.stringifier import PREC_NONE from pymbolic.mapper.stringifier import PREC_NONE
identifier = expr.function identifier_name = self.codegen_state.callables_table[
expr.function.name].name
if identifier.name in ["indexof", "indexof_vec"]: if identifier_name in ["indexof", "indexof_vec"]:
raise LoopyError( raise LoopyError(
"indexof, indexof_vec not yet supported in Python") "indexof, indexof_vec not yet supported in Python")
if isinstance(identifier, Variable): clbl = self.codegen_state.callables_table[
identifier = identifier.name expr.function.name]
par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters)
str_parameters = None str_parameters = None
number_of_assignees = len([key for key in
clbl.arg_id_to_dtype.keys() if key < 0])
mangle_result = self.kernel.mangle_function( if number_of_assignees != 1:
identifier, par_dtypes,
ast_builder=self.codegen_state.ast_builder)
if mangle_result is None:
raise RuntimeError("function '%s' unknown--"
"maybe you need to register a function mangler?"
% identifier)
if len(mangle_result.result_dtypes) != 1:
raise LoopyError("functions with more or fewer than one return value " raise LoopyError("functions with more or fewer than one return value "
"may not be used in an expression") "may not be used in an expression")
str_parameters = [ str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters]
self.rec(par, PREC_NONE)
for par, par_dtype, tgt_dtype in zip(
expr.parameters, par_dtypes, mangle_result.arg_dtypes)]
from loopy.codegen import SeenFunction return "{}({})".format(clbl.name_in_target,
self.codegen_state.seen_functions.add( ", ".join(str_parameters))
SeenFunction(identifier,
mangle_result.target_name,
mangle_result.arg_dtypes or par_dtypes))
return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
def map_group_hw_index(self, expr, enclosing_prec): def map_group_hw_index(self, expr, enclosing_prec):
raise LoopyError("plain Python does not have group hw axes") raise LoopyError("plain Python does not have group hw axes")
...@@ -148,34 +143,8 @@ class ExpressionToPythonMapper(StringifyMapper): ...@@ -148,34 +143,8 @@ class ExpressionToPythonMapper(StringifyMapper):
# }}} # }}}
# {{{ genpy extensions
class Collection(Suite):
def generate(self):
for item in self.contents:
for item_line in item.generate():
yield item_line
# }}}
# {{{ ast builder # {{{ ast builder
def _numpy_single_arg_function_mangler(kernel, name, arg_dtypes):
if (not isinstance(name, str)
or not hasattr(np, name)
or len(arg_dtypes) != 1):
return None
arg_dtype, = arg_dtypes
from loopy.kernel.data import CallMangleInfo
return CallMangleInfo(
target_name="_lpy_np."+name,
result_dtypes=(arg_dtype,),
arg_dtypes=arg_dtypes)
def _base_python_preamble_generator(preamble_info): def _base_python_preamble_generator(preamble_info):
yield ("00_future", "from __future__ import division, print_function\n") yield ("00_future", "from __future__ import division, print_function\n")
yield ("05_numpy_import", """ yield ("05_numpy_import", """
...@@ -183,27 +152,33 @@ def _base_python_preamble_generator(preamble_info): ...@@ -183,27 +152,33 @@ def _base_python_preamble_generator(preamble_info):
""") """)
class PythonASTBuilderBase(ASTBuilderBase): class PythonASTBuilderBase(ASTBuilderBase[Generable]):
"""A Python host AST builder for integration with PyOpenCL. """A Python host AST builder for integration with PyOpenCL.
""" """
# {{{ code generation guts @property
def known_callables(self):
def function_manglers(self): from loopy.target.c import get_c_callables
return ( callables = super().known_callables
super(PythonASTBuilderBase, self).function_manglers() + [ callables.update(get_c_callables())
_numpy_single_arg_function_mangler, return callables
])
def preamble_generators(self): def preamble_generators(self):
return ( return (
super(PythonASTBuilderBase, self).preamble_generators() + [ [*super().preamble_generators(), _base_python_preamble_generator])
_base_python_preamble_generator
])
def get_function_declaration(self, codegen_state, codegen_result, # {{{ code generation guts
schedule_index):
return None @property
def ast_module(self):
import genpy
return genpy
def get_function_declaration(
self, codegen_state: CodeGenerationState,
codegen_result: CodeGenerationResult, schedule_index: int
) -> tuple[Sequence[tuple[str, str]], Generable | None]:
return [], None
def get_function_definition(self, codegen_state, codegen_result, def get_function_definition(self, codegen_state, codegen_result,
schedule_index, schedule_index,
...@@ -223,12 +198,12 @@ class PythonASTBuilderBase(ASTBuilderBase): ...@@ -223,12 +198,12 @@ class PythonASTBuilderBase(ASTBuilderBase):
result = [] result = []
from pymbolic.mapper.stringifier import PREC_NONE
from genpy import Assign from genpy import Assign
from pymbolic.mapper.stringifier import PREC_NONE
for tv in sorted( for tv in sorted(
six.itervalues(kernel.temporary_variables), kernel.temporary_variables.values(),
key=lambda tv: tv.name): key=lambda key_tv: key_tv.name):
if tv.shape: if tv.shape:
result.append( result.append(
Assign( Assign(
...@@ -236,7 +211,10 @@ class PythonASTBuilderBase(ASTBuilderBase): ...@@ -236,7 +211,10 @@ class PythonASTBuilderBase(ASTBuilderBase):
"_lpy_np.empty(%s, dtype=%s)" "_lpy_np.empty(%s, dtype=%s)"
% ( % (
ecm(tv.shape, PREC_NONE, "i"), ecm(tv.shape, PREC_NONE, "i"),
"_lpy_np."+tv.dtype.numpy_dtype.name "_lpy_np."+(
tv.dtype.numpy_dtype.name
if tv.dtype.numpy_dtype.name != "bool"
else "bool_")
))) )))
return result return result
...@@ -256,11 +234,14 @@ class PythonASTBuilderBase(ASTBuilderBase): ...@@ -256,11 +234,14 @@ class PythonASTBuilderBase(ASTBuilderBase):
return Collection return Collection
def emit_sequential_loop(self, codegen_state, iname, iname_dtype, def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
lbound, ubound, inner): lbound, ubound, inner, hints):
ecm = codegen_state.expression_to_code_mapper ecm = codegen_state.expression_to_code_mapper
from pymbolic.mapper.stringifier import PREC_NONE, PREC_SUM
from genpy import For from genpy import For
from pymbolic.mapper.stringifier import PREC_NONE, PREC_SUM
if hints:
raise ValueError("hints for python loops not supported")
return For( return For(
(iname,), (iname,),
...@@ -283,6 +264,14 @@ class PythonASTBuilderBase(ASTBuilderBase): ...@@ -283,6 +264,14 @@ class PythonASTBuilderBase(ASTBuilderBase):
from genpy import Comment from genpy import Comment
return Comment(s) return Comment(s)
def emit_noop_with_comment(self, s):
from cgen import Line
return Line(f"pass #{s}")
@property
def can_implement_conditionals(self):
return True
def emit_if(self, condition_str, ast): def emit_if(self, condition_str, ast):
from genpy import If from genpy import If
return If(condition_str, ast) return If(condition_str, ast)
...@@ -293,8 +282,8 @@ class PythonASTBuilderBase(ASTBuilderBase): ...@@ -293,8 +282,8 @@ class PythonASTBuilderBase(ASTBuilderBase):
if insn.atomicity: if insn.atomicity:
raise NotImplementedError("atomic ops in Python") raise NotImplementedError("atomic ops in Python")
from pymbolic.mapper.stringifier import PREC_NONE
from genpy import Assign from genpy import Assign
from pymbolic.mapper.stringifier import PREC_NONE
return Assign( return Assign(
ecm(insn.assignee, prec=PREC_NONE, type_context=None), ecm(insn.assignee, prec=PREC_NONE, type_context=None),
......
from __future__ import division, absolute_import from __future__ import annotations
import six
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
...@@ -23,35 +23,42 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ...@@ -23,35 +23,42 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
""" """
import collections.abc as abc
import logging
from functools import cached_property
from sys import intern
import numpy as np import numpy as np
from pytools.persistent_dict import KeyBuilder as KeyBuilderBase from constantdict import constantdict
from loopy.symbolic import WalkMapper as LoopyWalkMapper
from pymbolic.mapper.persistent_hash import (
PersistentHashWalkMapper as PersistentHashWalkMapperBase)
import six # noqa
from six.moves import intern
import islpy as isl
from pytools import ProcessLogger, memoize_method
from pytools.persistent_dict import (
KeyBuilder as KeyBuilderBase,
WriteOncePersistentDict,
)
if six.PY2: from .symbolic import (
def is_integer(obj): RuleAwareIdentityMapper,
return isinstance(obj, (int, long, np.integer)) # noqa )
else: from .typing import is_integer # noqa: F401
def is_integer(obj):
return isinstance(obj, (int, np.integer))
# {{{ custom KeyBuilder subclass logger = logging.getLogger(__name__)
class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase):
"""A subclass of :class:`loopy.symbolic.WalkMapper` for constructing def update_persistent_hash(obj, key_hash, key_builder):
persistent hash keys for use with """
Custom hash computation function for use with
:class:`pytools.persistent_dict.PersistentDict`. :class:`pytools.persistent_dict.PersistentDict`.
See also :meth:`LoopyKeyBuilder.update_for_pymbolic_expression`. Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
""" """
for field_name in obj.hash_fields:
key_builder.rec(key_hash, getattr(obj, field_name))
# <empty implementation>
# {{{ custom KeyBuilder subclass
class LoopyKeyBuilder(KeyBuilderBase): class LoopyKeyBuilder(KeyBuilderBase):
"""A custom :class:`pytools.persistent_dict.KeyBuilder` subclass """A custom :class:`pytools.persistent_dict.KeyBuilder` subclass
...@@ -63,10 +70,8 @@ class LoopyKeyBuilder(KeyBuilderBase): ...@@ -63,10 +70,8 @@ class LoopyKeyBuilder(KeyBuilderBase):
update_for_list = KeyBuilderBase.update_for_tuple update_for_list = KeyBuilderBase.update_for_tuple
update_for_set = KeyBuilderBase.update_for_frozenset update_for_set = KeyBuilderBase.update_for_frozenset
def update_for_dict(self, key_hash, key): update_for_dict = KeyBuilderBase.update_for_constantdict
# Order matters for the hash--insert in sorted order. update_for_defaultdict = KeyBuilderBase.update_for_constantdict
for dict_key in sorted(six.iterkeys(key)):
self.rec(key_hash, (dict_key, key[dict_key]))
def update_for_BasicSet(self, key_hash, key): # noqa def update_for_BasicSet(self, key_hash, key): # noqa
from islpy import Printer from islpy import Printer
...@@ -74,41 +79,65 @@ class LoopyKeyBuilder(KeyBuilderBase): ...@@ -74,41 +79,65 @@ class LoopyKeyBuilder(KeyBuilderBase):
getattr(prn, "print_"+key._base_name)(key) getattr(prn, "print_"+key._base_name)(key)
key_hash.update(prn.get_str().encode("utf8")) key_hash.update(prn.get_str().encode("utf8"))
def update_for_type(self, key_hash, key): def update_for_Map(self, key_hash, key): # noqa
try: if isinstance(key, isl.Map):
method = getattr(self, "update_for_type_"+key.__name__) self.update_for_BasicSet(key_hash, key)
except AttributeError:
pass
else: else:
method(key_hash, key) raise AssertionError()
return
raise TypeError("unsupported type for persistent hash keying: %s" # }}}
% type(key))
def update_for_type_auto(self, key_hash, key):
key_hash.update("auto".encode("utf8"))
def update_for_pymbolic_expression(self, key_hash, key): # {{{ eq key builder
if key is None:
self.update_for_NoneType(key_hash, key)
else:
PersistentHashWalkMapper(key_hash)(key)
class LoopyEqKeyBuilder:
"""Unlike :class:`loopy.tools.LoopyKeyBuilder`, this builds keys for use in
equality comparison, such that `key(a) == key(b)` if and only if `a == b`.
The types of objects being compared should satisfy structural equality.
class PymbolicExpressionHashWrapper(object): The output is suitable for use with :class:`loopy.tools.LoopyKeyBuilder`
def __init__(self, expression): provided all fields are persistent hashable.
self.expression = expression
def __eq__(self, other): As an optimization, top-level pymbolic expression fields are stringified for
return (type(self) == type(other) faster comparisons / hash calculations.
and self.expression == other.expression)
def __ne__(self, other): Usage::
return not self.__eq__(other)
def update_persistent_hash(self, key_hash, key_builder): kb = LoopyEqKeyBuilder()
key_builder.update_for_pymbolic_expression(key_hash, self.expression) kb.update_for_class(insn.__class__)
kb.update_for_field("field", insn.field)
...
key = kb.key()
"""
def __init__(self):
self.field_dict = {}
def update_for_class(self, class_):
self.class_ = class_
def update_for_field(self, field_name, value):
self.field_dict[field_name] = value
def key(self):
"""A key suitable for equality comparison."""
return (self.class_.__name__.encode("utf-8"), self.field_dict)
@memoize_method
def hash_key(self):
"""A key suitable for hashing.
"""
# To speed up any calculations that repeatedly use the return value,
# this method returns a hash.
kb = LoopyKeyBuilder()
# Build the key. For faster hashing, avoid hashing field names.
key = (
(self.class_.__name__.encode("utf-8"),
*(self.field_dict[k] for k in sorted(self.field_dict.keys()))))
return kb(key)
# }}} # }}}
...@@ -121,7 +150,9 @@ def remove_common_indentation(code, require_leading_newline=True, ...@@ -121,7 +150,9 @@ def remove_common_indentation(code, require_leading_newline=True,
return code return code
# accommodate pyopencl-ish syntax highlighting # accommodate pyopencl-ish syntax highlighting
code = code.lstrip("//CL//") cl_prefix = "//CL//"
if code.startswith(cl_prefix):
code = code[len(cl_prefix):]
if require_leading_newline and not code.startswith("\n"): if require_leading_newline and not code.startswith("\n"):
return code return code
...@@ -136,11 +167,11 @@ def remove_common_indentation(code, require_leading_newline=True, ...@@ -136,11 +167,11 @@ def remove_common_indentation(code, require_leading_newline=True,
test_line = None test_line = None
if ignore_lines_starting_with: if ignore_lines_starting_with:
for l in lines: for line in lines:
strip_l = l.lstrip() strip_l = line.lstrip()
if (strip_l if (strip_l
and not strip_l.startswith(ignore_lines_starting_with)): and not strip_l.startswith(ignore_lines_starting_with)):
test_line = l test_line = line
break break
else: else:
...@@ -168,16 +199,29 @@ def remove_common_indentation(code, require_leading_newline=True, ...@@ -168,16 +199,29 @@ def remove_common_indentation(code, require_leading_newline=True,
# }}} # }}}
# {{{ remove_lines_with_only_spaces
def remove_lines_with_only_spaces(code):
return "\n".join(line for line in code.split("\n") if set(line) != {" "})
# }}}
# {{{ build_ispc_shared_lib # {{{ build_ispc_shared_lib
# DO NOT RELY ON THESE: THEY WILL GO AWAY # DO NOT RELY ON THESE: THEY WILL GO AWAY
def build_ispc_shared_lib( def build_ispc_shared_lib(
cwd, ispc_sources, cxx_sources, cwd, ispc_sources, cxx_sources,
ispc_options=[], cxx_options=[], ispc_options=None, cxx_options=None,
ispc_bin="ispc", ispc_bin="ispc",
cxx_bin="g++", cxx_bin="g++",
quiet=True): quiet=True):
if ispc_options is None:
ispc_options = []
if cxx_options is None:
cxx_options = []
from os.path import join from os.path import join
ispc_source_names = [] ispc_source_names = []
...@@ -196,25 +240,14 @@ def build_ispc_shared_lib( ...@@ -196,25 +240,14 @@ def build_ispc_shared_lib(
from subprocess import check_call from subprocess import check_call
ispc_cmd = ([ispc_bin, ispc_cmd = ([ispc_bin, "--pic", "-o", "ispc.o", *ispc_options, *ispc_source_names])
"--pic",
"-o", "ispc.o"]
+ ispc_options
+ list(ispc_source_names))
if not quiet: if not quiet:
print(" ".join(ispc_cmd)) print(" ".join(ispc_cmd))
check_call(ispc_cmd, cwd=cwd) check_call(ispc_cmd, cwd=cwd)
cxx_cmd = ([ cxx_cmd = ([cxx_bin, "-shared", "-Wl,--export-dynamic", "-fPIC", "-oshared.so",
cxx_bin, "ispc.o", *cxx_options, *cxx_source_names])
"-shared", "-Wl,--export-dynamic",
"-fPIC",
"-oshared.so",
"ispc.o",
]
+ cxx_options
+ list(cxx_source_names))
check_call(cxx_cmd, cwd=cwd) check_call(cxx_cmd, cwd=cwd)
...@@ -233,7 +266,7 @@ def address_from_numpy(obj): ...@@ -233,7 +266,7 @@ def address_from_numpy(obj):
if ary_intf is None: if ary_intf is None:
raise RuntimeError("no array interface") raise RuntimeError("no array interface")
buf_base, is_read_only = ary_intf["data"] buf_base, _is_read_only = ary_intf["data"]
return buf_base + ary_intf.get("offset", 0) return buf_base + ary_intf.get("offset", 0)
...@@ -243,8 +276,8 @@ def cptr_from_numpy(obj): ...@@ -243,8 +276,8 @@ def cptr_from_numpy(obj):
# https://github.com/hgomersall/pyFFTW/blob/master/pyfftw/utils.pxi#L172 # https://github.com/hgomersall/pyFFTW/blob/master/pyfftw/utils.pxi#L172
def empty_aligned(shape, dtype, order='C', n=64): def empty_aligned(shape, dtype, order="C", n=64):
'''empty_aligned(shape, dtype='float64', order='C', n=None) """empty_aligned(shape, dtype='float64', order="C", n=None)
Function that returns an empty numpy array that is n-byte aligned, Function that returns an empty numpy array that is n-byte aligned,
where ``n`` is determined by inspecting the CPU if it is not where ``n`` is determined by inspecting the CPU if it is not
provided. provided.
...@@ -252,7 +285,7 @@ def empty_aligned(shape, dtype, order='C', n=64): ...@@ -252,7 +285,7 @@ def empty_aligned(shape, dtype, order='C', n=64):
``n`` is not provided then this function will inspect the CPU to ``n`` is not provided then this function will inspect the CPU to
determine alignment. The rest of the arguments are as per determine alignment. The rest of the arguments are as per
:func:`numpy.empty`. :func:`numpy.empty`.
''' """
itemsize = np.dtype(dtype).itemsize itemsize = np.dtype(dtype).itemsize
# Apparently there is an issue with numpy.prod wrapping around on 32-bits # Apparently there is an issue with numpy.prod wrapping around on 32-bits
...@@ -270,10 +303,10 @@ def empty_aligned(shape, dtype, order='C', n=64): ...@@ -270,10 +303,10 @@ def empty_aligned(shape, dtype, order='C', n=64):
# We now need to know how to offset base_ary # We now need to know how to offset base_ary
# so it is correctly aligned # so it is correctly aligned
_array_aligned_offset = (n-address_from_numpy(base_ary)) % n array_aligned_offset = (n-address_from_numpy(base_ary)) % n
array = np.frombuffer( array = np.frombuffer(
base_ary[_array_aligned_offset:_array_aligned_offset-n].data, base_ary[array_aligned_offset:array_aligned_offset-n].data,
dtype=dtype).reshape(shape, order=order) dtype=dtype).reshape(shape, order=order)
return array return array
...@@ -281,82 +314,19 @@ def empty_aligned(shape, dtype, order='C', n=64): ...@@ -281,82 +314,19 @@ def empty_aligned(shape, dtype, order='C', n=64):
# }}} # }}}
# {{{ compute SCCs with Tarjan's algorithm # {{{ pickled container value
def compute_sccs(graph):
to_search = set(graph.keys())
visit_order = {}
scc_root = {}
sccs = []
while to_search:
top = next(iter(to_search))
call_stack = [(top, iter(graph[top]), None)]
visit_stack = []
visiting = set()
scc = []
while call_stack:
top, children, last_popped_child = call_stack.pop()
if top not in visiting:
# Unvisited: mark as visited, initialize SCC root.
count = len(visit_order)
visit_stack.append(top)
visit_order[top] = count
scc_root[top] = count
visiting.add(top)
to_search.discard(top)
# Returned from a recursion, update SCC.
if last_popped_child is not None:
scc_root[top] = min(
scc_root[top],
scc_root[last_popped_child])
for child in children:
if child not in visit_order:
# Recurse.
call_stack.append((top, children, child))
call_stack.append((child, iter(graph[child]), None))
break
if child in visiting:
scc_root[top] = min(
scc_root[top],
visit_order[child])
else:
if scc_root[top] == visit_order[top]:
scc = []
while visit_stack[-1] != top:
scc.append(visit_stack.pop())
scc.append(visit_stack.pop())
for item in scc:
visiting.remove(item)
sccs.append(scc)
return sccs
# }}}
# {{{ lazily unpickling dictionary class _PickledObject:
"""A class meant to wrap a pickled value (for :class:`LazilyUnpicklingDict` and
:class:`LazilyUnpicklingList`).
class _PickledObjectWrapper(object):
"""
A class meant to wrap a pickled value (for :class:`LazilyUnpicklingDictionary`).
""" """
@classmethod def __init__(self, obj):
def from_object(cls, obj): if isinstance(obj, _PickledObject):
if isinstance(obj, cls): self.objstring = obj.objstring
return obj else:
from pickle import dumps from pickle import dumps
return cls(dumps(obj)) self.objstring = dumps(obj)
def __init__(self, objstring):
self.objstring = objstring
def unpickle(self): def unpickle(self):
from pickle import loads from pickle import loads
...@@ -365,13 +335,39 @@ class _PickledObjectWrapper(object): ...@@ -365,13 +335,39 @@ class _PickledObjectWrapper(object):
def __getstate__(self): def __getstate__(self):
return {"objstring": self.objstring} return {"objstring": self.objstring}
def __repr__(self) -> str:
return type(self).__name__ + "(" + repr(self.unpickle()) + ")"
import collections class _PickledObjectWithEqAndPersistentHashKeys(_PickledObject):
"""Like :class:`_PickledObject`, with two additional attributes:
* `eq_key`
* `persistent_hash_key`
class LazilyUnpicklingDictionary(collections.MutableMapping): This allows for comparison and for persistent hashing without unpickling.
""" """
A dictionary-like object which lazily unpickles its values.
def __init__(self, obj, eq_key, persistent_hash_key):
_PickledObject.__init__(self, obj)
self.eq_key = eq_key
self.persistent_hash_key = persistent_hash_key
def update_persistent_hash(self, key_hash, key_builder):
key_builder.rec(key_hash, self.persistent_hash_key)
def __getstate__(self):
return {"objstring": self.objstring,
"eq_key": self.eq_key,
"persistent_hash_key": self.persistent_hash_key}
# }}}
# {{{ lazily unpickling dictionary
class LazilyUnpicklingDict(abc.MutableMapping):
"""A dictionary-like object which lazily unpickles its values.
""" """
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
...@@ -379,7 +375,7 @@ class LazilyUnpicklingDictionary(collections.MutableMapping): ...@@ -379,7 +375,7 @@ class LazilyUnpicklingDictionary(collections.MutableMapping):
def __getitem__(self, key): def __getitem__(self, key):
value = self._map[key] value = self._map[key]
if isinstance(value, _PickledObjectWrapper): if isinstance(value, _PickledObject):
value = self._map[key] = value.unpickle() value = self._map[key] = value.unpickle()
return value return value
...@@ -396,13 +392,201 @@ class LazilyUnpicklingDictionary(collections.MutableMapping): ...@@ -396,13 +392,201 @@ class LazilyUnpicklingDictionary(collections.MutableMapping):
return iter(self._map) return iter(self._map)
def __getstate__(self): def __getstate__(self):
return {"_map": dict( return {"_map": {
(key, _PickledObjectWrapper.from_object(val)) key: _PickledObject(val)
for key, val in six.iteritems(self._map))} for key, val in self._map.items()}}
def __repr__(self) -> str:
return type(self).__name__ + "(" + repr(self._map) + ")"
# }}} # }}}
# {{{ lazily unpickling list
class LazilyUnpicklingList(abc.MutableSequence):
"""A list which lazily unpickles its values."""
def __init__(self, *args, **kwargs):
self._list = list(*args, **kwargs)
def __getitem__(self, key):
item = self._list[key]
if isinstance(item, _PickledObject):
item = self._list[key] = item.unpickle()
return item
def __setitem__(self, key, value):
self._list[key] = value
def __delitem__(self, key):
del self._list[key]
def __len__(self):
return len(self._list)
def insert(self, key, value):
self._list.insert(key, value)
def __getstate__(self):
return {"_list": [_PickledObject(val) for val in self._list]}
def __add__(self, other):
return self._list + other
def __mul__(self, other):
return self._list * other
def __repr__(self) -> str:
return type(self).__name__ + "(" + repr(self._list) + ")"
class LazilyUnpicklingListWithEqAndPersistentHashing(LazilyUnpicklingList):
"""A list which lazily unpickles its values, and supports equality comparison
and persistent hashing without unpickling.
Persistent hashing only works in conjunction with :class:`LoopyKeyBuilder`.
Equality comparison and persistent hashing are implemented by supplying
functions `eq_key_getter` and `persistent_hash_key_getter` to the
constructor. These functions should return keys that can be used in place of
the original object for the respective purposes of equality comparison and
persistent hashing.
"""
def __init__(self, *args, **kwargs):
self.eq_key_getter = kwargs.pop("eq_key_getter")
self.persistent_hash_key_getter = kwargs.pop("persistent_hash_key_getter")
LazilyUnpicklingList.__init__(self, *args, **kwargs)
def update_persistent_hash(self, key_hash, key_builder):
key_builder.update_for_list(key_hash, self._list)
def _get_eq_key(self, obj):
if isinstance(obj, _PickledObjectWithEqAndPersistentHashKeys):
return obj.eq_key
return self.eq_key_getter(obj)
def _get_persistent_hash_key(self, obj):
if isinstance(obj, _PickledObjectWithEqAndPersistentHashKeys):
return obj.persistent_hash_key
return self.persistent_hash_key_getter(obj)
def __eq__(self, other):
if not isinstance(other, (list, LazilyUnpicklingList)):
return NotImplemented
if isinstance(other, LazilyUnpicklingList):
other = other._list
if len(self) != len(other):
return False
for a, b in zip(self._list, other):
if self._get_eq_key(a) != self._get_eq_key(b):
return False
return True
def __ne__(self, other):
return not self.__eq__(other)
def __getstate__(self):
return {"_list": [
_PickledObjectWithEqAndPersistentHashKeys(
val,
self._get_eq_key(val),
self._get_persistent_hash_key(val))
for val in self._list],
"eq_key_getter": self.eq_key_getter,
"persistent_hash_key_getter": self.persistent_hash_key_getter}
# }}}
# {{{ optional object
class _no_value: # noqa
pass
class Optional:
"""A wrapper for an optionally present object.
.. attribute:: has_value
*True* if and only if this object contains a value.
.. attribute:: value
The value, if present.
"""
__slots__ = ("_value", "has_value")
def __init__(self, value=_no_value):
self.has_value = value is not _no_value
if self.has_value:
self._value = value
def __str__(self):
if not self.has_value:
return "Optional()"
return "Optional(%s)" % self._value
def __repr__(self):
if not self.has_value:
return "Optional()"
return "Optional(%r)" % self._value
def __getstate__(self):
if not self.has_value:
return _no_value
return (self._value,)
def __setstate__(self, state):
if state is _no_value:
self.has_value = False
return
self.has_value = True
self._value, = state
def __eq__(self, other):
if not self.has_value:
return not other.has_value
return self.value == other.value if other.has_value else False
def __neq__(self, other):
return not self.__eq__(other)
@property
def value(self):
if not self.has_value:
raise AttributeError("optional value not present")
return self._value
def update_persistent_hash(self, key_hash, key_builder):
key_builder.rec(
key_hash,
(self._value,) if self.has_value else ())
def __hash__(self):
if not self.has_value:
return hash((type(self), False))
else:
return hash((self.has_value, self._value))
# }}}
def unpickles_equally(obj):
from pickle import dumps, loads
return loads(dumps(obj)) == obj
def is_interned(s): def is_interned(s):
return s is None or intern(s) is s return s is None or intern(s) is s
...@@ -411,4 +595,313 @@ def intern_frozenset_of_ids(fs): ...@@ -411,4 +595,313 @@ def intern_frozenset_of_ids(fs):
return frozenset(intern(s) for s in fs) return frozenset(intern(s) for s in fs)
# vim: foldmethod=marker # {{{ t_unit_to_python
def _is_generated_t_unit_the_same(python_code, var_name, ref_t_unit):
"""
Helper for :func:`kernel_to_python`. Returns *True* only if the variable
referenced by *var_name* in *python_code* is equal to *kernel*, else
returns *False*.
"""
reproducer_variables = {}
exec(python_code, reproducer_variables)
t_unit = reproducer_variables[var_name]
return ref_t_unit == t_unit
# {{{ CallablesUnresolver
class _CallablesUnresolver(RuleAwareIdentityMapper):
def __init__(self, rule_mapping_context, callables_table, target):
super().__init__(rule_mapping_context)
self.callables_table = callables_table
self.target = target
@cached_property
def known_callables(self):
from loopy.kernel.function_interface import CallableKernel
return (frozenset(self.target.get_device_ast_builder().known_callables)
| {name
for name, clbl in self.callables_table.items()
if isinstance(clbl, CallableKernel)})
def map_call(self, expr, expn_state):
from loopy.symbolic import ResolvedFunction
if isinstance(expr.function, ResolvedFunction):
if expr.function.name not in self.known_callables:
raise NotImplementedError("User-provided scalar callables not"
" supported yet.")
from pymbolic.primitives import Call
return Call(expr.function.function, tuple(self.rec(par, expn_state)
for par in expr.parameters))
else:
return super().map_call(expr, expn_state)
def _unresolve_callables(kernel, callables_table):
from loopy.kernel import KernelState
from loopy.symbolic import SubstitutionRuleMappingContext
vng = kernel.get_var_name_generator()
rule_mapping_context = SubstitutionRuleMappingContext(kernel.substitutions,
vng)
mapper = _CallablesUnresolver(rule_mapping_context,
callables_table,
kernel.target)
return (rule_mapping_context.finish_kernel(mapper.map_kernel(kernel))
.copy(state=KernelState.INITIAL))
# }}}
def _kernel_to_python(kernel, is_entrypoint=False, var_name="kernel"):
from mako.template import Template
from loopy.kernel.instruction import BarrierInstruction, MultiAssignmentBase
options = {} # options: mapping from insn_id to str of options
for insn in kernel.instructions:
option = f"id={insn.id}, "
if insn.depends_on:
option += ("dep="+":".join(insn.depends_on)+", ")
if insn.tags:
option += ("tags="+":".join(insn.tags)+", ")
if insn.within_inames is not None:
if insn.within_inames_is_final:
option += ("inames="+":".join(insn.within_inames)+", ")
else:
option += ("inames=+"+":".join(insn.within_inames)+", ")
if isinstance(insn, MultiAssignmentBase):
if insn.atomicity:
option += "atomic, "
elif isinstance(insn, BarrierInstruction):
option += (f"mem_kind={insn.mem_kind}, ")
options[insn.id] = option[:-2] # get rid of the trailing ", "
make_kernel = "make_kernel" if is_entrypoint else "make_function"
python_code = r"""
<%! import loopy as lp %>
<%! tv_aspace = {0: 'lp.AddressSpace.PRIVATE', 1: 'lp.AddressSpace.LOCAL',
2: 'lp.AddressSpace.GLOBAL', lp.auto: 'lp.auto' } %>
${var_name} = lp.${make_kernel}(
[
% for dom in kernel.domains:
"${str(dom)}",
% endfor
],
'''
% for name, rule in sorted(kernel.substitutions.items(), key=lambda x: x[0]):
${name}(${", ".join(rule.arguments)}) := ${str(rule.expression)}
%endfor
% for id, opts in options.items():
<% insn = kernel.id_to_insn[id] %>
% if isinstance(insn, lp.MultiAssignmentBase):
${','.join([str(a) for a in insn.assignees])} = ${insn.expression} {${opts}}
% elif isinstance(insn, lp.BarrierInstruction):
... ${insn.synchronization_kind[0]}barrier {${opts}}
% elif isinstance(insn, lp.NoOpInstruction):
... nop {${opts}}
% else:
<% raise NotImplementedError(f"Not implemented for {type(insn)}.")%>
% endif
%endfor
''', [
% for arg in kernel.args:
% if isinstance(arg, lp.ValueArg):
lp.ValueArg(
name="${arg.name}",
dtype=${('np.'+arg.dtype.numpy_dtype.name
if arg.dtype else 'None')}),
% else:
lp.GlobalArg(
name="${arg.name}", dtype=${('np.'+arg.dtype.numpy_dtype.name
if arg.dtype else 'None')},
shape=${arg.shape}, for_atomic=${arg.for_atomic}),
% endif
% endfor
% for tv in kernel.temporary_variables.values():
lp.TemporaryVariable(
name="${tv.name}",
dtype=${'np.'+tv.dtype.numpy_dtype.name if tv.dtype else 'lp.auto'},
shape=${tv.shape}, for_atomic=${tv.for_atomic},
address_space=${tv_aspace[tv.address_space]},
read_only=${tv.read_only},
% if tv.initializer is not None:
initializer=${"np."+repr(tv.initializer)},
% endif
),
% endfor
],
lang_version=${lp.MOST_RECENT_LANGUAGE_VERSION},
% if kernel.iname_slab_increments:
iname_slab_increments=${repr(kernel.iname_slab_increments)},
% endif
% if kernel.applied_iname_rewrites:
applied_iname_rewrites=${repr(kernel.applied_iname_rewrites)},
% endif
% if kernel.name != "loopy_kernel":
name="${kernel.name}",
% endif
)
% for iname in kernel.inames.values():
% for tag in iname.tags:
${var_name} = lp.tag_inames(${var_name}, "${"%s:%s" %(iname.name, tag)}")
% endfor
% endfor
"""
python_code = Template(python_code,
strict_undefined=True).render(options=options,
kernel=kernel,
make_kernel=make_kernel,
var_name=var_name)
python_code = remove_lines_with_only_spaces(
remove_common_indentation(python_code))
return python_code
def t_unit_to_python(t_unit, var_name="t_unit",
return_preamble_and_body_separately=False):
""""
Returns a :class:`str` of a python code that instantiates *kernel*.
:arg kernel: An instance of :class:`loopy.LoopKernel`
:arg var_name: A :class:`str` of the kernel variable name in the generated
python script.
:arg return_preamble_and_body_separately: A :class:`bool`.
If *True* returns ``(preamble, body)``, where ``preamble`` includes the
import statements and ``body`` includes the kernel, translation unit
instantiation code.
.. note::
The implementation is partially complete and a :class:`AssertionError`
is raised if the returned python script does not exactly reproduce
*kernel*. Contributions are welcome to fill in the missing voids.
"""
from loopy.kernel.function_interface import CallableKernel
new_callables = {name: CallableKernel(_unresolve_callables(clbl.subkernel,
t_unit
.callables_table))
for name, clbl in t_unit.callables_table.items()
if isinstance(clbl, CallableKernel)}
t_unit = t_unit.copy(callables_table=constantdict(new_callables))
knl_python_code_srcs = [_kernel_to_python(clbl.subkernel,
name in t_unit.entrypoints,
f"{name}_knl"
)
for name, clbl in t_unit.callables_table.items()]
knl_args = ", ".join(f"{name}_knl" for name in t_unit.callables_table)
merge_stmt = f"{var_name} = lp.merge([{knl_args}])"
preamble_str = "\n".join([
"import loopy as lp",
"import numpy as np",
"from pymbolic.primitives import *",
"from constantdict import constantdict",
])
body_str = "\n".join([*knl_python_code_srcs, "\n", merge_stmt])
python_code = "\n".join([preamble_str, "\n", body_str])
assert _is_generated_t_unit_the_same(python_code, var_name, t_unit)
if return_preamble_and_body_separately:
return preamble_str, body_str
else:
return python_code
# }}}
# {{{ cache management
caches: list[WriteOncePersistentDict] = []
def clear_in_mem_caches() -> None:
for cache in caches:
cache.clear_in_mem_cache()
# }}}
# {{{ memoize_on_disk
def memoize_on_disk(func, key_builder_t=LoopyKeyBuilder):
from functools import wraps
from pytools.persistent_dict import WriteOncePersistentDict
from loopy.kernel import LoopKernel
from loopy.translation_unit import TranslationUnit
from loopy.version import DATA_MODEL_VERSION
transform_cache = WriteOncePersistentDict(
("loopy-memoize-cache-"
f"{func.__name__}-"
f"{key_builder_t.__qualname__}.{key_builder_t.__name__}"
f"-v0-{DATA_MODEL_VERSION}"),
key_builder=key_builder_t(),
safe_sync=False)
caches.append(transform_cache)
@wraps(func)
def wrapper(*args, **kwargs):
from loopy import CACHING_ENABLED
if (not CACHING_ENABLED
or kwargs.pop("_no_memoize_on_disk", False)):
return func(*args, **kwargs)
cache_key = (func.__qualname__, func.__name__, args, kwargs)
try:
result = transform_cache[cache_key]
logger.debug(f"Function {func.__name__} returned from"
" memoized result on disk.")
return result
except KeyError:
logger.debug(f"Function {func.__name__} not present"
" on disk.")
if args and isinstance(args[0], LoopKernel):
proc_log_str = f"{func.__name__} on '{args[0].name}'"
elif args and isinstance(args[0], TranslationUnit):
entrypoints_str = ", ".join(args[0].entrypoints)
proc_log_str = f"{func.__name__} on '{entrypoints_str}'"
else:
proc_log_str = f"{func.__name__}"
with ProcessLogger(logger, proc_log_str):
result = func(*args, **kwargs)
transform_cache.store_if_not_present(cache_key, result)
return result
return wrapper
# }}}
def is_hashable(o: object) -> bool:
try:
hash(o)
except TypeError:
return False
return True
# vim: fdm=marker
from __future__ import division, absolute_import from __future__ import annotations
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
......
from __future__ import annotations
__copyright__ = "Copyright (C) 2017 Kaushik Kulkarni"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
from loopy.kernel import LoopKernel
from loopy.kernel.instruction import BarrierInstruction
from loopy.match import parse_match
from loopy.transform.instruction import add_dependency
from loopy.translation_unit import for_each_kernel
__doc__ = """
.. currentmodule:: loopy
.. autofunction:: add_barrier
"""
# {{{ add_barrier
@for_each_kernel
def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None,
tags=None, synchronization_kind="global", mem_kind=None,
within_inames=None):
"""Takes in a kernel that needs to be added a barrier and returns a kernel
which has a barrier inserted into it. It takes input of 2 instructions and
then adds a barrier in between those 2 instructions. The expressions can
be any inputs that are understood by :func:`loopy.match.parse_match`.
:arg insn_before: String expression that specifies the instruction(s)
before the barrier which is to be added. If None, no dependencies will
be added to barrier.
:arg insn_after: String expression that specifies the instruction(s) after
the barrier which is to be added. If None, no dependencies on the barrier
will be added.
:arg id: String on which the id of the barrier would be based on.
:arg tags: The tag of the group to which the barrier must be added
:arg synchronization_kind: Kind of barrier to be added. May be "global" or
"local"
:arg kind: Type of memory to be synchronized. May be "global" or "local". Ignored
for "global" barriers. If not supplied, defaults to *synchronization_kind*
:arg within_inames: A :class:`frozenset` of inames identifying the loops
within which the barrier will be executed.
"""
assert isinstance(kernel, LoopKernel)
if mem_kind is None:
mem_kind = synchronization_kind
if id_based_on is None:
id = kernel.make_unique_instruction_id(
based_on=synchronization_kind[0]+"_barrier")
else:
id = kernel.make_unique_instruction_id(based_on=id_based_on)
if insn_before is not None:
match = parse_match(insn_before)
insns_before = frozenset(
[insn.id for insn in kernel.instructions if match(kernel, insn)])
else:
insns_before = None
barrier_to_add = BarrierInstruction(depends_on=insns_before,
depends_on_is_final=True,
id=id,
within_inames=within_inames,
tags=tags,
synchronization_kind=synchronization_kind,
mem_kind=mem_kind)
new_kernel = kernel.copy(instructions=[*kernel.instructions, barrier_to_add])
if insn_after is not None:
new_kernel = add_dependency(new_kernel,
insn_match=insn_after,
depends_on="id:"+id)
return new_kernel
# }}}
# vim: foldmethod=marker
from __future__ import division, absolute_import from __future__ import annotations
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
...@@ -23,13 +24,14 @@ THE SOFTWARE. ...@@ -23,13 +24,14 @@ THE SOFTWARE.
""" """
import six
from loopy.diagnostic import LoopyError from loopy.diagnostic import LoopyError
from loopy.kernel import LoopKernel
from loopy.translation_unit import for_each_kernel
# {{{ fold constants # {{{ fold constants
@for_each_kernel
def fold_constants(kernel): def fold_constants(kernel):
from loopy.symbolic import ConstantFoldingMapper from loopy.symbolic import ConstantFoldingMapper
cfm = ConstantFoldingMapper() cfm = ConstantFoldingMapper()
...@@ -38,10 +40,10 @@ def fold_constants(kernel): ...@@ -38,10 +40,10 @@ def fold_constants(kernel):
insn.with_transformed_expressions(cfm) insn.with_transformed_expressions(cfm)
for insn in kernel.instructions] for insn in kernel.instructions]
new_substs = dict( new_substs = {
(sub.name, sub.name:
sub.copy(expression=cfm(sub.expression))) sub.copy(expression=cfm(sub.expression))
for sub in six.itervalues(kernel.substitutions)) for sub in kernel.substitutions.values()}
return kernel.copy( return kernel.copy(
instructions=new_insns, instructions=new_insns,
...@@ -53,7 +55,9 @@ def fold_constants(kernel): ...@@ -53,7 +55,9 @@ def fold_constants(kernel):
# {{{ collect_common_factors_on_increment # {{{ collect_common_factors_on_increment
# thus far undocumented # thus far undocumented
@for_each_kernel
def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
assert isinstance(kernel, LoopKernel)
# FIXME: Does not understand subst rules for now # FIXME: Does not understand subst rules for now
if kernel.substitutions: if kernel.substitutions:
from loopy.transform.subst import expand_subst from loopy.transform.subst import expand_subst
...@@ -74,9 +78,9 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): ...@@ -74,9 +78,9 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
from loopy.kernel.array import ArrayBase from loopy.kernel.array import ArrayBase
if isinstance(var_descr, ArrayBase): if isinstance(var_descr, ArrayBase):
if var_descr.dim_names is not None: if var_descr.dim_names is not None:
name_to_index = dict( name_to_index = {
(name, idx) name: idx
for idx, name in enumerate(var_descr.dim_names)) for idx, name in enumerate(var_descr.dim_names)}
else: else:
name_to_index = {} name_to_index = {}
...@@ -85,7 +89,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): ...@@ -85,7 +89,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
try: try:
return name_to_index[ax] return name_to_index[ax]
except KeyError: except KeyError:
raise LoopyError("axis name '%s' not understood " % ax) raise LoopyError("axis name '%s' not understood " % ax) from None
else: else:
return ax return ax
...@@ -102,10 +106,21 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): ...@@ -102,10 +106,21 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
# }}} # }}}
from pymbolic.mapper.substitutor import make_subst_func from pymbolic.mapper.substitutor import make_subst_func
from pymbolic.primitives import (Sum, Product, is_zero, from pymbolic.primitives import (
flattened_sum, flattened_product, Subscript, Variable) Product,
from loopy.symbolic import (get_dependencies, SubstitutionMapper, Subscript,
UnidirectionalUnifier) Sum,
Variable,
flattened_product,
flattened_sum,
is_zero,
)
from loopy.symbolic import (
SubstitutionMapper,
UnidirectionalUnifier,
get_dependencies,
)
# {{{ common factor key list maintenance # {{{ common factor key list maintenance
...@@ -113,7 +128,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): ...@@ -113,7 +128,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
common_factors = [] common_factors = []
def find_unifiable_cf_index(index_key): def find_unifiable_cf_index(index_key):
for i, (key, val) in enumerate(common_factors): for i, (key, _val) in enumerate(common_factors):
unif = UnidirectionalUnifier( unif = UnidirectionalUnifier(
lhs_mapping_candidates=get_dependencies(key)) lhs_mapping_candidates=get_dependencies(key))
...@@ -140,8 +155,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): ...@@ -140,8 +155,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
def iterate_as(cls, expr): def iterate_as(cls, expr):
if isinstance(expr, cls): if isinstance(expr, cls):
for ch in expr.children: yield from expr.children
yield ch
else: else:
yield expr yield expr
...@@ -216,9 +230,9 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): ...@@ -216,9 +230,9 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
product_parts = set(iterate_as(Product, term)) product_parts = set(iterate_as(Product, term))
my_common_factors = set( my_common_factors = {
cf for cf in my_common_factors cf for cf in my_common_factors
if unif_subst_map(cf) in product_parts) if unif_subst_map(cf) in product_parts}
common_factors[cf_index] = (index_key, my_common_factors) common_factors[cf_index] = (index_key, my_common_factors)
...@@ -263,9 +277,9 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): ...@@ -263,9 +277,9 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
unif_subst_map = SubstitutionMapper( unif_subst_map = SubstitutionMapper(
make_subst_func(unif_result.lmap)) make_subst_func(unif_result.lmap))
mapped_my_common_factors = set( mapped_my_common_factors = {
unif_subst_map(cf) unif_subst_map(cf)
for cf in my_common_factors) for cf in my_common_factors}
new_sum_terms = [] new_sum_terms = []
......
from __future__ import division, absolute_import from __future__ import annotations
from six.moves import range, zip
__copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner" __copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner"
...@@ -23,16 +23,28 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ...@@ -23,16 +23,28 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
""" """
from abc import ABC, abstractmethod
from dataclasses import dataclass, replace
from typing import TYPE_CHECKING, Any, Callable, Sequence
from typing_extensions import Self
import islpy as isl import islpy as isl
from islpy import dim_type from islpy import dim_type
from loopy.symbolic import (get_dependencies, SubstitutionMapper) from pymbolic import ArithmeticExpression, var
from pymbolic.mapper.substitutor import make_subst_func from pymbolic.mapper.substitutor import make_subst_func
from pytools import memoize_method
from pytools import ImmutableRecord, memoize_method from loopy.symbolic import SubstitutionMapper, get_dependencies
from pymbolic import var
class AccessDescriptor(ImmutableRecord): if TYPE_CHECKING:
from loopy.typing import Expression
@dataclass(frozen=True)
class AccessDescriptor:
""" """
.. attribute:: identifier .. attribute:: identifier
...@@ -40,10 +52,11 @@ class AccessDescriptor(ImmutableRecord): ...@@ -40,10 +52,11 @@ class AccessDescriptor(ImmutableRecord):
to the access that generated it. Any Python value. to the access that generated it. Any Python value.
""" """
__slots__ = [ identifier: Any = None
"identifier", storage_axis_exprs: Sequence[ArithmeticExpression] | None = None
"storage_axis_exprs",
] def copy(self, **kwargs) -> Self:
return replace(self, **kwargs)
def to_parameters_or_project_out(param_inames, set_inames, set): def to_parameters_or_project_out(param_inames, set_inames, set):
...@@ -64,9 +77,12 @@ def to_parameters_or_project_out(param_inames, set_inames, set): ...@@ -64,9 +77,12 @@ def to_parameters_or_project_out(param_inames, set_inames, set):
# {{{ construct storage->sweep map # {{{ construct storage->sweep map
def build_per_access_storage_to_domain_map(storage_axis_exprs, domain, def build_per_access_storage_to_domain_map(
storage_axis_names, storage_axis_exprs: Sequence[Expression],
prime_sweep_inames): domain: isl.BasicSet,
storage_axis_names: Sequence[str],
prime_sweep_inames: Callable[[Expression], Expression]
) -> isl.BasicMap:
map_space = domain.space map_space = domain.space
stor_dim = len(storage_axis_names) stor_dim = len(storage_axis_names)
...@@ -89,12 +105,12 @@ def build_per_access_storage_to_domain_map(storage_axis_exprs, domain, ...@@ -89,12 +105,12 @@ def build_per_access_storage_to_domain_map(storage_axis_exprs, domain,
stor2sweep = None stor2sweep = None
from loopy.symbolic import aff_from_expr from loopy.symbolic import guarded_aff_from_expr
for saxis, sa_expr in zip(storage_axis_names, storage_axis_exprs): for saxis, sa_expr in zip(storage_axis_names, storage_axis_exprs):
cns = isl.Constraint.equality_from_aff( cns_expr = var(saxis+"'") - prime_sweep_inames(sa_expr)
aff_from_expr(set_space, cns_aff = guarded_aff_from_expr(set_space, cns_expr)
var(saxis+"'") - prime_sweep_inames(sa_expr))) cns = isl.Constraint.equality_from_aff(cns_aff)
cns_map = isl.BasicMap.from_constraint(cns) cns_map = isl.BasicMap.from_constraint(cns)
if stor2sweep is None: if stor2sweep is None:
...@@ -126,10 +142,8 @@ def move_to_par_from_out(s2smap, except_inames): ...@@ -126,10 +142,8 @@ def move_to_par_from_out(s2smap, except_inames):
return s2smap return s2smap
def build_global_storage_to_sweep_map(kernel, access_descriptors, def build_global_storage_to_sweep_map(access_descriptors,
domain_dup_sweep, dup_sweep_index, domain_dup_sweep, storage_axis_names, prime_sweep_inames):
storage_axis_names,
sweep_inames, primed_sweep_inames, prime_sweep_inames):
# The storage map goes from storage axes to the domain. # The storage map goes from storage axes to the domain.
# The first len(arg_names) storage dimensions are the rule's arguments. # The first len(arg_names) storage dimensions are the rule's arguments.
...@@ -194,7 +208,23 @@ def compute_bounds(kernel, domain, stor2sweep, ...@@ -194,7 +208,23 @@ def compute_bounds(kernel, domain, stor2sweep,
# {{{ array-to-buffer map # {{{ array-to-buffer map
class ArrayToBufferMap(object): class ArrayToBufferMapBase(ABC):
non1_storage_axis_names: tuple[str, ...]
storage_base_indices: tuple[ArithmeticExpression, ...]
non1_storage_shape: tuple[ArithmeticExpression, ...]
non1_storage_axis_flags: tuple[ArithmeticExpression, ...]
@abstractmethod
def is_access_descriptor_in_footprint(self, accdesc: AccessDescriptor) -> bool:
...
@abstractmethod
def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names,
boxify_sweep=False):
...
class ArrayToBufferMap(ArrayToBufferMapBase):
def __init__(self, kernel, domain, sweep_inames, access_descriptors, def __init__(self, kernel, domain, sweep_inames, access_descriptors,
storage_axis_count): storage_axis_count):
self.kernel = kernel self.kernel = kernel
...@@ -217,16 +247,16 @@ class ArrayToBufferMap(object): ...@@ -217,16 +247,16 @@ class ArrayToBufferMap(object):
self.primed_sweep_inames) self.primed_sweep_inames)
self.prime_sweep_inames = SubstitutionMapper(make_subst_func( self.prime_sweep_inames = SubstitutionMapper(make_subst_func(
dict((sin, var(psin)) {sin: var(psin)
for sin, psin in zip(sweep_inames, self.primed_sweep_inames)))) for sin, psin in zip(sweep_inames, self.primed_sweep_inames)}))
# # }}} # # }}}
self.stor2sweep = build_global_storage_to_sweep_map( self.stor2sweep = build_global_storage_to_sweep_map(
kernel, access_descriptors, access_descriptors,
domain_dup_sweep, dup_sweep_index, domain_dup_sweep,
storage_axis_names, storage_axis_names,
sweep_inames, self.primed_sweep_inames, self.prime_sweep_inames) self.prime_sweep_inames)
storage_base_indices, storage_shape = compute_bounds( storage_base_indices, storage_shape = compute_bounds(
kernel, domain, self.stor2sweep, self.primed_sweep_inames, kernel, domain, self.stor2sweep, self.primed_sweep_inames,
...@@ -239,14 +269,13 @@ class ArrayToBufferMap(object): ...@@ -239,14 +269,13 @@ class ArrayToBufferMap(object):
non1_storage_axis_flags = [] non1_storage_axis_flags = []
non1_storage_shape = [] non1_storage_shape = []
for saxis, bi, l in zip( for saxis_len in storage_shape:
storage_axis_names, storage_base_indices, storage_shape): has_length_non1 = saxis_len != 1
has_length_non1 = l != 1
non1_storage_axis_flags.append(has_length_non1) non1_storage_axis_flags.append(has_length_non1)
if has_length_non1: if has_length_non1:
non1_storage_shape.append(l) non1_storage_shape.append(saxis_len)
# }}} # }}}
...@@ -301,7 +330,7 @@ class ArrayToBufferMap(object): ...@@ -301,7 +330,7 @@ class ArrayToBufferMap(object):
self.non1_storage_axis_flags = non1_storage_axis_flags self.non1_storage_axis_flags = non1_storage_axis_flags
self.aug_domain = aug_domain self.aug_domain = aug_domain
self.storage_base_indices = storage_base_indices self.storage_base_indices = storage_base_indices
self.non1_storage_shape = non1_storage_shape self.non1_storage_shape = tuple(non1_storage_shape)
def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names, def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names,
boxify_sweep=False): boxify_sweep=False):
...@@ -332,14 +361,15 @@ class ArrayToBufferMap(object): ...@@ -332,14 +361,15 @@ class ArrayToBufferMap(object):
domain = domain & renamed_aug_domain domain = domain & renamed_aug_domain
from loopy.isl_helpers import convexify, boxify from loopy.isl_helpers import boxify, convexify
if boxify_sweep: if boxify_sweep:
return boxify(self.kernel.cache_manager, domain, return boxify(self.kernel.cache_manager, domain,
new_non1_storage_axis_names, self.kernel.assumptions) new_non1_storage_axis_names, self.kernel.assumptions)
else: else:
return convexify(domain) return convexify(domain)
def is_access_descriptor_in_footprint(self, accdesc): def is_access_descriptor_in_footprint(self, accdesc: AccessDescriptor) -> bool:
assert accdesc.storage_axis_exprs is not None
return self._is_access_descriptor_in_footprint_inner( return self._is_access_descriptor_in_footprint_inner(
tuple(accdesc.storage_axis_exprs)) tuple(accdesc.storage_axis_exprs))
...@@ -352,13 +382,11 @@ class ArrayToBufferMap(object): ...@@ -352,13 +382,11 @@ class ArrayToBufferMap(object):
self.stor2sweep, self.stor2sweep,
except_inames=frozenset(self.primed_sweep_inames)).domain() except_inames=frozenset(self.primed_sweep_inames)).domain()
arg_inames = ( arg_inames = set(global_s2s_par_dom.get_var_names(dim_type.param))
set(global_s2s_par_dom.get_var_names(dim_type.param))
& self.kernel.all_inames())
for arg in storage_axis_exprs: for arg in storage_axis_exprs:
arg_inames.update(get_dependencies(arg)) arg_inames.update(get_dependencies(arg))
arg_inames = frozenset(arg_inames) arg_inames = frozenset(arg_inames & self.kernel.all_inames())
from loopy.kernel import CannotBranchDomainTree from loopy.kernel import CannotBranchDomainTree
try: try:
...@@ -391,30 +419,33 @@ class ArrayToBufferMap(object): ...@@ -391,30 +419,33 @@ class ArrayToBufferMap(object):
except_inames=frozenset(self.primed_sweep_inames)) except_inames=frozenset(self.primed_sweep_inames))
s2s_domain = stor2sweep.domain() s2s_domain = stor2sweep.domain()
s2s_domain, aligned_g_s2s_parm_dom = isl.align_two( s2s_domain, aligned_g_s2s_param_dom = isl.align_two(
s2s_domain, global_s2s_par_dom) s2s_domain, global_s2s_par_dom)
arg_restrictions = ( arg_restrictions = (
aligned_g_s2s_parm_dom aligned_g_s2s_param_dom
.eliminate(dim_type.set, 0, .eliminate(dim_type.set, 0,
aligned_g_s2s_parm_dom.dim(dim_type.set)) aligned_g_s2s_param_dom.dim(dim_type.set))
.remove_divs()) .remove_divs())
return (arg_restrictions & s2s_domain).is_subset( return (arg_restrictions & s2s_domain).is_subset(
aligned_g_s2s_parm_dom) aligned_g_s2s_param_dom)
class NoOpArrayToBufferMap(object): class NoOpArrayToBufferMap(ArrayToBufferMapBase):
non1_storage_axis_names = () non1_storage_axis_names = ()
storage_base_indices = () storage_base_indices = ()
non1_storage_shape = () non1_storage_shape = ()
def is_access_descriptor_in_footprint(self, accdesc): def is_access_descriptor_in_footprint(self, accdesc: AccessDescriptor) -> bool:
# no index dependencies--every reference to the subst rule # no index dependencies--every reference to the subst rule
# is necessarily in the footprint. # is necessarily in the footprint.
return True return True
def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names,
boxify_sweep=False):
return domain
# }}} # }}}
# vim: foldmethod=marker # vim: foldmethod=marker
from __future__ import division, absolute_import from __future__ import annotations
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
...@@ -23,12 +24,13 @@ THE SOFTWARE. ...@@ -23,12 +24,13 @@ THE SOFTWARE.
""" """
import six
from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingContext)
from loopy.kernel.data import ValueArg, GlobalArg
import islpy as isl import islpy as isl
from loopy.kernel.data import ArrayArg, ValueArg
from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext
from loopy.translation_unit import for_each_kernel
__doc__ = """ __doc__ = """
.. currentmodule:: loopy .. currentmodule:: loopy
...@@ -38,10 +40,24 @@ __doc__ = """ ...@@ -38,10 +40,24 @@ __doc__ = """
# {{{ to_batched # {{{ to_batched
def temp_needs_batching_if_not_sequential(tv, batch_varying_args):
from loopy.kernel.data import AddressSpace
if tv.name in batch_varying_args:
return True
if tv.initializer is not None and tv.read_only:
# do not batch read_only temps if not in
# `batch_varying_args`
return False
if tv.address_space == AddressSpace.PRIVATE:
# do not batch private temps if not in `batch_varying args`
return False
return True
class _BatchVariableChanger(RuleAwareIdentityMapper): class _BatchVariableChanger(RuleAwareIdentityMapper):
def __init__(self, rule_mapping_context, kernel, batch_varying_args, def __init__(self, rule_mapping_context, kernel, batch_varying_args,
batch_iname_expr, sequential): batch_iname_expr, sequential):
super(_BatchVariableChanger, self).__init__(rule_mapping_context) super().__init__(rule_mapping_context)
self.kernel = kernel self.kernel = kernel
self.batch_varying_args = batch_varying_args self.batch_varying_args = batch_varying_args
...@@ -50,28 +66,31 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): ...@@ -50,28 +66,31 @@ class _BatchVariableChanger(RuleAwareIdentityMapper):
def needs_batch_subscript(self, name): def needs_batch_subscript(self, name):
tv = self.kernel.temporary_variables.get(name) tv = self.kernel.temporary_variables.get(name)
return (
(not self.sequential if name in self.batch_varying_args:
and (tv is not None return True
and not ( if not self.sequential:
tv.initializer is not None if tv is None:
and tv.read_only))) return False
or if not temp_needs_batching_if_not_sequential(tv,
name in self.batch_varying_args) self.batch_varying_args):
return False
return True
def map_subscript(self, expr, expn_state): def map_subscript(self, expr, expn_state):
if not self.needs_batch_subscript(expr.aggregate.name): if not self.needs_batch_subscript(expr.aggregate.name):
return super(_BatchVariableChanger, self).map_subscript(expr, expn_state) return super().map_subscript(expr, expn_state)
idx = self.rec(expr.index, expn_state) idx = self.rec(expr.index, expn_state)
if not isinstance(idx, tuple): if not isinstance(idx, tuple):
idx = (idx,) idx = (idx,)
return type(expr)(expr.aggregate, (self.batch_iname_expr,) + idx) return type(expr)(expr.aggregate, (self.batch_iname_expr, *idx))
def map_variable(self, expr, expn_state): def map_variable(self, expr, expn_state):
if not self.needs_batch_subscript(expr.name): if not self.needs_batch_subscript(expr.name):
return super(_BatchVariableChanger, self).map_variable(expr, expn_state) return super().map_variable(expr, expn_state)
return expr[self.batch_iname_expr] return expr[self.batch_iname_expr]
...@@ -82,14 +101,20 @@ def _add_unique_dim_name(name, dim_names): ...@@ -82,14 +101,20 @@ def _add_unique_dim_name(name, dim_names):
from pytools import UniqueNameGenerator from pytools import UniqueNameGenerator
ng = UniqueNameGenerator(set(dim_names)) ng = UniqueNameGenerator(set(dim_names))
return (ng(name),) + tuple(dim_names) return (ng(name), *tuple(dim_names))
def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", @for_each_kernel
def to_batched(kernel, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
sequential=False): sequential=False):
"""Takes in a kernel that carries out an operation and returns a kernel """Takes in a kernel that carries out an operation and returns a kernel
that carries out a batch of these operations. that carries out a batch of these operations.
.. note::
For temporaries in a kernel that are private or read only
globals and if `sequential=True`, loopy does not does not batch these
variables unless explicitly mentioned in `batch_varying_args`.
:arg nbatches: the number of batches. May be a constant non-negative :arg nbatches: the number of batches. May be a constant non-negative
integer or a string, which will be added as an integer argument. integer or a string, which will be added as an integer argument.
:arg batch_varying_args: a list of argument names that vary per-batch. :arg batch_varying_args: a list of argument names that vary per-batch.
...@@ -101,70 +126,70 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", ...@@ -101,70 +126,70 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
from pymbolic import var from pymbolic import var
vng = knl.get_var_name_generator() vng = kernel.get_var_name_generator()
batch_iname = vng(batch_iname_prefix) batch_iname = vng(batch_iname_prefix)
batch_iname_expr = var(batch_iname) batch_iname_expr = var(batch_iname)
new_args = [] new_args = []
batch_dom_str = "{[%(iname)s]: 0 <= %(iname)s < %(nbatches)s}" % { batch_dom_str = "{{[{iname}]: 0 <= {iname} < {nbatches}}}".format(
"iname": batch_iname, iname=batch_iname,
"nbatches": nbatches, nbatches=nbatches,
} )
if not isinstance(nbatches, int): if not isinstance(nbatches, int):
batch_dom_str = "[%s] -> " % nbatches + batch_dom_str batch_dom_str = "[%s] -> " % nbatches + batch_dom_str
new_args.append(ValueArg(nbatches, dtype=knl.index_dtype)) new_args.append(ValueArg(nbatches, dtype=kernel.index_dtype))
nbatches_expr = var(nbatches) nbatches_expr = var(nbatches)
else: else:
nbatches_expr = nbatches nbatches_expr = nbatches
batch_domain = isl.BasicSet(batch_dom_str) batch_domain = isl.BasicSet(batch_dom_str)
new_domains = [batch_domain] + knl.domains new_domains = [batch_domain, *kernel.domains]
for arg in knl.args: for arg in kernel.args:
if arg.name in batch_varying_args: if arg.name in batch_varying_args:
if isinstance(arg, ValueArg): if isinstance(arg, ValueArg):
arg = GlobalArg(arg.name, arg.dtype, shape=(nbatches_expr,), arg = ArrayArg(arg.name, arg.dtype, shape=(nbatches_expr,),
dim_tags="c") dim_tags="c")
else: else:
arg = arg.copy( arg = arg.copy(
shape=(nbatches_expr,) + arg.shape, shape=(nbatches_expr, *arg.shape),
dim_tags=("c",) * (len(arg.shape) + 1), dim_tags=("c",) * (len(arg.shape) + 1),
dim_names=_add_unique_dim_name("ibatch", arg.dim_names)) dim_names=_add_unique_dim_name("ibatch", arg.dim_names))
new_args.append(arg) new_args.append(arg)
knl = knl.copy( kernel = kernel.copy(
domains=new_domains, domains=new_domains,
args=new_args) args=new_args)
if not sequential: if not sequential:
new_temps = {} new_temps = {}
for temp in six.itervalues(knl.temporary_variables): for temp in kernel.temporary_variables.values():
if temp.initializer is not None and temp.read_only: if temp_needs_batching_if_not_sequential(temp, batch_varying_args):
new_temps[temp.name] = temp
else:
new_temps[temp.name] = temp.copy( new_temps[temp.name] = temp.copy(
shape=(nbatches_expr,) + temp.shape, shape=(nbatches_expr, *temp.shape),
dim_tags=("c",) * (len(temp.shape) + 1), dim_tags=("c",) * (len(temp.shape) + 1),
dim_names=_add_unique_dim_name("ibatch", temp.dim_names)) dim_names=_add_unique_dim_name("ibatch", temp.dim_names))
else:
new_temps[temp.name] = temp
knl = knl.copy(temporary_variables=new_temps) kernel = kernel.copy(temporary_variables=new_temps)
else: else:
import loopy as lp import loopy as lp
from loopy.kernel.data import ForceSequentialTag from loopy.kernel.data import ForceSequentialTag
knl = lp.tag_inames(knl, [(batch_iname, ForceSequentialTag())]) kernel = lp.tag_inames(kernel, [(batch_iname, ForceSequentialTag())])
rule_mapping_context = SubstitutionRuleMappingContext( rule_mapping_context = SubstitutionRuleMappingContext(
knl.substitutions, vng) kernel.substitutions, vng)
bvc = _BatchVariableChanger(rule_mapping_context, bvc = _BatchVariableChanger(rule_mapping_context,
knl, batch_varying_args, batch_iname_expr, kernel, batch_varying_args, batch_iname_expr,
sequential=sequential) sequential=sequential)
kernel = rule_mapping_context.finish_kernel( kernel = rule_mapping_context.finish_kernel(
bvc.map_kernel(knl)) bvc.map_kernel(kernel))
batch_iname_set = frozenset([batch_iname]) batch_iname_set = frozenset([batch_iname])
kernel = kernel.copy( kernel = kernel.copy(
......
from __future__ import division, absolute_import from __future__ import annotations
from six.moves import range
__copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner" __copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner"
...@@ -23,20 +23,31 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ...@@ -23,20 +23,31 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
""" """
from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, import logging
AccessDescriptor)
from loopy.symbolic import (get_dependencies, from constantdict import constantdict
RuleAwareIdentityMapper, SubstitutionRuleMappingContext,
SubstitutionMapper) from pymbolic import var
from pymbolic.mapper.substitutor import make_subst_func from pymbolic.mapper.substitutor import make_subst_func
from pytools.persistent_dict import PersistentDict
from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper
from loopy.version import DATA_MODEL_VERSION
from loopy.diagnostic import LoopyError from loopy.diagnostic import LoopyError
from loopy.kernel import LoopKernel
from loopy.kernel.function_interface import CallableKernel, ScalarCallable
from loopy.symbolic import (
RuleAwareIdentityMapper,
SubstitutionMapper,
SubstitutionRuleMappingContext,
get_dependencies,
)
from loopy.tools import memoize_on_disk
from loopy.transform.array_buffer_map import (
AccessDescriptor,
ArrayToBufferMap,
NoOpArrayToBufferMap,
)
from loopy.translation_unit import TranslationUnit
from pymbolic import var
import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -45,7 +56,7 @@ logger = logging.getLogger(__name__) ...@@ -45,7 +56,7 @@ logger = logging.getLogger(__name__)
class ArrayAccessReplacer(RuleAwareIdentityMapper): class ArrayAccessReplacer(RuleAwareIdentityMapper):
def __init__(self, rule_mapping_context, def __init__(self, rule_mapping_context,
var_name, within, array_base_map, buf_var): var_name, within, array_base_map, buf_var):
super(ArrayAccessReplacer, self).__init__(rule_mapping_context) super().__init__(rule_mapping_context)
self.within = within self.within = within
...@@ -65,7 +76,7 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper): ...@@ -65,7 +76,7 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper):
result = self.map_array_access((), expn_state) result = self.map_array_access((), expn_state)
if result is None: if result is None:
return super(ArrayAccessReplacer, self).map_variable(expr, expn_state) return super().map_variable(expr, expn_state)
else: else:
self.modified_insn_ids.add(expn_state.insn_id) self.modified_insn_ids.add(expn_state.insn_id)
return result return result
...@@ -79,7 +90,7 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper): ...@@ -79,7 +90,7 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper):
result = self.map_array_access(expr.index_tuple, expn_state) result = self.map_array_access(expr.index_tuple, expn_state)
if result is None: if result is None:
return super(ArrayAccessReplacer, self).map_subscript(expr, expn_state) return super().map_subscript(expr, expn_state)
else: else:
self.modified_insn_ids.add(expn_state.insn_id) self.modified_insn_ids.add(expn_state.insn_id)
return result return result
...@@ -104,7 +115,7 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper): ...@@ -104,7 +115,7 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper):
continue continue
ax_index = index[i] ax_index = index[i]
from loopy.isl_helpers import simplify_via_aff from loopy.symbolic import simplify_via_aff
ax_index = simplify_via_aff( ax_index = simplify_via_aff(
ax_index - abm.storage_base_indices[i]) ax_index - abm.storage_base_indices[i])
...@@ -117,26 +128,21 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper): ...@@ -117,26 +128,21 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper):
# Can't possibly be nested, but recurse anyway to # Can't possibly be nested, but recurse anyway to
# make sure substitution rules referenced below here # make sure substitution rules referenced below here
# do not get thrown away. # do not get thrown away.
self.rec(result, expn_state.copy(arg_context={})) self.rec(result, expn_state.copy(arg_context=constantdict()))
return result return result
# }}} # }}}
buffer_array_cache = PersistentDict("loopy-buffer-array-cache-"+DATA_MODEL_VERSION, def buffer_array_for_single_kernel(kernel, callables_table, var_name,
key_builder=LoopyKeyBuilder()) buffer_inames, init_expression=None, store_expression=None,
within=None, default_tag="l.auto", temporary_scope=None,
# Adding an argument? also add something to the cache_key below.
def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
store_expression=None, within=None, default_tag="l.auto",
temporary_scope=None, temporary_is_local=None,
fetch_bounding_box=False): fetch_bounding_box=False):
"""Replace accesses to *var_name* with ones to a temporary, which is """Replace accesses to *var_name* with ones to a temporary, which is
created and acts as a buffer. To perform this transformation, the access created and acts as a buffer. To perform this transformation, the access
footprint to *var_name* is determined and a temporary of a suitable footprint to *var_name* is determined and a temporary of a suitable
:class:`loopy.temp_var_scope` and shape is created. :class:`loopy.AddressSpace` and shape is created.
By default, the value of the buffered cells in *var_name* are read prior to By default, the value of the buffered cells in *var_name* are read prior to
any (read/write) use, and the modified values are written out after use has any (read/write) use, and the modified values are written out after use has
...@@ -158,8 +164,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, ...@@ -158,8 +164,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
:arg within: If not None, limit the action of the transformation to :arg within: If not None, limit the action of the transformation to
matching contexts. See :func:`loopy.match.parse_stack_match` matching contexts. See :func:`loopy.match.parse_stack_match`
for syntax. for syntax.
:arg temp_var_scope: If given, override the choice of :class:`temp_var_scope` :arg temporary_scope: If given, override the choice of
for the created temporary. :class:`AddressSpace` for the created temporary.
:arg default_tag: The default :ref:`iname-tags` to be assigned to the :arg default_tag: The default :ref:`iname-tags` to be assigned to the
inames used for fetching and storing inames used for fetching and storing
:arg fetch_bounding_box: If the access footprint is non-convex :arg fetch_bounding_box: If the access footprint is non-convex
...@@ -168,26 +174,19 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, ...@@ -168,26 +174,19 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
fetched. fetched.
""" """
# {{{ unify temporary_scope / temporary_is_local if isinstance(kernel, TranslationUnit):
kernel_names = [i for i, clbl in
kernel.callables_table.items() if isinstance(clbl,
CallableKernel)]
if len(kernel_names) != 1:
raise LoopyError()
from loopy.kernel.data import temp_var_scope return kernel.with_kernel(buffer_array(kernel[kernel_names[0]],
if temporary_is_local is not None: var_name, buffer_inames, init_expression, store_expression, within,
from warnings import warn default_tag, temporary_scope,
warn("temporary_is_local is deprecated. Use temporary_scope instead", fetch_bounding_box, kernel.callables_table))
DeprecationWarning, stacklevel=2)
if temporary_scope is not None: assert isinstance(kernel, LoopKernel)
raise LoopyError("may not specify both temporary_is_local and "
"temporary_scope")
if temporary_is_local:
temporary_scope = temp_var_scope.LOCAL
else:
temporary_scope = temp_var_scope.PRIVATE
del temporary_is_local
# }}}
# {{{ process arguments # {{{ process arguments
...@@ -233,36 +232,16 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, ...@@ -233,36 +232,16 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
# }}} # }}}
# {{{ caching
from loopy import CACHING_ENABLED
from loopy.preprocess import prepare_for_caching
key_kernel = prepare_for_caching(kernel)
cache_key = (key_kernel, var_name, tuple(buffer_inames),
PymbolicExpressionHashWrapper(init_expression),
PymbolicExpressionHashWrapper(store_expression), within,
default_tag, temporary_scope, fetch_bounding_box)
if CACHING_ENABLED:
try:
result = buffer_array_cache[cache_key]
logger.info("%s: buffer_array cache hit" % kernel.name)
return result
except KeyError:
pass
# }}}
var_name_gen = kernel.get_var_name_generator() var_name_gen = kernel.get_var_name_generator()
within_inames = set() within_inames = set()
access_descriptors = [] access_descriptors = []
for insn in kernel.instructions: for insn in kernel.instructions:
if not within(kernel, insn.id, ()): if not within(kernel, insn, ()):
continue continue
from pymbolic.primitives import Variable, Subscript from pymbolic.primitives import Subscript, Variable
from loopy.symbolic import LinearSubscript from loopy.symbolic import LinearSubscript
for assignee in insn.assignees: for assignee in insn.assignees:
...@@ -275,7 +254,9 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, ...@@ -275,7 +254,9 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
index = assignee.index_tuple index = assignee.index_tuple
elif isinstance(assignee, LinearSubscript): elif isinstance(assignee, LinearSubscript):
if assignee.aggregate.name == var_name: assignee_name = assignee.aggregate.name
index = ()
if assignee_name == var_name:
raise LoopyError("buffer_array may not be applied in the " raise LoopyError("buffer_array may not be applied in the "
"presence of linear write indexing into '%s'" % var_name) "presence of linear write indexing into '%s'" % var_name)
...@@ -302,8 +283,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, ...@@ -302,8 +283,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
if isinstance(var_descr, ArrayBase) and var_descr.dim_names is not None: if isinstance(var_descr, ArrayBase) and var_descr.dim_names is not None:
dim_name = var_descr.dim_names[i] dim_name = var_descr.dim_names[i]
init_iname = var_name_gen("%s_init_%s" % (var_name, dim_name)) init_iname = var_name_gen(f"{var_name}_init_{dim_name}")
store_iname = var_name_gen("%s_store_%s" % (var_name, dim_name)) store_iname = var_name_gen(f"{var_name}_store_{dim_name}")
new_iname_to_tag[init_iname] = default_tag new_iname_to_tag[init_iname] = default_tag
new_iname_to_tag[store_iname] = default_tag new_iname_to_tag[store_iname] = default_tag
...@@ -377,7 +358,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, ...@@ -377,7 +358,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
dtype=var_descr.dtype, dtype=var_descr.dtype,
base_indices=(0,)*len(abm.non1_storage_shape), base_indices=(0,)*len(abm.non1_storage_shape),
shape=tuple(abm.non1_storage_shape), shape=tuple(abm.non1_storage_shape),
scope=temporary_scope) address_space=temporary_scope)
new_temporary_variables[buf_var_name] = temp_var new_temporary_variables[buf_var_name] = temp_var
...@@ -513,26 +494,53 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, ...@@ -513,26 +494,53 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
new_insns.append(init_instruction) new_insns.append(init_instruction)
if did_write: if did_write:
new_insns.append(store_instruction) # new_insns_with_redirected_deps: if an insn depends on a modified
# insn, then it should also depend on the store insn.
new_insns_with_redirected_deps = [
insn.copy(depends_on=(insn.depends_on | {store_instruction.id}))
if insn.depends_on & aar.modified_insn_ids
else insn
for insn in new_insns
] + [store_instruction]
else: else:
for iname in store_inames: for iname in store_inames:
del new_iname_to_tag[iname] del new_iname_to_tag[iname]
new_insns_with_redirected_deps = new_insns
kernel = kernel.copy( kernel = kernel.copy(
domains=new_kernel_domains, domains=new_kernel_domains,
instructions=new_insns, instructions=new_insns_with_redirected_deps,
temporary_variables=new_temporary_variables) temporary_variables=new_temporary_variables)
from loopy import tag_inames from loopy import tag_inames
kernel = tag_inames(kernel, new_iname_to_tag) kernel = tag_inames(kernel, new_iname_to_tag)
from loopy.kernel.tools import assign_automatic_axes from loopy.kernel.tools import assign_automatic_axes
kernel = assign_automatic_axes(kernel) kernel = assign_automatic_axes(kernel, callables_table)
if CACHING_ENABLED:
from loopy.preprocess import prepare_for_caching
buffer_array_cache[cache_key] = prepare_for_caching(kernel)
return kernel return kernel
@memoize_on_disk
def buffer_array(program, *args, **kwargs):
assert isinstance(program, TranslationUnit)
new_callables = {}
for func_id, clbl in program.callables_table.items():
if isinstance(clbl, CallableKernel):
clbl = clbl.copy(
subkernel=buffer_array_for_single_kernel(clbl.subkernel,
program.callables_table, *args, **kwargs))
elif isinstance(clbl, ScalarCallable):
pass
else:
raise NotImplementedError()
new_callables[func_id] = clbl
return program.copy(callables_table=constantdict(new_callables))
# vim: foldmethod=marker # vim: foldmethod=marker