diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a6b56f8db65e7a1856d82db1e005c788a272d3e5..8123bf042251efad613755f0184f42d7682e293c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,7 @@ jobs: - name: "Main Script" run: | curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh - . ./prepare-and-run-flake8.sh ./loopy ./test + . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test examples pylint: name: Pylint @@ -35,7 +35,7 @@ jobs: CONDA_ENVIRONMENT=.test-conda-env.yml USE_CONDA_BUILD=1 curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh - . ./prepare-and-run-pylint.sh loopy test/test_*.py + . ./prepare-and-run-pylint.sh "$(basename $GITHUB_REPOSITORY)" test/test_*.py pytest3: name: Conda Pytest diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 515473e6ab76577a5c47aca6d39ce2f241a5e795..851caaebd4334ac9421c42d60dcaca8d57a812ca 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -89,7 +89,7 @@ Pylint: - export PY_EXE=python3 - EXTRA_INSTALL="pybind11 numpy mako matplotlib ipykernel ply fparser" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh - - ". ./prepare-and-run-pylint.sh loopy test/test_*.py" + - . ./prepare-and-run-pylint.sh "$CI_PROJECT_NAME" test/test_*.py tags: - python3 except: @@ -106,7 +106,7 @@ Documentation: Flake8: script: - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh - - ". ./prepare-and-run-flake8.sh loopy test" + - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test examples tags: - python3 except: diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index 7ab049cd1906f703b0efc39808ff68a63b91ff37..7f80175ebe82b8412a38708a5b1d32042d8061fe 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -1,7 +1,5 @@ import numpy as np import loopy as lp -import pyopencl as cl -import pyopencl.array knl = lp.make_kernel( "{ [i,k]: 0<=i 200: op = op[:200] + "..." @@ -498,7 +498,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): insn_label = op tooltip = insn.id - lines.append("\"%s\" [label=\"%s\",shape=\"box\",tooltip=\"%s\"];" + lines.append('"%s" [label="%s",shape="box",tooltip="%s"];' % ( insn.id, repr(insn_label)[1:-1], @@ -542,7 +542,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): for sched_item in kernel.schedule: if isinstance(sched_item, EnterLoop): - lines.append("subgraph cluster_%s { label=\"%s\"" + lines.append('subgraph cluster_%s { label="%s"' % (sched_item.iname, sched_item.iname)) elif isinstance(sched_item, LeaveLoop): lines.append("}") @@ -1546,8 +1546,8 @@ def stringify_instruction_list(kernel): options.append("no_sync_with=%s" % ":".join( "%s@%s" % entry for entry in sorted(insn.no_sync_with))) if isinstance(insn, lp.BarrierInstruction) and \ - insn.synchronization_kind == 'local': - options.append('mem_kind=%s' % insn.mem_kind) + insn.synchronization_kind == "local": + options.append("mem_kind=%s" % insn.mem_kind) if lhs: core = "%s = %s" % ( diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index aacce544b35a31359cb535dfeacc46d6e7e2acda..7e1b7af5a818663f1c6e7d56fa93c90bc73ad26c 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -164,11 +164,11 @@ def get_le_neutral(dtype): elif dtype.numpy_dtype.kind == "i": # OpenCL 1.1, section 6.11.3 if dtype.numpy_dtype.itemsize == 4: - #32 bit integer + # 32 bit integer return var("INT_MAX") elif dtype.numpy_dtype.itemsize == 8: - #64 bit integer - return var('LONG_MAX') + # 64 bit integer + return var("LONG_MAX") else: raise NotImplementedError("less") @@ -182,11 +182,11 @@ def get_ge_neutral(dtype): elif dtype.numpy_dtype.kind == "i": # OpenCL 1.1, section 6.11.3 if dtype.numpy_dtype.itemsize == 4: - #32 bit integer + # 32 bit integer return var("INT_MIN") elif dtype.numpy_dtype.itemsize == 8: - #64 bit integer - return var('LONG_MIN') + # 64 bit integer + return var("LONG_MIN") else: raise NotImplementedError("less") diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 6ce9d0a9972754d12e249d3ad41d5bdd2746c75c..4cc4fcd23c620b9866aefe0ed481d58bdd28b471 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -447,9 +447,9 @@ def format_insn(kernel, insn_id): Fore.MAGENTA, str(insn.expression), Style.RESET_ALL, format_insn_id(kernel, insn_id)) elif isinstance(insn, BarrierInstruction): - mem_kind = '' - if insn.synchronization_kind == 'local': - mem_kind = '{mem_kind=%s}' % insn.mem_kind + mem_kind = "" + if insn.synchronization_kind == "local": + mem_kind = "{mem_kind=%s}" % insn.mem_kind return "[%s] %s... %sbarrier%s%s" % ( format_insn_id(kernel, insn_id), diff --git a/loopy/statistics.py b/loopy/statistics.py index 2a005a73105a675c45c5781d44d0a0a0e816650e..46904aeea417492e38e30b22c897ad13565a06b5 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -108,7 +108,7 @@ class GuardedPwQPolynomial(object): @staticmethod def zero(): - p = isl.PwQPolynomial('{ 0 }') + p = isl.PwQPolynomial("{ 0 }") return GuardedPwQPolynomial(p, isl.Set.universe(p.domain().space)) def __str__(self): @@ -221,10 +221,10 @@ class ToCountMap(object): # (first create loopy kernel and specify array data types) - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} mem_map = lp.get_mem_access_map(knl) - filtered_map = mem_map.filter_by(direction=['load'], - variable=['a','g']) + filtered_map = mem_map.filter_by(direction=["load"], + variable=["a","g"]) tot_loads_a_g = filtered_map.eval_and_sum(params) # (now use these counts to, e.g., predict performance) @@ -234,8 +234,8 @@ class ToCountMap(object): result_map = ToCountMap(val_type=self.val_type) from loopy.types import to_loopy_type - if 'dtype' in kwargs.keys(): - kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']] + if "dtype" in kwargs.keys(): + kwargs["dtype"] = [to_loopy_type(d) for d in kwargs["dtype"]] # for each item in self.count_map for self_key, self_val in self.items(): @@ -267,7 +267,7 @@ class ToCountMap(object): # (first create loopy kernel and specify array data types) - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} mem_map = lp.get_mem_access_map(knl) def filter_func(key): return key.lid_strides[0] > 1 and key.lid_strides[0] <= 4: @@ -302,29 +302,29 @@ class ToCountMap(object): # (first create loopy kernel and specify array data types) - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} mem_map = get_mem_access_map(knl) - grouped_map = mem_map.group_by('mtype', 'dtype', 'direction') + grouped_map = mem_map.group_by("mtype", "dtype", "direction") - f32_global_ld = grouped_map[MemAccess(mtype='global', + f32_global_ld = grouped_map[MemAccess(mtype="global", dtype=np.float32, - direction='load') + direction="load") ].eval_with_dict(params) - f32_global_st = grouped_map[MemAccess(mtype='global', + f32_global_st = grouped_map[MemAccess(mtype="global", dtype=np.float32, - direction='store') + direction="store") ].eval_with_dict(params) - f32_local_ld = grouped_map[MemAccess(mtype='local', + f32_local_ld = grouped_map[MemAccess(mtype="local", dtype=np.float32, - direction='load') + direction="load") ].eval_with_dict(params) - f32_local_st = grouped_map[MemAccess(mtype='local', + f32_local_st = grouped_map[MemAccess(mtype="local", dtype=np.float32, - direction='store') + direction="store") ].eval_with_dict(params) op_map = get_op_map(knl) - ops_dtype = op_map.group_by('dtype') + ops_dtype = op_map.group_by("dtype") f32ops = ops_dtype[Op(dtype=np.float32)].eval_with_dict(params) f64ops = ops_dtype[Op(dtype=np.float64)].eval_with_dict(params) @@ -372,20 +372,20 @@ class ToCountMap(object): # (first create loopy kernel and specify array data types) bytes_map = get_mem_access_map(knl).to_bytes() - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} s1_g_ld_byt = bytes_map.filter_by( - mtype=['global'], lid_strides={0: 1}, - direction=['load']).eval_and_sum(params) + mtype=["global"], lid_strides={0: 1}, + direction=["load"]).eval_and_sum(params) s2_g_ld_byt = bytes_map.filter_by( - mtype=['global'], lid_strides={0: 2}, - direction=['load']).eval_and_sum(params) + mtype=["global"], lid_strides={0: 2}, + direction=["load"]).eval_and_sum(params) s1_g_st_byt = bytes_map.filter_by( - mtype=['global'], lid_strides={0: 1}, - direction=['store']).eval_and_sum(params) + mtype=["global"], lid_strides={0: 1}, + direction=["store"]).eval_and_sum(params) s2_g_st_byt = bytes_map.filter_by( - mtype=['global'], lid_strides={0: 2}, - direction=['store']).eval_and_sum(params) + mtype=["global"], lid_strides={0: 2}, + direction=["store"]).eval_and_sum(params) # (now use these counts to, e.g., predict performance) @@ -438,10 +438,10 @@ class ToCountMap(object): # (first create loopy kernel and specify array data types) - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} mem_map = lp.get_mem_access_map(knl) - filtered_map = mem_map.filter_by(direction=['load'], - variable=['a', 'g']) + filtered_map = mem_map.filter_by(direction=["load"], + variable=["a", "g"]) tot_loads_a_g = filtered_map.eval_and_sum(params) # (now use these counts to, e.g., predict performance) @@ -507,7 +507,7 @@ class Op(Record): once per *work-item*, *sub-group*, or *work-group*. The granularities allowed can be found in :class:`CountGranularity`, and may be accessed, e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance - of computation executing on a single processor (think 'thread'), a + of computation executing on a single processor (think "thread"), a collection of which may be grouped together into a work-group. Each work-group executes on a single compute unit with all work-items within the work-group sharing local memory. A sub-group is an @@ -593,7 +593,7 @@ class MemAccess(Record): once per *work-item*, *sub-group*, or *work-group*. The granularities allowed can be found in :class:`CountGranularity`, and may be accessed, e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance - of computation executing on a single processor (think 'thread'), a + of computation executing on a single processor (think "thread"), a collection of which may be grouped together into a work-group. Each work-group executes on a single compute unit with all work-items within the work-group sharing local memory. A sub-group is an @@ -725,7 +725,7 @@ class ExpressionOpCounter(CounterBase): def map_call(self, expr): return ToCountMap( {Op(dtype=self.type_inf(expr), - name='func:'+str(expr.function), + name="func:"+str(expr.function), count_granularity=CountGranularity.SUBGROUP): 1} ) + self.rec(expr.parameters) @@ -739,7 +739,7 @@ class ExpressionOpCounter(CounterBase): assert expr.children return ToCountMap( {Op(dtype=self.type_inf(expr), - name='add', + name="add", count_granularity=CountGranularity.SUBGROUP): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) @@ -748,18 +748,18 @@ class ExpressionOpCounter(CounterBase): from pymbolic.primitives import is_zero assert expr.children return sum(ToCountMap({Op(dtype=self.type_inf(expr), - name='mul', + name="mul", count_granularity=CountGranularity.SUBGROUP): 1}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ ToCountMap({Op(dtype=self.type_inf(expr), - name='mul', + name="mul", count_granularity=CountGranularity.SUBGROUP): -1}) def map_quotient(self, expr, *args): return ToCountMap({Op(dtype=self.type_inf(expr), - name='div', + name="div", count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -769,14 +769,14 @@ class ExpressionOpCounter(CounterBase): def map_power(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), - name='pow', + name="pow", count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), - name='shift', + name="shift", count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) @@ -785,13 +785,13 @@ class ExpressionOpCounter(CounterBase): def map_bitwise_not(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), - name='bw', + name="bw", count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), - name='bw', + name="bw", count_granularity=CountGranularity.SUBGROUP): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -815,7 +815,7 @@ class ExpressionOpCounter(CounterBase): def map_min(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), - name='maxmin', + name="maxmin", count_granularity=CountGranularity.SUBGROUP): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -958,7 +958,7 @@ class LocalMemAccessCounter(MemAccessCounter): if index is None: # no subscript sub_map[MemAccess( - mtype='local', + mtype="local", dtype=dtype, count_granularity=CountGranularity.SUBGROUP) ] = 1 @@ -975,7 +975,7 @@ class LocalMemAccessCounter(MemAccessCounter): self.knl, array, index_tuple) sub_map[MemAccess( - mtype='local', + mtype="local", dtype=dtype, lid_strides=dict(sorted(six.iteritems(lid_strides))), gid_strides=dict(sorted(six.iteritems(gid_strides))), @@ -1015,7 +1015,7 @@ class GlobalMemAccessCounter(MemAccessCounter): # this array is not in global memory return ToCountMap() - return ToCountMap({MemAccess(mtype='global', + return ToCountMap({MemAccess(mtype="global", dtype=self.type_inf(expr), lid_strides={}, gid_strides={}, variable=name, count_granularity=CountGranularity.WORKITEM): 1} @@ -1050,7 +1050,7 @@ class GlobalMemAccessCounter(MemAccessCounter): ) else CountGranularity.SUBGROUP return ToCountMap({MemAccess( - mtype='global', + mtype="global", dtype=self.type_inf(expr), lid_strides=dict(sorted(six.iteritems(lid_strides))), gid_strides=dict(sorted(six.iteritems(gid_strides))), @@ -1381,14 +1381,14 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, count operations inside array indices. :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` - ``'guess'``, or *None* that specifies the sub-group size. An OpenCL + ``"guess"``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be made, if this fails an error will be raised. If a :class:`str` - ``'guess'`` is passed as the subgroup_size, get_mem_access_map will + ``"guess"`` is passed as the subgroup_size, get_mem_access_map will attempt to find the sub-group size using the device and, if unsuccessful, will make a wild guess. @@ -1407,13 +1407,13 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, # (first create loopy kernel and specify array data types) op_map = get_op_map(knl) - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} f32add = op_map[Op(np.float32, - 'add', + "add", count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) f32mul = op_map[Op(np.float32, - 'mul', + "mul", count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) @@ -1493,7 +1493,7 @@ def _process_subgroup_size(knl, subgroup_size_requested): if subgroup_size_requested is None: if subgroup_size_guess is None: - # 'guess' was not passed and either no target device found + # "guess" was not passed and either no target device found # or get_simd_group_size returned None raise ValueError("No sub-group size passed, no target device found. " "Either (1) pass integer value for subgroup_size, " @@ -1503,7 +1503,7 @@ def _process_subgroup_size(knl, subgroup_size_requested): else: return subgroup_size_guess - elif subgroup_size_requested == 'guess': + elif subgroup_size_requested == "guess": if subgroup_size_guess is None: # unable to get subgroup_size from device, so guess subgroup_size_guess = 32 @@ -1539,14 +1539,14 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) - :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or + :arg subgroup_size: An :class:`int`, :class:`str` ``"guess"``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be made, if this fails - an error will be raised. If a :class:`str` ``'guess'`` is passed as + an error will be raised. If a :class:`str` ``"guess"`` is passed as the subgroup_size, get_mem_access_map will attempt to find the sub-group size using the device and, if unsuccessful, will make a wild guess. @@ -1565,43 +1565,43 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, # (first create loopy kernel and specify array data types) - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} mem_map = get_mem_access_map(knl) f32_s1_g_ld_a = mem_map[MemAccess( - mtype='global', + mtype="global", dtype=np.float32, lid_strides={0: 1}, gid_strides={0: 256}, - direction='load', - variable='a', + direction="load", + variable="a", count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) f32_s1_g_st_a = mem_map[MemAccess( - mtype='global', + mtype="global", dtype=np.float32, lid_strides={0: 1}, gid_strides={0: 256}, - direction='store', - variable='a', + direction="store", + variable="a", count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) f32_s1_l_ld_x = mem_map[MemAccess( - mtype='local', + mtype="local", dtype=np.float32, lid_strides={0: 1}, gid_strides={0: 256}, - direction='load', - variable='x', + direction="load", + variable="x", count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) f32_s1_l_st_x = mem_map[MemAccess( - mtype='local', + mtype="local", dtype=np.float32, lid_strides={0: 1}, gid_strides={0: 256}, - direction='store', - variable='x', + direction="store", + variable="x", count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) @@ -1691,14 +1691,14 @@ def get_synchronization_map(knl, subgroup_size=None): :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` - ``'guess'``, or *None* that specifies the sub-group size. An OpenCL + ``"guess"``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be made, if this fails an error will be raised. If a :class:`str` - ``'guess'`` is passed as the subgroup_size, get_mem_access_map will + ``"guess"`` is passed as the subgroup_size, get_mem_access_map will attempt to find the sub-group size using the device and, if unsuccessful, will make a wild guess. @@ -1714,8 +1714,8 @@ def get_synchronization_map(knl, subgroup_size=None): # (first create loopy kernel and specify array data types) sync_map = get_synchronization_map(knl) - params = {'n': 512, 'm': 256, 'l': 128} - barrier_ct = sync_map['barrier_local'].eval_with_dict(params) + params = {"n": 512, "m": 256, "l": 128} + barrier_ct = sync_map["barrier_local"].eval_with_dict(params) # (now use this count to, e.g., predict performance) @@ -1732,7 +1732,7 @@ def get_synchronization_map(knl, subgroup_size=None): result = ToCountMap() - one = isl.PwQPolynomial('{ 1 }') + one = isl.PwQPolynomial("{ 1 }") def get_count_poly(iname_list): if iname_list: # (if iname_list is not empty) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 802cc7044bf73d51567e23a6eaac791982709d51..fe9717765d2e7f4f207be3e198170f26997cd022 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1064,7 +1064,7 @@ def generate_header(kernel, codegen_result=None): if not isinstance(kernel.target, CFamilyTarget): raise LoopyError( - 'Header generation for non C-based languages are not implemented') + "Header generation for non C-based languages are not implemented") if codegen_result is None: from loopy.codegen import generate_code_v2 diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 45fad014316511bb852cbf18173fa389e75732c5..9c147b6339317e3f5bcd5df2eb4a6474d3c64874 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -54,7 +54,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): def python_dtype_str(self, dtype): if np.dtype(str(dtype)).isbuiltin: return "_lpy_np."+dtype.name - raise Exception('dtype: {0} not recognized'.format(dtype)) + raise Exception("dtype: {0} not recognized".format(dtype)) # {{{ handle non numpy arguements @@ -149,7 +149,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): kernel, implemented_data_info): gen("for knl in _lpy_c_kernels:") with Indentation(gen): - gen('knl({args})'.format( + gen("knl({args})".format( args=", ".join(args))) # }}} @@ -163,7 +163,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if options.return_dict: gen("return None, {%s}" - % ", ".join("\"%s\": %s" % (arg.name, arg.name) + % ", ".join('"%s": %s' % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) if arg.base_name in kernel.get_written_variables())) @@ -211,10 +211,10 @@ class CCompiler(object): """ def __init__(self, toolchain=None, - cc='gcc', cflags='-std=c99 -O3 -fPIC'.split(), - ldflags='-shared'.split(), libraries=[], + cc="gcc", cflags="-std=c99 -O3 -fPIC".split(), + ldflags="-shared".split(), libraries=[], include_dirs=[], library_dirs=[], defines=[], - source_suffix='c'): + source_suffix="c"): # try to get a default toolchain # or subclass supplied version if available self.toolchain = toolchain @@ -225,32 +225,32 @@ class CCompiler(object): # missing compiler python was built with (likely, Conda) # use a default GCCToolchain logger = logging.getLogger(__name__) - logger.warn('Default toolchain guessed from python config ' - 'not found, replacing with default GCCToolchain.') + logger.warn("Default toolchain guessed from python config " + "not found, replacing with default GCCToolchain.") # this is ugly, but I'm not sure there's a clean way to copy the # default args self.toolchain = GCCToolchain( - cc='gcc', - cflags='-std=c99 -O3 -fPIC'.split(), - ldflags='-shared'.split(), + cc="gcc", + cflags="-std=c99 -O3 -fPIC".split(), + ldflags="-shared".split(), libraries=[], library_dirs=[], defines=[], undefines=[], - source_suffix='c', - so_ext='.so', - o_ext='.o', + source_suffix="c", + so_ext=".so", + o_ext=".o", include_dirs=[]) if toolchain is None: # copy in all differing values - diff = {'cc': cc, - 'cflags': cflags, - 'ldflags': ldflags, - 'libraries': libraries, - 'include_dirs': include_dirs, - 'library_dirs': library_dirs, - 'defines': defines} + diff = {"cc": cc, + "cflags": cflags, + "ldflags": ldflags, + "libraries": libraries, + "include_dirs": include_dirs, + "library_dirs": library_dirs, + "defines": defines} # filter empty and those equal to toolchain defaults diff = dict((k, v) for k, v in six.iteritems(diff) if v and (not hasattr(self.toolchain, k) or @@ -267,7 +267,7 @@ class CCompiler(object): debug_recompile=True): """Compile code, build and load shared library.""" logger.debug(code) - c_fname = self._tempname('code.' + self.source_suffix) + c_fname = self._tempname("code." + self.source_suffix) # build object _, mod_name, ext_file, recompiled = \ @@ -276,9 +276,9 @@ class CCompiler(object): debug_recompile, False) if recompiled: - logger.debug('Kernel {0} compiled from source'.format(name)) + logger.debug("Kernel {0} compiled from source".format(name)) else: - logger.debug('Kernel {0} retrieved from cache'.format(name)) + logger.debug("Kernel {0} retrieved from cache".format(name)) # and return compiled return ctypes.CDLL(ext_file) @@ -288,10 +288,10 @@ class CPlusPlusCompiler(CCompiler): """Subclass of CCompiler to invoke a C++ compiler.""" def __init__(self, toolchain=None, - cc='g++', cflags='-std=c++98 -O3 -fPIC'.split(), + cc="g++", cflags="-std=c++98 -O3 -fPIC".split(), ldflags=[], libraries=[], include_dirs=[], library_dirs=[], defines=[], - source_suffix='cpp'): + source_suffix="cpp"): super(CPlusPlusCompiler, self).__init__( toolchain=toolchain, cc=cc, cflags=cflags, ldflags=ldflags, @@ -322,8 +322,8 @@ class IDIToCDLL(object): def _dtype_to_ctype(self, dtype, pointer=False): """Map NumPy dtype to equivalent ctypes type.""" typename = self.registry.dtype_to_ctype(dtype) - typename = {'unsigned': 'uint'}.get(typename, typename) - basetype = getattr(ctypes, 'c_' + typename) + typename = {"unsigned": "uint"}.get(typename, typename) + basetype = getattr(ctypes, "c_" + typename) if pointer: return ctypes.POINTER(basetype) return basetype @@ -359,7 +359,7 @@ class CompiledCKernel(object): """Execute kernel with given args mapped to ctypes equivalents.""" args_ = [] for arg, arg_t in zip(args, self._fn.argtypes): - if hasattr(arg, 'ctypes'): + if hasattr(arg, "ctypes"): if arg.size == 0: # TODO eliminate unused arguments from kernel arg_ = arg_t(0.0) @@ -406,7 +406,7 @@ class CKernelExecutor(KernelExecutorBase): dev_code = codegen_result.device_code() host_code = codegen_result.host_code() - all_code = '\n'.join([dev_code, '', host_code]) + all_code = "\n".join([dev_code, "", host_code]) if self.kernel.options.write_cl: output = all_code @@ -423,7 +423,7 @@ class CKernelExecutor(KernelExecutorBase): from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") # update code from editor - all_code = '\n'.join([dev_code, '', host_code]) + all_code = "\n".join([dev_code, "", host_code]) c_kernels = [] for dp in codegen_result.device_programs: diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index bb9ab6355c2b15ae1435da510567d20643ac4792..df49679a6390ee8ab43041527f10682b1967235d 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -178,7 +178,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): def map_subscript(self, expr, type_context): def base_impl(expr, type_context): - return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')] + return self.rec(expr.aggregate, type_context)[self.rec(expr.index, "i")] def make_var(name): from loopy import TaggedVariable @@ -226,7 +226,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): base_access = var("read_imagef")( var(ary.name), var("loopy_sampler"), - var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, 'i'))) + var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, "i"))) if ary.dtype.numpy_dtype == np.float32: return base_access.attr("x") @@ -260,7 +260,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): ary, make_var(access_info.array_name), simplify_using_aff( - self.kernel, self.rec(subscript, 'i'))) + self.kernel, self.rec(subscript, "i"))) if access_info.vector_index is not None: return self.codegen_state.ast_builder.add_vector_access( @@ -295,7 +295,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): return self.make_subscript( arg, var(expr.aggregate.name), - self.rec(offset + expr.index, 'i')) + self.rec(offset + expr.index, "i")) elif expr.aggregate.name in self.kernel.temporary_variables: raise RuntimeError("linear indexing is not supported on temporaries: %s" @@ -339,13 +339,13 @@ class ExpressionToCExpressionMapper(IdentityMapper): else: seen_func("%s_pos_b" % base_func_name) return var("%s_pos_b_%s" % (base_func_name, suffix))( - self.rec(expr.numerator, 'i'), - self.rec(expr.denominator, 'i')) + self.rec(expr.numerator, "i"), + self.rec(expr.denominator, "i")) else: seen_func(base_func_name) return var("%s_%s" % (base_func_name, suffix))( - self.rec(expr.numerator, 'i'), - self.rec(expr.denominator, 'i')) + self.rec(expr.numerator, "i"), + self.rec(expr.denominator, "i")) def map_floor_div(self, expr, type_context): import operator @@ -684,8 +684,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): if not self.allow_complex: return base_impl(expr, type_context) - n_complex = 'c' == n_dtype.kind - d_complex = 'c' == d_dtype.kind + n_complex = "c" == n_dtype.kind + d_complex = "c" == d_dtype.kind tgt_dtype = self.infer_type(expr) diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index 7e48e1166a13cfbb7b60f909b071f088034ffda1..d1f993daecc03947d9e6e3e60d2a5145ecbf3786 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit 7e48e1166a13cfbb7b60f909b071f088034ffda1 +Subproject commit d1f993daecc03947d9e6e3e60d2a5145ecbf3786 diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 50fd1026f7bd15ce72915d0d5d5e60f6da4e264c..27422abce85cc6adb329bae9f30e4e36dd9bc06b 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -58,18 +58,18 @@ def _create_vector_types(): vec.type_to_scalar_and_count = {} for base_name, base_type, counts in [ - ('char', np.int8, [1, 2, 3, 4]), - ('uchar', np.uint8, [1, 2, 3, 4]), - ('short', np.int16, [1, 2, 3, 4]), - ('ushort', np.uint16, [1, 2, 3, 4]), - ('int', np.int32, [1, 2, 3, 4]), - ('uint', np.uint32, [1, 2, 3, 4]), - ('long', long_dtype, [1, 2, 3, 4]), - ('ulong', ulong_dtype, [1, 2, 3, 4]), - ('longlong', np.int64, [1, 2]), - ('ulonglong', np.uint64, [1, 2]), - ('float', np.float32, [1, 2, 3, 4]), - ('double', np.float64, [1, 2]), + ("char", np.int8, [1, 2, 3, 4]), + ("uchar", np.uint8, [1, 2, 3, 4]), + ("short", np.int16, [1, 2, 3, 4]), + ("ushort", np.uint16, [1, 2, 3, 4]), + ("int", np.int32, [1, 2, 3, 4]), + ("uint", np.uint32, [1, 2, 3, 4]), + ("long", long_dtype, [1, 2, 3, 4]), + ("ulong", ulong_dtype, [1, 2, 3, 4]), + ("longlong", np.int64, [1, 2]), + ("ulonglong", np.uint64, [1, 2]), + ("float", np.float32, [1, 2, 3, 4]), + ("double", np.float64, [1, 2]), ]: for count in counts: name = "%s%d" % (base_name, count) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index c5ccc54f148d704d560d7fe2e61863c215bb2489..a503475d095baf15d644484dd0acac17d7577574 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -234,7 +234,7 @@ class ExecutionWrapperGeneratorBase(object): gen("else:") with Indentation(gen): if not options.no_numpy: - gen("_lpy_offset = getattr(%s, \"offset\", 0)" + gen('_lpy_offset = getattr(%s, "offset", 0)' % impl_array_name) else: gen("_lpy_offset = %s.offset" % impl_array_name) @@ -246,7 +246,7 @@ class ExecutionWrapperGeneratorBase(object): % (arg.name, base_arg.dtype.itemsize)) gen("assert _lpy_remdr == 0, \"Offset of array '%s' is " - "not divisible by its dtype itemsize\"" + 'not divisible by its dtype itemsize"' % impl_array_name) gen("del _lpy_remdr") else: @@ -281,7 +281,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("raise RuntimeError(\"required stride '%s' for " "argument '%s' not given or deducible from " - "passed array\")" + 'passed array")' % (arg.name, impl_array_name)) base_arg = kernel.impl_arg_to_arg[impl_array_name] @@ -292,7 +292,7 @@ class ExecutionWrapperGeneratorBase(object): base_arg.dtype.dtype.itemsize)) gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' " - " is not divisible by its dtype itemsize\"" + ' is not divisible by its dtype itemsize"' % (stride_impl_axis, impl_array_name)) gen("del _lpy_remdr") else: @@ -324,7 +324,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("raise TypeError(\"value argument '%s' " "was not given and could not be automatically " - "determined\")" % arg.name) + 'determined")' % arg.name) gen("# }}}") gen("") @@ -409,7 +409,7 @@ class ExecutionWrapperGeneratorBase(object): gen("if %s is None:" % arg.name) with Indentation(gen): gen("raise RuntimeError(\"input argument '%s' must " - "be supplied\")" % arg.name) + 'be supplied")' % arg.name) gen("") if (is_written @@ -418,14 +418,14 @@ class ExecutionWrapperGeneratorBase(object): gen("if %s is None:" % arg.name) with Indentation(gen): gen("raise RuntimeError(\"written image '%s' must " - "be supplied\")" % arg.name) + 'be supplied")' % arg.name) gen("") if is_written and arg.shape is None and not options.skip_arg_checks: gen("if %s is None:" % arg.name) with Indentation(gen): gen("raise RuntimeError(\"written argument '%s' has " - "unknown shape and must be supplied\")" % arg.name) + 'unknown shape and must be supplied")' % arg.name) gen("") possibly_made_by_loopy = False @@ -468,7 +468,7 @@ class ExecutionWrapperGeneratorBase(object): kernel_arg.dtype.numpy_dtype))) with Indentation(gen): gen("raise TypeError(\"dtype mismatch on argument '%s' " - "(got: %%s, expected: %s)\" %% %s.dtype)" + '(got: %%s, expected: %s)" %% %s.dtype)' % (arg.name, arg.dtype, arg.name)) # {{{ generate shape checking code @@ -489,7 +489,7 @@ class ExecutionWrapperGeneratorBase(object): shape_mismatch_msg = ( "raise TypeError(\"shape mismatch on argument '%s' " - "(got: %%s, expected: %%s)\" " + '(got: %%s, expected: %%s)" ' "%% (%s.shape, %s))" % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) @@ -545,10 +545,10 @@ class ExecutionWrapperGeneratorBase(object): "if dim > 1)" % (arg.name, strify_tuple(sym_strides))) - gen("raise TypeError(\"strides mismatch on " + gen('raise TypeError("strides mismatch on ' "argument '%s' " "(after removing unit length dims, " - "got: %%s, expected: %%s)\" " + 'got: %%s, expected: %%s)" ' "%% (_lpy_got, _lpy_expected))" % arg.name) @@ -559,7 +559,7 @@ class ExecutionWrapperGeneratorBase(object): gen("raise ValueError(\"Argument '%s' does not " "allow arrays with offsets. Try passing " "default_offset=loopy.auto to make_kernel()." - "\")" % arg.name) + '")' % arg.name) gen("") # }}} diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index eb0157bf86d478901fb5a07bbac28aa7a11bcec9..322c771b653a2fd28977538f81e64a63e8984784 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -117,7 +117,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): subscript, = access_info.subscripts result = var(access_info.array_name)[ - var("programIndex") + self.rec(lsize*subscript, 'i')] + var("programIndex") + self.rec(lsize*subscript, "i")] if access_info.vector_index is not None: return self.kernel.target.add_vector_access( @@ -475,7 +475,7 @@ class ISPCASTBuilder(CFamilyASTBuilder): "streaming_store(%s + %s, %s)" % ( access_info.array_name, - ecm(flattened_sum(new_terms), PREC_NONE, 'i'), + ecm(flattened_sum(new_terms), PREC_NONE, "i"), rhs_code)) # }}} diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 4569be50367b3063999656bcd1de9d76f98e8c0a..f81c05a396c30bb9043d55416f157182beb5085b 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -81,16 +81,16 @@ def _create_vector_types(): counts = [2, 3, 4, 8, 16] for base_name, base_type in [ - ('char', np.int8), - ('uchar', np.uint8), - ('short', np.int16), - ('ushort', np.uint16), - ('int', np.int32), - ('uint', np.uint32), - ('long', np.int64), - ('ulong', np.uint64), - ('float', np.float32), - ('double', np.float64), + ("char", np.int8), + ("uchar", np.uint8), + ("short", np.int16), + ("ushort", np.uint16), + ("int", np.int32), + ("uint", np.uint32), + ("long", np.int64), + ("ulong", np.uint64), + ("float", np.float32), + ("double", np.float64), ]: for count in counts: name = "%s%d" % (base_name, count) @@ -151,16 +151,16 @@ _CL_SIMPLE_MULTI_ARG_FUNCTIONS = { VECTOR_LITERAL_FUNCS = dict( ("make_%s%d" % (name, count), (name, dtype, count)) for name, dtype in [ - ('char', np.int8), - ('uchar', np.uint8), - ('short', np.int16), - ('ushort', np.uint16), - ('int', np.int32), - ('uint', np.uint32), - ('long', np.int64), - ('ulong', np.uint64), - ('float', np.float32), - ('double', np.float64), + ("char", np.int8), + ("uchar", np.uint8), + ("short", np.int16), + ("ushort", np.uint16), + ("int", np.int32), + ("uint", np.uint32), + ("long", np.int64), + ("ulong", np.uint64), + ("float", np.float32), + ("double", np.float64), ] for count in [2, 3, 4, 8, 16] ) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 24b724c6022dade0eba682539096fda1156e0b5c..7ede6e7605652308be676af9b2a069a7495eaf38 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -57,7 +57,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if dtype.isbuiltin: return "_lpy_np."+dtype.name else: - return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")" + return ('_lpy_cl_tools.get_or_register_dtype("%s")' % cl_tools.dtype_to_ctype(dtype)) # {{{ handle non-numpy args @@ -222,7 +222,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if options.return_dict: gen("return _lpy_evt, {%s}" - % ", ".join("\"%s\": %s" % (arg.name, arg.name) + % ", ".join('"%s": %s' % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) if arg.base_name in kernel.get_written_variables())) diff --git a/loopy/tools.py b/loopy/tools.py index a1cd5e108a45ba60c71b3bb7a51f779b84172065..a93b918f4cf749db55a656acb8522c7daf9d06af 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -40,12 +40,8 @@ import six # noqa from six.moves import intern -if six.PY2: - def is_integer(obj): - return isinstance(obj, (int, long, np.integer)) # noqa pylint:disable=undefined-variable -else: - def is_integer(obj): - return isinstance(obj, (int, np.integer)) +def is_integer(obj): + return isinstance(obj, (int, np.integer)) # {{{ custom KeyBuilder subclass @@ -317,8 +313,8 @@ def cptr_from_numpy(obj): # https://github.com/hgomersall/pyFFTW/blob/master/pyfftw/utils.pxi#L172 -def empty_aligned(shape, dtype, order='C', n=64): - '''empty_aligned(shape, dtype='float64', order='C', n=None) +def empty_aligned(shape, dtype, order="C", n=64): + """empty_aligned(shape, dtype='float64', order="C", n=None) Function that returns an empty numpy array that is n-byte aligned, where ``n`` is determined by inspecting the CPU if it is not provided. @@ -326,7 +322,7 @@ def empty_aligned(shape, dtype, order='C', n=64): ``n`` is not provided then this function will inspect the CPU to determine alignment. The rest of the arguments are as per :func:`numpy.empty`. - ''' + """ itemsize = np.dtype(dtype).itemsize # Apparently there is an issue with numpy.prod wrapping around on 32-bits diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 6c7cb3365991cf92db4c0fa2a56a07e9ad07f66d..905c1e64ab96b039bce5e451da71ce1f73792e0e 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -723,7 +723,7 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): from loopy.match import re_from_glob new_iname_to_tag = {} for iname, new_tag in iname_to_tag: - if '*' in iname or '?' in iname: + if "*" in iname or "?" in iname: match_re = re_from_glob(iname) for sub_iname in all_inames: if match_re.match(sub_iname): diff --git a/setup.cfg b/setup.cfg index a0d95746e1a399d6a2d7c315bffc9b834d2f5487..9495d106cf389d485037db16a35a14b4aaf6c873 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,3 +4,7 @@ max-line-length=85 exclude= loopy/target/c/compyte/ndarray, loopy/target/c/compyte/array.py + +inline-quotes = " +docstring-quotes = """ +multiline-quotes = """ diff --git a/setup.py b/setup.py index c041ba2dad331d44ae34ea7959df32de05ec807b..497fa60ba09acc34a24e77bd17bf56a69b9490e7 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ finally: version_file.close() os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"] = "1" -exec(compile(version_file_contents, "loopy/version.py", 'exec'), ver_dic) +exec(compile(version_file_contents, "loopy/version.py", "exec"), ver_dic) # {{{ capture git revision at install time @@ -56,7 +56,7 @@ def write_git_revision(package_name): git_rev = find_git_revision(dn) with open(join(dn, package_name, "_git_rev.py"), "w") as outf: - outf.write("GIT_REVISION = %s\n" % repr(git_rev)) + outf.write('GIT_REVISION = "%s"\n' % git_rev) write_git_revision("loopy") @@ -69,20 +69,20 @@ setup(name="loo.py", description="A code generator for array-based code on CPUs and GPUs", long_description=open("README.rst", "rt").read(), classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'Intended Audience :: Other Audience', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT License', - 'Natural Language :: English', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Topic :: Scientific/Engineering', - 'Topic :: Scientific/Engineering :: Information Analysis', - 'Topic :: Scientific/Engineering :: Mathematics', - 'Topic :: Scientific/Engineering :: Visualization', - 'Topic :: Software Development :: Libraries', - 'Topic :: Utilities', + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Other Audience", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Scientific/Engineering :: Mathematics", + "Topic :: Scientific/Engineering :: Visualization", + "Topic :: Software Development :: Libraries", + "Topic :: Utilities", ], python_requires="~=3.6", diff --git a/test/test_apps.py b/test/test_apps.py index f7eeb756e735ffb4d5ab6ab747c6bb792c690668..ed5e5ce3d25bd37c92d400021422b362c3a5e28f 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -47,7 +47,7 @@ from loopy.diagnostic import LoopyError __all__ = [ "pytest_generate_tests", - "cl" # 'cl.create_some_context' + "cl" # "cl.create_some_context" ] @@ -102,10 +102,10 @@ def test_convolution(ctx_factory): knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", - fetch_outer_inames='im_x_outer, im_y_outer, ifeat', + fetch_outer_inames="im_x_outer, im_y_outer, ifeat", default_tag="l.auto") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y", - fetch_outer_inames='iimg, im_x_outer, im_y_outer, ifeat, icolor', + fetch_outer_inames="iimg, im_x_outer, im_y_outer, ifeat, icolor", default_tag="l.auto") return knl @@ -592,12 +592,12 @@ def test_poisson_fem(ctx_factory): knl = lp.prioritize_loops(knl, ["c", "j", "i", "k"]) def variant_1(knl): - knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') + knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag="for") knl = lp.prioritize_loops(knl, "c,i,j") return knl def variant_2(knl): - knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') + knl = lp.precompute(knl, "dpsi", "i,ell", default_tag="for") knl = lp.prioritize_loops(knl, "c,i,j") return knl @@ -633,10 +633,10 @@ def test_domain_tree_nesting(): TV = lp.TemporaryVariable # noqa - knl = lp.make_kernel(['{[i]: 0 <= i < 12}', - '{[j]: 0 <= j < 100}', - '{[a_count]: 0 <= a_count < a_end}', - '{[b_count]: 0 <= b_count < b_end}'], + knl = lp.make_kernel(["{[i]: 0 <= i < 12}", + "{[j]: 0 <= j < 100}", + "{[a_count]: 0 <= a_count < a_end}", + "{[b_count]: 0 <= b_count < b_end}"], """ for j for i @@ -655,15 +655,15 @@ def test_domain_tree_nesting(): end """, [ - TV('out_map', initializer=out_map, read_only=True, address_space=AS.PRIVATE), - TV('if_val', initializer=if_val, read_only=True, address_space=AS.PRIVATE), - TV('vals', initializer=vals, read_only=True, address_space=AS.PRIVATE), - TV('num_vals', initializer=num_vals, read_only=True, + TV("out_map", initializer=out_map, read_only=True, address_space=AS.PRIVATE), + TV("if_val", initializer=if_val, read_only=True, address_space=AS.PRIVATE), + TV("vals", initializer=vals, read_only=True, address_space=AS.PRIVATE), + TV("num_vals", initializer=num_vals, read_only=True, address_space=AS.PRIVATE), - TV('num_vals_offset', initializer=num_vals_offset, read_only=True, + TV("num_vals_offset", initializer=num_vals_offset, read_only=True, address_space=AS.PRIVATE), - lp.GlobalArg('B', shape=(100, 31), dtype=np.float64), - lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)]) + lp.GlobalArg("B", shape=(100, 31), dtype=np.float64), + lp.GlobalArg("out", shape=(100, 12), dtype=np.float64)]) parents_per_domain = knl.parents_per_domain() diff --git a/test/test_c_execution.py b/test/test_c_execution.py index b0ca7ade25d3077c7f868f366cb9ff6bb011af33..53fb80be784b4b89c2740daa80004d59ae0f2e97 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -63,29 +63,29 @@ def test_c_target(): def test_c_target_strides(): from loopy.target.c import ExecutableCTarget - def __get_kernel(order='C'): + def __get_kernel(order="C"): return lp.make_kernel( "{ [i,j]: 0<=i,j Tcond = T[k] < 0.5 @@ -1789,7 +1789,7 @@ def test_ilp_and_conditionals(ctx_factory): ref_knl = knl - knl = lp.split_iname(knl, 'k', 2, inner_tag='ilp') + knl = lp.split_iname(knl, "k", 2, inner_tag="ilp") lp.auto_test_vs_ref(ref_knl, ctx, knl) @@ -1797,7 +1797,7 @@ def test_ilp_and_conditionals(ctx_factory): def test_unr_and_conditionals(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel('{[k]: 0<=k Tcond[k] = T[k] < 0.5 @@ -1812,7 +1812,7 @@ def test_unr_and_conditionals(ctx_factory): ref_knl = knl - knl = lp.split_iname(knl, 'k', 2, inner_tag='unr') + knl = lp.split_iname(knl, "k", 2, inner_tag="unr") lp.auto_test_vs_ref(ref_knl, ctx, knl) @@ -1820,7 +1820,7 @@ def test_unr_and_conditionals(ctx_factory): def test_constant_array_args(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel('{[k]: 0<=k Tcond[k] = T[k] < 0.5 @@ -1829,8 +1829,8 @@ def test_constant_array_args(ctx_factory): end end """, - [lp.ConstantArg('T', shape=(200,), dtype=np.float32), - '...']) + [lp.ConstantArg("T", shape=(200,), dtype=np.float32), + "..."]) knl = lp.fix_parameters(knl, n=200) @@ -1892,33 +1892,33 @@ def test_const_temp_with_initializer_not_saved(): def test_header_extract(): - knl = lp.make_kernel('{[k]: 0<=kind = indirect(offsets[i], offsets[i + 1], 1) out[i] = data[ind] end """, - [lp.GlobalArg('out', shape=('n',)), + [lp.GlobalArg("out", shape=("n",)), lp.TemporaryVariable( - 'offsets', shape=(offsets.size,), initializer=offsets, + "offsets", shape=(offsets.size,), initializer=offsets, address_space=lp.AddressSpace.GLOBAL, read_only=True), - lp.GlobalArg('data', shape=(data.size,), dtype=np.float64)], + lp.GlobalArg("data", shape=(data.size,), dtype=np.float64)], ) # fixt params, and add manglers / preamble @@ -2564,13 +2564,13 @@ def test_preamble_with_separate_temporaries(ctx_factory): SeparateTemporariesPreambleTestPreambleGenerator, ) func_info = dict( - func_name='indirect', + func_name="indirect", func_arg_dtypes=(np.int32, np.int32, np.int32), func_result_dtypes=(np.int32,), arr=lookup ) - kernel = lp.fix_parameters(kernel, **{'n': n}) + kernel = lp.fix_parameters(kernel, **{"n": n}) kernel = lp.register_preamble_generators( kernel, [SeparateTemporariesPreambleTestPreambleGenerator(**func_info)]) kernel = lp.register_function_manglers( @@ -2582,7 +2582,7 @@ def test_preamble_with_separate_temporaries(ctx_factory): queue = cl.CommandQueue(ctx) # check that it actually performs the lookup correctly assert np.allclose(kernel( - queue, data=data.flatten('C'))[1][0], data[offsets[:-1] + 1]) + queue, data=data.flatten("C"))[1][0], data[offsets[:-1] + 1]) def test_arg_inference_for_predicates(): @@ -2715,7 +2715,7 @@ def test_dep_cycle_printing_and_error(): # https://gitlab.tiker.net/inducer/loopy/issues/140 # This kernel has two dep cycles. - knl = lp.make_kernel('{[i,j,k]: 0 <= i,j,k < 12}', + knl = lp.make_kernel("{[i,j,k]: 0 <= i,j,k < 12}", """ for j for i @@ -2735,11 +2735,11 @@ def test_dep_cycle_printing_and_error(): end end """, - [lp.GlobalArg('a', shape=(12, 12), dtype=np.int32)]) + [lp.GlobalArg("a", shape=(12, 12), dtype=np.int32)]) - knl = lp.split_iname(knl, 'j', 4, inner_tag='vec') - knl = lp.split_array_axis(knl, 'a', 1, 4) - knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec') + knl = lp.split_iname(knl, "j", 4, inner_tag="vec") + knl = lp.split_array_axis(knl, "a", 1, 4) + knl = lp.tag_array_axes(knl, "a", "N1,N0,vec") knl = lp.preprocess_kernel(knl) from loopy.diagnostic import DependencyCycleFound @@ -2758,7 +2758,7 @@ def test_backwards_dep_printing_and_error(): d[i] = 7*a[i ] {id=insn5, dep=insn4} a[i] = a[i] + d[i] {id=insn6, dep=insn5} """, [ - lp.GlobalArg('a, b', dtype=np.float64), + lp.GlobalArg("a, b", dtype=np.float64), "..." ]) @@ -2837,9 +2837,9 @@ def test_shape_mismatch_check(ctx_factory): def test_array_arg_extra_kwargs_persis_hash(): from loopy.tools import LoopyKeyBuilder - a = lp.ArrayArg('a', shape=(10, ), dtype=np.float64, + a = lp.ArrayArg("a", shape=(10, ), dtype=np.float64, address_space=lp.AddressSpace.LOCAL) - not_a = lp.ArrayArg('a', shape=(10, ), dtype=np.float64, + not_a = lp.ArrayArg("a", shape=(10, ), dtype=np.float64, address_space=lp.AddressSpace.PRIVATE) key_builder = LoopyKeyBuilder() @@ -2852,7 +2852,7 @@ def test_non_integral_array_idx_raises(): """ out[j] = 0 {id=init} out[i] = a[1.94**i-1] {dep=init} - """, [lp.GlobalArg('a', np.float64), '...']) + """, [lp.GlobalArg("a", np.float64), "..."]) from loopy.diagnostic import LoopyError with pytest.raises(LoopyError): diff --git a/test/test_nbody.py b/test/test_nbody.py index 6016c2f1c9955d3bd58d52ad33a3fa95ed63cff8..f7f88889b63f5811eb30a9d2b591e522d048ff01 100644 --- a/test/test_nbody.py +++ b/test/test_nbody.py @@ -78,7 +78,7 @@ def test_nbody(ctx_factory): knl = lp.split_iname(knl, "j", 256) knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"], ["x_fetch_j", "x_fetch_k"], - fetch_outer_inames='i_outer, j_outer', default_tag=None) + fetch_outer_inames="i_outer, j_outer", default_tag=None) knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0")) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"]) diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py index fff2b5356e75f414356ea1c61c2dd54753186d26..1291de2b0e3e5f62676dcea30cce220a966d98ab 100644 --- a/test/test_sem_reagan.py +++ b/test/test_sem_reagan.py @@ -82,7 +82,7 @@ def test_tim2d(ctx_factory): def variant_orig(knl): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0")) - knl = lp.add_prefetch(knl, "D[:,:]", fetch_outer_inames='e', + knl = lp.add_prefetch(knl, "D[:,:]", fetch_outer_inames="e", default_tag="l.auto") knl = lp.add_prefetch(knl, "u[e, :, :]", default_tag="l.auto") diff --git a/test/test_statistics.py b/test/test_statistics.py index 41a88b3864166b81d60ec0468cf9e5fbd07c227c..33565ef0007dff2b1ebf671dc0a55341d09c5053 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -66,13 +66,13 @@ def test_op_counter_basic(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) - f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP) + params = {"n": n, "m": m, "ell": ell} + f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP)].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP)].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, "div", CG.SUBGROUP)].eval_with_dict(params) + f64mul = op_map[lp.Op(np.dtype(np.float64), "mul", CG.SUBGROUP) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) + i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == f32div == n*m*ell*n_subgroups @@ -98,14 +98,14 @@ def test_op_counter_reduction(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) - f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP) + params = {"n": n, "m": m, "ell": ell} + f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP)].eval_with_dict(params) + f32mul = op_map[lp.Op(np.dtype(np.float32), "mul", CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == n*m*ell*n_subgroups - op_map_dtype = op_map.group_by('dtype') + op_map_dtype = op_map.group_by("dtype") f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) assert f32 == f32add + f32mul @@ -133,12 +133,12 @@ def test_op_counter_logic(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) - f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params) - f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP) + params = {"n": n, "m": m, "ell": ell} + f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP)].eval_with_dict(params) + f64add = op_map[lp.Op(np.float64, "add", CG.SUBGROUP)].eval_with_dict(params) + f64div = op_map[lp.Op(np.dtype(np.float64), "div", CG.SUBGROUP) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) + i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul == n*m*n_subgroups @@ -171,18 +171,18 @@ def test_op_counter_specialops(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) - f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP)].eval_with_dict(params) - f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP) + params = {"n": n, "m": m, "ell": ell} + f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP)].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, "div", CG.SUBGROUP)].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP)].eval_with_dict(params) + f64pow = op_map[lp.Op(np.float64, "pow", CG.SUBGROUP)].eval_with_dict(params) + f64add = op_map[lp.Op(np.dtype(np.float64), "add", CG.SUBGROUP) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) + i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP) ].eval_with_dict(params) - f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP) + f64rsq = op_map[lp.Op(np.dtype(np.float64), "func:rsqrt", CG.SUBGROUP) ].eval_with_dict(params) - f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP) + f64sin = op_map[lp.Op(np.dtype(np.float64), "func:sin", CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32div == 2*n*m*ell*n_subgroups @@ -217,16 +217,16 @@ def test_op_counter_bitwise(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} - i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(params) - i32bw = op_map[lp.Op(np.int32, 'bw', CG.SUBGROUP)].eval_with_dict(params) - i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP) + params = {"n": n, "m": m, "ell": ell} + i32add = op_map[lp.Op(np.int32, "add", CG.SUBGROUP)].eval_with_dict(params) + i32bw = op_map[lp.Op(np.int32, "bw", CG.SUBGROUP)].eval_with_dict(params) + i64bw = op_map[lp.Op(np.dtype(np.int64), "bw", CG.SUBGROUP) ].eval_with_dict(params) - i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP) + i64mul = op_map[lp.Op(np.dtype(np.int64), "mul", CG.SUBGROUP) ].eval_with_dict(params) - i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP) + i64add = op_map[lp.Op(np.dtype(np.int64), "add", CG.SUBGROUP) ].eval_with_dict(params) - i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP) + i64shift = op_map[lp.Op(np.dtype(np.int64), "shift", CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert i32add == n*m*ell*n_subgroups @@ -261,7 +261,7 @@ def test_op_counter_triangular_domain(): knl, subgroup_size=SGS, count_redundant_work=True - )[lp.Op(np.float64, 'mul', CG.SUBGROUP)] + )[lp.Op(np.float64, "mul", CG.SUBGROUP)] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) @@ -297,31 +297,31 @@ def test_mem_access_counter_basic(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group - f32l = mem_map[lp.MemAccess('global', np.float32, + f32l = mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='a', + direction="load", variable="a", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - f32l += mem_map[lp.MemAccess('global', np.float32, + f32l += mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='b', + direction="load", variable="b", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - f64l = mem_map[lp.MemAccess('global', np.float64, + f64l = mem_map[lp.MemAccess("global", np.float64, lid_strides={}, gid_strides={}, - direction='load', variable='g', + direction="load", variable="g", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - f64l += mem_map[lp.MemAccess('global', np.float64, + f64l += mem_map[lp.MemAccess("global", np.float64, lid_strides={}, gid_strides={}, - direction='load', variable='h', + direction="load", variable="h", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -329,14 +329,14 @@ def test_mem_access_counter_basic(): assert f32l == (3*n*m*ell)*n_subgroups assert f64l == (2*n*m)*n_subgroups - f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), + f32s = mem_map[lp.MemAccess("global", np.dtype(np.float32), lid_strides={}, gid_strides={}, - direction='store', variable='c', + direction="store", variable="c", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), + f64s = mem_map[lp.MemAccess("global", np.dtype(np.float64), lid_strides={}, gid_strides={}, - direction='store', variable='e', + direction="store", variable="e", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -361,39 +361,39 @@ def test_mem_access_counter_reduction(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group - f32l = mem_map[lp.MemAccess('global', np.float32, + f32l = mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='a', + direction="load", variable="a", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - f32l += mem_map[lp.MemAccess('global', np.float32, + f32l += mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='b', + direction="load", variable="b", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32l == (2*n*m*ell)*n_subgroups - f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), + f32s = mem_map[lp.MemAccess("global", np.dtype(np.float32), lid_strides={}, gid_strides={}, - direction='store', variable='c', + direction="store", variable="c", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32s == (n*ell)*n_subgroups - ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] + ld_bytes = mem_map.filter_by(mtype=["global"], direction=["load"] ).to_bytes().eval_and_sum(params) - st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'] + st_bytes = mem_map.filter_by(mtype=["global"], direction=["store"] ).to_bytes().eval_and_sum(params) assert ld_bytes == 4*f32l assert st_bytes == 4*f32s @@ -419,23 +419,23 @@ def test_mem_access_counter_logic(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group - reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') + reduced_map = mem_map.group_by("mtype", "dtype", "direction") - f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32), - direction='load') + f32_g_l = reduced_map[lp.MemAccess("global", to_loopy_type(np.float32), + direction="load") ].eval_with_dict(params) - f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), - direction='load') + f64_g_l = reduced_map[lp.MemAccess("global", to_loopy_type(np.float64), + direction="load") ].eval_with_dict(params) - f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), - direction='store') + f64_g_s = reduced_map[lp.MemAccess("global", to_loopy_type(np.float64), + direction="store") ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -464,31 +464,31 @@ def test_mem_access_counter_specialops(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group - f32 = mem_map[lp.MemAccess('global', np.float32, + f32 = mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='a', + direction="load", variable="a", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - f32 += mem_map[lp.MemAccess('global', np.float32, + f32 += mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='b', + direction="load", variable="b", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), + f64 = mem_map[lp.MemAccess("global", np.dtype(np.float64), lid_strides={}, gid_strides={}, - direction='load', variable='g', + direction="load", variable="g", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), + f64 += mem_map[lp.MemAccess("global", np.dtype(np.float64), lid_strides={}, gid_strides={}, - direction='load', variable='h', + direction="load", variable="h", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -496,14 +496,14 @@ def test_mem_access_counter_specialops(): assert f32 == (2*n*m*ell)*n_subgroups assert f64 == (2*n*m)*n_subgroups - f32 = mem_map[lp.MemAccess('global', np.float32, + f32 = mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='store', variable='c', + direction="store", variable="c", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - f64 = mem_map[lp.MemAccess('global', np.float64, + f64 = mem_map[lp.MemAccess("global", np.float64, lid_strides={}, gid_strides={}, - direction='store', variable='e', + direction="store", variable="e", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -511,7 +511,7 @@ def test_mem_access_counter_specialops(): assert f32 == (n*m*ell)*n_subgroups assert f64 == (n*m)*n_subgroups - filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], + filtered_map = mem_map.filter_by(direction=["load"], variable=["a", "g"], count_granularity=CG.SUBGROUP) tot = filtered_map.eval_and_sum(params) @@ -541,45 +541,45 @@ def test_mem_access_counter_bitwise(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group - i32 = mem_map[lp.MemAccess('global', np.int32, + i32 = mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, - direction='load', variable='a', + direction="load", variable="a", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - i32 += mem_map[lp.MemAccess('global', np.int32, + i32 += mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, - direction='load', variable='b', + direction="load", variable="b", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - i32 += mem_map[lp.MemAccess('global', np.int32, + i32 += mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, - direction='load', variable='g', + direction="load", variable="g", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), + i32 += mem_map[lp.MemAccess("global", np.dtype(np.int32), lid_strides={}, gid_strides={}, - direction='load', variable='h', + direction="load", variable="h", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert i32 == (4*n*m+2*n*m*ell)*n_subgroups - i32 = mem_map[lp.MemAccess('global', np.int32, + i32 = mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, - direction='store', variable='c', + direction="store", variable="c", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - i32 += mem_map[lp.MemAccess('global', np.int32, + i32 += mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, - direction='store', variable='e', + direction="store", variable="e", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -610,7 +610,7 @@ def test_mem_access_counter_mixed(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = div_ceil(ell, group_size_0) group_size = group_size_0 @@ -619,33 +619,33 @@ def test_mem_access_counter_mixed(): mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) - f64uniform = mem_map[lp.MemAccess('global', np.float64, + f64uniform = mem_map[lp.MemAccess("global", np.float64, lid_strides={}, gid_strides={}, - direction='load', variable='g', + direction="load", variable="g", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - f64uniform += mem_map[lp.MemAccess('global', np.float64, + f64uniform += mem_map[lp.MemAccess("global", np.float64, lid_strides={}, gid_strides={}, - direction='load', variable='h', + direction="load", variable="h", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - f32uniform = mem_map[lp.MemAccess('global', np.float32, + f32uniform = mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='x', + direction="load", variable="x", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*group_size_0}, - direction='load', - variable='a', + f32nonconsec = mem_map[lp.MemAccess("global", np.dtype(np.float32), + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*group_size_0}, + direction="load", + variable="a", count_granularity=CG.WORKITEM) ].eval_with_dict(params) - f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*group_size_0}, - direction='load', - variable='b', + f32nonconsec += mem_map[lp.MemAccess("global", np.dtype(np.float32), + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*group_size_0}, + direction="load", + variable="b", count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -670,16 +670,16 @@ def test_mem_access_counter_mixed(): else: assert f32nonconsec == 3*n*m*ell - f64uniform = mem_map[lp.MemAccess('global', np.float64, + f64uniform = mem_map[lp.MemAccess("global", np.float64, lid_strides={}, gid_strides={}, - direction='store', variable='e', + direction="store", variable="e", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - f32nonconsec = mem_map[lp.MemAccess('global', np.float32, - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*group_size_0}, - direction='store', - variable='c', + f32nonconsec = mem_map[lp.MemAccess("global", np.float32, + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*group_size_0}, + direction="store", + variable="c", count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -717,52 +717,52 @@ def test_mem_access_counter_nonconsec(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} - f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*lsize0}, - direction='load', - variable='g', + params = {"n": n, "m": m, "ell": ell} + f64nonconsec = mem_map[lp.MemAccess("global", np.float64, + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*lsize0}, + direction="load", + variable="g", count_granularity=CG.WORKITEM) ].eval_with_dict(params) - f64nonconsec += mem_map[lp.MemAccess('global', np.float64, - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*lsize0}, - direction='load', - variable='h', + f64nonconsec += mem_map[lp.MemAccess("global", np.float64, + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*lsize0}, + direction="load", + variable="h", count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess( - 'global', np.dtype(np.float32), - lid_strides={0: Variable('m')*Variable('ell')}, - gid_strides={0: Variable('m')*Variable('ell')*lsize0}, - direction='load', variable='a', + "global", np.dtype(np.float32), + lid_strides={0: Variable("m")*Variable("ell")}, + gid_strides={0: Variable("m")*Variable("ell")*lsize0}, + direction="load", variable="a", count_granularity=CG.WORKITEM ) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess( - 'global', np.dtype(np.float32), - lid_strides={0: Variable('m')*Variable('ell')}, - gid_strides={0: Variable('m')*Variable('ell')*lsize0}, - direction='load', variable='b', + "global", np.dtype(np.float32), + lid_strides={0: Variable("m")*Variable("ell")}, + gid_strides={0: Variable("m")*Variable("ell")*lsize0}, + direction="load", variable="b", count_granularity=CG.WORKITEM ) ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell - f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*lsize0}, - direction='store', - variable='e', + f64nonconsec = mem_map[lp.MemAccess("global", np.float64, + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*lsize0}, + direction="store", + variable="e", count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess( - 'global', np.float32, - lid_strides={0: Variable('m')*Variable('ell')}, - gid_strides={0: Variable('m')*Variable('ell')*lsize0}, - direction='store', variable='c', + "global", np.float32, + lid_strides={0: Variable("m")*Variable("ell")}, + gid_strides={0: Variable("m")*Variable("ell")*lsize0}, + direction="store", variable="c", count_granularity=CG.WORKITEM ) ].eval_with_dict(params) @@ -772,37 +772,37 @@ def test_mem_access_counter_nonconsec(): mem_map64 = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=64) f64nonconsec = mem_map64[lp.MemAccess( - 'global', + "global", np.float64, - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*lsize0}, - direction='load', variable='g', + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*lsize0}, + direction="load", variable="g", count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64nonconsec += mem_map64[lp.MemAccess( - 'global', + "global", np.float64, - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*lsize0}, - direction='load', variable='h', + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*lsize0}, + direction="load", variable="h", count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map64[lp.MemAccess( - 'global', + "global", np.dtype(np.float32), - lid_strides={0: Variable('m')*Variable('ell')}, - gid_strides={0: Variable('m')*Variable('ell')*lsize0}, - direction='load', - variable='a', + lid_strides={0: Variable("m")*Variable("ell")}, + gid_strides={0: Variable("m")*Variable("ell")*lsize0}, + direction="load", + variable="a", count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map64[lp.MemAccess( - 'global', + "global", np.dtype(np.float32), - lid_strides={0: Variable('m')*Variable('ell')}, - gid_strides={0: Variable('m')*Variable('ell')*lsize0}, - direction='load', - variable='b', + lid_strides={0: Variable("m")*Variable("ell")}, + gid_strides={0: Variable("m")*Variable("ell")*lsize0}, + direction="load", + variable="b", count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64nonconsec == 2*n*m @@ -825,52 +825,52 @@ def test_mem_access_counter_consec(): knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size='guess') + subgroup_size="guess") n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} f64consec = mem_map[lp.MemAccess( - 'global', np.float64, - lid_strides={0: 1}, gid_strides={0: Variable('m')}, - direction='load', variable='g', + "global", np.float64, + lid_strides={0: 1}, gid_strides={0: Variable("m")}, + direction="load", variable="g", count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64consec += mem_map[lp.MemAccess( - 'global', np.float64, - lid_strides={0: 1}, gid_strides={0: Variable('m')}, - direction='load', variable='h', + "global", np.float64, + lid_strides={0: 1}, gid_strides={0: Variable("m")}, + direction="load", variable="h", count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess( - 'global', np.float32, + "global", np.float32, lid_strides={0: 1}, - gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, - direction='load', variable='a', + gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")}, + direction="load", variable="a", count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec += mem_map[lp.MemAccess( - 'global', np.dtype(np.float32), + "global", np.dtype(np.float32), lid_strides={0: 1}, - gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, - direction='load', variable='b', + gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")}, + direction="load", variable="b", count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == 2*n*m*ell assert f32consec == 3*n*m*ell f64consec = mem_map[lp.MemAccess( - 'global', np.float64, - lid_strides={0: 1}, gid_strides={0: Variable('m')}, - direction='store', variable='e', + "global", np.float64, + lid_strides={0: 1}, gid_strides={0: Variable("m")}, + direction="store", variable="e", count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess( - 'global', np.float32, + "global", np.float32, lid_strides={0: 1}, - gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, - direction='store', variable='c', + gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")}, + direction="store", variable="c", count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == n*m*ell @@ -885,7 +885,7 @@ def test_count_granularity_val_checks(): lp.MemAccess(count_granularity=CG.WORKGROUP) lp.MemAccess(count_granularity=None) assert True - lp.MemAccess(count_granularity='bushel') + lp.MemAccess(count_granularity="bushel") assert False except ValueError: assert True @@ -896,7 +896,7 @@ def test_count_granularity_val_checks(): lp.Op(count_granularity=CG.WORKGROUP) lp.Op(count_granularity=None) assert True - lp.Op(count_granularity='bushel') + lp.Op(count_granularity="bushel") assert False except ValueError: assert True @@ -920,7 +920,7 @@ def test_barrier_counter_nobarriers(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} assert len(sync_map) == 1 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 @@ -947,7 +947,7 @@ def test_barrier_counter_barriers(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} barrier_count = sync_map["barrier_local"].eval_with_dict(params) assert barrier_count == 50*10*2 @@ -970,7 +970,7 @@ def test_all_counters_parallel_matmul(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} group_size = bsize*bsize n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize) subgroups_per_group = div_ceil(group_size, SGS) @@ -983,16 +983,16 @@ def test_all_counters_parallel_matmul(): op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ - lp.Op(np.float32, 'mul', CG.SUBGROUP) + lp.Op(np.float32, "mul", CG.SUBGROUP) ].eval_with_dict(params) f32add = op_map[ - lp.Op(np.float32, 'add', CG.SUBGROUP) + lp.Op(np.float32, "add", CG.SUBGROUP) ].eval_with_dict(params) i32ops = op_map[ - lp.Op(np.int32, 'add', CG.SUBGROUP) + lp.Op(np.int32, "add", CG.SUBGROUP) ].eval_with_dict(params) i32ops += op_map[ - lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP) + lp.Op(np.dtype(np.int32), "mul", CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups @@ -1001,26 +1001,26 @@ def test_all_counters_parallel_matmul(): mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) - f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides={0: 1, 1: Variable('ell')}, + f32s1lb = mem_access_map[lp.MemAccess("global", np.float32, + lid_strides={0: 1, 1: Variable("ell")}, gid_strides={1: bsize}, - direction='load', variable='b', + direction="load", variable="b", count_granularity=CG.WORKITEM) ].eval_with_dict(params) - f32s1la = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides={0: 1, 1: Variable('m')}, - gid_strides={0: Variable('m')*bsize}, - direction='load', - variable='a', count_granularity=CG.WORKITEM) + f32s1la = mem_access_map[lp.MemAccess("global", np.float32, + lid_strides={0: 1, 1: Variable("m")}, + gid_strides={0: Variable("m")*bsize}, + direction="load", + variable="a", count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize assert f32s1la == n*m*ell/bsize - f32coal = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides={0: 1, 1: Variable('ell')}, - gid_strides={0: Variable('ell')*bsize, 1: bsize}, - direction='store', variable='c', + f32coal = mem_access_map[lp.MemAccess("global", np.float32, + lid_strides={0: 1, 1: Variable("ell")}, + gid_strides={0: Variable("ell")*bsize, 1: bsize}, + direction="store", variable="c", count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -1028,32 +1028,32 @@ def test_all_counters_parallel_matmul(): local_mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=SGS).filter_by(mtype=['local']) + subgroup_size=SGS).filter_by(mtype=["local"]) - local_mem_l = local_mem_map.filter_by(direction=['load'] + local_mem_l = local_mem_map.filter_by(direction=["load"] ).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert local_mem_l == m*2*n_subgroups - local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), - direction='load', + local_mem_l_a = local_mem_map[lp.MemAccess("local", np.dtype(np.float32), + direction="load", lid_strides={1: 16}, gid_strides={}, - variable='a_fetch', + variable="a_fetch", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), - direction='load', + local_mem_l_b = local_mem_map[lp.MemAccess("local", np.dtype(np.float32), + direction="load", lid_strides={0: 1}, gid_strides={}, - variable='b_fetch', + variable="b_fetch", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert local_mem_l_a == local_mem_l_b == m*n_subgroups - local_mem_s = local_mem_map.filter_by(direction=['store'] + local_mem_s = local_mem_map.filter_by(direction=["store"] ).eval_and_sum(params) # (count-per-sub-group)*n_subgroups @@ -1122,7 +1122,7 @@ def test_mem_access_tagged_variables(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} group_size = bsize*bsize n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize) subgroups_per_group = div_ceil(group_size, SGS) @@ -1131,19 +1131,19 @@ def test_mem_access_tagged_variables(): mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) - f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, + f32s1lb = mem_access_map[lp.MemAccess("global", np.float32, lid_strides={0: 1}, gid_strides={1: bsize}, - direction='load', variable='b', - variable_tag='mmbload', + direction="load", variable="b", + variable_tag="mmbload", count_granularity=CG.WORKITEM) ].eval_with_dict(params) - f32s1la = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides={1: Variable('m')}, - gid_strides={0: Variable('m')*bsize}, - direction='load', - variable='a', - variable_tag='mmaload', + f32s1la = mem_access_map[lp.MemAccess("global", np.float32, + lid_strides={1: Variable("m")}, + gid_strides={0: Variable("m")*bsize}, + direction="load", + variable="a", + variable_tag="mmaload", count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -1152,11 +1152,11 @@ def test_mem_access_tagged_variables(): # uniform: (count-per-sub-group)*n_subgroups assert f32s1la == m*n_subgroups - f32coal = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides={0: 1, 1: Variable('ell')}, - gid_strides={0: Variable('ell')*bsize, 1: bsize}, - direction='store', variable='c', - variable_tag='mmresult', + f32coal = mem_access_map[lp.MemAccess("global", np.float32, + lid_strides={0: 1, 1: Variable("ell")}, + gid_strides={0: Variable("ell")*bsize, 1: bsize}, + direction="store", variable="c", + variable_tag="mmresult", count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -1213,7 +1213,7 @@ def test_summations_and_filters(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 @@ -1223,24 +1223,24 @@ def test_summations_and_filters(): mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) - loads_a = mem_map.filter_by(direction=['load'], variable=['a'], + loads_a = mem_map.filter_by(direction=["load"], variable=["a"], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert loads_a == (2*n*m*ell)*n_subgroups - global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], + global_stores = mem_map.filter_by(mtype=["global"], direction=["store"], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert global_stores == (n*m*ell + n*m)*n_subgroups - ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], + ld_bytes = mem_map.filter_by(mtype=["global"], direction=["load"], count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) - st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'], + st_bytes = mem_map.filter_by(mtype=["global"], direction=["store"], count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) @@ -1249,10 +1249,10 @@ def test_summations_and_filters(): assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups # ignore stride and variable names in this map - reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') - f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load') + reduced_map = mem_map.group_by("mtype", "dtype", "direction") + f32lall = reduced_map[lp.MemAccess("global", np.float32, direction="load") ].eval_with_dict(params) - f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') + f64lall = reduced_map[lp.MemAccess("global", np.float64, direction="load") ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -1264,7 +1264,7 @@ def test_summations_and_filters(): #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) - op_map_dtype = op_map.group_by('dtype') + op_map_dtype = op_map.group_by("dtype") f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params) i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params) @@ -1272,7 +1272,7 @@ def test_summations_and_filters(): assert f64 == n*m assert i32 == n*m*2 - addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params) + addsub_all = op_map.filter_by(name=["add", "sub"]).eval_and_sum(params) f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params) assert addsub_all == n*m*ell + n*m*2 assert f32ops_all == n*m*ell*3 @@ -1280,16 +1280,16 @@ def test_summations_and_filters(): non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params) assert non_field == 0 - ops_nodtype = op_map.group_by('name') - ops_noname = op_map.group_by('dtype') - mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params) + ops_nodtype = op_map.group_by("name") + ops_noname = op_map.group_by("dtype") + mul_all = ops_nodtype[lp.Op(name="mul")].eval_with_dict(params) f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params) assert mul_all == n*m*ell + n*m assert f64ops_all == n*m def func_filter(key): return key.lid_strides == {} and key.dtype == to_loopy_type(np.float64) and \ - key.direction == 'load' + key.direction == "load" f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups @@ -1313,7 +1313,7 @@ def test_strided_footprint(): knl = lp.split_iname(knl, "i_inner", bx, outer_tag="unr", inner_tag="l.0") footprints = lp.gather_access_footprints(knl) - x_l_foot = footprints[('x', 'read')] + x_l_foot = footprints[("x", "read")] from loopy.statistics import count num = count(knl, x_l_foot).eval_with_dict(param_dict) diff --git a/test/test_target.py b/test/test_target.py index e27f6a32a3e84ce29ac9b6d0c817c989ee75058e..38f2017cac73efb56addd13191a510b4991941ba 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -48,7 +48,7 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \ __all__ = [ "pytest_generate_tests", - "cl" # 'cl.create_some_context' + "cl" # "cl.create_some_context" ] @@ -279,10 +279,10 @@ def test_numba_cuda_target(): target=lp.NumbaCudaTarget()) knl = lp.assume(knl, "M>0") - knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') - knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) + knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") + knl = lp.split_iname(knl, "j", 128, inner_tag="l.0", slabs=(0, 1)) knl = lp.add_prefetch(knl, "X[i,:]", - fetch_outer_inames='i_inner, i_outer, j_inner', + fetch_outer_inames="i_inner, i_outer, j_inner", default_tag="l.auto") knl = lp.fix_parameters(knl, N=3) knl = lp.prioritize_loops(knl, "i_inner,j_outer") @@ -327,7 +327,7 @@ def test_child_invalid_type_cast(): def test_target_invalid_type_cast(): - dtype = np.dtype([('', '