diff --git a/doc/reference.rst b/doc/reference.rst index 1c9dda7b5db4831534809ff59ef76216cd21b5cc..e79f17554119c5efe9351f46b8c501c2c27d2387 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -535,7 +535,7 @@ Obtaining Kernel Statistics .. autofunction:: get_op_poly -.. autofunction:: get_DRAM_access_poly +.. autofunction:: get_gmem_access_poly .. autofunction:: get_barrier_poly diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 40f171356cda9ec8106f39c1c786efbd8757fcea..ca22c2a64beb69d990797537cae272b587b182f6 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1256,14 +1256,14 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: Counting array accesses ~~~~~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_DRAM_access_poly` provides information on the number and type of +:func:`loopy.get_gmem_access_poly` provides information on the number and type of array loads and stores being performed in a kernel. To demonstrate this, we'll continue using the kernel from the previous example: .. doctest:: - >>> from loopy.statistics import get_DRAM_access_poly - >>> load_store_map = get_DRAM_access_poly(knl) + >>> from loopy.statistics import get_gmem_access_poly + >>> load_store_map = get_gmem_access_poly(knl) >>> print(load_store_map) (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 } @@ -1271,7 +1271,7 @@ continue using the kernel from the previous example: (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } <BLANKLINE> -:func:`loopy.get_DRAM_access_poly` returns a mapping of **{(** +:func:`loopy.get_gmem_access_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. @@ -1313,7 +1313,7 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: ~~~~~~~~~~~ Since we have not tagged any of the inames or parallelized the kernel across threads -(which would have produced iname tags), :func:`loopy.get_DRAM_access_poly` considers +(which would have produced iname tags), :func:`loopy.get_gmem_access_poly` considers the array accesses *uniform*. Now we'll parallelize the kernel and count the array accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated this time, so we'll print the mapping manually to make it more legible: @@ -1321,7 +1321,7 @@ this time, so we'll print the mapping manually to make it more legible: .. doctest:: >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0") - >>> load_store_map = get_DRAM_access_poly(knl_consec) + >>> load_store_map = get_gmem_access_poly(knl_consec) >>> for key in sorted(load_store_map.dict.keys(), key=lambda k: str(k)): ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) (dtype('float32'), 'consecutive', 'load') : @@ -1368,7 +1368,7 @@ our parallelization of the kernel: .. doctest:: >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1") - >>> load_store_map = get_DRAM_access_poly(knl_nonconsec) + >>> load_store_map = get_gmem_access_poly(knl_nonconsec) >>> for key in sorted(load_store_map.dict.keys(), key=lambda k: str(k)): ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) (dtype('float32'), 'nonconsecutive', 'load') : diff --git a/loopy/__init__.py b/loopy/__init__.py index d24f507c2fb1e87cca3e9cd7b8dd67778bbf6253..8956856d4735a9554ed8c34741790ef1286d9e54 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -63,7 +63,8 @@ from loopy.padding import (split_arg_axis, find_padding_multiple, from loopy.preprocess import (preprocess_kernel, realize_reduction, infer_unknown_types) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.statistics import get_op_poly, get_DRAM_access_poly, get_barrier_poly +from loopy.statistics import (get_op_poly, get_gmem_access_poly, + get_DRAM_access_poly, get_barrier_poly) from loopy.codegen import generate_code, generate_body from loopy.compiled import CompiledKernel from loopy.options import Options @@ -103,7 +104,8 @@ __all__ = [ "generate_loop_schedules", "get_one_scheduled_kernel", "generate_code", "generate_body", - "get_op_poly", "get_DRAM_access_poly", "get_barrier_poly", + "get_op_poly", "get_gmem_access_poly", "get_DRAM_access_poly", + "get_barrier_poly", "CompiledKernel", diff --git a/loopy/statistics.py b/loopy/statistics.py index 76345f097b9c3100a9a74ece5b65af94ec276454..d25ea3eaca44187e8ad7020b6ddfd9e5bedc95bc 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -55,7 +55,7 @@ class ToCountMap: "to {} {}. ToCountMap may only be added to " "0 and other ToCountMap objects." .format(type(other), other)) - return + return self def __mul__(self, other): @@ -207,7 +207,7 @@ class ExpressionOpCounter(CombineMapper): "map_slice not implemented.") -class ExpressionSubscriptCounter(CombineMapper): +class GlobalSubscriptCounter(CombineMapper): def __init__(self, knl): self.knl = knl @@ -345,12 +345,12 @@ class ExpressionSubscriptCounter(CombineMapper): map_logical_and = map_logical_or def map_if(self, expr): - warnings.warn("ExpressionSubscriptCounter counting DRAM accesses as " + warnings.warn("GlobalSubscriptCounter counting DRAM accesses as " "sum of if-statement branches.") return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_) def map_if_positive(self, expr): - warnings.warn("ExpressionSubscriptCounter counting DRAM accesses as " + warnings.warn("GlobalSubscriptCounter counting DRAM accesses as " "sum of if_pos-statement branches.") return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) @@ -358,22 +358,22 @@ class ExpressionSubscriptCounter(CombineMapper): map_max = map_min def map_common_subexpression(self, expr): - raise NotImplementedError("ExpressionSubscriptCounter encountered " + raise NotImplementedError("GlobalSubscriptCounter encountered " "common_subexpression, " "map_common_subexpression not implemented.") def map_substitution(self, expr): - raise NotImplementedError("ExpressionSubscriptCounter encountered " + raise NotImplementedError("GlobalSubscriptCounter encountered " "substitution, " "map_substitution not implemented.") def map_derivative(self, expr): - raise NotImplementedError("ExpressionSubscriptCounter encountered " + raise NotImplementedError("GlobalSubscriptCounter encountered " "derivative, " "map_derivative not implemented.") def map_slice(self, expr): - raise NotImplementedError("ExpressionSubscriptCounter encountered slice, " + raise NotImplementedError("GlobalSubscriptCounter encountered slice, " "map_slice not implemented.") @@ -450,9 +450,8 @@ def get_op_poly(knl): return op_poly -def get_DRAM_access_poly(knl): # for now just counting subscripts - - """Count the number of DRAM accesses in a loopy kernel. +def get_gmem_access_poly(knl): # for now just counting subscripts + """Count the number of global memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be counted. @@ -477,7 +476,7 @@ def get_DRAM_access_poly(knl): # for now just counting subscripts # (first create loopy kernel and specify array data types) - subscript_map = get_DRAM_access_poly(knl) + subscript_map = get_gmem_access_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} f32_uncoalesced_load = subscript_map.dict[ @@ -499,7 +498,7 @@ def get_DRAM_access_poly(knl): # for now just counting subscripts knl = preprocess_kernel(knl) subs_poly = 0 - subscript_counter = ExpressionSubscriptCounter(knl) + subscript_counter = GlobalSubscriptCounter(knl) for insn in knl.instructions: insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) @@ -518,6 +517,13 @@ def get_DRAM_access_poly(knl): # for now just counting subscripts return subs_poly +def get_DRAM_access_poly(knl): + from warnings import warn + warn("get_DRAM_access_poly is deprecated. Use get_gmem_access_poly instead", + DeprecationWarning, stacklevel=2) + return get_gmem_access_poly(knl) + + def get_barrier_poly(knl): """Count the number of barriers each thread encounters in a loopy kernel. diff --git a/test/test_statistics.py b/test/test_statistics.py index 87ed797e74fd709c29ad9d763e195ff46985ed96..a58ce6d582a8d03d622028156adff35c61009bc0 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -27,7 +27,7 @@ from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) import loopy as lp -from loopy.statistics import get_op_poly, get_DRAM_access_poly, get_barrier_poly +from loopy.statistics import get_op_poly, get_gmem_access_poly, get_barrier_poly import numpy as np @@ -185,7 +185,7 @@ def test_op_counter_triangular_domain(): assert flops == 78 -def test_DRAM_access_counter_basic(): +def test_gmem_access_counter_basic(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -199,7 +199,7 @@ def test_DRAM_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = get_DRAM_access_poly(knl) + poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -222,7 +222,7 @@ def test_DRAM_access_counter_basic(): assert f64 == n*m -def test_DRAM_access_counter_reduction(): +def test_gmem_access_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -232,7 +232,7 @@ def test_DRAM_access_counter_reduction(): name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - poly = get_DRAM_access_poly(knl) + poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -247,7 +247,7 @@ def test_DRAM_access_counter_reduction(): assert f32 == n*l -def test_DRAM_access_counter_logic(): +def test_gmem_access_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -259,7 +259,7 @@ def test_DRAM_access_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - poly = get_DRAM_access_poly(knl) + poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -278,7 +278,7 @@ def test_DRAM_access_counter_logic(): assert f64 == n*m -def test_DRAM_access_counter_specialops(): +def test_gmem_access_counter_specialops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -292,7 +292,7 @@ def test_DRAM_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = get_DRAM_access_poly(knl) + poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -315,7 +315,7 @@ def test_DRAM_access_counter_specialops(): assert f64 == n*m -def test_DRAM_access_counter_bitwise(): +def test_gmem_access_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -332,7 +332,7 @@ def test_DRAM_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - poly = get_DRAM_access_poly(knl) + poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -347,7 +347,7 @@ def test_DRAM_access_counter_bitwise(): assert i32 == n*m+n*m*l -def test_DRAM_access_counter_mixed(): +def test_gmem_access_counter_mixed(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -363,7 +363,7 @@ def test_DRAM_access_counter_mixed(): knl = lp.split_iname(knl, "j", 16) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - poly = get_DRAM_access_poly(knl) # noqa + poly = get_gmem_access_poly(knl) # noqa n = 512 m = 256 l = 128 @@ -386,7 +386,7 @@ def test_DRAM_access_counter_mixed(): assert f32nonconsec == n*m*l -def test_DRAM_access_counter_nonconsec(): +def test_gmem_access_counter_nonconsec(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -402,7 +402,7 @@ def test_DRAM_access_counter_nonconsec(): knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) - poly = get_DRAM_access_poly(knl) # noqa + poly = get_gmem_access_poly(knl) # noqa n = 512 m = 256 l = 128 @@ -425,7 +425,7 @@ def test_DRAM_access_counter_nonconsec(): assert f32nonconsec == n*m*l -def test_DRAM_access_counter_consec(): +def test_gmem_access_counter_consec(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -440,7 +440,7 @@ def test_DRAM_access_counter_consec(): a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) - poly = get_DRAM_access_poly(knl) + poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -541,7 +541,7 @@ def test_all_counters_parallel_matmul(): assert f32ops == n*m*l*2 assert i32ops == n*m*l*4 + l*n*4 - subscript_map = get_DRAM_access_poly(knl) + subscript_map = get_gmem_access_poly(knl) f32uncoal = subscript_map.dict[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l})