From a1b33a1dc5bd6fdc31c6336578207119095e9963 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 4 Jun 2013 21:39:55 -0400
Subject: [PATCH] More of loopy's tests passing after switch to data tagging.

---
 MEMO                        |   8 +
 loopy/__init__.py           |   6 +-
 loopy/codegen/__init__.py   |  86 ++---
 loopy/codegen/expression.py | 163 +++++----
 loopy/compiled.py           |  72 +---
 loopy/kernel/__init__.py    |  36 +-
 loopy/kernel/array.py       | 711 ++++++++++++++++++++++++++++++++++++
 loopy/kernel/creation.py    |  23 +-
 loopy/kernel/data.py        | 241 ++++--------
 loopy/preprocess.py         |  27 +-
 test/test_loopy.py          |   4 +-
 11 files changed, 986 insertions(+), 391 deletions(-)
 create mode 100644 loopy/kernel/array.py

diff --git a/MEMO b/MEMO
index a92570307..b7a492cfc 100644
--- a/MEMO
+++ b/MEMO
@@ -50,13 +50,17 @@ To-do
 
 - when are link_inames, duplicate_inames safe?
 
+- rename IndexTag -> InameTag
+
 - Data implementation tags
   TODO initial bringup:
   - implemented_arg_info
   - Arg declaration
   - Temp var declaration
   - Adapt padding
+  - Adapt automatic padding of temp variables
   - loopy.compiled
+  - turn base_indices into offset
 
   TODO further:
   - vectorization
@@ -64,6 +68,10 @@ To-do
   - write_image()
   - change_arg_to_image (test!)
 
+  - automatic copies from an array with one set of tags
+    to the same array with another set.
+
+
 - Make tests run on GPUs
 
 Fixes:
diff --git a/loopy/__init__.py b/loopy/__init__.py
index a609ee599..60954f9cb 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -45,7 +45,7 @@ class LoopyAdvisory(UserWarning):
 # {{{ imported user interface
 
 from loopy.kernel.data import (
-        ValueArg, ScalarArg, GlobalArg, ArrayArg, ConstantArg, ImageArg,
+        ValueArg, GlobalArg, ConstantArg, ImageArg,
 
         default_function_mangler, single_arg_function_mangler,
         opencl_function_mangler,
@@ -802,7 +802,7 @@ def _process_footprint_subscripts(kernel, rule_name, sweep_inames,
         if not isinstance(fsub, tuple):
             fsub = (fsub,)
 
-        if len(fsub) != arg.dimensions:
+        if len(fsub) != arg.num_user_axes():
             raise ValueError("sweep index '%s' has the wrong number of dimensions")
 
         for subst_map in kernel.applied_iname_rewrites:
@@ -911,7 +911,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
     # {{{ make parameter names and unification template
 
     parameters = []
-    for i in range(arg.dimensions):
+    for i in range(arg.num_user_axes()):
         based_on = "%s_dim_%d" % (c_name, i)
         if dim_arg_names is not None and i < len(dim_arg_names):
             based_on = dim_arg_names[i]
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index a369ad4a7..750330db1 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -215,16 +215,25 @@ class POD(PODBase):
 # }}}
 
 
+class CLArgumentInfo(Record):
+    """
+    .. attribute:: name
+    .. attribute:: base_name
+    .. attribute:: dtype
+    .. attribute:: shape
+    .. attribute:: offset_for_name
+    """
+
+
 # {{{ main code generation entrypoint
 
 def generate_code(kernel, with_annotation=False,
         allow_complex=None):
     from cgen import (FunctionBody, FunctionDeclaration,
-            Value, ArrayOf, Module, Block,
+            Value, Module, Block,
             Line, Const, LiteralLines, Initializer)
 
-    from cgen.opencl import (CLKernel, CLGlobal, CLRequiredWorkGroupSize,
-            CLLocal, CLImage, CLConstant)
+    from cgen.opencl import (CLKernel, CLRequiredWorkGroupSize)
 
     allow_complex = False
     for var in kernel.args + list(kernel.temporary_variables.itervalues()):
@@ -246,46 +255,32 @@ def generate_code(kernel, with_annotation=False,
 
     # {{{ examine arg list
 
-    def restrict_ptr_if_not_nvidia(arg):
-        from cgen import Pointer, RestrictPointer
+    from loopy.kernel.data import ImageArg, ValueArg
+    from loopy.kernel.array import ArrayBase
 
-        if "nvidia" in kernel.device.platform.name.lower():
-            return Pointer(arg)
-        else:
-            return RestrictPointer(arg)
-
-    has_image = False
+    arg_decls = []
+    cl_arg_info = []
 
-    from loopy.kernel.data import GlobalArg, ConstantArg, ImageArg, ValueArg
-
-    args = []
     for arg in kernel.args:
-        if isinstance(arg, (ConstantArg, GlobalArg)):
-            arg_decl = restrict_ptr_if_not_nvidia(
-                    POD(arg.dtype, arg.name))
-            if arg_decl.name not in kernel.get_written_variables():
-                arg_decl = Const(arg_decl)
-            if isinstance(arg, ConstantArg):
-                arg_decl = CLConstant(arg_decl)
-            else:
-                arg_decl = CLGlobal(arg_decl)
-        elif isinstance(arg, ImageArg):
-            if arg.name in kernel.get_written_variables():
-                mode = "w"
-            else:
-                mode = "r"
-
-            arg_decl = CLImage(arg.dimensions, mode, arg.name)
+        if isinstance(arg, ArrayBase):
+            for cdecl, clai in arg.decl_info(
+                    is_written=arg.name in kernel.get_written_variables(),
+                    index_dtype=kernel.index_dtype):
+                arg_decls.append(cdecl)
+                cl_arg_info.append(clai)
 
-            has_image = True
         elif isinstance(arg, ValueArg):
-            arg_decl = Const(POD(arg.dtype, arg.name))
+            arg_decls.append(Const(POD(arg.dtype, arg.name)))
+            cl_arg_info.append(CLArgumentInfo(
+                name=arg.name,
+                base_name=arg.name,
+                dtype=arg.dtype,
+                shape=None))
+
         else:
             raise ValueError("argument type not understood: '%s'" % type(arg))
 
-        args.append(arg_decl)
-
-    if has_image:
+    if any(isinstance(arg, ImageArg) for arg in kernel.args):
         body.append(Initializer(Const(Value("sampler_t", "loopy_sampler")),
             "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP "
                 "| CLK_FILTER_NEAREST"))
@@ -303,20 +298,9 @@ def generate_code(kernel, with_annotation=False,
     # {{{ build lmem array declarators for temporary variables
 
     for tv in kernel.temporary_variables.itervalues():
-        temp_var_decl = POD(tv.dtype, tv.name)
-
-        try:
-            storage_shape = tv.storage_shape
-        except AttributeError:
-            storage_shape = tv.shape
-
-        for l in storage_shape:
-            temp_var_decl = ArrayOf(temp_var_decl, l)
-
-        if tv.is_local:
-            temp_var_decl = CLLocal(temp_var_decl)
-
-        body.append(temp_var_decl)
+        for cdecl, clai in tv.decl_info(
+                is_written=True, index_dtype=kernel.index_dtype):
+            body.append(cdecl)
 
     # }}}
 
@@ -339,7 +323,7 @@ def generate_code(kernel, with_annotation=False,
             CLRequiredWorkGroupSize(
                 kernel.get_grid_sizes_as_exprs()[1],
                 CLKernel(FunctionDeclaration(
-                    Value("void", kernel.name), args))),
+                    Value("void", kernel.name), arg_decls))),
             body))
 
     # {{{ handle preambles
@@ -374,7 +358,7 @@ def generate_code(kernel, with_annotation=False,
     assert check_implemented_domains(kernel, gen_code.implemented_domains,
             result)
 
-    return result
+    return result, cl_arg_info
 
 # }}}
 
diff --git a/loopy/codegen/expression.py b/loopy/codegen/expression.py
index c0f333d5b..6baae343e 100644
--- a/loopy/codegen/expression.py
+++ b/loopy/codegen/expression.py
@@ -242,6 +242,19 @@ def dtype_to_type_context(dtype):
     return None
 
 
+VEC_AXES = "xyzw"
+
+
+def get_opencl_vec_member(idx):
+    if idx is None:
+        return idx
+
+    if idx < len(VEC_AXES):
+        return VEC_AXES[idx]
+    else:
+        return "s%d" % idx
+
+
 class LoopyCCodeMapper(RecursiveMapper):
     def __init__(self, kernel, seen_dtypes, seen_functions, var_subst_map={},
             with_annotation=False, allow_complex=False):
@@ -340,9 +353,17 @@ class LoopyCCodeMapper(RecursiveMapper):
                     enclosing_prec, type_context))
         elif expr.name in self.kernel.arg_dict:
             arg = self.kernel.arg_dict[expr.name]
-            from loopy.kernel.data import ShapedArg
-            if isinstance(arg, ShapedArg) and arg.shape == ():
-                return "*"+expr.name
+            from loopy.kernel.array import ArrayBase
+            if isinstance(arg, ArrayBase):
+                if arg.shape == ():
+                    if arg.offset:
+                        # FIXME
+                        raise NotImplementedError("in-memory scalar with offset")
+                    else:
+                        return "*"+expr.name
+                else:
+                    raise RuntimeError("unsubscripted reference to array '%s'"
+                            % expr.name)
 
         for mangler in self.kernel.symbol_manglers:
             result = mangler(expr.name)
@@ -374,87 +395,92 @@ class LoopyCCodeMapper(RecursiveMapper):
             return base_impl(expr, enclosing_prec, type_context)
 
         if expr.aggregate.name in self.kernel.arg_dict:
-            arg = self.kernel.arg_dict[expr.aggregate.name]
+            ary = self.kernel.arg_dict[expr.aggregate.name]
+        elif expr.aggregate.name in self.kernel.temporary_variables:
+            ary = self.kernel.temporary_variables[expr.aggregate.name]
+        else:
+            raise RuntimeError("nothing known about subscripted variable '%s'"
+                    % expr.aggregate.name)
 
-            from loopy.kernel.data import ImageArg
-            if isinstance(arg, ImageArg):
-                assert isinstance(expr.index, tuple)
-
-                base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))"
-                        % (arg.name, arg.dimensions,
-                            ", ".join(self.rec(idx, PREC_NONE, 'i')
-                                for idx in expr.index[::-1])))
-
-                if arg.dtype == np.float32:
-                    return base_access+".x"
-                if arg.dtype in cl.array.vec.type_to_scalar_and_count:
-                    return base_access
-                elif arg.dtype == np.float64:
-                    return "as_double(%s.xy)" % base_access
-                else:
-                    raise NotImplementedError(
-                            "non-floating-point images not supported for now")
+        from loopy.kernel.array import ArrayBase
+        if not isinstance(ary, ArrayBase):
+            raise RuntimeError("subscripted variable '%s' is not an array"
+                    % expr.aggregate.name)
 
-            else:
-                # GlobalArg
-                index_expr = expr.index
-                if not isinstance(expr.index, tuple):
-                    index_expr = (index_expr,)
+        from loopy.kernel.array import get_access_info
+        from pymbolic import evaluate
 
-                if arg.strides is None:
-                    raise RuntimeError("index access to '%s' requires known "
-                            "strides" % arg.name)
+        access_info = get_access_info(ary, expr.index,
+                lambda expr: evaluate(expr, self.var_subst_map))
 
-                if len(arg.strides) != len(index_expr):
-                    raise RuntimeError("subscript to '%s' in '%s' has the wrong "
-                            "number of indices (got: %d, expected: %d)" % (
-                                expr.aggregate.name, expr,
-                                len(index_expr), len(arg.strides)))
+        vec_member = get_opencl_vec_member(access_info.vector_index)
 
-                from pymbolic.primitives import Subscript, Variable
-                if arg.offset:
-                    offset = Variable(arg.offset)
-                else:
-                    offset = 0
+        from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable
 
-                return base_impl(
-                        Subscript(expr.aggregate, offset+sum(
-                            stride*expr_i for stride, expr_i in zip(
-                                arg.strides, index_expr))),
-                        enclosing_prec, type_context)
+        if isinstance(ary, ImageArg):
+            base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))"
+                    % (ary.name, ary.dimensions,
+                        ", ".join(self.rec(idx, PREC_NONE, 'i')
+                            for idx in expr.index[::-1])))
+
+            if ary.dtype == np.float32:
+                return base_access+".x"
+            if ary.dtype in cl.array.vec.type_to_scalar_and_count:
+                return base_access
+            elif ary.dtype == np.float64:
+                return "as_double(%s.xy)" % base_access
+            else:
+                raise NotImplementedError(
+                        "non-floating-point images not supported for now")
+
+        elif isinstance(ary, (GlobalArg, TemporaryVariable)):
+            if len(access_info.subscripts) == 0:
+                if isinstance(ary, GlobalArg):
+                    # unsubscripted global args are pointers
+                    if vec_member is not None:
+                        return "%s%s->%s" % (
+                                expr.aggregate.name, access_info.array_suffix,
+                                vec_member)
+                    else:
+                        return "*" + expr.aggregate.name+access_info.array_suffix
+
+                else:
+                    # unsubscripted temp vars are scalars
+                    if vec_member is not None:
+                        return "%s%s.%s" % (
+                                expr.aggregate.name, access_info.array_suffix,
+                                vec_member)
+                    else:
+                        return expr.aggregate.name+access_info.array_suffix
 
-        elif expr.aggregate.name in self.kernel.temporary_variables:
-            temp_var = self.kernel.temporary_variables[expr.aggregate.name]
-            if isinstance(expr.index, tuple):
-                index = expr.index
             else:
-                index = (expr.index,)
+                subscript, = access_info.subscripts
+                result = self.parenthesize_if_needed(
+                        "%s[%s]" % (
+                            expr.aggregate.name+access_info.array_suffix,
+                            self.rec(subscript, PREC_NONE, 'i')),
+                        enclosing_prec, PREC_CALL)
 
-            return (temp_var.name + "".join("[%s]" % self.rec(idx, PREC_NONE, 'i')
-                for idx in index))
+                if vec_member:
+                    result += "."+vec_member
+
+                return result
 
         else:
-            raise RuntimeError(
-                    "nothing known about variable '%s'" % expr.aggregate.name)
+            assert False
 
     def map_linear_subscript(self, expr, enclosing_prec, type_context):
-        def base_impl(expr, enclosing_prec, type_context):
-            return self.parenthesize_if_needed(
-                    "%s[%s]" % (
-                        self.rec(expr.aggregate, PREC_CALL, type_context),
-                        self.rec(expr.index, PREC_NONE, 'i')),
-                    enclosing_prec, PREC_CALL)
-
         from pymbolic.primitives import Variable
         if not isinstance(expr.aggregate, Variable):
-            return base_impl(expr, enclosing_prec, type_context)
+                raise RuntimeError("linear indexing on non-variable: %s"
+                        % expr)
 
         if expr.aggregate.name in self.kernel.arg_dict:
             arg = self.kernel.arg_dict[expr.aggregate.name]
 
             from loopy.kernel.data import ImageArg
             if isinstance(arg, ImageArg):
-                raise RuntimeError("linear indexing doesn't work on images: %s"
+                raise RuntimeError("linear indexing is not supported on images: %s"
                         % expr)
 
             else:
@@ -464,13 +490,14 @@ class LoopyCCodeMapper(RecursiveMapper):
                 else:
                     offset = 0
 
-                from pymbolic.primitives import Subscript
-                return base_impl(
-                        Subscript(expr.aggregate, offset+expr.index),
-                        enclosing_prec, type_context)
+                return self.parenthesize_if_needed(
+                        "%s[%s]" % (
+                            expr.aggregate.name,
+                            self.rec(offset + expr.index, PREC_NONE, 'i')),
+                        enclosing_prec, PREC_CALL)
 
         elif expr.aggregate.name in self.kernel.temporary_variables:
-            raise RuntimeError("linear indexing doesn't work on temporaries: %s"
+            raise RuntimeError("linear indexing is not supported on temporaries: %s"
                     % expr)
 
         else:
diff --git a/loopy/compiled.py b/loopy/compiled.py
index beb6a6453..0089c77b0 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -122,7 +122,7 @@ class CompiledKernel:
         self.options = options
 
     @memoize_method
-    def get_kernel_info(self, arg_to_dtype_set, arg_to_has_offset_set):
+    def get_kernel_info(self, arg_to_dtype_set):
         kernel = self.kernel
 
         import loopy as lp
@@ -134,28 +134,6 @@ class CompiledKernel:
             from loopy.preprocess import infer_unknown_types
             kernel = infer_unknown_types(kernel, expect_completion=True)
 
-        if arg_to_has_offset_set:
-            arg_to_has_offset = dict(arg_to_has_offset_set)
-
-            vng = kernel.get_var_name_generator()
-
-            new_args = []
-            for arg in kernel.args:
-                if getattr(arg, "offset", None) is lp.auto:
-                    if arg_to_has_offset[arg.name]:
-                        offset_arg_name = vng(arg.name+"_offset")
-                        new_args.append(arg.copy(offset=offset_arg_name))
-                        new_args.append(
-                                lp.ValueArg(
-                                    offset_arg_name, kernel.index_dtype))
-                    else:
-                        new_args.append(arg.copy(offset=0))
-                else:
-                    new_args.append(arg)
-
-            kernel = kernel.copy(args=new_args)
-
-        import loopy as lp
         if kernel.schedule is None:
             kernel = _get_kernel_from_iterable(
                     lp.generate_loop_schedules(kernel))
@@ -179,14 +157,13 @@ class CompiledKernel:
                 )
 
     @memoize_method
-    def get_cl_kernel(self,
-            arg_to_dtype_set, arg_to_has_offset_set, code_op=False):
-        kernel_info = self.get_kernel_info(
-                arg_to_dtype_set, arg_to_has_offset_set)
+    def get_cl_kernel_info(self,
+            arg_to_dtype_set, code_op=False):
+        kernel_info = self.get_kernel_info(arg_to_dtype_set)
         kernel = kernel_info.kernel
 
         from loopy.codegen import generate_code
-        code = generate_code(kernel, **self.codegen_kwargs)
+        code, cl_arg_info = generate_code(kernel, **self.codegen_kwargs)
 
         if code_op == "print":
             print code
@@ -213,35 +190,34 @@ class CompiledKernel:
             print "[Loopy] "+70*"-"
             raise
 
-        from loopy.kernel.data import ValueArg
-
         arg_types = []
-        for arg in kernel.args:
-            if isinstance(arg, ValueArg):
-                arg_types.append(arg.dtype)
+        for arg_info in cl_arg_info:
+            if arg_info.shape is None:
+                arg_types.append(arg_info.dtype)
             else:
                 arg_types.append(None)
 
         cl_kernel.set_scalar_arg_dtypes(arg_types)
 
-        return kernel_info, cl_kernel
+        return kernel_info.copy(
+                cl_kernel=cl_kernel,
+                cl_arg_info=cl_arg_info)
 
     # {{{ debugging aids
 
-    def get_code(self, arg_to_dtype=None, arg_to_has_offset=None):
+    def get_code(self, arg_to_dtype=None):
         if arg_to_dtype is not None:
             arg_to_dtype = frozenset(arg_to_dtype.iteritems())
-        if arg_to_has_offset is not None:
-            arg_to_has_offset = frozenset(arg_to_has_offset.iteritems())
 
-        kernel_info = self.get_kernel_info(arg_to_dtype, arg_to_has_offset)
+        kernel_info = self.get_kernel_info(arg_to_dtype)
 
         from loopy.codegen import generate_code
-        return generate_code(kernel_info.kernel, **self.codegen_kwargs)
+        code, arg_info = generate_code(kernel_info.kernel, **self.codegen_kwargs)
+        return code
 
-    def get_highlighted_code(self, arg_to_dtype=None, arg_to_has_offset=None):
+    def get_highlighted_code(self, arg_to_dtype=None):
         return get_highlighted_code(
-                self.get_code(arg_to_dtype, arg_to_has_offset))
+                self.get_code(arg_to_dtype))
 
     @property
     def code(self):
@@ -274,7 +250,6 @@ class CompiledKernel:
         import loopy as lp
 
         arg_to_dtype = {}
-        arg_to_has_offset = {}
         for arg in self.kernel.args:
             val = kwargs.get(arg.name)
 
@@ -286,24 +261,15 @@ class CompiledKernel:
                 else:
                     arg_to_dtype[arg.name] = dtype
 
-            if getattr(arg, "offset", None) is lp.auto:
-                if val is not None and isinstance(val, cl_array.Array):
-                    has_offset = val.offset != 0
-                else:
-                    has_offset = False
-                arg_to_has_offset[arg.name] = has_offset
-
-        kernel_info, cl_kernel = self.get_cl_kernel(
+        kernel_info = self.get_cl_kernel_info(
                 frozenset(arg_to_dtype.iteritems()),
-                frozenset(arg_to_has_offset.iteritems()),
                 code_op)
         kernel = kernel_info.kernel
+        cl_kernel = kernel_info.cl_kernel
         del arg_to_dtype
 
         # }}}
 
-        import loopy as lp
-
         kwargs.update(
                 kernel.domain_parameter_finder()(kwargs))
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 750b6a58b..744805e9c 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -29,6 +29,7 @@ import numpy as np
 from pytools import Record, memoize_method
 import islpy as isl
 from islpy import dim_type
+import re
 
 from pytools import UniqueNameGenerator, generate_unique_possibilities
 
@@ -47,6 +48,39 @@ class CannotBranchDomainTree(RuntimeError):
     pass
 
 
+# {{{ unique var names
+
+def _is_var_name_conflicting_with_longer(name_a, name_b):
+    # Array dimensions implemented as separate arrays generate
+    # names by appending '_s<NUMBER>'. Make sure that no
+    # conflicts can arise from these names.
+
+    # Only deal with the case of b longer than a.
+    if not name_b.startswith(name_a):
+        return False
+
+    return re.match("^%s_s[0-9]+" % re.escape(name_b), name_a) is not None
+
+
+def _is_var_name_conflicting(name_a, name_b):
+    if name_a == name_b:
+        return True
+
+    return (
+            _is_var_name_conflicting_with_longer(name_a, name_b)
+            or _is_var_name_conflicting_with_longer(name_b, name_a))
+
+
+class _UniqueVarNameGenerator(UniqueNameGenerator):
+    def is_name_conflicting(self, name):
+        from pytools import any
+        return any(
+                _is_var_name_conflicting(name, other_name)
+                for other_name in self.existing_names)
+
+# }}}
+
+
 # {{{ loop kernel object
 
 class LoopKernel(Record):
@@ -246,7 +280,7 @@ class LoopKernel(Record):
                 | set(self.all_inames()))
 
     def get_var_name_generator(self):
-        return UniqueNameGenerator(self.all_variable_names())
+        return _UniqueVarNameGenerator(self.all_variable_names())
 
     def make_unique_instruction_id(self, insns=None, based_on="insn",
             extra_used_ids=set()):
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
new file mode 100644
index 000000000..08da01b98
--- /dev/null
+++ b/loopy/kernel/array.py
@@ -0,0 +1,711 @@
+"""Implementation tagging of array axes."""
+
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import re
+from pytools import Record, memoize_method
+
+import pyopencl as cl  # noqa
+import pyopencl.array  # noqa
+
+import numpy as np
+
+
+# {{{ array dimension tags
+
+class ArrayDimImplementationTag(Record):
+    pass
+
+
+class _StrideArrayDimTagBase(ArrayDimImplementationTag):
+    pass
+
+
+class FixedStrideArrayDimTag(_StrideArrayDimTagBase):
+    """An arg dimension implementation tag for a fixed (potentially
+    symbolic) stride.
+
+    The stride is given in units of :attr:`ArrayBase.dtype`.
+
+    .. attribute :: target_axis
+
+        For objects (such as images) with more than one axis, *target_axis*
+        sets which of these indices is being targeted by this dimension.
+        Note that there may be multiple dim_tags with the same *target_axis*,
+        their contributions are combined additively.
+
+        Note that "normal" arrays only have one *target_axis*.
+    """
+
+    def __init__(self, stride, target_axis=0):
+        _StrideArrayDimTagBase.__init__(self, stride=stride, target_axis=target_axis)
+        self.stride = stride
+        self.target_axis = target_axis
+
+    def __str__(self):
+        return "stride:%s->%d" % (self.stride, self.target_axis)
+
+    def map_expr(self, mapper):
+        return self.copy(stride=mapper(self.stride))
+
+
+class ComputedStrideArrayDimTag(_StrideArrayDimTagBase):
+    """
+    :arg order: "C" or "F", indicating whether this argument dimension will be added
+        as faster-moving ("C") or more-slowly-moving ("F") than the previous
+        argument.
+    :arg pad_to: :attr:`ArrayBase.dtype` granularity to which to pad this dimension
+
+    This type of stride arg dim gets converted to :class:`FixedStrideArrayDimTag`
+    on input to :class:`ArrayBase` subclasses.
+    """
+
+    def __init__(self, order, pad_to=None, target_axis=0):
+        order = order.upper()
+        if order not in "CF":
+            raise ValueError("'order' must be either 'C' or 'F'")
+
+        _StrideArrayDimTagBase.__init__(self, order=order, pad_to=pad_to,
+                target_axis=target_axis)
+
+    def __str__(self):
+        if self.pad_to is None:
+            return self.order
+        else:
+            return "%s(pad=%s)" % (self.order, self.pad_to)
+
+    def map_expr(self, mapper):
+        raise TypeError("ComputedStrideArrayDimTag is a transient type only used "
+                "for construction of arrays. It should never have to map its "
+                "expressions.")
+
+
+class SeparateArrayArrayDimTag(ArrayDimImplementationTag):
+    def __str__(self):
+        return "sep"
+
+    def map_expr(self, mapper):
+        return self
+
+
+class VectorArrayDimTag(ArrayDimImplementationTag):
+    def __str__(self):
+        return "vec"
+
+    def map_expr(self, mapper):
+        return self
+
+
+PADDED_STRIDE_TAG = re.compile(r"^([a-zA-Z]+)\(pad=(.*)\)$")
+TARGET_AXIS_RE = re.compile(r"->([0-9])$")
+
+
+def parse_array_dim_tag(tag):
+    if isinstance(tag, ArrayDimImplementationTag):
+        return tag
+
+    if not isinstance(tag, str):
+        raise TypeError("arg dimension implementation tag must be "
+                "string or tag object")
+
+    if tag.startswith("stride:"):
+        from loopy.symbolic import parse
+        return FixedStrideArrayDimTag(parse(tag[7:]))
+    elif tag == "sep":
+        return SeparateArrayArrayDimTag()
+    elif tag == "vec":
+        return VectorArrayDimTag()
+
+    target_axis_match = TARGET_AXIS_RE.search(tag)
+
+    if target_axis_match is not None:
+        target_axis = int(target_axis_match.group(1))
+        tag = tag[:target_axis_match.start()]
+    else:
+        target_axis = 0
+
+    if tag in ["c", "C", "f", "F"]:
+        return ComputedStrideArrayDimTag(tag, target_axis=target_axis)
+    else:
+        padded_stride_match = PADDED_STRIDE_TAG.match(tag)
+        if padded_stride_match is None:
+            raise ValueError("invalid arg dim tag: '%s'" % tag)
+
+        order = padded_stride_match.group(1)
+        pad = parse(padded_stride_match.group(2))
+
+        if order not in ["c", "C", "f", "F"]:
+            raise ValueError("invalid arg dim tag: '%s'" % tag)
+
+        return ComputedStrideArrayDimTag(order, pad, target_axis=target_axis)
+
+
+def parse_array_dim_tags(dim_tags):
+    if isinstance(dim_tags, str):
+        dim_tags = dim_tags.split(",")
+
+    def parse_dim_tag_if_necessary(dt):
+        if isinstance(dt, str):
+            dt = parse_array_dim_tag(dt)
+        return dt
+
+    return [parse_dim_tag_if_necessary(dt) for dt in dim_tags]
+
+
+def convert_computed_to_fixed_dim_tags(name, num_user_axes, num_target_axes,
+        shape, dim_tags):
+
+    # Just to clarify:
+    #
+    # - user axes are user-facing--what the user actually uses for indexing.
+    #
+    # - target axes are implementation facing. Normal in-memory arrays have one.
+    #   3D images have three.
+
+    # {{{ pick apart arg dim tags into computed, fixed and vec
+
+    vector_dim = None
+
+    # one list of indices into dim_tags for each target axis
+    computed_stride_dim_tags = [[] for i in range(num_target_axes)]
+    fixed_stride_dim_tags = [[] for i in range(num_target_axes)]
+
+    for i, dt in enumerate(dim_tags):
+        if isinstance(dt, VectorArrayDimTag):
+            if vector_dim is not None:
+                raise ValueError("arg '%s' may only have one vector-tagged "
+                        "argument dimension" % name)
+
+            vector_dim = i
+
+        elif isinstance(dt, FixedStrideArrayDimTag):
+            fixed_stride_dim_tags[dt.target_axis].append(i)
+
+        elif isinstance(dt, ComputedStrideArrayDimTag):
+            if dt.order in "cC":
+                computed_stride_dim_tags[dt.target_axis].insert(0, i)
+            elif dt.order in "fF":
+                computed_stride_dim_tags[dt.target_axis].append(i)
+            else:
+                raise ValueError("invalid value '%s' for "
+                        "ComputedStrideArrayDimTag.order" % dt.order)
+
+        elif isinstance(dt, SeparateArrayArrayDimTag):
+            pass
+
+        else:
+            raise ValueError("invalid array dim tag")
+
+    # }}}
+
+    # {{{ convert computed to fixed stride dim tags
+
+    new_dim_tags = dim_tags[:]
+
+    for target_axis in range(num_target_axes):
+        if (computed_stride_dim_tags[target_axis]
+                and fixed_stride_dim_tags[target_axis]):
+            error_msg = "computed and fixed stride arg dim tags may " \
+                    "not be mixed for argument '%s'" % name
+
+            if num_target_axes > 1:
+                error_msg += " (target axis %d)" % target_axis
+
+            raise ValueError(error_msg)
+
+        stride_so_far = 1
+
+        if fixed_stride_dim_tags[target_axis]:
+            for i in fixed_stride_dim_tags[target_axis]:
+                dt = dim_tags[i]
+                new_dim_tags[i] = dt
+        else:
+            for i in computed_stride_dim_tags[target_axis]:
+                dt = dim_tags[i]
+                new_dim_tags[i] = FixedStrideArrayDimTag(stride_so_far)
+
+                if shape is None:
+                    # unable to normalize without known shape
+                    return None
+
+                stride_so_far *= shape[i]
+
+                if dt.pad_to is not None:
+                    from pytools import div_ceil
+                    stride_so_far = (
+                            div_ceil(stride_so_far, dt.pad_to)
+                            * stride_so_far)
+
+    # }}}
+
+    return new_dim_tags
+
+# }}}
+
+
+# {{{ array base class (for arguments and temporary arrays)
+
+def _pymbolic_parse_if_necessary(x):
+    if isinstance(x, str):
+        from pymbolic import parse
+        return parse(x)
+    else:
+        return x
+
+
+def _parse_shape_or_strides(x):
+    import loopy as lp
+    if x == "auto":
+        from warnings import warn
+        warn("use of 'auto' as a shape or stride won't work "
+                "any more--use loopy.auto instead",
+                stacklevel=3)
+    x = _pymbolic_parse_if_necessary(x)
+    if isinstance(x, lp.auto):
+        return x
+    if not isinstance(x, tuple):
+        assert x is not lp.auto
+        x = (x,)
+
+    return tuple(_pymbolic_parse_if_necessary(xi) for xi in x)
+
+
+class ArrayBase(Record):
+    """
+    .. attribute :: name
+
+    .. attribute :: dtype
+
+    .. attribute :: shape
+
+    .. attribute:: dim_tags
+
+        a list of :class:`ArrayDimImplementationTag` instances.
+        or a list of strings that :func:`parse_array_dim_tag` understands,
+        or a comma-separated string of such tags.
+
+    .. attribute:: offset
+
+    """
+
+    # Note that order may also wind up in attributes, if the
+    # number of dimensions has not yet been determined.
+
+    def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0,
+            strides=None, order=None, **kwargs):
+        """
+        All of the following are optional. Specify either strides or shape.
+
+        :arg name: May contain multiple names separated by
+            commas, in which case multiple arguments,
+            each with identical properties, are created
+            for each name.
+        :arg dtype: the :class:`numpy.dtype` of the array.
+            If this is *None*, :mod:`loopy` will try to continue
+            without knowing the type of this array.
+
+            Note that some operations, such as :func:`loopy.add_padding`
+            will not work without the *dtype*.
+
+            :class:`loopy.CompiledKernel` will automatically compile a kernel
+            with the right dtype when called with a concrete array on a kernel
+            with argument whose *dtype* is *None*.
+        :arg shape: May be one of the following:
+
+            * *None*. In this case, no shape is intended to be specified,
+              only the strides will be used to access the array. Bounds checking
+              will not be performed.
+
+            * :class:`loopy.auto`. The shape will be determined by finding the
+              access footprint.
+
+            * a tuple like like :attr:`numpy.ndarray.shape`.
+
+              Each entry of the tuple is also allowed to be a :mod:`pymbolic`
+              expression involving kernel parameters, or a (potentially-comma
+              separated) or a string that can be parsed to such an expression.
+
+            * A string which can be parsed into the previous form.
+
+        :arg dim_tags: A comma-separated list of tags as understood by
+            :func:`parse_array_dim_tag`.
+
+        :arg strides: May be one of the following:
+
+            * None
+
+            * :class:`loopy.auto`. The strides will be determined by *order*
+              and the access footprint.
+
+            * a tuple like like :attr:`numpy.ndarray.shape`.
+
+              Each entry of the tuple is also allowed to be a :mod:`pymbolic`
+              expression involving kernel parameters, or a (potentially-comma
+              separated) or a string that can be parsed to such an expression.
+
+            * A string which can be parsed into the previous form.
+
+        :arg order: "F" or "C" for C (row major) or Fortran
+            (column major). Defaults to the *default_order* argument
+            passed to :func:`loopy.make_kernel`.
+        :arg offset: Offset from the beginning of the buffer to the point from
+            which the strides are counted. May be one of
+
+            * 0
+            * a string (that is interpreted as an argument name).
+            * :class:`loopy.auto`, in which case an offset argument
+              is added automatically, immediately following this argument.
+              :class:`loopy.CompiledKernel` is even smarter in its treatment of
+              this case and will compile custom versions of the kernel based on
+              whether the passed arrays have offsets or not.
+        """
+
+        import loopy as lp
+
+        if dtype is not None and dtype is not lp.auto:
+            dtype = np.dtype(dtype)
+
+        strides_known = strides is not None and strides is not lp.auto
+        shape_known = shape is not None and shape is not lp.auto
+
+        if strides_known:
+            strides = _parse_shape_or_strides(strides)
+
+        if shape_known:
+            shape = _parse_shape_or_strides(shape)
+
+        # {{{ convert strides to dim_tags (Note: strides override order)
+
+        if dim_tags is not None and strides_known:
+            raise TypeError("may not specify both strides and dim_tags")
+
+        if dim_tags is None and strides_known:
+            dim_tags = [FixedStrideArrayDimTag(s) for s in strides]
+            strides = None
+
+        # }}}
+
+        # {{{ determine number of user axes
+
+        num_user_axes = None
+        if shape_known:
+            num_user_axes = len(shape)
+        if dim_tags is not None:
+            new_num_user_axes = len(dim_tags)
+
+            if num_user_axes is None:
+                num_user_axes = new_num_user_axes
+            else:
+                if new_num_user_axes != num_user_axes:
+                    raise ValueError("contradictory values for number of dimensions "
+                            "from shape, strides, or dim_tags")
+
+            del new_num_user_axes
+
+        # }}}
+
+        # {{{ convert order to dim_tags
+
+        if dim_tags is None and num_user_axes is not None and order is not None:
+            dim_tags = num_user_axes*[order]
+            order = None
+
+        # }}}
+
+        if dim_tags is not None:
+            dim_tags = parse_array_dim_tags(dim_tags)
+
+            # {{{ find number of target axes
+
+            target_axes = set()
+            for dt in dim_tags:
+                if isinstance(dt, _StrideArrayDimTagBase):
+                    target_axes.add(dt.target_axis)
+
+            if target_axes != set(xrange(len(target_axes))):
+                raise ValueError("target axes for variable '%s' are non-"
+                        "contiguous" % self.name)
+
+            num_target_axes = len(target_axes)
+            del target_axes
+
+            # }}}
+
+            if not (self.min_target_axes <= num_target_axes <= self.max_target_axes):
+                raise ValueError("%s only supports between %d and %d target axes "
+                        "('%s' has %d)" % (type(self).__name__, self.min_target_axes,
+                            self.max_target_axes, self.name, num_target_axes))
+
+            new_dim_tags = convert_computed_to_fixed_dim_tags(
+                    name, num_user_axes, num_target_axes,
+                    shape, dim_tags)
+
+            if new_dim_tags is not None:
+                # successfully normalized
+                dim_tags = new_dim_tags
+                del new_dim_tags
+
+        if dim_tags is not None:
+            # for hashability
+            dim_tags = tuple(dim_tags)
+            order = None
+
+        Record.__init__(self,
+                name=name,
+                dtype=dtype,
+                shape=shape,
+                dim_tags=dim_tags,
+                offset=offset,
+                order=order,
+                strides=strides,
+                **kwargs)
+
+    def __str__(self):
+        import loopy as lp
+
+        info_entries = [type(self).__name__, str(self.dtype)]
+
+        if self.shape is None:
+            pass
+        elif self.shape is lp.auto:
+            info_entries.append("shape: auto")
+        else:
+            info_entries.append("shape: (%s)"
+                    % ",".join(str(i) for i in self.shape))
+
+        if self.dim_tags is not None:
+            info_entries.append("dim_tags: (%s)"
+                    % ",".join(str(i) for i in self.dim_tags))
+
+        if self.offset:
+            info_entries.append("offset: %s" % self.offset)
+
+        return "%s: %s" % (self.name, ", ".join(info_entries))
+
+    def __repr__(self):
+        return "<%s>" % self.__str__()
+
+    @property
+    @memoize_method
+    def numpy_strides(self):
+        return tuple(self.dtype.itemsize*s for s in self.strides)
+
+    def num_target_axes(self):
+        target_axes = set()
+        for dt in self.dim_tags:
+            if isinstance(dt, _StrideArrayDimTagBase):
+                target_axes.add(dt.target_axis)
+
+        return len(target_axes)
+
+    def num_user_axes(self, require_answer=True):
+        if self.shape is not None:
+            return len(self.shape)
+        if self.dim_tags is not None:
+            return len(self.dim_tags)
+        if require_answer:
+            raise RuntimeError("number of user axes of array '%s' cannot be found"
+                    % self.name)
+        else:
+            return None
+
+    def map_exprs(self, mapper):
+        """Return a copy of self with all expressions replaced with what *mapper*
+        transformed them into.
+        """
+        kwargs = {}
+        import loopy as lp
+
+        if self.shape is not None and self.shape is not lp.auto:
+            kwargs["shape"] = tuple(mapper(s) for s in self.shape)
+
+        if self.dim_tags is not None:
+            kwargs["dim_tags"] = [dt.map_expr(mapper) for dt in self.dim_tags]
+
+        # offset is not an expression, do not map.
+
+        return self.copy(**kwargs)
+
+    def decl_info(self, is_written, index_dtype):
+        """Return a list of tuples ``(cgen_decl, arg_info)``, where
+        *cgen_decl* is a :mod:`cgen` argument declarations, *arg_info*
+        is a :class:`CLArgumentInfo` instance.
+        """
+
+        from loopy.codegen import CLArgumentInfo
+
+        def gen_decls(name_suffix, shape, dtype, user_index):
+            if dtype is None:
+                dtype = self.dtype
+
+            user_axis = len(user_index)
+
+            num_user_axes = self.num_user_axes(require_answer=False)
+
+            if num_user_axes is None or user_axis >= num_user_axes:
+                # implemented by various argument types
+                yield (self.get_arg_decl(name_suffix, shape, dtype, is_written),
+                        CLArgumentInfo(
+                            name=self.name + name_suffix,
+                            base_name=self.name,
+                            dtype=dtype,
+                            shape=shape,
+                            offset_for_name=None))
+
+                if self.offset:
+                    from cgen import Const, POD
+                    yield (Const(POD(index_dtype,
+                                self.name+name_suffix+"_offset")),
+                            CLArgumentInfo(
+                                name=self.name + name_suffix,
+                                base_name=self.name,
+                                dtype=dtype,
+                                shape=shape,
+                                offset_for_name=None))
+
+                return
+
+            dim_tag = self.dim_tags[user_axis]
+
+            if isinstance(dim_tag, FixedStrideArrayDimTag):
+                if self.shape is None:
+                    new_shape = shape + (None,)
+                else:
+                    new_shape = shape + (self.shape[user_axis],)
+
+                for res in gen_decls(name_suffix, new_shape, dtype,
+                        user_index + (None,)):
+                    yield res
+
+            elif isinstance(dim_tag, SeparateArrayArrayDimTag):
+                shape_i = self.shape[user_axis]
+                if not isinstance(shape_i, int):
+                    raise RuntimeError("shape of '%s' has non-constant "
+                            "integer axis %d (0-based)" % (
+                                self.name, user_axis))
+
+                for i in xrange(shape_i):
+                    for res in gen_decls(name_suffix + "_s%d" % i,
+                            shape + (self.shape[user_axis],), dtype,
+                            user_index + (i,)):
+                        yield res
+
+            elif isinstance(dim_tag, VectorArrayDimTag):
+                shape_i = self.shape[user_axis]
+                if not isinstance(shape_i, int):
+                    raise RuntimeError("shape of '%s' has non-constant "
+                            "integer axis %d (0-based)" % (
+                                self.name, user_axis))
+
+                for res in gen_decls(name_suffix, shape,
+                        cl.array.vec.types[dtype, shape_i],
+                        user_index + (None,)):
+                    yield res
+
+            else:
+                raise RuntimeError("unsupported array dim implementation tag '%s' "
+                        "in array '%s'" % (dim_tag, self.name))
+
+        for res in gen_decls("", (), self.dtype, ()):
+            yield res
+
+# }}}
+
+
+# {{{ access code generation
+
+class AccessInfo(Record):
+    """
+    :ivar array_suffix:
+    :ivar vector_index:
+    :ivar subscripts: List of expressions, one for each target axis
+    """
+
+
+def get_access_info(ary, index, eval_expr):
+    """
+    :arg ary: an object of type :class:`ArrayBase`
+    :arg index: a tuple of indices representing a subscript into ary
+    """
+    if not isinstance(index, tuple):
+        index = (index,)
+
+    if ary.shape is None:
+        return AccessInfo(subscripts=index, vector_index=0)
+
+    if len(ary.shape) != len(index):
+        raise RuntimeError("subscript to '%s[%s]' has the wrong "
+                "number of indices (got: %d, expected: %d)" % (
+                    ary.name, index, len(index), len(ary.shape)))
+
+    num_target_axes = ary.num_target_axes()
+
+    array_suffix = ""
+    vector_index = None
+    subscripts = [0] * num_target_axes
+
+    for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
+        if isinstance(dim_tag, FixedStrideArrayDimTag):
+            subscripts[dim_tag.target_axis] += dim_tag.stride*idx
+        elif isinstance(dim_tag, SeparateArrayArrayDimTag):
+            idx = eval_expr(idx)
+            if not isinstance(idx, int):
+                raise RuntimeError("subscript '%s[%s]' has non-constant "
+                        "index for separate-array axis %d (0-based)" % (
+                            ary.name, index, i))
+            array_suffix += "_s%d" % idx
+        elif isinstance(dim_tag, VectorArrayDimTag):
+            idx = eval_expr(idx)
+
+            if not isinstance(idx, int):
+                raise RuntimeError("subscript '%s[%s]' has non-constant "
+                        "index for separate-array axis %d (0-based)" % (
+                            ary.name, index, i))
+            assert vector_index is None
+            vector_index = idx
+        else:
+            raise RuntimeError("unsupported array dim implementation tag '%s' "
+                    "in array '%s'" % (dim_tag, ary.name))
+
+    from pymbolic import var
+    import loopy as lp
+    if ary.offset:
+        if num_target_axes > 1:
+            raise NotImplementedError("offsets for multiple image axes")
+
+        offset_name = ary.offset
+        if offset_name is lp.auto:
+            offset_name = ary.name+array_suffix+"_offset"
+
+        subscripts[0] = var(offset_name) + subscripts[0]
+
+    return AccessInfo(
+            array_suffix=array_suffix,
+            vector_index=vector_index,
+            subscripts=subscripts)
+
+# }}}
+
+# vim: fdm=marker
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 18c95fe98..cf136a9f4 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -694,8 +694,7 @@ def check_for_reduction_inames_duplication_requests(kernel):
 # {{{ duplicate arguments and expand defines in shapes
 
 def dup_args_and_expand_defines_in_shapes(kernel, defines):
-    import loopy as lp
-    from loopy.kernel.data import ShapedArg
+    from loopy.kernel.array import ArrayBase
     from loopy.kernel.creation import expand_defines_in_expr
 
     processed_args = []
@@ -705,14 +704,9 @@ def dup_args_and_expand_defines_in_shapes(kernel, defines):
                 continue
 
             new_arg = arg.copy(name=arg_name)
-            if isinstance(arg, ShapedArg):
-                if arg.shape is not None and arg.shape is not lp.auto:
-                    new_arg = new_arg.copy(
-                            shape=expand_defines_in_expr(arg.shape, defines))
-                if arg.strides is not None and arg.strides is not lp.auto:
-                    new_arg = new_arg.copy(
-                            strides=expand_defines_in_expr(
-                                arg.strides, defines))
+            if isinstance(arg, ArrayBase):
+                new_arg = arg.map_exprs(
+                        lambda expr: expand_defines_in_expr(expr, defines))
 
             processed_args.append(new_arg)
 
@@ -727,15 +721,14 @@ def guess_arg_shape_if_requested(kernel, default_order):
     new_args = []
 
     import loopy as lp
-    from loopy.kernel.data import ShapedArg
+    from loopy.kernel.array import ArrayBase
     from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper
 
     submap = SubstitutionRuleExpander(kernel.substitutions,
             kernel.get_var_name_generator())
 
     for arg in kernel.args:
-        if isinstance(arg, ShapedArg) and (
-                arg.shape is lp.auto or arg.strides is lp.auto):
+        if isinstance(arg, ArrayBase) and arg.shape is lp.auto:
             armap = AccessRangeMapper(kernel, arg.name)
 
             for insn in kernel.instructions:
@@ -783,11 +776,11 @@ def guess_arg_shape_if_requested(kernel, default_order):
 # {{{ apply default_order to args
 
 def apply_default_order_to_args(kernel, default_order):
-    from loopy.kernel.data import ShapedArg
+    from loopy.kernel.array import ArrayBase
 
     processed_args = []
     for arg in kernel.args:
-        if isinstance(arg, ShapedArg):
+        if isinstance(arg, ArrayBase):
             arg = arg.copy(order=default_order)
         processed_args.append(arg)
 
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 1daf63747..613996403 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -27,6 +27,7 @@ THE SOFTWARE.
 
 import numpy as np
 from pytools import Record, memoize_method
+from loopy.kernel.array import ArrayBase
 
 
 # {{{ iname tags
@@ -147,187 +148,61 @@ def parse_tag(tag):
 
 # {{{ arguments
 
-def make_strides(shape, order):
-    from pyopencl.compyte.array import (
-            f_contiguous_strides,
-            c_contiguous_strides)
-
-    if order == "F":
-        return f_contiguous_strides(1, shape)
-    elif order == "C":
-        return c_contiguous_strides(1, shape)
-    else:
-        raise ValueError("invalid order: %s" % order)
-
 
 class KernelArgument(Record):
     pass
 
 
-class ShapedArg(KernelArgument):
-    def __init__(self, name, dtype=None, shape=None, strides=None, order=None,
-            offset=0):
-        """
-        All of the following are optional. Specify either strides or shape.
-
-        :arg name: May contain multiple names separated by
-            commas, in which case multiple arguments,
-            each with identical properties are created
-            for each name.
-        :arg dtype: the :class:`numpy.dtype` of the array.
-            If this is *None*, :mod:`loopy` will try to continue
-            without knowing the type of this array.
-
-            Note that some operations, such as :func:`loopy.add_padding`
-            require this information to work.
-
-            :class:`loopy.CompiledKernel` will automatically compile a kernel
-            with the right dtype when called with a concrete array on a kernel
-            with argument whose *dtype* is *None*.
-        :arg shape: like :attr:`numpy.ndarray.shape`.
-            Also allowed to be :class:`loopy.auto`, in
-            which case shape is determined by finding the
-            access footprint.
-
-            This is also allowed to be an expression involving
-            kernel parameters, or a (potentially-comma separated)
-            string that can be parsed to such an expression.
-        :arg strides:  like :attr:`numpy.ndarray.strides`,
-            but in multiples of data type size.
-            Also allowed to be :class:`loopy.auto`, in which
-            case strides are determined from shape and
-            *default_order* of :func:`loopy.make_kernel`.
-
-            This is also allowed to be an expression involving
-            kernel parameters, or a (potentially-comma separated)
-            string that can be parsed to such an expression.
-        :arg order: "F" or "C" for C (row major) or Fortran
-            (column major)
-        :arg offset: Offset from the beginning of the buffer to the point from
-            which the strides are counted. May be one of
-
-            * 0
-            * a string (that is interpreted as an argument name).
-            * :class:`loopy.auto`, in which case an offset argument
-              is added automatically, immediately following this argument.
-              :class:`loopy.CompiledKernel` is even smarter in its treatment of
-              this case and will compile custom versions of the kernel based on
-              whether the passed arrays have offsets or not.
-        """
-        if dtype is not None:
-            dtype = np.dtype(dtype)
+class GlobalArg(ArrayBase, KernelArgument):
+    min_target_axes = 0
+    max_target_axes = 1
 
-        def parse_if_necessary(x):
-            if isinstance(x, str):
-                from pymbolic import parse
-                return parse(x)
-            else:
-                return x
-
-        def process_tuple(x):
-            if x == "auto":
-                from warnings import warn
-                warn("use of 'auto' as a shape or stride won't work "
-                        "any more--use loopy.auto instead",
-                        stacklevel=3)
-            x = parse_if_necessary(x)
-            if isinstance(x, lp.auto):
-                return x
-            if not isinstance(x, tuple):
-                assert x is not lp.auto
-                x = (x,)
-
-            return tuple(parse_if_necessary(xi) for xi in x)
-
-        import loopy as lp
-        strides_known = strides is not None and strides is not lp.auto
-        shape_known = shape is not None and shape is not lp.auto
-
-        if strides_known:
-            strides = process_tuple(strides)
-
-        if shape_known:
-            shape = process_tuple(shape)
-
-        if not strides_known and shape_known:
-            if len(shape) == 1:
-                # don't need order to know that
-                strides = (1,)
-            elif order is not None:
-                strides = make_strides(shape, order)
+    def get_arg_decl(self, name_suffix, shape, dtype, is_written):
+        from cgen import RestrictPointer, POD, Const
+        from cgen.opencl import CLGlobal
 
-        Record.__init__(self,
-                name=name,
-                dtype=dtype,
-                strides=strides,
-                offset=offset,
-                shape=shape)
+        arg_decl = RestrictPointer(
+                POD(dtype, self.name + name_suffix))
 
-    @property
-    @memoize_method
-    def numpy_strides(self):
-        return tuple(self.dtype.itemsize*s for s in self.strides)
+        if not is_written:
+            arg_decl = Const(arg_decl)
 
-    @property
-    def dimensions(self):
-        return len(self.strides)
+        return CLGlobal(arg_decl)
 
-    def __str__(self):
-        import loopy as lp
 
-        if self.shape is None:
-            shape = "unknown"
-        elif self.shape is lp.auto:
-            shape = "auto"
-        else:
-            shape = ",".join(str(i) for i in self.shape)
+class ConstantArg(ArrayBase, KernelArgument):
+    min_target_axes = 0
+    max_target_axes = 1
 
-        if self.strides is None:
-            strides = "unknown"
-        elif self.strides is lp.auto:
-            strides = "auto"
+    def get_arg_decl(self, name_suffix, shape, dtype, is_written):
+        if is_written:
+            mode = "w"
         else:
-            strides = ",".join(str(i) for i in self.strides)
+            mode = "r"
 
-        return "%s: %s, type: %s, shape: (%s), strides: (%s)" % (
-                self.name, type(self).__name__, self.dtype, shape,
-                strides)
+        from cgen.opencl import CLImage
+        return CLImage(self.num_target_axes(), mode, self.name+name_suffix)
 
-    def __repr__(self):
-        return "<%s>" % self.__str__()
 
+class ImageArg(ArrayBase, KernelArgument):
+    min_target_axes = 1
+    max_target_axes = 3
 
-class GlobalArg(ShapedArg):
-    pass
-
-
-class ConstantArg(ShapedArg):
-    pass
+    @property
+    def dimensions(self):
+        return len(self.dim_tags)
 
+    def get_arg_decl(self, name_suffix, shape, dtype, is_written):
+        from cgen import RestrictPointer, POD, Const
+        from cgen.opencl import CLConstant
 
-class ImageArg(KernelArgument):
-    def __init__(self, name, dtype=None, dimensions=None, shape=None):
-        dtype = np.dtype(dtype)
-        if shape is not None:
-            if dimensions is not None and dimensions != len(shape):
-                raise RuntimeError("cannot specify both shape and "
-                        "disagreeing dimensions in ImageArg")
-            dimensions = len(shape)
-        else:
-            if not isinstance(dimensions, int):
-                raise RuntimeError("ImageArg: dimensions must be an integer")
+        arg_decl = RestrictPointer(
+                POD(dtype, self.name + name_suffix))
 
-        Record.__init__(self,
-                dimensions=dimensions,
-                shape=shape,
-                dtype=dtype,
-                name=name)
+        if not is_written:
+            arg_decl = Const(arg_decl)
 
-    def __str__(self):
-        return "%s: ImageArg, type %s" % (self.name, self.dtype)
-
-    def __repr__(self):
-        return "<%s>" % self.__str__()
+        return CLConstant(arg_decl)
 
 
 class ValueArg(KernelArgument):
@@ -349,26 +224,25 @@ class ValueArg(KernelArgument):
 
 # {{{ temporary variable
 
-class TemporaryVariable(Record):
-    """
-    :ivar name:
-    :ivar dtype:
-    :ivar shape:
-    :ivar storage_shape:
-    :ivar base_indices:
-    :ivar is_local:
+class TemporaryVariable(ArrayBase):
+    __doc__ = ArrayBase.__doc__ + """
+    .. attribute:: storage_shape
+    .. attribute:: base_indices
+    .. attribute:: is_local
     """
 
-    def __init__(self, name, dtype, shape, is_local, base_indices=None,
-            storage_shape=None):
+    min_target_axes = 0
+    max_target_axes = 1
+
+    def __init__(self, name, dtype, shape, is_local,
+            dim_tags=None, offset=0, strides=None, order=None,
+            base_indices=None, storage_shape=None):
         if base_indices is None:
             base_indices = (0,) * len(shape)
 
-        if shape is not None and not isinstance(shape, tuple):
-            shape = tuple(shape)
-
-        Record.__init__(self, name=name, dtype=dtype, shape=shape, is_local=is_local,
-                base_indices=base_indices,
+        ArrayBase.__init__(self, name=name, dtype=dtype, shape=shape,
+                dim_tags=dim_tags, order="C",
+                base_indices=base_indices, is_local=is_local,
                 storage_shape=storage_shape)
 
     @property
@@ -376,6 +250,23 @@ class TemporaryVariable(Record):
         from pytools import product
         return product(si for si in self.shape)*self.dtype.itemsize
 
+    def get_arg_decl(self, name_suffix, shape, dtype, is_written):
+        from cgen import ArrayOf, POD
+        from cgen.opencl import CLLocal
+
+        temp_var_decl = POD(self.dtype, self.name)
+
+        # FIXME take into account storage_shape, or something like it
+        storage_shape = self.shape
+
+        for l in storage_shape:
+            temp_var_decl = ArrayOf(temp_var_decl, l)
+
+        if self.is_local:
+            temp_var_decl = CLLocal(temp_var_decl)
+
+        return temp_var_decl
+
 # }}}
 
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 1dfba0cb0..09adb1278 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -480,7 +480,10 @@ def duplicate_private_temporaries_for_ilp(kernel):
         if shape is None:
             shape = ()
 
-        new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape)
+        new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape,
+                # Forget what you knew about data layout,
+                # create from scratch.
+                dim_tags=None)
 
     # }}}
 
@@ -957,27 +960,6 @@ def adjust_local_temp_var_storage(kernel):
 # }}}
 
 
-# {{{ add automatic offset arguments
-
-def add_auto_offset_args(kernel):
-    import loopy as lp
-
-    vng = kernel.get_var_name_generator()
-
-    new_args = []
-    for arg in kernel.args:
-        if getattr(arg, "offset", None) is lp.auto:
-            offset_arg_name = vng(arg.name+"_offset")
-            new_args.append(arg.copy(offset=offset_arg_name))
-            new_args.append(lp.ValueArg(offset_arg_name, kernel.index_dtype))
-        else:
-            new_args.append(arg)
-
-    return kernel.copy(args=new_args)
-
-# }}}
-
-
 def preprocess_kernel(kernel):
     from loopy.subst import expand_subst
     kernel = expand_subst(kernel)
@@ -1005,7 +987,6 @@ def preprocess_kernel(kernel):
     kernel = add_boostability_and_automatic_dependencies(kernel)
     kernel = limit_boostability(kernel)
     kernel = adjust_local_temp_var_storage(kernel)
-    kernel = add_auto_offset_args(kernel)
 
     return kernel
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index afc1f0559..5875912ba 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -681,7 +681,7 @@ def test_dependent_loop_bounds_2(ctx_factory):
             [
                 lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto),
                 lp.GlobalArg("a_indices", np.int32, shape=lp.auto),
-                lp.GlobalArg("a_values", dtype),
+                lp.GlobalArg("a_values", dtype, strides=(1,)),
                 lp.GlobalArg("ax", dtype, shape=lp.auto),
                 lp.ValueArg("n", np.int32),
                 ],
@@ -1117,7 +1117,7 @@ def test_array_with_offset(ctx_factory):
     a_full = cl.clrandom.rand(queue, (n, n), np.float64)
     a = a_full[3:10]
 
-    print cknl.get_highlighted_code({"a": a.dtype}, {"a": True, "b": False})
+    print cknl.get_highlighted_code({"a": a.dtype})
     evt, (b,) = cknl(queue, a=a)
 
     import numpy.linalg as la
-- 
GitLab