From 5b2e0f6062364f77998b2d5c270a0f73deba5843 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Mon, 19 Feb 2018 11:56:27 -0500
Subject: [PATCH 01/35] Commit only DeferredSourceModule support without
 changing calling behavior of users of get_elwise_module or
 get_elwise_range_module, to test backwards compatibility and performance.
 This squashes multiple commits to put all major changes in one place.

---
 pycuda/deferred.py    | 511 +++++++++++++++++++++++++++++++++++++++++
 pycuda/elementwise.py | 523 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 960 insertions(+), 74 deletions(-)
 create mode 100644 pycuda/deferred.py

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
new file mode 100644
index 00000000..97f28da7
--- /dev/null
+++ b/pycuda/deferred.py
@@ -0,0 +1,511 @@
+"""
+This exports a "deferred" implementation of SourceModule, where compilation
+is delayed until call-time.  Several methods, like get_function(), return
+"deferred" values that are also only evaluated at call-time.
+"""
+
+from pycuda.tools import context_dependent_memoize
+from pycuda.compiler import compile, SourceModule
+import pycuda.driver
+
+import re
+
+class DeferredSource(object):
+    '''
+    Source generator that supports user-directed indentation, nesting
+    ``DeferredSource`` objects, indentation-aware string interpolation,
+    and deferred generation.
+    Use ``+=`` or ``add()`` to add source fragments as strings or
+    other ``DeferredSource`` objects, ``indent()`` or ``dedent()`` to
+    change base indentation, and ``__call__`` or ``generate()`` to
+    generate source.
+    
+    '''
+    def __init__(self, subsources=None, base_indent=0, indent_step=2):
+        self.base_indent = base_indent
+        self.indent_step = indent_step
+        if subsources is None:
+            subsources = []
+        self.subsources = subsources
+
+    def __str__(self):
+        return self.generate()
+
+    def __repr__(self):
+        return repr(self.__str__())
+
+    def __call__(self, indent=0, indent_first=True):
+        return self.generate(indent, indent_first)
+
+    def generate(self, indent=0, indent_first=True, get_list=False):
+        if get_list:
+            retval = []
+        else:
+            retval = ''
+        do_indent = not indent_first
+        for subindent, strip_space, subsource, format_dict in self.subsources:
+            if do_indent:
+                newindent = self.base_indent + indent + subindent
+            else:
+                newindent = 0
+            do_indent = True
+            if isinstance(subsource, DeferredSource):
+                retval = retval + subsource.generate(indent=(indent + subindent), get_list=get_list)
+                continue
+            lines = subsource.split("\n")
+            regex_space = re.compile(r"^(\s*)(.*?)(\s*)$")
+            regex_format = re.compile(r"%\(([^\)]*)\)([a-zA-Z])")
+            minstrip = None
+            newlines = []
+            for line in lines:
+                linelen = len(line)
+                space_match = regex_space.match(line)
+                end_leading_space = space_match.end(1)
+                begin_trailing_space = space_match.start(3)
+                if strip_space:
+                    if linelen == end_leading_space:
+                        # all space, ignore
+                        continue
+                    if minstrip is None or end_leading_space < minstrip:
+                        minstrip = end_leading_space
+                if not format_dict:
+                    newlines.append(line)
+                    continue
+                newlinelist = None
+                newline = ''
+                curpos = 0
+                matches = list(regex_format.finditer(line, end_leading_space))
+                nummatches = len(matches)
+                for match in matches:
+                    formatchar = match.group(2)
+                    name = match.group(1)
+                    matchstart = match.start()
+                    matchend = match.end()
+                    repl = format_dict.get(name, None)
+                    if repl is None:
+                        continue
+                    if (isinstance(repl, DeferredSource) and
+                        nummatches == 1 and
+                        matchstart == end_leading_space):
+                        # only one replacement, and only spaces preceding
+                        space = space_match.group(1)
+                        newlinelist = [ space + x
+                                        for x in repl.generate(get_list=True) ]
+                    else:
+                        newline = newline + line[curpos:matchstart]
+                        newline = newline + (('%' + formatchar) % (repl,))
+                    curpos = matchend
+                if newlinelist is None:
+                    newline = newline + line[curpos:]
+                    newlines.append(newline)
+                else:
+                    newlines.extend(newlinelist)
+                    newlines.append(line[curpos:])
+            indentstr = ' ' * (indent + subindent)
+            for i, line in enumerate(newlines):
+                line = indentstr + line[minstrip:]
+                newlines[i] = line
+            if get_list:
+                retval += newlines
+            else:
+                retval = retval + "\n".join(newlines) + "\n"
+        return retval
+
+    def indent(self, indent_step=None):
+        if indent_step is None:
+            indent_step = self.indent_step
+        self.base_indent += indent_step
+        return self
+
+    def dedent(self, indent_step=None):
+        if indent_step is None:
+            indent_step = self.indent_step
+        self.base_indent -= indent_step
+        return self
+
+    def format_dict(self, format_dict):
+        for subsource in self.subsources:
+            subsource[3] = format_dict
+        return self
+
+    def add(self, other, strip_space=True, format_dict=None):
+        self.subsources.append([self.base_indent, strip_space, other, format_dict])
+        return self
+
+    def __iadd__(self, other):
+        self.add(other)
+        return self
+
+    def __add__(self, other):
+        newgen = DeferredSource(subsources=self.subsources,
+                                base_indent=self.base_indent,
+                                indent_step=self.indent_step)
+        newgen.add(other)
+        return newgen
+
+class DeferredVal(object):
+    '''
+    This is an object that serves as a proxy to an as-yet undetermined
+    object, which is only known at the time when either ``_set_val()``
+    or ``_evalbase()`` is called.  Any calls to methods listed in the class
+    attribute ``_deferred_method_dict`` are queued until a call to
+    ``_eval()``, at which point ``_evalbase()`` is called (unless the value
+    has been already set by an earlier call to ``_evalbase()`` or
+    ``set_val()``), and then the queued method calls are executed, in order,
+    immediately on the new object.
+    This class must be subclassed, and the class attribute
+    ``_deferred_method_dict`` must contain a mapping from defer-able method
+    names to either ``DeferredVal``, None (same as ``DeferredVal``), or a
+    subclass, which when instantiated, will be assigned (with ``_set_val()``)
+    the return value of the method.
+    There are two ways to set the proxied object.  One is to set it
+    explicitly with ``_set_val(val)``.  The other is to override the method
+    ``_evalbase()`` which should return the new object, and will be called
+    by ``_eval()``.
+    '''
+    __unimpl = object()
+    _deferred_method_dict = None # must be set by subclass
+
+    def __init__(self):
+        self._val_available = False
+        self._val = None
+        self._deferred_method_calls = []
+
+    def _copy(self, retval=None):
+        if retval is None:
+            retval = self.__class__()
+        retval._val_available = self._val_available
+        retval._val = self._val
+        retval._deferred_method_calls = self._deferred_method_calls
+        return retval
+
+    def __repr__(self):
+        return self._repr(0)
+
+    def _repr(self, indent):
+        indentstr = " " * indent
+        retstrs = []
+        retstrs.append("")
+        retstrs.append("%s [" % (self.__class__,))
+        for dmc in self._deferred_method_calls:
+            (name, args, kwargs, retval) = dmc
+            retstrs.append("  method %s" % (repr(name),))
+            for arg in args:
+                if isinstance(arg, DeferredVal):
+                    retstrs.append("    deferred arg (id=%s)" % (id(arg),))
+                    retstrs.append(arg._repr(indent + 6))
+                else:
+                    retstrs.append("    arg %s" % (repr(arg),))
+            for kwname, arg in kwargs.items():
+                if isinstance(arg, DeferredVal):
+                    retstrs.append("    deferred kwarg %s (id=%s)" % (repr(kwname), id(arg)))
+                    retstrs.append(arg._repr(indent + 6))
+                else:
+                    retstrs.append("    kwarg %s=%s" % (kwname, repr(arg),))
+            retstrs.append("    deferred retval (id=%s)" % (id(retval),))
+        retstrs.append("]")
+        return "\n".join([(indentstr + retstr) for retstr in retstrs])
+
+    def _set_val(self, val):
+        self._val = val
+        self._val_available = True
+        self._eval_methods()
+        return val
+
+    def _evalbase(self, evalcont=None):
+        raise NotImplementedError()
+
+    def _eval_list(self, vals, evalcont=None):
+        newvals = []
+        for val in vals:
+            if isinstance(val, DeferredVal):
+                val = val._eval(evalcont=evalcont)
+            newvals.append(val)
+        return newvals
+
+    def _eval_dict(self, valsdict, evalcont=None):
+        newvalsdict = {}
+        for name, val in valsdict.items():
+            if isinstance(val, DeferredVal):
+                val = val._eval(evalcont=evalcont)
+            newvalsdict[name] = val
+        return newvalsdict
+
+    def _eval_methods(self, evalcont=None):
+        assert(self._val_available)
+        val = self._val
+        for op in self._deferred_method_calls:
+            (methodname, methodargs, methodkwargs, deferredretval) = op
+            methodargs = self._eval_list(methodargs, evalcont=evalcont)
+            methodkwargs = self._eval_dict(methodkwargs, evalcont=evalcont)
+            retval = getattr(val, methodname)(*methodargs, **methodkwargs)
+            deferredretval._set_val(retval)
+        self._deferred_method_calls = []
+
+    def _eval(self, evalcont=None):
+        if not self._val_available:
+            self._val = self._evalbase(evalcont=evalcont)
+            self._val_available = True
+            self._eval_methods(evalcont=evalcont)
+        return self._val
+
+    def _get_deferred_func(self, _name, _retval):
+        def _deferred_func(*args, **kwargs):
+            if not self._val_available:
+                self._deferred_method_calls.append((_name, args, kwargs, _retval))
+                return _retval
+            args = self._eval_list(args)
+            kwargs = self._eval_dict(kwargs)
+            return getattr(self._val, _name)(*args, **kwargs)
+        _deferred_func.__name__ = _name + ".deferred"
+        return _deferred_func
+
+    def __getattr__(self, name):
+        if self.__class__._deferred_method_dict is None:
+            raise Exception("DeferredVal must be subclassed and the class attribute _deferred_method_dict must be set to a valid dictionary!")
+        if self._val_available:
+            return getattr(self._val, name)
+        deferredclass = self.__class__._deferred_method_dict.get(name, self.__unimpl)
+        if deferredclass is not self.__unimpl:
+            if deferredclass is None:
+                deferredclass = DeferredVal
+            retval = deferredclass()
+            return self._get_deferred_func(name, retval)
+        raise AttributeError("no such attribute (yet): '%s'" % (name,))
+
+# we allow all math operators to be deferred
+_mathops = (
+    '__add__', '__sub__', '__mul__', '__floordiv__', '__mod__',
+    '__divmod__', '__pow__', '__lshift__', '__rshift__', '__and__',
+    '__xor__', '__or__', '__div__', '__truediv__', '__radd__', '__rsub__',
+    '__rmul__', '__rdiv__', '__rtruediv__', '__rfloordiv__', '__rmod__',
+    '__rdivmod__', '__rpow__', '__rlshift__', '__rrshift__', '__rand__',
+    '__rxor__', '__ror__', '__iadd__', '__isub__', '__imul__', '__idiv__',
+    '__itruediv__', '__ifloordiv__', '__imod__', '__ipow__', '__ilshift__',
+    '__irshift__', '__iand__', '__ixor__', '__ior__', '__pos__', '__abs__',
+    '__invert__', '__complex__', '__int__', '__long__', '__float__',
+    '__oct__', '__hex__', '__index__', '__coerce__')
+class DeferredNumeric(DeferredVal):
+    pass
+DeferredNumeric._deferred_method_dict = dict((x, DeferredNumeric)
+                                             for x in _mathops)
+def _get_deferred_attr_func(_name):
+    def _deferred_func(self, *args, **kwargs):
+        return self.__getattr__(_name)(*args, **kwargs)
+    _deferred_func.__name__ = _name
+    return _deferred_func
+for name in _mathops:
+    setattr(DeferredNumeric, name, _get_deferred_attr_func(name))
+
+class DeferredModuleCall(DeferredVal):
+    _deferred_method_dict = {}
+    def __init__(self, methodstr, *args, **kwargs):
+        super(DeferredModuleCall, self).__init__()
+        self._methodstr = methodstr
+        self._args = args
+        self._kwargs = kwargs
+
+    def _copy(self, retval=None):
+        if retval is None:
+            retval = self.__class__(self._methodstr, *self._args, **self._kwargs)
+        return super(DeferredModuleCall, self)._copy(retval=retval)
+
+    def _evalbase(self, evalcont):
+        # evalcont is the actual Module object
+        return getattr(evalcont, self._methodstr)(*self._args, **self._kwargs)
+
+class DeferredTexRef(DeferredModuleCall):
+    _deferred_method_dict = {
+        "set_array": None,
+        "set_address": DeferredNumeric,
+        "set_address_2d": None,
+        "set_format": None,
+        "set_address_mode": None,
+        "set_flags": None,
+        "get_address": DeferredNumeric,
+        "get_flags": DeferredNumeric,
+    }
+
+class DeferredFunction(object):
+    '''
+    This class is a pseudo-replacement of ``pycuda.driver.Function``,
+    but takes a ``DeferredSourceModule`` and a function name as an argument,
+    and queues any call to ``prepare()`` until call-time, at which time it
+    calls out to the ``DeferredSourceModule`` object do create the actual
+    Function before preparing (if necessary) and calling the underlying
+    kernel.  NOTE: you may now send the actual ``GPUArrays`` as arguments,
+    rather than their ``.gpudata`` members; this can be helpful to
+    dynamically create kernels.
+    '''
+    def __init__(self, deferredmod, funcname):
+        self._deferredmod = deferredmod
+        self._funcname = funcname
+        self._prepare_args = None
+
+        def get_unimplemented(_methodname):
+            def _unimplemented(self, _methodname=_methodname, *args, **kwargs):
+                raise NotImplementedError("%s does not implement method '%s'" % (type(self), _methodname,))
+            return _unimplemented
+
+        for meth_name in ["set_block_shape", "set_shared_size",
+                "param_set_size", "param_set", "param_seti", "param_setf",
+                "param_setv", "param_set_texref",
+                "launch", "launch_grid", "launch_grid_async"]:
+            setattr(self, meth_name, get_unimplemented(meth_name))
+
+    def _fix_texrefs(self, kwargs, mod):
+        texrefs = kwargs.get('texrefs', None)
+        if texrefs is not None:
+            kwargs = kwargs.copy()
+            newtexrefs = []
+            for texref in texrefs:
+                if isinstance(texref, DeferredVal):
+                    texref = texref._copy() # future calls may use different modules/functions
+                    texref = texref._eval(mod)
+                newtexrefs.append(texref)
+            kwargs['texrefs'] = newtexrefs
+        return kwargs
+
+    def __call__(self, *args, **kwargs):
+        block = kwargs.get('block', None)
+        if block is None or not isinstance(block, tuple) or len(block) != 3:
+            raise ValueError("keyword argument 'block' is required, and must be a 3-tuple of integers")
+        grid = kwargs.get('grid', (1,1))
+        mod, func = self._deferredmod._delayed_get_function(self._funcname, args, grid, block)
+        kwargs = self._fix_texrefs(kwargs, mod)
+        return func.__call__(*args, **kwargs)
+
+    def param_set_texref(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def prepare(self, *args, **kwargs):
+        self._prepare_args = (args, kwargs)
+        return self
+
+    def _do_delayed_prepare(self, mod, func):
+        if self._prepare_args is None:
+            raise Exception("prepared_*_call() requires that prepare() be called first")
+        (prepare_args, prepare_kwargs) = self._prepare_args
+        prepare_kwargs = self._fix_texrefs(prepare_kwargs, mod)
+        func.prepare(*prepare_args, **prepare_kwargs)
+
+    def _generic_prepared_call(self, funcmethodstr, funcmethodargs, funcargs, funckwargs):
+        grid = funcmethodargs[0]
+        block = funcmethodargs[1]
+        mod, func = self._deferredmod._delayed_get_function(self._funcname, funcargs, grid, block)
+        self._do_delayed_prepare(mod, func)
+        newfuncargs = [ getattr(arg, 'gpudata', arg) for arg in funcargs ]
+        fullargs = list(funcmethodargs)
+        fullargs.extend(newfuncargs)
+        return getattr(func, funcmethodstr)(*fullargs, **funckwargs)
+
+    def prepared_call(self, grid, block, *args, **kwargs):
+        return self._generic_prepared_call('prepared_call', (grid, block), args, kwargs)
+
+    def prepared_timed_call(self, grid, block, *args, **kwargs):
+        return self._generic_prepared_call('prepared_timed_call', (grid, block), args, kwargs)
+
+    def prepared_async_call(self, grid, block, stream, *args, **kwargs):
+        return self._generic_prepared_call('prepared_async_call', (grid, block, stream), args, kwargs)
+
+@context_dependent_memoize
+def _delayed_compile_aux(source, compileargs):
+    # re-convert any tuples to lists
+    newcompileargs = []
+    for i, arg in enumerate(compileargs):
+        if isinstance(arg, tuple):
+            arg = list(arg)
+        newcompileargs.append(arg)
+    cubin = compile(source, *newcompileargs)
+
+    from pycuda.driver import module_from_buffer
+    return module_from_buffer(cubin)
+
+class DeferredSourceModule(SourceModule):
+    '''
+    This is an abstract specialization of SourceModule which allows the
+    delay of creating the actual kernel source until call-time, at which
+    point the actual arguments are available and their characteristics can
+    be used to create specific kernels.
+    To support this, ``get_function()`` returns a ``DeferredFunction``
+    object which queues any calls to ``DeferredFunction.prepare()`` and
+    saves them until future calls to ``DeferredFunction.__call__()`` or
+    ``DeferredFunction.prepared_*_call()``.  NOTE: you may now send actual
+    ``GPUArrays`` to these functions rather their ``.gpudata`` members;
+    this can be helpful when creating dynamic kernels.
+    Likewise, ``get_global()``, ``get_texref()`` and ``get_surfref()``
+    return proxy objects that can be stored by ``DeferredFunction.prepare()``
+    and will only be evaluated at call-time.
+    This class must be subclassed and the function ``create_source(self,
+    grid, block, *args)`` must be overridden, returning the kernel source
+    (or ``DeferredSource`` object) that should be compiled.  ``grid``,
+    ``block``, and ``*args`` are the same arguments that were sent to the
+    ``DeferredFunction`` call functions above.
+    The function ``create_key(self, grid, block, *args)`` is always
+    called before ``create_source`` and the key returned is used to cache
+    any compiled functions.  Default return value of ``create_key()`` is
+    None, which means to use the function name and generated source as the
+    key.  The return value of ``create_key()`` must be usable as a hash
+    key.
+    '''
+    _cache = {}
+
+    def __init__(self, nvcc="nvcc", options=None, keep=False,
+            no_extern_c=False, arch=None, code=None, cache_dir=None,
+            include_dirs=[]):
+        self._arch = arch
+        # tuples below are so _compileargs can be used as a hash key
+        if options is not None:
+            options = tuple(options)
+        include_dirs = tuple(include_dirs)
+        self._compileargs = (nvcc, options, keep, no_extern_c,
+                             arch, code, cache_dir, include_dirs)
+
+    def _delayed_compile(self, source):
+        self._check_arch(self._arch)
+
+        return _delayed_compile_aux(source, self._compileargs)
+
+    def create_key(self, grid, block, *funcargs):
+        return None
+
+    def create_source(self, grid, block, *funcargs):
+        raise NotImplementedError("create_source must be overridden!")
+
+    def _delayed_get_function(self, funcname, funcargs, grid, block):
+        '''
+        If ``create_key()`` returns non-None, then it is used as the key
+        to cache compiled functions.  Otherwise the return value of
+        ``create_source()`` is used as the key.
+        '''
+        context = pycuda.driver.Context.get_current()
+        funccache = DeferredSourceModule._cache.get(context, None)
+        if funccache is None:
+            funccache = self._cache[context] = {}
+        key = self.create_key(grid, block, *funcargs)
+        funckey = (funcname, key)
+        if key is None or funckey not in funccache:
+            source = self.create_source(grid, block, *funcargs)
+            if isinstance(source, DeferredSource):
+                source = source.generate()
+            if key is None:
+                funckey = (funcname, source)
+        modfunc = funccache.get(funckey, None)
+        if modfunc is None:
+            module = self._delayed_compile(source)
+            func = module.get_function(funcname)
+            modfunc = funccache[funckey] = (module, func)
+        return modfunc
+
+    def get_function(self, name):
+        return DeferredFunction(self, name)
+
+    def get_global(self, name):
+        raise NotImplementedError("Deferred globals in element-wise kernels not supported yet")
+
+    def get_texref(self, name):
+        return DeferredTexRef('get_texref', name)
+
+    def get_surfref(self, name):
+        raise NotImplementedError("Deferred surfaces in element-wise kernels not supported yet")
+
diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 695817fa..91a2a2c9 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -36,93 +36,468 @@
 import numpy as np
 from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg
 from pytools import memoize_method
+from pycuda.deferred import DeferredSourceModule, DeferredSource
+
+class ElementwiseSourceModule(DeferredSourceModule):
+    '''
+    This is a ``DeferredSourceModule`` which is backwards-compatible with the
+    original ``get_elwise_module`` and ``get_elwise_range_module`` (using
+    ``do_range=True``).  However, this class delays the compilation of
+    kernels until call-time.  If you send actual ``GPUArray`` arguments
+    (instead of their ``.gpudata`` members) when calling the methods
+    supported by the return value of ``get_function()``, then you get:
+      * support for array-specific flat indices (i.e. for input array ``z``,
+        you can index it as ``z[z_i]`` in addition to the old-style ``z[i]``)
+      * support for non-contiguous (and arbitrarily-strided) arrays, but
+        only if you use the array-specific indices above.
+    Array-specific flat indices only really work if all the arrays using them
+    are the same shape.  This shape is also used to optimize index
+    calculations.  By default, the shape is taken from the first argument
+    that is specified as a pointer/array, but you can override this by
+    sending ``shape_arg_index=N`` where ``N`` is the zero-based index of the
+    kernel argument whose shape should be used.
+    '''
+    def __init__(self, arguments, operation,
+                 name="kernel", preamble="", loop_prep="", after_loop="",
+                 do_range=False, shape_arg_index=None,
+                 debug=False,
+                 **compilekwargs):
+        super(ElementwiseSourceModule, self).__init__(**compilekwargs)
+        self._do_range = do_range
+        self._shape_arg_index = shape_arg_index
+        self._init_args = (tuple(arguments), operation,
+                           name, preamble, loop_prep, after_loop)
+        self._debug = debug
+
+    def create_key(self, grid, block, *args):
+        (arguments, operation,
+         funcname, preamble, loop_prep, after_loop) = self._init_args
+        shape_arg_index = self._shape_arg_index
+
+        # 'args' is the list of actual parameters being sent to the kernel
+        # 'arguments' is the list of argument descriptors (VectorArg, ScalarArg)
+
+        arraypairs = []
+        contigmatch = True
+        arrayspecificinds = True
+        shape = None
+        size = None
+        order = None
+        for i, argpair in enumerate(zip(args, arguments)):
+            arg, arg_descr = argpair
+            if isinstance(arg_descr, VectorArg):
+                # is a GPUArray/DeviceAllocation
+                arraypairs.append(argpair)
+                if not arrayspecificinds:
+                    continue
+                if not hasattr(arg, 'shape'):
+                    # At least one array argument is probably sent as a
+                    # GPUArray.gpudata rather than the GPUArray itself,
+                    # so disable array-specific indices -- caller is on
+                    # their own.
+                    arrayspecificinds = False
+                    continue
+                curshape = arg.shape
+                cursize = arg.size
+                curorder = 'N'
+                if arg.flags.f_contiguous:
+                    curorder = 'F'
+                elif arg.flags.c_contiguous:
+                    curorder = 'C'
+                if shape is None:
+                    shape = curshape
+                    size = cursize
+                    order = curorder
+                elif curorder == 'N' or order != curorder:
+                    contigmatch = False
+                elif shape_arg_index is None and shape != curshape:
+                    raise Exception("All input arrays to elementwise kernels must have the same shape, or you must specify the argument that has the canonical shape with shape_arg_index; found shapes %s and %s" % (shape, curshape))
+                if shape_arg_index == i:
+                    shape = curshape
+
+        self._contigmatch = contigmatch
+        self._arraypairs = arraypairs
+        self._arrayspecificinds = arrayspecificinds
+
+        key = repr(self._init_args)
+
+        if contigmatch:
+            return key
+
+        # Arrays are not contiguous or different order
+
+        if grid[1] != 1 or block[1] != 1 or block[2] != 1:
+            raise Exception("Grid (%s) and block (%s) specifications should have all '1' except in the first element" % (grid, block))
+
+        ndim = len(shape)
+        numthreads = block[0]
+
+        # Use index of minimum stride in first array as a hint on how to
+        # order the traversal of dimensions.  We could probably do something
+        # smarter, like tranposing/reshaping arrays if possible to maximize
+        # performance, but that is probably best done in a pre-processing step.
+        # Note that this could mess up custom indexing that assumes a
+        # particular traversal order, but in that case one should probably
+        # ensure that inputs have the same order, and explicitly send
+        # shape_arg_index to turn this off.
+        do_reverse = False
+        if (shape_arg_index is None and
+            np.argmin(np.abs(arraypairs[0][0].strides)) > ndim // 2):
+            print "traversing dimensions in reverse order"
+            # traverse dimensions in reverse order
+            do_reverse = True
+        if do_reverse:
+            shape = shape[::-1]
+        shapearr = np.array(shape)
+        block_step = np.array(shapearr)
+        tmp = numthreads
+        for dimnum in range(ndim):
+            newstep = tmp % block_step[dimnum]
+            tmp = tmp // block_step[dimnum]
+            block_step[dimnum] = newstep
+        arrayarginfos = []
+        for arg, arg_descr in arraypairs:
+            if do_reverse:
+                elemstrides = np.array(arg.strides[::-1]) // arg.itemsize
+            else:
+                elemstrides = np.array(arg.strides) // arg.itemsize
+            dimelemstrides = elemstrides * shapearr
+            blockelemstrides = elemstrides * block_step
+            arrayarginfos.append(
+                (arg_descr.name, tuple(elemstrides), tuple(dimelemstrides), tuple(blockelemstrides))
+            )
+
+        self._arrayarginfos = arrayarginfos
+        self._ndim = ndim
+        self._numthreads = numthreads
+        self._shape = shape
+        self._block_step = block_step
+
+        key = (self._init_args, grid, block, shape, tuple(self._arrayarginfos))
+
+        return key
+
+    def create_source(self, grid, block, *args):
+        # Precondition: create_key() must have been run with the same arguments
+
+        (arguments, operation,
+         funcname, preamble, loop_prep, after_loop) = self._init_args
+
+        contigmatch = self._contigmatch
+
+        if contigmatch:
+            arraypairs = self._arraypairs
+            arrayspecificinds = self._arrayspecificinds
+
+            indtype = 'unsigned'
+            if self._do_range:
+                indtype = 'long'
+
+            # All arrays are contiguous and same order (or we don't know and
+            # it's up to the caller to make sure it works)
+            if arrayspecificinds:
+                for arg, arg_descr in arraypairs:
+                    preamble = preamble + """
+                        #define %s_i i
+                    """ % (arg_descr.name,)
+            if self._do_range:
+                loop_body = """
+                  if (step < 0)
+                  {
+                    for (i = start + (cta_start + tid)*step;
+                      i > stop; i += total_threads*step)
+                    {
+                      %(operation)s;
+                    }
+                  }
+                  else
+                  {
+                    for (i = start + (cta_start + tid)*step;
+                      i < stop; i += total_threads*step)
+                    {
+                      %(operation)s;
+                    }
+                  }
+                """ % {
+                    "operation": operation,
+                }
+            else:
+                loop_body = """
+                  for (i = cta_start + tid; i < n; i += total_threads)
+                  {
+                    %(operation)s;
+                  }
+                """ % {
+                    "operation": operation,
+                }
+
+            return """
+                #include <pycuda-complex.hpp>
+
+                %(preamble)s
+
+                __global__ void %(name)s(%(arguments)s)
+                {
+                  unsigned tid = threadIdx.x;
+                  unsigned total_threads = gridDim.x*blockDim.x;
+                  unsigned cta_start = blockDim.x*blockIdx.x;
+
+                  %(indtype)s i;
+
+                  %(loop_prep)s;
+
+                  %(loop_body)s;
+
+                  %(after_loop)s;
+                }
+                """ % {
+                    "arguments": ", ".join(arg.declarator() for arg in arguments),
+                    "name": funcname,
+                    "preamble": preamble,
+                    "loop_prep": loop_prep,
+                    "after_loop": after_loop,
+                    "loop_body": loop_body,
+                    "indtype": indtype,
+                }
+
+        # Arrays are not contiguous or different order
+
+        arrayarginfos = self._arrayarginfos
+        ndim = self._ndim
+        numthreads = self._numthreads
+        shape = self._shape
+        block_step = self._block_step
+
+        arraynames = [ x[0] for x in arrayarginfos ]
+
+        defines = DeferredSource()
+        decls = DeferredSource()
+        loop_preop = DeferredSource()
+        loop_inds_calc = DeferredSource()
+        loop_inds_inc = DeferredSource()
+        loop_body = DeferredSource()
+
+        for dimnum in range(ndim):
+            defines += """
+                #define SHAPE_%d %d
+                #define BLOCK_STEP_%d %d
+            """ % (dimnum, shape[dimnum],
+                   dimnum, block_step[dimnum])
+            for name, elemstrides, dimelemstrides, blockelemstrides in arrayarginfos:
+                basename = "%s_%d" % (name, dimnum)
+                defines += """
+                    #define ELEMSTRIDE_%s_%d %d
+                    #define DIMELEMSTRIDE_%s_%d %d
+                    #define BLOCKELEMSTRIDE_%s_%d %d
+                """ % (name, dimnum, elemstrides[dimnum],
+                       name, dimnum, dimelemstrides[dimnum],
+                       name, dimnum, blockelemstrides[dimnum])
+
+        decls += """
+            unsigned GLOBAL_i = cta_start + tid;
+        """
+        for name in arraynames:
+            decls += """
+                long %s_i = 0;
+            """ % (name,)
+        for dimnum in range(ndim):
+            decls += """
+                long INDEX_%d;
+            """ % (dimnum,)
+
+        loop_inds_calc += """
+            unsigned int TMP_GLOBAL_i = GLOBAL_i;
+        """
+        for dimnum in range(ndim):
+            loop_inds_calc += """
+                INDEX_%d = TMP_GLOBAL_i %% SHAPE_%d;
+                TMP_GLOBAL_i = TMP_GLOBAL_i / SHAPE_%d;
+            """ % (dimnum, dimnum,
+                   dimnum)
+
+            for name in arraynames:
+                loop_inds_calc += """
+                    %s_i += INDEX_%d * ELEMSTRIDE_%s_%d;
+                """ % (name, dimnum, name, dimnum)
+
+        for dimnum in range(ndim):
+            loop_inds_inc += """
+                    INDEX_%d += BLOCK_STEP_%d;
+            """ % (dimnum, dimnum)
+            for name in arraynames:
+                loop_inds_inc += """
+                    %s_i += BLOCKELEMSTRIDE_%s_%d;
+                """ % (name, name, dimnum)
+            if dimnum < ndim - 1:
+                loop_inds_inc += """
+                    if (INDEX_%d > SHAPE_%d) {
+                """ % (dimnum, dimnum)
+                loop_inds_inc.indent()
+                loop_inds_inc += """
+                      INDEX_%d -= SHAPE_%d;
+                      INDEX_%d ++;
+                """ % (dimnum, dimnum,
+                       dimnum + 1)
+                for name in arraynames:
+                    loop_inds_inc += """
+                      %s_i += ELEMSTRIDE_%s_%d - DIMELEMSTRIDE_%s_%d;
+                    """ % (name, name, dimnum + 1, name, dimnum)
+                loop_inds_inc.dedent()
+                loop_inds_inc += """
+                    }
+                """
+
+        if self._debug:
+            preamble += """
+                #include <stdio.h>
+            """
+            loop_inds_calc += """
+                if (cta_start == 0 && tid == 0) {
+            """
+            loop_inds_calc.indent()
+            loop_inds_calc += r"""
+                printf("=======================\n");
+                printf("CALLING FUNC %s\n");
+                printf("N=%%u\n", (unsigned int)n);
+            """ % (funcname,)
+            for name, elemstrides, dimelemstrides, blockelemstrides in arrayarginfos:
+                loop_inds_calc += r"""
+                    printf("(%s) %s: ptr=0x%%lx maxoffset(elems)=%s\n", (unsigned long)%s);
+                """ % (funcname, name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name)
+            loop_inds_calc.dedent()
+            loop_inds_calc += """
+                }
+            """
+            indtest = DeferredSource()
+            for name in arraynames:
+                indtest += r"""
+                    if (%s_i > %s || %s_i < 0) {
+                """ % (name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name)
+                indtest.indent()
+                indtest += r"""
+                        printf("cta_start=%%d tid=%%d GLOBAL_i=%%d %s_i=%%d\n", cta_start, tid, GLOBAL_i, %s_i);
+                        break;
+                """ % (name, name)
+                indtest.dedent()
+                indtest += """
+                    }
+                """
+            loop_preop = indtest + loop_preop
+            after_loop += r"""
+                if (cta_start == 0 && tid == 0) {
+                    printf("DONE CALLING FUNC %s\n");
+                    printf("-----------------------\n");
+                }
+            """ % (funcname,)
+
+        if self._do_range:
+            loop_body.add("""
+              if (step < 0)
+              {
+                for (/*void*/; GLOBAL_i > stop; GLOBAL_i += total_threads*step)
+                {
+                  %(loop_preop)s;
+
+                  %(operation)s;
+
+                  %(loop_inds_inc)s;
+                }
+              }
+              else
+              {
+                for (/*void*/; GLOBAL_i < stop; GLOBAL_i += total_threads*step)
+                {
+                  %(loop_preop)s;
+
+                  %(operation)s;
+
+                  %(loop_inds_inc)s;
+                }
+              }
+            """, format_dict={
+                "loop_preop": loop_preop,
+                "operation": operation,
+                "loop_inds_inc": loop_inds_inc,
+            })
+        else:
+            loop_body.add("""
+              for (/*void*/; GLOBAL_i < n; GLOBAL_i += total_threads)
+              {
+                %(loop_preop)s;
 
+                %(operation)s;
 
-def get_elwise_module(arguments, operation,
-        name="kernel", keep=False, options=None,
-        preamble="", loop_prep="", after_loop=""):
-    from pycuda.compiler import SourceModule
-    return SourceModule("""
-        #include <pycuda-complex.hpp>
+                %(loop_inds_inc)s;
+              }
+            """, format_dict={
+                "loop_preop": loop_preop,
+                "operation": operation,
+                "loop_inds_inc": loop_inds_inc,
+            })
 
-        %(preamble)s
+        source = DeferredSource()
 
-        __global__ void %(name)s(%(arguments)s)
-        {
+        source.add("""
+            #include <pycuda-complex.hpp>
+            #include <stdio.h>
 
-          unsigned tid = threadIdx.x;
-          unsigned total_threads = gridDim.x*blockDim.x;
-          unsigned cta_start = blockDim.x*blockIdx.x;
-          unsigned i;
+            %(defines)s
 
-          %(loop_prep)s;
+            %(preamble)s
 
-          for (i = cta_start + tid; i < n; i += total_threads)
-          {
-            %(operation)s;
-          }
+            __global__ void %(name)s(%(arguments)s)
+            {
 
-          %(after_loop)s;
-        }
-        """ % {
-            "arguments": ", ".join(arg.declarator() for arg in arguments),
-            "operation": operation,
-            "name": name,
-            "preamble": preamble,
-            "loop_prep": loop_prep,
-            "after_loop": after_loop,
-            },
-        options=options, keep=keep)
+              unsigned tid = threadIdx.x;
+              unsigned total_threads = gridDim.x*blockDim.x;
+              unsigned cta_start = blockDim.x*blockIdx.x;
 
+              %(decls)s
 
-def get_elwise_range_module(arguments, operation,
-        name="kernel", keep=False, options=None,
-        preamble="", loop_prep="", after_loop=""):
-    from pycuda.compiler import SourceModule
-    return SourceModule("""
-        #include <pycuda-complex.hpp>
-
-        %(preamble)s
-
-        __global__ void %(name)s(%(arguments)s)
-        {
-          unsigned tid = threadIdx.x;
-          unsigned total_threads = gridDim.x*blockDim.x;
-          unsigned cta_start = blockDim.x*blockIdx.x;
-          long i;
-
-          %(loop_prep)s;
-
-          if (step < 0)
-          {
-            for (i = start + (cta_start + tid)*step;
-              i > stop; i += total_threads*step)
-            {
-              %(operation)s;
-            }
-          }
-          else
-          {
-            for (i = start + (cta_start + tid)*step;
-              i < stop; i += total_threads*step)
-            {
-              %(operation)s;
+              %(loop_prep)s;
+
+              %(loop_inds_calc)s;
+
+              %(loop_body)s;
+
+              %(after_loop)s;
             }
-          }
-
-          %(after_loop)s;
-        }
-        """ % {
-            "arguments": ", ".join(arg.declarator() for arg in arguments),
-            "operation": operation,
-            "name": name,
-            "preamble": preamble,
-            "loop_prep": loop_prep,
-            "after_loop": after_loop,
-            },
-        options=options, keep=keep)
+            """, format_dict={
+                "arguments": ", ".join(arg.declarator() for arg in arguments),
+                "name": funcname,
+                "preamble": preamble,
+                "loop_prep": loop_prep,
+                "after_loop": after_loop,
+                "defines": defines,
+                "decls": decls,
+                "loop_inds_calc": loop_inds_calc,
+                "loop_body": loop_body,
+            })
+
+        return source
 
 
+def get_elwise_module(arguments, operation,
+        name="kernel", keep=False, options=None,
+        preamble="", loop_prep="", after_loop="",
+        **kwargs):
+    return ElementwiseSourceModule(arguments, operation,
+                                   name=name, preamble=preamble,
+                                   loop_prep=loop_prep, after_loop=after_loop,
+                                   keep=keep, options=options,
+                                   **kwargs)
+
+def get_elwise_range_module(arguments, operation,
+        name="kernel", keep=False, options=None,
+        preamble="", loop_prep="", after_loop="",
+        **kwargs):
+    return ElementwiseSourceModule(arguments, operation,
+                                   name=name, preamble=preamble,
+                                   loop_prep=loop_prep, after_loop=after_loop,
+                                   keep=keep, options=options,
+                                   do_range=True,
+                                   **kwargs)
+
 def get_elwise_kernel_and_types(arguments, operation,
         name="kernel", keep=False, options=None, use_range=False, **kwargs):
     if isinstance(arguments, str):

From a1525de0de33f3e9a8c8aba22bc705d5982691b8 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Mon, 19 Feb 2018 12:24:20 -0500
Subject: [PATCH 02/35] Smarter _new_like_me that handles discontiguous input. 
 Have copy() use it too.

---
 pycuda/gpuarray.py | 113 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 97 insertions(+), 16 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 10c687d0..1d312bee 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -246,13 +246,14 @@ def set_async(self, ary, stream=None):
         return self.set(ary, async=True, stream=stream)
 
     def get(self, ary=None, pagelocked=False, async=False, stream=None):
+        slicer = None
         if ary is None:
             if pagelocked:
                 ary = drv.pagelocked_empty(self.shape, self.dtype)
             else:
                 ary = np.empty(self.shape, self.dtype)
 
-            strides = _compact_strides(self)
+            self, strides, slicer = _compact_positive_strides(self)
             ary = _as_strided(ary, strides=strides)
         else:
             if self.size != ary.size:
@@ -269,13 +270,15 @@ def get(self, ary=None, pagelocked=False, async=False, stream=None):
 
         if self.size:
             _memcpy_discontig(ary, self, async=async, stream=stream)
+        if slicer:
+            ary = ary[slicer]
         return ary
 
     def get_async(self, stream=None, ary=None):
         return self.get(ary=ary, async=True, stream=stream)
 
-    def copy(self):
-        new = GPUArray(self.shape, self.dtype, self.allocator)
+    def copy(self, order="K"):
+        new = self._new_like_me(order=order)
         _memcpy_discontig(new, self)
         return new
 
@@ -375,15 +378,52 @@ def _div(self, other, out, stream=None):
 
         return out
 
-    def _new_like_me(self, dtype=None, order="C"):
-        strides = None
+    def _new_like_me(self, dtype=None, order="K"):
+        slicer, selflist = _flip_negative_strides((self,))
+        self = selflist[0]
+        ndim = self.ndim
+        shape = self.shape
+        mystrides = self.strides
+        mydtype = self.dtype
+        myitemsize = mydtype.itemsize
         if dtype is None:
-            dtype = self.dtype
-        if dtype == self.dtype:
-            strides = self.strides
-
-        return self.__class__(self.shape, dtype,
-                allocator=self.allocator, strides=strides, order=order)
+            dtype = mydtype
+        else:
+            dtype = np.dtype(dtype)
+        itemsize = dtype.itemsize
+        if order == "K":
+            if self.flags.c_contiguous:
+                order = "C"
+            elif self.flags.f_contiguous:
+                order = "F"
+        if order == "C":
+            newstrides = _c_contiguous_strides(itemsize, shape)
+        elif order == "F":
+            newstrides = _f_contiguous_strides(itemsize, shape)
+        else:
+            maxstride = mystrides[0]
+            maxstrideind = 0
+            for i in range(1, ndim):
+                curstride = mystrides[i]
+                if curstride > maxstride:
+                    maxstrideind = i
+                    maxstride = curstride
+            mymaxoffset = (maxstride / myitemsize) * shape[maxstrideind]
+            if mymaxoffset <= self.size:
+                # it's probably safe to just allocate and pass strides
+                # XXX (do we need to calculate full offset for [-1,-1,-1...]?)
+                newstrides = tuple((x // myitemsize) * itemsize for x in mystrides)
+            else:
+                # just punt and choose something close
+                if ndim > 1 and maxstrideind == 0:
+                    newstrides = _c_contiguous_strides(itemsize, shape)
+                else:
+                    newstrides = _f_contiguous_strides(itemsize, shape)
+        retval = self.__class__(shape, dtype,
+                     allocator=self.allocator, strides=newstrides)
+        if slicer:
+            retval = retval[slicer]
+        return retval
 
     # operators ---------------------------------------------------------------
     def mul_add(self, selffac, other, otherfac, add_timer=None, stream=None):
@@ -1012,15 +1052,21 @@ def conj(self):
 
 def to_gpu(ary, allocator=drv.mem_alloc):
     """converts a numpy array to a GPUArray"""
-    result = GPUArray(ary.shape, ary.dtype, allocator, strides=_compact_strides(ary))
+    ary, newstrides, slicer = _compact_positive_strides(ary)
+    result = GPUArray(ary.shape, ary.dtype, allocator, strides=newstrides)
     result.set(ary)
+    if slicer:
+        result = result[slicer]
     return result
 
 
 def to_gpu_async(ary, allocator=drv.mem_alloc, stream=None):
     """converts a numpy array to a GPUArray"""
-    result = GPUArray(ary.shape, ary.dtype, allocator, strides=_compact_strides(ary))
+    ary, newstrides, slicer = _compact_positive_strides(ary)
+    result = GPUArray(ary.shape, ary.dtype, allocator, strides=newstrides)
     result.set_async(ary, stream)
+    if slicer:
+        result = result[slicer]
     return result
 
 
@@ -1180,8 +1226,41 @@ class Info(Record):
 # }}}
 
 
-def _compact_strides(a):
-    # Compute strides to have same order as self, but packed
+def _flip_negative_strides(arrays):
+    # If arrays have negative strides, flip them.  Return a list
+    # ``(slicer, arrays)`` where ``slicer`` is a tuple of slice objects
+    # used to flip the arrays (or ``None`` if there was no flipping),
+    # and ``arrays`` is the list of flipped arrays.
+    # NOTE: Every input array A must have the same value for the following
+    # expression:  np.sign(A.strides)
+    # NOTE: ``slicer`` is its own inverse, so ``A[slicer][slicer] == A``
+    if isinstance(arrays, GPUArray):
+        raise TypeError("_flip_negative_strides expects a list of GPUArrays")
+    slicer = None
+    ndim = arrays[0].ndim
+    shape = arrays[0].shape
+    for t in zip(range(ndim), *[np.sign(x.strides) for x in arrays]):
+        axis = t[0]
+        stride_sign = t[1]
+        if len(arrays) > 1:
+            if not np.all(t[2:] == stride_sign):
+                raise ValueError("found differing signs in dimension %d: %s" % (axis, t[1:]))
+        if stride_sign == -1:
+            if slicer is None:
+                slicer = [slice(None)] * ndim
+            slicer[axis] = slice(None, None, -1)
+    if slicer is not None:
+        slicer = tuple(slicer)
+        arrays = [x[slicer] for x in arrays]
+    return slicer, arrays
+
+
+def _compact_positive_strides(a):
+    # Flip ``a``'s axes if there are any negative strides, then compute
+    # strides to have same order as a, but packed.  Return flipped ``a``
+    # and packed strides.
+    slicer, alist = _flip_negative_strides((a,))
+    a = alist[0]
     info = sorted(
             (a.strides[axis], a.shape[axis], axis)
             for axis in range(len(a.shape)))
@@ -1191,7 +1270,7 @@ def _compact_strides(a):
     for _, dim, axis in info:
         strides[axis] = stride
         stride *= dim
-    return strides
+    return a, strides, slicer
 
 
 def _memcpy_discontig(dst, src, async=False, stream=None):
@@ -1216,6 +1295,8 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
         dst[...] = src
         return
 
+    src, dst = _flip_negative_strides((src, dst))[1]
+    
     if src.flags.forc and dst.flags.forc:
         shape = [src.size]
         src_strides = dst_strides = [src.dtype.itemsize]

From 8522e7f225a15b3e8c927612a955e51c6c171c94 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 21 Feb 2018 11:24:01 -0500
Subject: [PATCH 03/35] Allow existing kernel calls to use non-contiguous
 arrays by sending actual GPUArrays and using ARRAY_i indices (rather than
 just i). Squashed multiple commits into one to make changes clearer.

---
 pycuda/cumath.py      |  10 +--
 pycuda/curandom.py    |   2 +-
 pycuda/elementwise.py |  77 +++++++++++-----------
 pycuda/gpuarray.py    | 146 ++++++++++++------------------------------
 4 files changed, 83 insertions(+), 152 deletions(-)

diff --git a/pycuda/cumath.py b/pycuda/cumath.py
index dbae5bd6..1a8dcb5b 100644
--- a/pycuda/cumath.py
+++ b/pycuda/cumath.py
@@ -42,7 +42,7 @@ def f(array, stream_or_out=None, **kwargs):
 
         func = elementwise.get_unary_func_kernel(func_name, array.dtype)
         func.prepared_async_call(array._grid, array._block, stream,
-                array.gpudata, out.gpudata, array.mem_size)
+                array, out, array.mem_size)
 
         return out
     return f
@@ -77,7 +77,7 @@ def fmod(arg, mod, stream=None):
 
     func = elementwise.get_fmod_kernel()
     func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size)
+            arg, mod, result, arg.mem_size)
 
     return result
 
@@ -94,7 +94,7 @@ def frexp(arg, stream=None):
 
     func = elementwise.get_frexp_kernel()
     func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size)
+            arg, sig, expt, arg.mem_size)
 
     return sig, expt
 
@@ -111,7 +111,7 @@ def ldexp(significand, exponent, stream=None):
 
     func = elementwise.get_ldexp_kernel()
     func.prepared_async_call(significand._grid, significand._block, stream,
-            significand.gpudata, exponent.gpudata, result.gpudata,
+            significand, exponent, result,
             significand.mem_size)
 
     return result
@@ -129,7 +129,7 @@ def modf(arg, stream=None):
 
     func = elementwise.get_modf_kernel()
     func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, intpart.gpudata, fracpart.gpudata,
+            arg, intpart, fracpart,
             arg.mem_size)
 
     return fracpart, intpart
diff --git a/pycuda/curandom.py b/pycuda/curandom.py
index e1c68428..d31ee01a 100644
--- a/pycuda/curandom.py
+++ b/pycuda/curandom.py
@@ -240,7 +240,7 @@ def rand(shape, dtype=np.float32, stream=None):
         raise NotImplementedError;
 
     func.prepared_async_call(result._grid, result._block, stream,
-            result.gpudata, np.random.randint(2**31-1), result.size)
+            result, np.random.randint(2**31-1), result.size)
 
     return result
 
diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 91a2a2c9..7487f25c 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -119,9 +119,8 @@ def create_key(self, grid, block, *args):
         self._arraypairs = arraypairs
         self._arrayspecificinds = arrayspecificinds
 
-        key = repr(self._init_args)
-
         if contigmatch:
+            key = repr(self._init_args)
             return key
 
         # Arrays are not contiguous or different order
@@ -579,12 +578,8 @@ def __call__(self, *args, **kwargs):
 
         for arg, arg_descr in zip(args, arguments):
             if isinstance(arg_descr, VectorArg):
-                if not arg.flags.forc:
-                    raise RuntimeError("elementwise kernel cannot "
-                            "deal with non-contiguous arrays")
-
                 vectors.append(arg)
-                invocation_args.append(arg.gpudata)
+                invocation_args.append(arg)
             else:
                 invocation_args.append(arg)
 
@@ -631,9 +626,9 @@ def get_take_kernel(dtype, idx_dtype, vec_count=1):
         "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i)
         for i in range(vec_count))
     body = (
-            ("%(idx_tp)s src_idx = idx[i];\n" % ctx)
+            ("%(idx_tp)s src_idx = idx[idx_i];\n" % ctx)
             + "\n".join(
-                "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)
+                "dest%d[dest%d_i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i, i)
                 for i in range(vec_count)))
 
     mod = get_elwise_module(args, body, "take", preamble=preamble)
@@ -676,11 +671,12 @@ def get_copy_insn(i):
             return ("dest%d[dest_idx] = "
                     "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i))
 
-    body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n"
-                "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx)
+    body = (("%(idx_tp)s src_idx = gmem_src_idx[gmem_src_idx_i];\n"
+                "%(idx_tp)s dest_idx = gmem_dest_idx[gmem_dest_idx_i];\n" % ctx)
             + "\n".join(get_copy_insn(i) for i in range(vec_count)))
 
-    mod = get_elwise_module(args, body, "take_put", preamble=preamble)
+    mod = get_elwise_module(args, body, "take_put",
+                            preamble=preamble, shape_arg_index=0)
     func = mod.get_function("take_put")
     tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]
 
@@ -710,11 +706,12 @@ def get_put_kernel(dtype, idx_dtype, vec_count=1):
             ] + [ScalarArg(np.intp, "n")]
 
     body = (
-            "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx
-            + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i)
+            "%(idx_tp)s dest_idx = gmem_dest_idx[gmem_dest_idx_i];\n" % ctx
+            + "\n".join("dest%d[dest_idx] = src%d[src%d_i];" % (i, i, i)
                 for i in range(vec_count)))
 
-    func = get_elwise_module(args, body, "put").get_function("put")
+    func = get_elwise_module(args, body, "put",
+                             shape_arg_index=0).get_function("put")
     func.prepare("P"+(2*vec_count*"P")+np.dtype(np.uintp).char)
     return func
 
@@ -726,7 +723,7 @@ def get_copy_kernel(dtype_dest, dtype_src):
                 "tp_dest": dtype_to_ctype(dtype_dest),
                 "tp_src": dtype_to_ctype(dtype_src),
                 },
-            "dest[i] = src[i]",
+            "dest[dest_i] = src[src_i]",
             "copy")
 
 
@@ -758,13 +755,13 @@ def get_linear_combination_kernel(summand_descriptors,
             args.append(ScalarArg(scalar_dtype, "a%d" % i))
             args.append(VectorArg(vector_dtype, "x%d" % i))
 
-        summands.append("a%d*x%d[i]" % (i, i))
+        summands.append("a%d*x%d[x%d_i]" % (i, i, i))
 
     args.append(VectorArg(dtype_z, "z"))
     args.append(ScalarArg(np.uintp, "n"))
 
     mod = get_elwise_module(args,
-            "z[i] = " + " + ".join(summands),
+            "z[z_i] = " + " + ".join(summands),
             "linear_combination",
             preamble="\n".join(preamble),
             loop_prep=";\n".join(loop_prep))
@@ -785,7 +782,7 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = a*x[i] + b*y[i]",
+            "z[z_i] = a*x[x_i] + b*y[y_i]",
             "axpbyz")
 
 
@@ -796,7 +793,7 @@ def get_axpbz_kernel(dtype_x, dtype_z):
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_z": dtype_to_ctype(dtype_z)
                 },
-            "z[i] = a * x[i] + b",
+            "z[z_i] = a * x[x_i] + b",
             "axpb")
 
 
@@ -808,7 +805,7 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = x[i] %s y[i]" % operator,
+            "z[z_i] = x[x_i] %s y[y_i]" % operator,
             "multiply")
 
 
@@ -819,7 +816,7 @@ def get_rdivide_elwise_kernel(dtype_x, dtype_z):
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = y / x[i]",
+            "z[z_i] = y / x[x_i]",
             "divide_r")
 
 
@@ -831,7 +828,7 @@ def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = %s(x[i], y[i])" % func,
+            "z[z_i] = %s(x[x_i], y[y_i])" % func,
             func+"_kernel")
 
 
@@ -843,7 +840,7 @@ def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = %s(x[i], y)" % func,
+            "z[z_i] = %s(x[x_i], y)" % func,
             func+"_kernel")
 
 
@@ -867,7 +864,7 @@ def get_fill_kernel(dtype):
             "%(tp)s a, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = a",
+            "z[z_i] = a",
             "fill")
 
 
@@ -877,7 +874,7 @@ def get_reverse_kernel(dtype):
             "%(tp)s *y, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = y[n-1-i]",
+            "z[z_i] = y[n-1-y_i]",
             "reverse")
 
 
@@ -888,7 +885,7 @@ def get_real_kernel(dtype, real_dtype):
                 "tp": dtype_to_ctype(dtype),
                 "real_tp": dtype_to_ctype(real_dtype),
                 },
-            "z[i] = real(y[i])",
+            "z[z_i] = real(y[y_i])",
             "real")
 
 
@@ -899,7 +896,7 @@ def get_imag_kernel(dtype, real_dtype):
                 "tp": dtype_to_ctype(dtype),
                 "real_tp": dtype_to_ctype(real_dtype),
                 },
-            "z[i] = imag(y[i])",
+            "z[z_i] = imag(y[y_i])",
             "imag")
 
 
@@ -909,7 +906,7 @@ def get_conj_kernel(dtype):
             "%(tp)s *y, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = pycuda::conj(y[i])",
+            "z[z_i] = pycuda::conj(y[y_i])",
             "conj")
 
 
@@ -919,7 +916,7 @@ def get_arange_kernel(dtype):
             "%(tp)s *z, %(tp)s start, %(tp)s step" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = start + i*step",
+            "z[z_i] = start + z_i*step",
             "arange")
 
 
@@ -934,7 +931,7 @@ def get_pow_kernel(dtype):
             "%(tp)s value, %(tp)s *y, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = %s(y[i], value)" % func,
+            "z[z_i] = %s(y[y_i], value)" % func,
             "pow_method")
 
 
@@ -951,7 +948,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = %s(x[i], y[i])" % func,
+            "z[z_i] = %s(x[x_i], y[y_i])" % func,
             "pow_method")
 
 
@@ -959,7 +956,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
 def get_fmod_kernel():
     return get_elwise_kernel(
             "float *arg, float *mod, float *z",
-            "z[i] = fmod(arg[i], mod[i])",
+            "z[z_i] = fmod(arg[arg_i], mod[mod_i])",
             "fmod_kernel")
 
 
@@ -967,7 +964,7 @@ def get_fmod_kernel():
 def get_modf_kernel():
     return get_elwise_kernel(
             "float *x, float *intpart ,float *fracpart",
-            "fracpart[i] = modf(x[i], &intpart[i])",
+            "fracpart[fracpart_i] = modf(x[x_i], &intpart[intpart_i])",
             "modf_kernel")
 
 
@@ -977,8 +974,8 @@ def get_frexp_kernel():
             "float *x, float *significand, float *exponent",
             """
                 int expt = 0;
-                significand[i] = frexp(x[i], &expt);
-                exponent[i] = expt;
+                significand[significand_i] = frexp(x[x_i], &expt);
+                exponent[exponent_i] = expt;
             """,
             "frexp_kernel")
 
@@ -987,7 +984,7 @@ def get_frexp_kernel():
 def get_ldexp_kernel():
     return get_elwise_kernel(
             "float *sig, float *expt, float *z",
-            "z[i] = ldexp(sig[i], int(expt[i]))",
+            "z[z_i] = ldexp(sig[sig_i], int(expt[expt_i]))",
             "ldexp_kernel")
 
 
@@ -1001,7 +998,7 @@ def get_unary_func_kernel(func_name, in_dtype, out_dtype=None):
                 "tp_in": dtype_to_ctype(in_dtype),
                 "tp_out": dtype_to_ctype(out_dtype),
                 },
-            "z[i] = %s(y[i])" % func_name,
+            "z[z_i] = %s(y[y_i])" % func_name,
             "%s_kernel" % func_name)
 
 
@@ -1013,7 +1010,7 @@ def get_if_positive_kernel(crit_dtype, dtype):
             VectorArg(dtype, "else_"),
             VectorArg(dtype, "result"),
             ],
-            "result[i] = crit[i] > 0 ? then_[i] : else_[i]",
+            "result[result_i] = crit[crit_i] > 0 ? then_[then__i] : else_[else__i]",
             "if_positive")
 
 
@@ -1025,5 +1022,5 @@ def get_scalar_op_kernel(dtype_x, dtype_y, operator):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_a": dtype_to_ctype(dtype_x),
                 },
-            "y[i] = x[i] %s a" % operator,
+            "y[y_i] = x[x_i] %s a" % operator,
             "scalarop_kernel")
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 1d312bee..2fe366d4 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -118,23 +118,15 @@ def splay(n, dev=None):
 
 def _make_binary_op(operator):
     def func(self, other):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         if isinstance(other, GPUArray):
             assert self.shape == other.shape
 
-            if not other.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
-
             result = self._new_like_me()
             func = elementwise.get_binary_op_kernel(
                     self.dtype, other.dtype, result.dtype,
                     operator)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other.gpudata, result.gpudata,
+                    self, other, result,
                     self.mem_size)
 
             return result
@@ -143,7 +135,7 @@ def func(self, other):
             func = elementwise.get_scalar_op_kernel(
                     self.dtype, result.dtype, operator)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other, result.gpudata,
+                    self, other, result,
                     self.mem_size)
             return result
 
@@ -300,47 +292,36 @@ def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None):
         """Compute ``out = selffac * self + otherfac*other``,
         where `other` is a vector.."""
         assert self.shape == other.shape
-        if not self.flags.forc or not other.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
 
         func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype)
 
         if add_timer is not None:
             add_timer(3*self.size, func.prepared_timed_call(self._grid,
-                selffac, self.gpudata, otherfac, other.gpudata,
-                out.gpudata, self.mem_size))
+                selffac, self, otherfac, other,
+                out, self.mem_size))
         else:
             func.prepared_async_call(self._grid, self._block, stream,
-                    selffac, self.gpudata, otherfac, other.gpudata,
-                    out.gpudata, self.mem_size)
+                    selffac, self, otherfac, other,
+                    out, self.mem_size)
 
         return out
 
     def _axpbz(self, selffac, other, out, stream=None):
         """Compute ``out = selffac * self + other``, where `other` is a scalar."""
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         func = elementwise.get_axpbz_kernel(self.dtype, out.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                selffac, self.gpudata,
-                other, out.gpudata, self.mem_size)
+                selffac, self,
+                other, out, self.mem_size)
 
         return out
 
     def _elwise_multiply(self, other, out, stream=None):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
                 out.dtype, "*")
         func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size)
+                self, other,
+                out, self.mem_size)
 
         return out
 
@@ -350,31 +331,23 @@ def _rdiv_scalar(self, other, out, stream=None):
            y = n / self
         """
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other,
-                out.gpudata, self.mem_size)
+                self, other,
+                out, self.mem_size)
 
         return out
 
     def _div(self, other, out, stream=None):
         """Divides an array by another array."""
 
-        if not self.flags.forc or not other.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         assert self.shape == other.shape
 
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
                 out.dtype, "/")
         func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size)
+                self, other,
+                out, self.mem_size)
 
         return out
 
@@ -554,7 +527,7 @@ def fill(self, value, stream=None):
         """fills the array with the specified value"""
         func = elementwise.get_fill_kernel(self.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                value, self.gpudata, self.mem_size)
+                value, self, self.mem_size)
 
         return self
 
@@ -640,7 +613,7 @@ def __abs__(self):
                 out_dtype=out_dtype)
 
         func.prepared_async_call(self._grid, self._block, None,
-                self.gpudata, result.gpudata, self.mem_size)
+                self, result, self.mem_size)
 
         return result
 
@@ -651,10 +624,6 @@ def _pow(self, other, new):
         """
 
         if isinstance(other, GPUArray):
-            if not self.flags.forc or not other.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
-
             assert self.shape == other.shape
 
             if new:
@@ -666,22 +635,18 @@ def _pow(self, other, new):
                     self.dtype, other.dtype, result.dtype)
 
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other.gpudata, result.gpudata,
+                    self, other, result,
                     self.mem_size)
 
             return result
         else:
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
-
             if new:
                 result = self._new_like_me()
             else:
                 result = self
             func = elementwise.get_pow_kernel(self.dtype)
             func.prepared_async_call(self._grid, self._block, None,
-                    other, self.gpudata, result.gpudata,
+                    other, self, result,
                     self.mem_size)
 
             return result
@@ -713,24 +678,16 @@ def reverse(self, stream=None):
         as one-dimensional.
         """
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         result = self._new_like_me()
 
         func = elementwise.get_reverse_kernel(self.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, result.gpudata,
+                self, result,
                 self.mem_size)
 
         return result
 
     def astype(self, dtype, stream=None):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         if dtype == self.dtype:
             return self.copy()
 
@@ -738,7 +695,7 @@ def astype(self, dtype, stream=None):
 
         func = elementwise.get_copy_kernel(dtype, self.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                result.gpudata, self.gpudata,
+                result, self,
                 self.mem_size)
 
         return result
@@ -750,9 +707,6 @@ def reshape(self, *shape, **kwargs):
         order = kwargs.pop("order", "C")
 
         # TODO: add more error-checking, perhaps
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
 
         if isinstance(shape[0], tuple) or isinstance(shape[0], list):
             shape = tuple(shape[0])
@@ -970,15 +924,11 @@ def real(self):
         if issubclass(dtype.type, np.complexfloating):
             from pytools import match_precision
             real_dtype = match_precision(np.dtype(np.float64), dtype)
-            if self.flags.f_contiguous:
-                order = "F"
-            else:
-                order = "C"
-            result = self._new_like_me(dtype=real_dtype, order=order)
+            result = self._new_like_me(dtype=real_dtype)
 
             func = elementwise.get_real_kernel(dtype, real_dtype)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
+                    self, result,
                     self.mem_size)
 
             return result
@@ -989,21 +939,13 @@ def real(self):
     def imag(self):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
-
             from pytools import match_precision
             real_dtype = match_precision(np.dtype(np.float64), dtype)
-            if self.flags.f_contiguous:
-                order = "F"
-            else:
-                order = "C"
-            result = self._new_like_me(dtype=real_dtype, order=order)
+            result = self._new_like_me(dtype=real_dtype)
 
             func = elementwise.get_imag_kernel(dtype, real_dtype)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
+                    self, result,
                     self.mem_size)
 
             return result
@@ -1013,19 +955,11 @@ def imag(self):
     def conj(self):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
-
-            if self.flags.f_contiguous:
-                order = "F"
-            else:
-                order = "C"
-            result = self._new_like_me(order=order)
+            result = self._new_like_me()
 
             func = elementwise.get_conj_kernel(dtype)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
+                    self, result,
                     self.mem_size)
 
             return result
@@ -1219,7 +1153,7 @@ class Info(Record):
 
     func = elementwise.get_arange_kernel(dtype)
     func.prepared_async_call(result._grid, result._block, kwargs.get("stream"),
-            result.gpudata, start, step, size)
+            result, start, step, size)
 
     return result
 
@@ -1244,7 +1178,7 @@ def _flip_negative_strides(arrays):
         stride_sign = t[1]
         if len(arrays) > 1:
             if not np.all(t[2:] == stride_sign):
-                raise ValueError("found differing signs in dimension %d: %s" % (axis, t[1:]))
+                raise ValueError("found differing signs in strides for dimension %d (strides for all arrays: %s)" % (axis, [x.strides for x in arrays]))
         if stride_sign == -1:
             if slicer is None:
                 slicer = [slice(None)] * ndim
@@ -1425,7 +1359,7 @@ def take(a, indices, out=None, stream=None):
     a.bind_to_texref_ext(tex_src[0], allow_double_hack=True, allow_complex_hack=True)
 
     func.prepared_async_call(out._grid, out._block, stream,
-            indices.gpudata, out.gpudata, indices.size)
+            indices, out, indices.size)
 
     return out
 
@@ -1467,8 +1401,8 @@ def make_func_for_chunk_size(chunk_size):
             a.bind_to_texref_ext(tex_src[i], allow_double_hack=True)
 
         func.prepared_async_call(indices._grid, indices._block, stream,
-                indices.gpudata,
-                *([o.gpudata for o in out[chunk_slice]]
+                indices,
+                *([o for o in out[chunk_slice]]
                     + [indices.size]))
 
     return out
@@ -1532,8 +1466,8 @@ def make_func_for_chunk_size(chunk_size):
             a.bind_to_texref_ext(src_tr, allow_double_hack=True)
 
         func.prepared_async_call(src_indices._grid,  src_indices._block, stream,
-                dest_indices.gpudata, src_indices.gpudata,
-                *([o.gpudata for o in out[chunk_slice]]
+                dest_indices, src_indices,
+                *([o for o in out[chunk_slice]]
                     + src_offsets_list[chunk_slice]
                     + [src_indices.size]))
 
@@ -1577,9 +1511,9 @@ def make_func_for_chunk_size(chunk_size):
             func = make_func_for_chunk_size(vec_count-start_i)
 
         func.prepared_async_call(dest_indices._grid, dest_indices._block, stream,
-                dest_indices.gpudata,
-                *([o.gpudata for o in out[chunk_slice]]
-                    + [i.gpudata for i in arrays[chunk_slice]]
+                dest_indices,
+                *([o for o in out[chunk_slice]]
+                    + [i for i in arrays[chunk_slice]]
                     + [dest_indices.size]))
 
     return out
@@ -1631,7 +1565,7 @@ def if_positive(criterion, then_, else_, out=None, stream=None):
         out = empty_like(then_)
 
     func.prepared_async_call(criterion._grid, criterion._block, stream,
-            criterion.gpudata, then_.gpudata, else_.gpudata, out.gpudata,
+            criterion, then_, else_, out,
             criterion.size)
 
     return out
@@ -1646,14 +1580,14 @@ def f(a, b, out=None, stream=None):
                     a.dtype, b.dtype, out.dtype, use_scalar=False)
 
             func.prepared_async_call(a._grid, a._block, stream,
-                    a.gpudata, b.gpudata, out.gpudata, a.size)
+                    a, b, out, a.size)
         elif isinstance(a, GPUArray):
             if out is None:
                 out = empty_like(a)
             func = elementwise.get_binary_minmax_kernel(which,
                     a.dtype, a.dtype, out.dtype, use_scalar=True)
             func.prepared_async_call(a._grid, a._block, stream,
-                    a.gpudata, b, out.gpudata, a.size)
+                    a, b, out, a.size)
         else:  # assuming b is a GPUArray
             if out is None:
                 out = empty_like(b)
@@ -1661,7 +1595,7 @@ def f(a, b, out=None, stream=None):
                     b.dtype, b.dtype, out.dtype, use_scalar=True)
             # NOTE: we switch the order of a and b here!
             func.prepared_async_call(b._grid, b._block, stream,
-                    b.gpudata, a, out.gpudata, b.size)
+                    b, a, out, b.size)
         return out
     return f
 

From 028c1a63c5da0955bb32454a95d4adf488f83163 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 21 Feb 2018 15:26:47 -0500
Subject: [PATCH 04/35] Allow setting scalars.

---
 pycuda/gpuarray.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 2fe366d4..a714f572 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -912,7 +912,11 @@ def __getitem__(self, index):
                 strides=tuple(new_strides))
 
     def __setitem__(self, index, value):
-        _memcpy_discontig(self[index], value)
+        if isinstance(value, GPUArray) or isinstance(value, np.ndarray):
+            return _memcpy_discontig(self[index], value)
+
+        # Let's assume it's a scalar
+        self[index].fill(value)
 
     # }}}
 

From 97c6b8a706ba37cabca413504f9bcaf32bd8e253 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 28 Feb 2018 13:46:44 -0500
Subject: [PATCH 05/35] Fix kernel name.

---
 pycuda/elementwise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 7487f25c..ae158d36 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -806,7 +806,7 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[z_i] = x[x_i] %s y[y_i]" % operator,
-            "multiply")
+            "binary_op")
 
 
 @context_dependent_memoize

From aa83f431c70b67f27b68ba5bfdc4ba9d3b1075d6 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 28 Feb 2018 13:52:38 -0500
Subject: [PATCH 06/35] Fix _array_like_helper to work with non-contiguous
 arrays.

---
 pycuda/gpuarray.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index a714f572..6fe18ae3 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1034,10 +1034,10 @@ def _array_like_helper(other_ary, dtype, order):
             order = "F"
         else:
             # array_like routines only return positive strides
-            strides = [np.abs(s) for s in other_ary.strides]
+            _, strides, _ = _compact_positive_strides(other_ary)
             if dtype is not None and dtype != other_ary.dtype:
                 # scale strides by itemsize when dtype is not the same
-                itemsize = other_ary.nbytes // other_ary.size
+                itemsize = other_ary.dtype.itemsize
                 itemsize_ratio = np.dtype(dtype).itemsize / itemsize
                 strides = [int(s*itemsize_ratio) for s in strides]
     elif order not in ["C", "F"]:

From d50fa84cabe5c868b4ded64d5dfb91acd44c87b8 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 15 Mar 2018 10:42:10 -0400
Subject: [PATCH 07/35] Minor, but important fixes to index calculations!

---
 pycuda/elementwise.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index ae158d36..39bd61e5 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -129,7 +129,7 @@ def create_key(self, grid, block, *args):
             raise Exception("Grid (%s) and block (%s) specifications should have all '1' except in the first element" % (grid, block))
 
         ndim = len(shape)
-        numthreads = block[0]
+        numthreads = block[0] * grid[0]
 
         # Use index of minimum stride in first array as a hint on how to
         # order the traversal of dimensions.  We could probably do something
@@ -329,7 +329,7 @@ def create_source(self, grid, block, *args):
                 """ % (name, name, dimnum)
             if dimnum < ndim - 1:
                 loop_inds_inc += """
-                    if (INDEX_%d > SHAPE_%d) {
+                    if (INDEX_%d >= SHAPE_%d) {
                 """ % (dimnum, dimnum)
                 loop_inds_inc.indent()
                 loop_inds_inc += """
@@ -368,7 +368,7 @@ def create_source(self, grid, block, *args):
                 }
             """
             indtest = DeferredSource()
-            for name in arraynames:
+            for name, elemstrides, dimelemstrides, blockelemstrides in arrayarginfos:
                 indtest += r"""
                     if (%s_i > %s || %s_i < 0) {
                 """ % (name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name)

From a0b62fad9146280388607de8c3a9a2ce35c6dafe Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 15 Mar 2018 10:46:13 -0400
Subject: [PATCH 08/35] Let _memcpy_discontig fall back to generic assignment
 kernel.

---
 pycuda/gpuarray.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 6fe18ae3..304e6283 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1214,7 +1214,7 @@ def _compact_positive_strides(a):
 def _memcpy_discontig(dst, src, async=False, stream=None):
     """Copy the contents of src into dst.
 
-    The two arrays should have the same dtype, shape, and order, but
+    The two arrays should have the same dtype, and shape, but
     not necessarily the same strides. There may be up to _two_
     axes along which either `src` or `dst` is not contiguous.
     """
@@ -1234,8 +1234,15 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
         return
 
     src, dst = _flip_negative_strides((src, dst))[1]
+
+    if any(np.argsort(src.strides) != np.argsort(dst.strides)):
+        # order is not the same, create a no-op "assignment" kernel
+        func = elementwise.get_unary_func_kernel("", src.dtype)
+        func.prepared_async_call(src._grid, src._block, None,
+                                 src, dst, src.mem_size)
+        return
     
-    if src.flags.forc and dst.flags.forc:
+    if src.flags.f_contiguous == dst.flags.f_contiguous or src.flags.c_contiguous == dst.flags.c_contiguous:
         shape = [src.size]
         src_strides = dst_strides = [src.dtype.itemsize]
     else:
@@ -1301,7 +1308,11 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
     elif len(shape) == 3:
         copy = drv.Memcpy3D()
     else:
-        raise ValueError("more than 2 discontiguous axes not supported %s" % (tuple(sorted(axes)),))
+        # can't use MemcpyXD, create a no-op "assignment" kernel
+        func = elementwise.get_unary_func_kernel("", src.dtype)
+        func.prepared_async_call(src._grid, src._block, None,
+                                 src, dst, src.mem_size)
+        return
 
     if isinstance(src, GPUArray):
         copy.set_src_device(src.gpudata)

From 5a9d7d2ef54eaa726dc56fd4e75ab54f81e32c3a Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 15 Mar 2018 10:47:58 -0400
Subject: [PATCH 09/35] Fix arange for complex arguments.

---
 pycuda/elementwise.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 39bd61e5..c194a364 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -916,7 +916,9 @@ def get_arange_kernel(dtype):
             "%(tp)s *z, %(tp)s start, %(tp)s step" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[z_i] = start + z_i*step",
+            "z[z_i] = start + %(tp)s(z_i)*step" % {
+                "tp": dtype_to_ctype(dtype),
+                },
             "arange")
 
 

From cf24d724dfe4c72d54fd4f5e036f23a55634419d Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 15 Mar 2018 10:48:58 -0400
Subject: [PATCH 10/35] Prettify/rearrange kernel source.

---
 pycuda/elementwise.py | 48 ++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index c194a364..5b931da9 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -276,21 +276,25 @@ def create_source(self, grid, block, *args):
         loop_inds_inc = DeferredSource()
         loop_body = DeferredSource()
 
-        for dimnum in range(ndim):
-            defines += """
-                #define SHAPE_%d %d
-                #define BLOCK_STEP_%d %d
-            """ % (dimnum, shape[dimnum],
-                   dimnum, block_step[dimnum])
-            for name, elemstrides, dimelemstrides, blockelemstrides in arrayarginfos:
-                basename = "%s_%d" % (name, dimnum)
+        for definename, vals in (
+                ('SHAPE', shape),
+                ('BLOCK_STEP', block_step),
+        ):
+            for dimnum in range(ndim):
                 defines += """
-                    #define ELEMSTRIDE_%s_%d %d
-                    #define DIMELEMSTRIDE_%s_%d %d
-                    #define BLOCKELEMSTRIDE_%s_%d %d
-                """ % (name, dimnum, elemstrides[dimnum],
-                       name, dimnum, dimelemstrides[dimnum],
-                       name, dimnum, blockelemstrides[dimnum])
+                    #define %s_%d %d
+                """ % (definename, dimnum, vals[dimnum])
+        for name, elemstrides, dimelemstrides, blockelemstrides in arrayarginfos:
+            for definename, vals in (
+                    ('ELEMSTRIDE', elemstrides),
+                    ('DIMELEMSTRIDE', dimelemstrides),
+                    ('BLOCKELEMSTRIDE', blockelemstrides),
+            ):
+                for dimnum in range(ndim):
+                    basename = "%s_%d" % (name, dimnum)
+                    defines += """
+                        #define %s_%s_%d %d
+                    """ % (definename, name, dimnum, vals[dimnum])
 
         decls += """
             unsigned GLOBAL_i = cta_start + tid;
@@ -370,18 +374,20 @@ def create_source(self, grid, block, *args):
             indtest = DeferredSource()
             for name, elemstrides, dimelemstrides, blockelemstrides in arrayarginfos:
                 indtest += r"""
-                    if (%s_i > %s || %s_i < 0) {
+                    if (%s_i > %s || %s_i < 0)
                 """ % (name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name)
+                indtest += r"""{"""
                 indtest.indent()
                 indtest += r"""
-                        printf("cta_start=%%d tid=%%d GLOBAL_i=%%d %s_i=%%d\n", cta_start, tid, GLOBAL_i, %s_i);
-                        break;
-                """ % (name, name)
+                        printf("cta_start=%%d tid=%%d GLOBAL_i=%%u %s_i=%%ld %s\n", cta_start, tid, GLOBAL_i, %s_i, %s);
+                """ % (name, ' '.join("INDEX_%d=%%ld" % (dimnum,) for dimnum in range(ndim)), name, ',  '.join("INDEX_%d" % (dimnum,) for dimnum in range(ndim)))
+                indtest += """break;"""
                 indtest.dedent()
-                indtest += """
-                    }
-                """
+                indtest += """}"""
             loop_preop = indtest + loop_preop
+            tmp_after_loop = after_loop
+            after_loop = DeferredSource()
+            after_loop += tmp_after_loop
             after_loop += r"""
                 if (cta_start == 0 && tid == 0) {
                     printf("DONE CALLING FUNC %s\n");

From 65562c2e13d667ccf9dc80bf45b6747cbb0a7d99 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 15 Mar 2018 12:43:51 -0400
Subject: [PATCH 11/35] Fix contiguity-match test.

---
 pycuda/gpuarray.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 304e6283..8eb613ce 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1242,7 +1242,8 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
                                  src, dst, src.mem_size)
         return
     
-    if src.flags.f_contiguous == dst.flags.f_contiguous or src.flags.c_contiguous == dst.flags.c_contiguous:
+    if ((src.flags.f_contiguous and dst.flags.f_contiguous) or
+        (src.flags.c_contiguous and dst.flags.c_contiguous)):
         shape = [src.size]
         src_strides = dst_strides = [src.dtype.itemsize]
     else:

From 83a95aac35f24554a5a0e4555950fbfec80bc3c7 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 22 Mar 2018 13:49:24 -0400
Subject: [PATCH 12/35] Make "slow" memcpy work for host arrays too.

---
 pycuda/gpuarray.py | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 8eb613ce..fc8f1beb 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1210,6 +1210,27 @@ def _compact_positive_strides(a):
         stride *= dim
     return a, strides, slicer
 
+def _memcpy_discontig_slow(dst, src):
+    # This is a generic memcpy using a no-op "assignment" kernel.
+    # NOTE: no need to use this if both src and dst are on host.
+    func = elementwise.get_unary_func_kernel("", src.dtype)
+
+    # if one array is on host, need to make an intermediate device array
+    src_gpu = src
+    dst_gpu = dst
+    if isinstance(src, np.ndarray):
+        dtype, order, strides = _array_like_helper(src, None, "K")
+        src_gpu = GPUArray(src.shape, dtype, order=order, strides=strides)
+    if isinstance(dst, np.ndarray):
+        dtype, order, strides = _array_like_helper(dst, None, "K")
+        dst_gpu = GPUArray(dst.shape, dtype, order=order, strides=strides)
+    func.prepared_async_call(src_gpu._grid, src_gpu._block, None,
+                             src_gpu, dst_gpu, src.mem_size)
+    if src is not src_gpu:
+        src_gpu.get(src)
+    if dst is not dst_gpu:
+        dst_gpu.get(dst)
+    return
 
 def _memcpy_discontig(dst, src, async=False, stream=None):
     """Copy the contents of src into dst.
@@ -1236,10 +1257,8 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
     src, dst = _flip_negative_strides((src, dst))[1]
 
     if any(np.argsort(src.strides) != np.argsort(dst.strides)):
-        # order is not the same, create a no-op "assignment" kernel
-        func = elementwise.get_unary_func_kernel("", src.dtype)
-        func.prepared_async_call(src._grid, src._block, None,
-                                 src, dst, src.mem_size)
+        # order is not the same, use generic slow version
+        _memcpy_discontig_slow(dst, src)
         return
     
     if ((src.flags.f_contiguous and dst.flags.f_contiguous) or
@@ -1309,10 +1328,8 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
     elif len(shape) == 3:
         copy = drv.Memcpy3D()
     else:
-        # can't use MemcpyXD, create a no-op "assignment" kernel
-        func = elementwise.get_unary_func_kernel("", src.dtype)
-        func.prepared_async_call(src._grid, src._block, None,
-                                 src, dst, src.mem_size)
+        # can't use MemcpyXD, use generic slow version
+        _memcpy_discontig_slow(dst, src)
         return
 
     if isinstance(src, GPUArray):

From 0300f3141e548813d8a187a1fca8c0c31e1cf076 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 22 Mar 2018 15:48:41 -0400
Subject: [PATCH 13/35] Just in case there are any strides == 0.

---
 pycuda/gpuarray.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index fc8f1beb..bdc32351 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1177,13 +1177,15 @@ def _flip_negative_strides(arrays):
     slicer = None
     ndim = arrays[0].ndim
     shape = arrays[0].shape
-    for t in zip(range(ndim), *[np.sign(x.strides) for x in arrays]):
-        axis = t[0]
-        stride_sign = t[1]
+    for axis, stride_signs in enumerate(zip(*[np.sign(x.strides) for x in arrays])):
+        max_sign = np.max(stride_signs)
+        min_sign = np.min(stride_signs)
         if len(arrays) > 1:
-            if not np.all(t[2:] == stride_sign):
+            # Stride signs are -1 or 1, or 0 if stride is 0.
+            # Zero strides don't matter, but fail if there are both -1 and 1.
+            if max_sign == 1 and min_sign == -1:
                 raise ValueError("found differing signs in strides for dimension %d (strides for all arrays: %s)" % (axis, [x.strides for x in arrays]))
-        if stride_sign == -1:
+        if min_sign == -1:
             if slicer is None:
                 slicer = [slice(None)] * ndim
             slicer[axis] = slice(None, None, -1)
@@ -1256,7 +1258,9 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
 
     src, dst = _flip_negative_strides((src, dst))[1]
 
-    if any(np.argsort(src.strides) != np.argsort(dst.strides)):
+    if (0 in src.strides or
+        0 in dst.strides or
+        any(np.argsort(src.strides) != np.argsort(dst.strides))):
         # order is not the same, use generic slow version
         _memcpy_discontig_slow(dst, src)
         return

From 42804f4ba484e06dd964f8f9d47ad9c766ed1eaa Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Fri, 23 Mar 2018 14:30:28 -0400
Subject: [PATCH 14/35] Fix memcpy_discontig_slow for host-side src.

---
 pycuda/gpuarray.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index bdc32351..5146ee8e 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1223,11 +1223,12 @@ def _memcpy_discontig_slow(dst, src):
     if isinstance(src, np.ndarray):
         dtype, order, strides = _array_like_helper(src, None, "K")
         src_gpu = GPUArray(src.shape, dtype, order=order, strides=strides)
+        src_gpu.set(src)
     if isinstance(dst, np.ndarray):
         dtype, order, strides = _array_like_helper(dst, None, "K")
         dst_gpu = GPUArray(dst.shape, dtype, order=order, strides=strides)
     func.prepared_async_call(src_gpu._grid, src_gpu._block, None,
-                             src_gpu, dst_gpu, src.mem_size)
+                             src_gpu, dst_gpu, src_gpu.mem_size)
     if src is not src_gpu:
         src_gpu.get(src)
     if dst is not dst_gpu:

From 2cb1a9239a9721773e5452e039f565bca1747aee Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Fri, 23 Mar 2018 15:29:26 -0400
Subject: [PATCH 15/35] Fix some instances of incompatible strides.

---
 pycuda/gpuarray.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 5146ee8e..e28be096 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1257,11 +1257,15 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
         dst[...] = src
         return
 
-    src, dst = _flip_negative_strides((src, dst))[1]
+    doslow = (0 in src.strides) or (0 in dst.strides)
+    try:
+        src, dst = _flip_negative_strides((src, dst))[1]
+    except ValueError:
+        # probably different signs in strides -- will use slow version below
+        doslow = True
+        pass
 
-    if (0 in src.strides or
-        0 in dst.strides or
-        any(np.argsort(src.strides) != np.argsort(dst.strides))):
+    if doslow or any(np.argsort(src.strides) != np.argsort(dst.strides)):
         # order is not the same, use generic slow version
         _memcpy_discontig_slow(dst, src)
         return
@@ -1289,11 +1293,8 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
             axes[0:0] = [np.newaxis]
 
         # collapse contiguous dimensions
-        # and check that dst is in same order as src
         i = 1
         while i < len(shape):
-            if dst_strides[i] < dst_strides[i-1]:
-                raise ValueError("src and dst must have same order")
             if (src_strides[i-1] * shape[i-1] == src_strides[i] and
                 dst_strides[i-1] * shape[i-1] == dst_strides[i]):
                 shape[i-1:i+1] = [shape[i-1] * shape[i]]

From 540516ed76f818303607c9d381ebe431e8a9730a Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Tue, 29 May 2018 11:03:22 -0400
Subject: [PATCH 16/35] Compile regexes once per source object.

---
 pycuda/deferred.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index 97f28da7..6b141d48 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -27,6 +27,8 @@ def __init__(self, subsources=None, base_indent=0, indent_step=2):
         if subsources is None:
             subsources = []
         self.subsources = subsources
+        self._regex_space = re.compile(r"^(\s*)(.*?)(\s*)$")
+        self._regex_format = re.compile(r"%\(([^\)]*)\)([a-zA-Z])")
 
     def __str__(self):
         return self.generate()
@@ -53,13 +55,11 @@ def generate(self, indent=0, indent_first=True, get_list=False):
                 retval = retval + subsource.generate(indent=(indent + subindent), get_list=get_list)
                 continue
             lines = subsource.split("\n")
-            regex_space = re.compile(r"^(\s*)(.*?)(\s*)$")
-            regex_format = re.compile(r"%\(([^\)]*)\)([a-zA-Z])")
             minstrip = None
             newlines = []
             for line in lines:
                 linelen = len(line)
-                space_match = regex_space.match(line)
+                space_match = self._regex_space.match(line)
                 end_leading_space = space_match.end(1)
                 begin_trailing_space = space_match.start(3)
                 if strip_space:
@@ -74,7 +74,7 @@ def generate(self, indent=0, indent_first=True, get_list=False):
                 newlinelist = None
                 newline = ''
                 curpos = 0
-                matches = list(regex_format.finditer(line, end_leading_space))
+                matches = list(self._regex_format.finditer(line, end_leading_space))
                 nummatches = len(matches)
                 for match in matches:
                     formatchar = match.group(2)

From ca6434e25b0dd533079f08062750ede0121f5d38 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 31 May 2018 11:35:24 -0400
Subject: [PATCH 17/35] Faster _flip_negative_strides (and earlier
 short-circuit), plus a variable name change from "slicer" to "flipper".

---
 pycuda/gpuarray.py | 80 ++++++++++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 38 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index e28be096..ccf6a1e6 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -238,14 +238,14 @@ def set_async(self, ary, stream=None):
         return self.set(ary, async=True, stream=stream)
 
     def get(self, ary=None, pagelocked=False, async=False, stream=None):
-        slicer = None
+        flipper = None
         if ary is None:
             if pagelocked:
                 ary = drv.pagelocked_empty(self.shape, self.dtype)
             else:
                 ary = np.empty(self.shape, self.dtype)
 
-            self, strides, slicer = _compact_positive_strides(self)
+            self, strides, flipper = _compact_positive_strides(self)
             ary = _as_strided(ary, strides=strides)
         else:
             if self.size != ary.size:
@@ -262,8 +262,8 @@ def get(self, ary=None, pagelocked=False, async=False, stream=None):
 
         if self.size:
             _memcpy_discontig(ary, self, async=async, stream=stream)
-        if slicer:
-            ary = ary[slicer]
+        if flipper is not None:
+            ary = ary[flipper]
         return ary
 
     def get_async(self, stream=None, ary=None):
@@ -352,7 +352,7 @@ def _div(self, other, out, stream=None):
         return out
 
     def _new_like_me(self, dtype=None, order="K"):
-        slicer, selflist = _flip_negative_strides((self,))
+        flipper, selflist = _flip_negative_strides((self,))
         self = selflist[0]
         ndim = self.ndim
         shape = self.shape
@@ -394,8 +394,8 @@ def _new_like_me(self, dtype=None, order="K"):
                     newstrides = _f_contiguous_strides(itemsize, shape)
         retval = self.__class__(shape, dtype,
                      allocator=self.allocator, strides=newstrides)
-        if slicer:
-            retval = retval[slicer]
+        if flipper is not None:
+            retval = retval[flipper]
         return retval
 
     # operators ---------------------------------------------------------------
@@ -990,21 +990,21 @@ def conj(self):
 
 def to_gpu(ary, allocator=drv.mem_alloc):
     """converts a numpy array to a GPUArray"""
-    ary, newstrides, slicer = _compact_positive_strides(ary)
+    ary, newstrides, flipper = _compact_positive_strides(ary)
     result = GPUArray(ary.shape, ary.dtype, allocator, strides=newstrides)
     result.set(ary)
-    if slicer:
-        result = result[slicer]
+    if flipper is not None:
+        result = result[flipper]
     return result
 
 
 def to_gpu_async(ary, allocator=drv.mem_alloc, stream=None):
     """converts a numpy array to a GPUArray"""
-    ary, newstrides, slicer = _compact_positive_strides(ary)
+    ary, newstrides, flipper = _compact_positive_strides(ary)
     result = GPUArray(ary.shape, ary.dtype, allocator, strides=newstrides)
     result.set_async(ary, stream)
-    if slicer:
-        result = result[slicer]
+    if flipper:
+        result = result[flipper]
     return result
 
 
@@ -1166,40 +1166,44 @@ class Info(Record):
 
 def _flip_negative_strides(arrays):
     # If arrays have negative strides, flip them.  Return a list
-    # ``(slicer, arrays)`` where ``slicer`` is a tuple of slice objects
-    # used to flip the arrays (or ``None`` if there was no flipping),
-    # and ``arrays`` is the list of flipped arrays.
+    # ``(flipper, arrays)`` where ``flipper`` is a tuple of slice
+    # objects that can used to unflip the returned (flipped) arrays
+    # (or ``None`` if there was no flipping).
     # NOTE: Every input array A must have the same value for the following
     # expression:  np.sign(A.strides)
     # NOTE: ``slicer`` is its own inverse, so ``A[slicer][slicer] == A``
     if isinstance(arrays, GPUArray):
         raise TypeError("_flip_negative_strides expects a list of GPUArrays")
-    slicer = None
-    ndim = arrays[0].ndim
-    shape = arrays[0].shape
-    for axis, stride_signs in enumerate(zip(*[np.sign(x.strides) for x in arrays])):
-        max_sign = np.max(stride_signs)
-        min_sign = np.min(stride_signs)
-        if len(arrays) > 1:
-            # Stride signs are -1 or 1, or 0 if stride is 0.
-            # Zero strides don't matter, but fail if there are both -1 and 1.
-            if max_sign == 1 and min_sign == -1:
-                raise ValueError("found differing signs in strides for dimension %d (strides for all arrays: %s)" % (axis, [x.strides for x in arrays]))
-        if min_sign == -1:
-            if slicer is None:
-                slicer = [slice(None)] * ndim
-            slicer[axis] = slice(None, None, -1)
-    if slicer is not None:
-        slicer = tuple(slicer)
-        arrays = [x[slicer] for x in arrays]
-    return slicer, arrays
-
+    if not any(any(s < 0 for s in x.strides) for x in arrays):
+        return None, arrays
+    strides = np.vstack(x.strides for x in arrays)
+    shapes = np.vstack(x.shape for x in arrays)
+    signs = np.sign(strides)
+    max_signs = np.max(signs, axis=0)
+    min_signs = np.min(signs, axis=0)
+    # Stride signs are -1 or 1, or 0 if stride is 0.
+    # Zero strides don't matter, but fail if there are both -1 and 1.
+    if _builtin_min(min_signs) != -1:
+        # no flips needed
+        return None, arrays
+    flips = (min_signs == -1)[None, :]
+    if any(max_signs * min_signs == -1):
+        raise ValueError("found differing signs in strides (strides for all arrays: %s)" % ([x.strides for x in arrays],))
+    offsets = np.sum(strides * shapes * flips, axis=1)
+    strides = strides * (flips * -1)
+    sliceboth = (slice(None), slice(None, None, -1))
+    flipper = tuple(sliceboth[flip] for flip in flips[0])
+    arrays = [
+        arr[flipper]
+        for arr in arrays
+    ]
+    return flipper, arrays
 
 def _compact_positive_strides(a):
     # Flip ``a``'s axes if there are any negative strides, then compute
     # strides to have same order as a, but packed.  Return flipped ``a``
     # and packed strides.
-    slicer, alist = _flip_negative_strides((a,))
+    flipper, alist = _flip_negative_strides((a,))
     a = alist[0]
     info = sorted(
             (a.strides[axis], a.shape[axis], axis)
@@ -1210,7 +1214,7 @@ def _compact_positive_strides(a):
     for _, dim, axis in info:
         strides[axis] = stride
         stride *= dim
-    return a, strides, slicer
+    return a, strides, flipper
 
 def _memcpy_discontig_slow(dst, src):
     # This is a generic memcpy using a no-op "assignment" kernel.

From 0ead8e28fda344ee38f9a4df0858ce3f9dc34f8a Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 31 May 2018 11:48:44 -0400
Subject: [PATCH 18/35] Be smarter/faster calculating keys for deferred source.

---
 pycuda/deferred.py           |  25 ++++----
 pycuda/elementwise.py        |  91 +++++++++++++--------------
 src/wrapper/wrap_cudadrv.cpp | 119 +++++++++++++++++++++++++++++++++++
 3 files changed, 175 insertions(+), 60 deletions(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index 6b141d48..0dfd45a8 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -441,12 +441,13 @@ class DeferredSourceModule(SourceModule):
     (or ``DeferredSource`` object) that should be compiled.  ``grid``,
     ``block``, and ``*args`` are the same arguments that were sent to the
     ``DeferredFunction`` call functions above.
-    The function ``create_key(self, grid, block, *args)`` is always
-    called before ``create_source`` and the key returned is used to cache
-    any compiled functions.  Default return value of ``create_key()`` is
-    None, which means to use the function name and generated source as the
-    key.  The return value of ``create_key()`` must be usable as a hash
-    key.
+    The function ``create_key(self, grid, block, *args)`` returns a tuple
+    ``(key, precalc)``.  ``create_key`` is always called before
+    ``create_source`` and any pre-calculated info in ``precalc`` is sent back
+    to ``create_source``; ``key`` is used to cache any compiled functions.
+    Default return value of ``create_key()`` is (None, None), which means to
+    use the function name and generated source as the key.  The key returned
+    by ``create_key()`` must be usable as a hash key.
     '''
     _cache = {}
 
@@ -467,25 +468,25 @@ def _delayed_compile(self, source):
         return _delayed_compile_aux(source, self._compileargs)
 
     def create_key(self, grid, block, *funcargs):
-        return None
+        return (None, None)
 
     def create_source(self, grid, block, *funcargs):
         raise NotImplementedError("create_source must be overridden!")
 
     def _delayed_get_function(self, funcname, funcargs, grid, block):
         '''
-        If ``create_key()`` returns non-None, then it is used as the key
-        to cache compiled functions.  Otherwise the return value of
-        ``create_source()`` is used as the key.
+        If the first element of the tuple returned by ``create_key()`` is
+        not None, then it is used as the key to cache compiled functions.
+        Otherwise the return value of ``create_source()`` is used as the key.
         '''
         context = pycuda.driver.Context.get_current()
         funccache = DeferredSourceModule._cache.get(context, None)
         if funccache is None:
             funccache = self._cache[context] = {}
-        key = self.create_key(grid, block, *funcargs)
+        (key, precalc) = self.create_key(grid, block, *funcargs)
         funckey = (funcname, key)
         if key is None or funckey not in funccache:
-            source = self.create_source(grid, block, *funcargs)
+            source = self.create_source(precalc, grid, block, *funcargs)
             if isinstance(source, DeferredSource):
                 source = source.generate()
             if key is None:
diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 5b931da9..3d2901a5 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -37,6 +37,7 @@
 from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg
 from pytools import memoize_method
 from pycuda.deferred import DeferredSourceModule, DeferredSource
+import pycuda._driver as _drv
 
 class ElementwiseSourceModule(DeferredSourceModule):
     '''
@@ -67,27 +68,25 @@ def __init__(self, arguments, operation,
         self._shape_arg_index = shape_arg_index
         self._init_args = (tuple(arguments), operation,
                            name, preamble, loop_prep, after_loop)
+        self._init_args_repr = repr(self._init_args)
         self._debug = debug
 
-    def create_key(self, grid, block, *args):
-        (arguments, operation,
-         funcname, preamble, loop_prep, after_loop) = self._init_args
-        shape_arg_index = self._shape_arg_index
-
+    @profile
+    def _precalc_array_info(self, args, arguments, shape_arg_index):
         # 'args' is the list of actual parameters being sent to the kernel
         # 'arguments' is the list of argument descriptors (VectorArg, ScalarArg)
 
-        arraypairs = []
+        arrayarginds = []
         contigmatch = True
         arrayspecificinds = True
         shape = None
-        size = None
         order = None
-        for i, argpair in enumerate(zip(args, arguments)):
-            arg, arg_descr = argpair
+        for i, arg_descr in enumerate(arguments):
+            arg_descr = arguments[i]
             if isinstance(arg_descr, VectorArg):
                 # is a GPUArray/DeviceAllocation
-                arraypairs.append(argpair)
+                arg = args[i]
+                arrayarginds.append(i)
                 if not arrayspecificinds:
                     continue
                 if not hasattr(arg, 'shape'):
@@ -98,30 +97,35 @@ def create_key(self, grid, block, *args):
                     arrayspecificinds = False
                     continue
                 curshape = arg.shape
-                cursize = arg.size
                 curorder = 'N'
-                if arg.flags.f_contiguous:
-                    curorder = 'F'
-                elif arg.flags.c_contiguous:
-                    curorder = 'C'
-                if shape is None:
-                    shape = curshape
-                    size = cursize
-                    order = curorder
-                elif curorder == 'N' or order != curorder:
-                    contigmatch = False
-                elif shape_arg_index is None and shape != curshape:
+                if contigmatch:
+                    if arg.flags.f_contiguous:
+                        curorder = 'F'
+                    elif arg.flags.c_contiguous:
+                        curorder = 'C'
+                    if shape is None:
+                        shape = curshape
+                        order = curorder
+                    elif curorder == 'N' or order != curorder:
+                        contigmatch = False
+                if shape_arg_index is None and shape != curshape:
                     raise Exception("All input arrays to elementwise kernels must have the same shape, or you must specify the argument that has the canonical shape with shape_arg_index; found shapes %s and %s" % (shape, curshape))
                 if shape_arg_index == i:
                     shape = curshape
 
-        self._contigmatch = contigmatch
-        self._arraypairs = arraypairs
-        self._arrayspecificinds = arrayspecificinds
+        return (contigmatch, arrayarginds, arrayspecificinds, shape)
+
+
+    @profile
+    def create_key(self, grid, block, *args):
+        #print "Calling _precalc_array_info(args=%s, self._init_args[0]=%s, self._shape_arg_index=%s)\n" % (args, self._init_args[0], self._shape_arg_index)
+        #precalc = self._precalc_array_info(args, self._init_args[0], self._shape_arg_index)
+        precalc = _drv._precalc_array_info(args, self._init_args[0], self._shape_arg_index)
+        (contigmatch, arrayarginds, arrayspecificinds, shape) = precalc
 
         if contigmatch:
-            key = repr(self._init_args)
-            return key
+            key = self._init_args_repr
+            return key, (contigmatch, arrayarginds, arrayspecificinds, shape, None, None, None, None, None)
 
         # Arrays are not contiguous or different order
 
@@ -130,6 +134,7 @@ def create_key(self, grid, block, *args):
 
         ndim = len(shape)
         numthreads = block[0] * grid[0]
+        arguments = self._init_args[0]
 
         # Use index of minimum stride in first array as a hint on how to
         # order the traversal of dimensions.  We could probably do something
@@ -140,9 +145,8 @@ def create_key(self, grid, block, *args):
         # ensure that inputs have the same order, and explicitly send
         # shape_arg_index to turn this off.
         do_reverse = False
-        if (shape_arg_index is None and
-            np.argmin(np.abs(arraypairs[0][0].strides)) > ndim // 2):
-            print "traversing dimensions in reverse order"
+        if (self._shape_arg_index is None and
+            np.argmin(np.abs(args[arrayarginds[0]].strides)) > ndim // 2):
             # traverse dimensions in reverse order
             do_reverse = True
         if do_reverse:
@@ -155,7 +159,9 @@ def create_key(self, grid, block, *args):
             tmp = tmp // block_step[dimnum]
             block_step[dimnum] = newstep
         arrayarginfos = []
-        for arg, arg_descr in arraypairs:
+        for i in arrayarginds:
+            arg = args[i]
+            arg_descr = arguments[i]
             if do_reverse:
                 elemstrides = np.array(arg.strides[::-1]) // arg.itemsize
             else:
@@ -172,22 +178,17 @@ def create_key(self, grid, block, *args):
         self._shape = shape
         self._block_step = block_step
 
-        key = (self._init_args, grid, block, shape, tuple(self._arrayarginfos))
+        key = (self._init_args_repr, grid, block, shape, tuple(self._arrayarginfos))
 
-        return key
+        return key, (contigmatch, arrayarginds, arrayspecificinds, shape, arrayarginfos, ndim, numthreads, shape, block_step)
 
-    def create_source(self, grid, block, *args):
-        # Precondition: create_key() must have been run with the same arguments
+    def create_source(self, precalc, grid, block, *args):
+        (contigmatch, arrayarginds, arrayspecificinds, shape, arrayarginfos, ndim, numthreads, shape, block_step) = precalc
 
         (arguments, operation,
          funcname, preamble, loop_prep, after_loop) = self._init_args
 
-        contigmatch = self._contigmatch
-
         if contigmatch:
-            arraypairs = self._arraypairs
-            arrayspecificinds = self._arrayspecificinds
-
             indtype = 'unsigned'
             if self._do_range:
                 indtype = 'long'
@@ -195,10 +196,10 @@ def create_source(self, grid, block, *args):
             # All arrays are contiguous and same order (or we don't know and
             # it's up to the caller to make sure it works)
             if arrayspecificinds:
-                for arg, arg_descr in arraypairs:
+                for i in arrayarginds:
                     preamble = preamble + """
                         #define %s_i i
-                    """ % (arg_descr.name,)
+                    """ % (arguments[i].name,)
             if self._do_range:
                 loop_body = """
                   if (step < 0)
@@ -261,12 +262,6 @@ def create_source(self, grid, block, *args):
 
         # Arrays are not contiguous or different order
 
-        arrayarginfos = self._arrayarginfos
-        ndim = self._ndim
-        numthreads = self._numthreads
-        shape = self._shape
-        block_step = self._block_step
-
         arraynames = [ x[0] for x in arrayarginfos ]
 
         defines = DeferredSource()
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 8a2488de..1f0af564 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -17,6 +17,120 @@
 
 
 
+namespace precalc {
+  namespace py = boost::python;
+  py::tuple _precalc_array_info(const py::object & args, const py::object & arg_descrs, const py::object & shape_arg_index) {
+    bool contigmatch = true;
+    bool arrayspecificinds = true;
+    std::vector<int> arrayarginds;
+    std::vector<int> shape;
+    py::object shape_obj;
+    char order = 'N';
+    int numargs = py::len(args);
+
+    for (int i = 0; i < numargs; i++) {
+      py::object arg_descr = py::object(arg_descrs[i]);
+      // below is our version of isinstance(arg_descr, VectorArg)
+      py::object struct_char;
+      try {
+	struct_char = arg_descr.attr("struct_char");
+      } catch (...) {
+	continue;
+      }
+      if (py::extract<char>(struct_char) != 'P') {
+	continue;
+      }
+      // is a GPUArray/DeviceAllocation
+      //py::object arg = py::object(args[i]);
+      py::object arg = args[i];
+      arrayarginds.push_back(i);
+      if (!arrayspecificinds) {
+	continue;
+      }
+      py::object curshape_obj;
+      py::object curstrides_obj;
+      int itemsize;
+      int ndim;
+      try {
+	curshape_obj = arg.attr("shape");
+	curstrides_obj = arg.attr("strides");
+	itemsize = py::extract<int>(arg.attr("itemsize"));
+	ndim = py::extract<int>(arg.attr("ndim"));
+      } catch (...) {
+	// At least one array argument is probably sent as a
+	// GPUArray.gpudata rather than the GPUArray itself,
+	// so disable array-specific indices -- caller is on
+	// their own.
+	arrayspecificinds = false;
+	continue;
+      }
+      std::vector<int> curshape(ndim);
+      std::vector<int> curstrides(ndim);
+      for (int i = 0; i < ndim; i++) {
+	curshape[i] = py::extract<int>(curshape_obj[i]);
+	curstrides[i] = py::extract<int>(curstrides_obj[i]);
+      }
+      if (contigmatch) {
+	char curorder = 'N';
+	int tmpaccum = itemsize;
+	bool is_f = (curstrides[0] == tmpaccum);
+	tmpaccum *= curshape[0];
+	for (int i = 1; i < ndim; i++) {
+	  if (curshape[i] == 1) {
+	    continue;
+	  }
+	  if (curstrides[i] != tmpaccum) {
+	    is_f = false;
+	    break;
+	  }
+	  tmpaccum *= curshape[i];
+	}
+	tmpaccum = itemsize;
+	bool is_c = (curstrides[ndim - 1] == tmpaccum);
+	tmpaccum *= curshape[ndim - 1];
+	for (int i = ndim - 2; i >= 0; i--) {
+	  if (curshape[i] == 1) {
+	    continue;
+	  }
+	  if (curstrides[i] != tmpaccum) {
+	    is_c = false;
+	    break;
+	  }
+	  tmpaccum *= curshape[i];
+	}
+	if (is_f) {
+	  curorder = 'F';
+	}
+	if (is_c) {
+	  curorder = 'C';
+	}
+	if (shape.size() == 0) {
+	  shape = curshape;
+	  shape_obj = curshape_obj;
+	  order = curorder;
+	} else if (curorder == 'N' || order != curorder) {
+	  contigmatch = false;
+	}
+      }
+      if (shape_arg_index.is_none() && shape != curshape) {
+	PyErr_SetString(PyExc_RuntimeError, "All input arrays to elementwise kernels must have the same shape, or you must specify the argument that has the canonical shape with shape_arg_index");
+	py::throw_error_already_set();
+      }
+      if (shape_arg_index == i) {
+	shape = curshape;
+	shape_obj = curshape_obj;
+      }
+    }
+    py::list arrayarginds_obj;
+    {
+      typename std::vector<int>::iterator iter = arrayarginds.begin();
+      for (/*null*/; iter != arrayarginds.end(); iter++) {
+	arrayarginds_obj.append(*iter);
+      }
+    }
+    return py::make_tuple(contigmatch, arrayarginds_obj, arrayspecificinds, shape_obj);
+  }
+}
 
 using namespace pycuda;
 using boost::shared_ptr;
@@ -1657,6 +1771,11 @@ BOOST_PYTHON_MODULE(_driver)
 #ifdef HAVE_CURAND
   pycuda_expose_curand();
 #endif
+
+  {
+    using namespace precalc;
+    DEF_SIMPLE_FUNCTION_WITH_ARGS(_precalc_array_info, ("args", "arg_descrs", "shape_arg_index"));
+  }
 }
 
 // vim: foldmethod=marker

From eed20f8459b851bc8d04c13a00063c051da02fc1 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 31 May 2018 11:50:06 -0400
Subject: [PATCH 19/35] Fix prototype.

---
 pycuda/deferred.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index 0dfd45a8..861c087e 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -470,7 +470,7 @@ def _delayed_compile(self, source):
     def create_key(self, grid, block, *funcargs):
         return (None, None)
 
-    def create_source(self, grid, block, *funcargs):
+    def create_source(self, precalc, grid, block, *funcargs):
         raise NotImplementedError("create_source must be overridden!")
 
     def _delayed_get_function(self, funcname, funcargs, grid, block):

From 6f092662e00ad9724a156de094994cbe6bded1b4 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 31 May 2018 11:51:15 -0400
Subject: [PATCH 20/35] Add tests for non-contiguous arrays.

---
 test/test_gpuarray.py | 121 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index e38e2fda..f00a4e66 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -1082,6 +1082,12 @@ def test_copy(self):
         for start, stop, step in [(0,3,1), (1,2,1), (0,3,3)]:
             assert np.allclose(a_gpu[start:stop:step,:,start:stop:step].get(), a_gpu.get()[start:stop:step,:,start:stop:step])
 
+        # try 3 axes discontiguous
+        a_gpu = curand((10,10,10,10))
+        a = a_gpu.get()
+        for start, stop, step in [(0,3,1), (1,2,1), (0,3,3)]:
+            assert np.allclose(a_gpu[start:stop:step,start:stop:step,start:stop:step].get(), a_gpu.get()[start:stop:step,start:stop:step,start:stop:step])
+
     @mark_cuda_test
     def test_get_set(self):
         import pycuda.gpuarray as gpuarray
@@ -1145,6 +1151,121 @@ def test_zeros_like_etc(self):
                 assert new_z.dtype == np.complex64
                 assert new_z.shape == arr.shape
 
+    @mark_cuda_test
+    def test_noncontig_get(self):
+        from pycuda.curandom import rand as curand
+        a_gpu = curand((200,200))
+        a = a_gpu.get()
+
+        ret_a = a[2::3, 1::4]
+
+        ret_a_gpu = (a_gpu[2::3, 1::4]).get()
+
+        assert np.allclose(ret_a, ret_a_gpu)
+
+    @mark_cuda_test
+    def test_noncontig_get2(self):
+        from pycuda.curandom import rand as curand
+        a_gpu = curand((200,200))
+        a = a_gpu.get()
+
+        ret_a = a[2::3,::-1]
+
+        ret_a_gpu = (a_gpu[2::3,::-1]).get()
+
+        assert np.allclose(ret_a, ret_a_gpu)
+
+    @mark_cuda_test
+    def test_noncontig_set(self):
+        import pycuda.gpuarray as gpuarray
+
+        # test different orders, transpositions, offsets, and sizes of both
+        # src (host) and dst (device)
+        for host_order in ('C', 'F'):
+          for dev_order in ('C', 'F'):
+            for transpose in ((0,1,2), (0,2,1), (1,2,0), (1,0,2), (2,0,1), (2,1,0)):
+              for start, stop, step in [(0, 50, 2), (1, 50, 2), (49, None, -2), (48, None, -2)]:
+                print("host_order=%s dev_order=%s transpose=%s start=%s stop=%s step=%s" % (host_order, dev_order, transpose, start, stop, step))
+                a = np.array(np.random.normal(0., 1., (25,25,25)), order=host_order)
+                a_gpu = gpuarray.zeros((50, 50, 50), np.float64, order=dev_order)
+                a_gpu_sub = a_gpu[start:stop:step,start:stop:step,start:stop:step]
+                a_gpu_sub.set(a)
+                assert np.allclose(a_gpu_sub.get(), a)
+                assert np.allclose(a_gpu.get()[start:stop:step,start:stop:step,start:stop:step], a)
+
+
+    @mark_cuda_test
+    def test_noncontig_unary(self):
+        from pycuda.curandom import rand as curand
+        a_gpu = curand((200,200))[1::4, ::-2]
+        a = a_gpu.get()
+
+        ret_a = a ** 2
+
+        ret_a_gpu = (a_gpu ** 2).get()
+
+        assert np.allclose(ret_a, ret_a_gpu)
+
+    @mark_cuda_test
+    def test_noncontig_different_strides(self):
+        x = 200; y = 200
+
+        a_gpu = gpuarray.arange(0, x*y, 1, dtype=np.float32).reshape((x, y), order='C')
+        a = a_gpu.get()
+        assert a_gpu.strides[0] > a_gpu.strides[1], "a_gpu not C-order"
+
+        b_gpu = gpuarray.zeros_like(a_gpu, order='F')
+        b_gpu += a_gpu
+        b = b_gpu.get()
+        assert b_gpu.strides[1] > b_gpu.strides[0], "b_gpu not F-order"
+
+        assert np.allclose(a, b)
+
+    @mark_cuda_test
+    def test_noncontig_stride_tricky(self):
+        from pycuda.curandom import rand as curand
+        a_gpu = curand((200,200))
+        a = a_gpu.get()
+
+        a_gpu = a_gpu[1::4, ::-2]
+        a = a[1::4, ::-2]
+
+        a_gpu = gpuarray.GPUArray(a_gpu.shape, a_gpu.dtype,
+                                  base=a_gpu,
+                                  gpudata=a_gpu.gpudata,
+                                  strides=(0,0))
+        a.strides = (0, 0)
+
+        ret_a = a
+
+        ret_a_gpu = a_gpu.get()
+
+        assert np.allclose(ret_a, ret_a_gpu)
+
+    @mark_cuda_test
+    def test_noncontig_stride_tricky2(self):
+        # sliding window using tricks
+        from numpy.lib.stride_tricks import as_strided
+        from pycuda.curandom import rand as curand
+        a_gpu = curand((200000))
+        a = a_gpu.get()
+
+        a_gpu = gpuarray.GPUArray((199998, 3), a_gpu.dtype,
+                                  base=a_gpu,
+                                  gpudata=a_gpu.gpudata,
+                                  strides=(a_gpu.itemsize, a_gpu.itemsize))
+        a = as_strided(a, shape=(199998, 3), strides=(a.itemsize, a.itemsize))
+
+        ret_a = a
+        ret_a_gpu = a_gpu.get()
+
+        assert np.allclose(ret_a, ret_a_gpu)
+
+        ret_a = a + 2
+        ret_a_gpu = (a_gpu + 2).get()
+
+        assert np.allclose(ret_a, ret_a_gpu)
+
 
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the tests.

From a4cfd3696ac382b4036cee3b3c898b437c7ee6aa Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 31 May 2018 16:26:32 -0400
Subject: [PATCH 21/35] Don't need to transfer src to host.

---
 pycuda/gpuarray.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index ccf6a1e6..a30f986d 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1233,8 +1233,6 @@ def _memcpy_discontig_slow(dst, src):
         dst_gpu = GPUArray(dst.shape, dtype, order=order, strides=strides)
     func.prepared_async_call(src_gpu._grid, src_gpu._block, None,
                              src_gpu, dst_gpu, src_gpu.mem_size)
-    if src is not src_gpu:
-        src_gpu.get(src)
     if dst is not dst_gpu:
         dst_gpu.get(dst)
     return

From ba984f49233c5e53801cf93049d1cfb399acb8b0 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 31 May 2018 16:28:05 -0400
Subject: [PATCH 22/35] Ignore zero-strides in singleton dimensions (which will
 be culled later anyway).

---
 pycuda/gpuarray.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index a30f986d..fb5da57f 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1259,7 +1259,13 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
         dst[...] = src
         return
 
-    doslow = (0 in src.strides) or (0 in dst.strides)
+    # check for stride tricks (like stride == 0 with shape > 1)
+    shape = src.shape
+    doslow = (any(sh > 1 for sh, st in zip(shape, src.strides) if st == 0)
+              or
+              any(sh > 1 for sh, st in zip(shape, dst.strides) if st == 0))
+    del shape
+
     try:
         src, dst = _flip_negative_strides((src, dst))[1]
     except ValueError:

From 084a15d14af7a1f628dfd571b7f6fddd6951bf9d Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Mon, 25 Jun 2018 15:46:06 -0400
Subject: [PATCH 23/35] "precalc" performance improvements.

---
 pycuda/elementwise.py        | 68 +++++++++++++++++-------------------
 src/wrapper/wrap_cudadrv.cpp | 41 +++++++---------------
 2 files changed, 45 insertions(+), 64 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 3d2901a5..eec90b66 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -69,58 +69,54 @@ def __init__(self, arguments, operation,
         self._init_args = (tuple(arguments), operation,
                            name, preamble, loop_prep, after_loop)
         self._init_args_repr = repr(self._init_args)
+        self._arrayarginds = [i for i in range(len(arguments)) if isinstance(arguments[i], VectorArg)]
         self._debug = debug
 
     @profile
-    def _precalc_array_info(self, args, arguments, shape_arg_index):
+    def _precalc_array_info(self, args, shape_arg_index):
         # 'args' is the list of actual parameters being sent to the kernel
-        # 'arguments' is the list of argument descriptors (VectorArg, ScalarArg)
 
-        arrayarginds = []
         contigmatch = True
         arrayspecificinds = True
         shape = None
         order = None
-        for i, arg_descr in enumerate(arguments):
-            arg_descr = arguments[i]
-            if isinstance(arg_descr, VectorArg):
-                # is a GPUArray/DeviceAllocation
-                arg = args[i]
-                arrayarginds.append(i)
-                if not arrayspecificinds:
-                    continue
-                if not hasattr(arg, 'shape'):
-                    # At least one array argument is probably sent as a
-                    # GPUArray.gpudata rather than the GPUArray itself,
-                    # so disable array-specific indices -- caller is on
-                    # their own.
-                    arrayspecificinds = False
-                    continue
-                curshape = arg.shape
-                curorder = 'N'
-                if contigmatch:
-                    if arg.flags.f_contiguous:
-                        curorder = 'F'
-                    elif arg.flags.c_contiguous:
-                        curorder = 'C'
-                    if shape is None:
-                        shape = curshape
-                        order = curorder
-                    elif curorder == 'N' or order != curorder:
-                        contigmatch = False
-                if shape_arg_index is None and shape != curshape:
-                    raise Exception("All input arrays to elementwise kernels must have the same shape, or you must specify the argument that has the canonical shape with shape_arg_index; found shapes %s and %s" % (shape, curshape))
-                if shape_arg_index == i:
+        for i in self._arrayarginds:
+            # is a GPUArray/DeviceAllocation
+            arg = args[i]
+            if not arrayspecificinds:
+                continue
+            if not hasattr(arg, 'shape'):
+                # At least one array argument is probably sent as a
+                # GPUArray.gpudata rather than the GPUArray itself,
+                # so disable array-specific indices -- caller is on
+                # their own.
+                arrayspecificinds = False
+                continue
+            curshape = arg.shape
+            curorder = 'N'
+            if contigmatch:
+                if arg.flags.f_contiguous:
+                    curorder = 'F'
+                elif arg.flags.c_contiguous:
+                    curorder = 'C'
+                if shape is None:
                     shape = curshape
+                    order = curorder
+                elif curorder == 'N' or order != curorder:
+                    contigmatch = False
+            if shape_arg_index is None and shape != curshape:
+                raise Exception("All input arrays to elementwise kernels must have the same shape, or you must specify the argument that has the canonical shape with shape_arg_index; found shapes %s and %s" % (shape, curshape))
+            if shape_arg_index == i:
+                shape = curshape
 
-        return (contigmatch, arrayarginds, arrayspecificinds, shape)
+        return (contigmatch, self._arrayarginds, arrayspecificinds, shape)
 
 
     @profile
     def create_key(self, grid, block, *args):
         #print "Calling _precalc_array_info(args=%s, self._init_args[0]=%s, self._shape_arg_index=%s)\n" % (args, self._init_args[0], self._shape_arg_index)
-        #precalc = self._precalc_array_info(args, self._init_args[0], self._shape_arg_index)
-        precalc = _drv._precalc_array_info(args, self._init_args[0], self._shape_arg_index)
+        #precalc = self._precalc_array_info(args, self._shape_arg_index)
+        precalc = _drv._precalc_array_info(args, self._arrayarginds, self._shape_arg_index)
         (contigmatch, arrayarginds, arrayspecificinds, shape) = precalc
 
         if contigmatch:
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 1f0af564..d5c8cd24 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -19,38 +19,26 @@
 
 namespace precalc {
   namespace py = boost::python;
-  py::tuple _precalc_array_info(const py::object & args, const py::object & arg_descrs, const py::object & shape_arg_index) {
+  py::tuple _precalc_array_info(const py::object & args, const py::object & arrayarginds, const py::object & shape_arg_index) {
     bool contigmatch = true;
     bool arrayspecificinds = true;
-    std::vector<int> arrayarginds;
     std::vector<int> shape;
     py::object shape_obj;
     char order = 'N';
-    int numargs = py::len(args);
+    int numarrays = py::len(arrayarginds);
 
-    for (int i = 0; i < numargs; i++) {
-      py::object arg_descr = py::object(arg_descrs[i]);
-      // below is our version of isinstance(arg_descr, VectorArg)
-      py::object struct_char;
-      try {
-	struct_char = arg_descr.attr("struct_char");
-      } catch (...) {
-	continue;
-      }
-      if (py::extract<char>(struct_char) != 'P') {
-	continue;
-      }
+    for (int aind = 0; aind < numarrays; aind++) {
+      int arrayargind = py::extract<int>(arrayarginds[aind]);
       // is a GPUArray/DeviceAllocation
       //py::object arg = py::object(args[i]);
-      py::object arg = args[i];
-      arrayarginds.push_back(i);
+      py::object arg(args[arrayargind]);
       if (!arrayspecificinds) {
 	continue;
       }
       py::object curshape_obj;
       py::object curstrides_obj;
       int itemsize;
-      int ndim;
+      int ndim = 0;
       try {
 	curshape_obj = arg.attr("shape");
 	curstrides_obj = arg.attr("strides");
@@ -64,6 +52,10 @@ namespace precalc {
 	arrayspecificinds = false;
 	continue;
       }
+      if (ndim == 0) {
+	// probably a scalar
+	continue;
+      }
       std::vector<int> curshape(ndim);
       std::vector<int> curstrides(ndim);
       for (int i = 0; i < ndim; i++) {
@@ -116,19 +108,12 @@ namespace precalc {
 	PyErr_SetString(PyExc_RuntimeError, "All input arrays to elementwise kernels must have the same shape, or you must specify the argument that has the canonical shape with shape_arg_index");
 	py::throw_error_already_set();
       }
-      if (shape_arg_index == i) {
+      if (shape_arg_index == arrayargind) {
 	shape = curshape;
 	shape_obj = curshape_obj;
       }
     }
-    py::list arrayarginds_obj;
-    {
-      typename std::vector<int>::iterator iter = arrayarginds.begin();
-      for (/*null*/; iter != arrayarginds.end(); iter++) {
-	arrayarginds_obj.append(*iter);
-      }
-    }
-    return py::make_tuple(contigmatch, arrayarginds_obj, arrayspecificinds, shape_obj);
+    return py::make_tuple(contigmatch, arrayarginds, arrayspecificinds, shape_obj);
   }
 }
 
@@ -1774,7 +1759,7 @@ BOOST_PYTHON_MODULE(_driver)
 
   {
     using namespace precalc;
-    DEF_SIMPLE_FUNCTION_WITH_ARGS(_precalc_array_info, ("args", "arg_descrs", "shape_arg_index"));
+    DEF_SIMPLE_FUNCTION_WITH_ARGS(_precalc_array_info, ("args", "arrayarginds", "shape_arg_index"));
   }
 }
 

From 6effa4d7a7928bbffa7dc952271beb3faa74b55e Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Mon, 25 Jun 2018 17:42:27 -0400
Subject: [PATCH 24/35] Don't delete a variable that is used later.

---
 pycuda/gpuarray.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index fb5da57f..2c270487 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1264,7 +1264,6 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
     doslow = (any(sh > 1 for sh, st in zip(shape, src.strides) if st == 0)
               or
               any(sh > 1 for sh, st in zip(shape, dst.strides) if st == 0))
-    del shape
 
     try:
         src, dst = _flip_negative_strides((src, dst))[1]

From f4bdd52fb00bdadeb7d19fcf5b4861af02c99f44 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 5 Jul 2018 13:49:37 -0400
Subject: [PATCH 25/35] Allow DeferredSource to take a string in constructor.

---
 pycuda/deferred.py    | 4 +++-
 pycuda/elementwise.py | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index 861c087e..0db9e757 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -21,12 +21,14 @@ class DeferredSource(object):
     generate source.
     
     '''
-    def __init__(self, subsources=None, base_indent=0, indent_step=2):
+    def __init__(self, str=None, subsources=None, base_indent=0, indent_step=2):
         self.base_indent = base_indent
         self.indent_step = indent_step
         if subsources is None:
             subsources = []
         self.subsources = subsources
+        if str is not None:
+            self.add(str)
         self._regex_space = re.compile(r"^(\s*)(.*?)(\s*)$")
         self._regex_format = re.compile(r"%\(([^\)]*)\)([a-zA-Z])")
 
diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index eec90b66..181e7335 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -260,6 +260,10 @@ def create_source(self, precalc, grid, block, *args):
 
         arraynames = [ x[0] for x in arrayarginfos ]
 
+        preamble = DeferredSource(preamble)
+        operation = DeferredSource(operation)
+        loop_prep = DeferredSource(loop_prep)
+        after_loop = DeferredSource(after_loop)
         defines = DeferredSource()
         decls = DeferredSource()
         loop_preop = DeferredSource()

From 49aa8e0d74f634574bd4484ad245d9b8071c2505 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 5 Jul 2018 14:10:02 -0400
Subject: [PATCH 26/35] Many updates including:

- move as much computation as possible out of create_key(), either to __init__() if it can be pre-computed without knowing shape/strides of args, or to create_source() otherwise.
- add support for kernel to see the N-D indices via the do_indices keyword argument and the INDEX kernel variable
- change all internal kernel variable names to all-caps and precede by underscore, aside from those variables likely to be used by custom kernels, such as indexing variables (i, A_i, etc.), NDIM, INDEX, total_threads (used by curandom)
- allow user to explicitly specify the arguments (by index into arguments) that should be traversed elementwise as arrays
- allow user to specify order (fortran or C) to traverse elementwise arrays
- remove some accidentally committed @profile decorators
---
 pycuda/elementwise.py        | 262 ++++++++++++++++++++++-------------
 src/wrapper/wrap_cudadrv.cpp |  12 +-
 2 files changed, 174 insertions(+), 100 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 181e7335..439aba02 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -51,28 +51,50 @@ class ElementwiseSourceModule(DeferredSourceModule):
         you can index it as ``z[z_i]`` in addition to the old-style ``z[i]``)
       * support for non-contiguous (and arbitrarily-strided) arrays, but
         only if you use the array-specific indices above.
+      * if do_indices=True, the N-D index of the current data item in all
+        participating arrays is available in INDEX[dim] where dim goes from
+        0 to NDIM - 1.
     Array-specific flat indices only really work if all the arrays using them
     are the same shape.  This shape is also used to optimize index
     calculations.  By default, the shape is taken from the first argument
     that is specified as a pointer/array, but you can override this by
     sending ``shape_arg_index=N`` where ``N`` is the zero-based index of the
     kernel argument whose shape should be used.
+    By default, any VectorArg objects in ```arguments``` are assumed to be
+    arrays that need index calculations.  If only some of the VectorArgs
+    will be traversed element-wise, you can specify their indices (within
+    ```arguments```) with ```array_arg_inds```, and the resulting kernel will
+    not waste time computing indices for those.
     '''
     def __init__(self, arguments, operation,
                  name="kernel", preamble="", loop_prep="", after_loop="",
-                 do_range=False, shape_arg_index=None,
+                 do_range=False,
+                 shape_arg_index=None,
+                 array_arg_inds=None,
+                 do_indices=False,
+                 order='K',
                  debug=False,
                  **compilekwargs):
         super(ElementwiseSourceModule, self).__init__(**compilekwargs)
         self._do_range = do_range
-        self._shape_arg_index = shape_arg_index
-        self._init_args = (tuple(arguments), operation,
+        self._do_indices = do_indices
+        assert(order in 'CFK', "order must be 'F', 'C', or 'K'")
+        self._order = order
+        self._arg_decls = ", ".join(arg.declarator() for arg in arguments)
+        if array_arg_inds is None:
+            array_arg_inds = [i for i in range(len(arguments)) if isinstance(arguments[i], VectorArg)]
+        self._array_arg_inds = array_arg_inds
+        if shape_arg_index is None:
+            self._shape_arg_index = array_arg_inds[0]
+        else:
+            self._shape_arg_index = shape_arg_index
+        self._arg_names = tuple(argument.name for argument in arguments)
+        self._array_arg_names = tuple(self._arg_names[i] for i in self._array_arg_inds)
+        self._init_args = (self._arg_names, self._arg_decls, operation,
                            name, preamble, loop_prep, after_loop)
         self._init_args_repr = repr(self._init_args)
-        self._arrayarginds = [i for i in range(len(arguments)) if isinstance(arguments[i], VectorArg)]
         self._debug = debug
 
-    @profile
     def _precalc_array_info(self, args, shape_arg_index):
         # 'args' is the list of actual parameters being sent to the kernel
 
@@ -80,7 +102,8 @@ def _precalc_array_info(self, args, shape_arg_index):
         arrayspecificinds = True
         shape = None
         order = None
-        for i in self._arrayarginds:
+        array_arg_inds = self._array_arg_inds
+        for i in array_arg_inds:
             # is a GPUArray/DeviceAllocation
             arg = args[i]
             if not arrayspecificinds:
@@ -109,82 +132,64 @@ def _precalc_array_info(self, args, shape_arg_index):
             if shape_arg_index == i:
                 shape = curshape
 
-        return (contigmatch, self._arrayarginds, arrayspecificinds, shape)
+        arrayitemstrides = tuple((arg.itemsize, arg.strides) for arg in (args[i] for i in array_arg_inds))
+
+        return (contigmatch, arrayspecificinds, shape, arrayitemstrides)
 
 
-    @profile
     def create_key(self, grid, block, *args):
         #print "Calling _precalc_array_info(args=%s, self._init_args[0]=%s, self._shape_arg_index=%s)\n" % (args, self._init_args[0], self._shape_arg_index)
-        #precalc = self._precalc_array_info(args, self._shape_arg_index)
-        precalc = _drv._precalc_array_info(args, self._arrayarginds, self._shape_arg_index)
-        (contigmatch, arrayarginds, arrayspecificinds, shape) = precalc
+        #precalc_init = self._precalc_array_info(args, self._shape_arg_index)
+        array_arg_inds = self._array_arg_inds
+        precalc_init = _drv._precalc_array_info(args, array_arg_inds, self._shape_arg_index)
+        (contigmatch, arrayspecificinds, shape, arrayitemstrides) = precalc_init
 
-        if contigmatch:
+        if (contigmatch or not arrayspecificinds) and not self._do_indices:
             key = self._init_args_repr
-            return key, (contigmatch, arrayarginds, arrayspecificinds, shape, None, None, None, None, None)
+            return key, (self._init_args_repr, shape, contigmatch, arrayspecificinds, None, None, None, None, None)
 
-        # Arrays are not contiguous or different order
+        # Arrays are not contiguous or different order or need calculated indices
 
         if grid[1] != 1 or block[1] != 1 or block[2] != 1:
             raise Exception("Grid (%s) and block (%s) specifications should have all '1' except in the first element" % (grid, block))
 
         ndim = len(shape)
-        numthreads = block[0] * grid[0]
-        arguments = self._init_args[0]
+        arg_names = self._arg_names
 
-        # Use index of minimum stride in first array as a hint on how to
+        # Kernel traverses in Fortran-order by default, but if order == 'C',
+        # we can just reverse the shape and strides, since the kernel is
+        # elementwise.  If order == 'K', use index of minimum stride in first
+        # array (or that specified by shape_arg_index) as a hint on how to
         # order the traversal of dimensions.  We could probably do something
         # smarter, like tranposing/reshaping arrays if possible to maximize
         # performance, but that is probably best done in a pre-processing step.
         # Note that this could mess up custom indexing that assumes a
-        # particular traversal order, but in that case one should probably
-        # ensure that inputs have the same order, and explicitly send
-        # shape_arg_index to turn this off.
+        # particular traversal order, but in that case one should explicitly
+        # set order to 'C' or 'F'.
         do_reverse = False
-        if (self._shape_arg_index is None and
-            np.argmin(np.abs(args[arrayarginds[0]].strides)) > ndim // 2):
-            # traverse dimensions in reverse order
+        if self._order == 'C':
             do_reverse = True
-        if do_reverse:
-            shape = shape[::-1]
-        shapearr = np.array(shape)
-        block_step = np.array(shapearr)
-        tmp = numthreads
-        for dimnum in range(ndim):
-            newstep = tmp % block_step[dimnum]
-            tmp = tmp // block_step[dimnum]
-            block_step[dimnum] = newstep
-        arrayarginfos = []
-        for i in arrayarginds:
-            arg = args[i]
-            arg_descr = arguments[i]
-            if do_reverse:
-                elemstrides = np.array(arg.strides[::-1]) // arg.itemsize
-            else:
-                elemstrides = np.array(arg.strides) // arg.itemsize
-            dimelemstrides = elemstrides * shapearr
-            blockelemstrides = elemstrides * block_step
-            arrayarginfos.append(
-                (arg_descr.name, tuple(elemstrides), tuple(dimelemstrides), tuple(blockelemstrides))
-            )
+        elif self._order == 'K':
+            if np.argmin(np.abs(args[self._shape_arg_index].strides)) > ndim // 2:
+                # traverse dimensions in reverse order
+                do_reverse = True
 
-        self._arrayarginfos = arrayarginfos
-        self._ndim = ndim
-        self._numthreads = numthreads
-        self._shape = shape
-        self._block_step = block_step
+        # need to include grid and block in key because kernel will change
+        # depending on number of threads
+        key = (self._init_args_repr, shape, contigmatch, arrayspecificinds, arrayitemstrides, self._arg_decls, do_reverse, grid, block)
 
-        key = (self._init_args_repr, grid, block, shape, tuple(self._arrayarginfos))
-
-        return key, (contigmatch, arrayarginds, arrayspecificinds, shape, arrayarginfos, ndim, numthreads, shape, block_step)
+        return key, key
 
     def create_source(self, precalc, grid, block, *args):
-        (contigmatch, arrayarginds, arrayspecificinds, shape, arrayarginfos, ndim, numthreads, shape, block_step) = precalc
+        (_, shape, contigmatch, arrayspecificinds, arrayitemstrides, _, do_reverse, _, _) = precalc
 
-        (arguments, operation,
+        (arg_names, arg_decls, operation,
          funcname, preamble, loop_prep, after_loop) = self._init_args
 
-        if contigmatch:
+        array_arg_inds = self._array_arg_inds
+        array_arg_names = tuple(arg_names[i] for i in array_arg_inds)
+
+        if contigmatch and not self._do_indices:
             indtype = 'unsigned'
             if self._do_range:
                 indtype = 'long'
@@ -192,15 +197,15 @@ def create_source(self, precalc, grid, block, *args):
             # All arrays are contiguous and same order (or we don't know and
             # it's up to the caller to make sure it works)
             if arrayspecificinds:
-                for i in arrayarginds:
+                for arg_name in array_arg_names:
                     preamble = preamble + """
                         #define %s_i i
-                    """ % (arguments[i].name,)
+                    """ % (arg_name,)
             if self._do_range:
                 loop_body = """
                   if (step < 0)
                   {
-                    for (i = start + (cta_start + tid)*step;
+                    for (i = start + (_CTA_START + _TID)*step;
                       i > stop; i += total_threads*step)
                     {
                       %(operation)s;
@@ -208,7 +213,7 @@ def create_source(self, precalc, grid, block, *args):
                   }
                   else
                   {
-                    for (i = start + (cta_start + tid)*step;
+                    for (i = start + (_CTA_START + _TID)*step;
                       i < stop; i += total_threads*step)
                     {
                       %(operation)s;
@@ -219,7 +224,7 @@ def create_source(self, precalc, grid, block, *args):
                 }
             else:
                 loop_body = """
-                  for (i = cta_start + tid; i < n; i += total_threads)
+                  for (i = _CTA_START + _TID; i < n; i += total_threads)
                   {
                     %(operation)s;
                   }
@@ -232,11 +237,11 @@ def create_source(self, precalc, grid, block, *args):
 
                 %(preamble)s
 
-                __global__ void %(name)s(%(arguments)s)
+                __global__ void %(name)s(%(arg_decls)s)
                 {
-                  unsigned tid = threadIdx.x;
+                  unsigned _TID = threadIdx.x;
                   unsigned total_threads = gridDim.x*blockDim.x;
-                  unsigned cta_start = blockDim.x*blockIdx.x;
+                  unsigned _CTA_START = blockDim.x*blockIdx.x;
 
                   %(indtype)s i;
 
@@ -247,7 +252,7 @@ def create_source(self, precalc, grid, block, *args):
                   %(after_loop)s;
                 }
                 """ % {
-                    "arguments": ", ".join(arg.declarator() for arg in arguments),
+                    "arg_decls": arg_decls,
                     "name": funcname,
                     "preamble": preamble,
                     "loop_prep": loop_prep,
@@ -256,9 +261,50 @@ def create_source(self, precalc, grid, block, *args):
                     "indtype": indtype,
                 }
 
-        # Arrays are not contiguous or different order
+        # Arrays are not contiguous or different order or we need N-D indices
+
+        # these are the arrays that need calculated indices
+        arrayindnames = []
 
-        arraynames = [ x[0] for x in arrayarginfos ]
+        # these are the arrays that match another array and can use the
+        # same indices
+        arrayrefnames = []
+
+        argnamestrides = ((arg_name, itemstride[0], itemstride[1]) for (arg_name, itemstride) in zip(array_arg_names, arrayitemstrides))
+
+        ndim = len(shape)
+        numthreads = block[0] * grid[0]
+        if do_reverse:
+            shape = shape[::-1]
+        shapearr = np.array(shape)
+        block_step = np.array(shapearr)
+        tmpnumthreads = numthreads
+        for dimnum in range(ndim):
+            newstep = tmpnumthreads % block_step[dimnum]
+            tmpnumthreads = tmpnumthreads // block_step[dimnum]
+            block_step[dimnum] = newstep
+        arrayarginfos = []
+        for (arg_name, arg_itemsize, arg_strides) in argnamestrides:
+            strides = [0] * ndim
+            if do_reverse:
+                strides[0:len(arg_strides)] = arg_strides[::-1]
+            else:
+                strides[0:len(arg_strides)] = arg_strides
+            elemstrides = np.array(strides) // arg_itemsize
+            dimelemstrides = tuple(elemstrides * shapearr)
+            blockelemstrides = tuple(elemstrides * block_step)
+            elemstrides = tuple(elemstrides)
+            inforef = None
+            for i, arrayarginfo in enumerate(arrayarginfos):
+                if elemstrides == arrayarginfo[1]:
+                    inforef = i
+            if inforef is None:
+                arrayindnames.append(arg_name)
+            else:
+                arrayrefnames.append((arg_name, arrayarginfos[inforef][0]))
+            arrayarginfos.append(
+                (arg_name, elemstrides, dimelemstrides, blockelemstrides, inforef)
+            )
 
         preamble = DeferredSource(preamble)
         operation = DeferredSource(operation)
@@ -271,6 +317,9 @@ def create_source(self, precalc, grid, block, *args):
         loop_inds_inc = DeferredSource()
         loop_body = DeferredSource()
 
+        if self._do_indices:
+            defines += """#define NDIM %d""" % (ndim,)
+
         for definename, vals in (
                 ('SHAPE', shape),
                 ('BLOCK_STEP', block_step),
@@ -279,53 +328,71 @@ def create_source(self, precalc, grid, block, *args):
                 defines += """
                     #define %s_%d %d
                 """ % (definename, dimnum, vals[dimnum])
-        for name, elemstrides, dimelemstrides, blockelemstrides in arrayarginfos:
+        for name, elemstrides, dimelemstrides, blockelemstrides, inforef in arrayarginfos:
             for definename, vals in (
                     ('ELEMSTRIDE', elemstrides),
                     ('DIMELEMSTRIDE', dimelemstrides),
                     ('BLOCKELEMSTRIDE', blockelemstrides),
             ):
                 for dimnum in range(ndim):
-                    basename = "%s_%d" % (name, dimnum)
                     defines += """
                         #define %s_%s_%d %d
                     """ % (definename, name, dimnum, vals[dimnum])
 
         decls += """
-            unsigned GLOBAL_i = cta_start + tid;
+            unsigned _GLOBAL_i = _CTA_START + _TID;
         """
-        for name in arraynames:
+        for name in arrayindnames:
             decls += """
                 long %s_i = 0;
             """ % (name,)
-        for dimnum in range(ndim):
+
+        for (name, refname) in arrayrefnames:
+            defines += """
+                #define %s_i %s_i
+            """ % (name, refname)
+
+        if self._do_indices:
             decls += """
+                long INDEX[%d];
+            """ % (ndim,)
+
+            for dimnum in range(ndim):
+                defines += """
+                #define INDEX_%d (INDEX[%d])
+                """ % (dimnum, dimnum)
+        else:
+            for dimnum in range(ndim):
+                decls += """
                 long INDEX_%d;
-            """ % (dimnum,)
+                """ % (dimnum,)
 
         loop_inds_calc += """
-            unsigned int TMP_GLOBAL_i = GLOBAL_i;
+            unsigned int _TMP_GLOBAL_i = _GLOBAL_i;
         """
         for dimnum in range(ndim):
             loop_inds_calc += """
-                INDEX_%d = TMP_GLOBAL_i %% SHAPE_%d;
-                TMP_GLOBAL_i = TMP_GLOBAL_i / SHAPE_%d;
+                INDEX_%d = _TMP_GLOBAL_i %% SHAPE_%d;
+                _TMP_GLOBAL_i = _TMP_GLOBAL_i / SHAPE_%d;
             """ % (dimnum, dimnum,
                    dimnum)
 
-            for name in arraynames:
+            for name in arrayindnames:
                 loop_inds_calc += """
                     %s_i += INDEX_%d * ELEMSTRIDE_%s_%d;
                 """ % (name, dimnum, name, dimnum)
 
+        for name in arrayindnames:
+            loop_inds_inc += """
+                    %s_i += %s;
+            """ % (name,
+                   " + ".join(
+                       "BLOCKELEMSTRIDE_%s_%d" % (name, dimnum)
+                       for dimnum in range(ndim)))
         for dimnum in range(ndim):
             loop_inds_inc += """
                     INDEX_%d += BLOCK_STEP_%d;
             """ % (dimnum, dimnum)
-            for name in arraynames:
-                loop_inds_inc += """
-                    %s_i += BLOCKELEMSTRIDE_%s_%d;
-                """ % (name, name, dimnum)
             if dimnum < ndim - 1:
                 loop_inds_inc += """
                     if (INDEX_%d >= SHAPE_%d) {
@@ -336,7 +403,7 @@ def create_source(self, precalc, grid, block, *args):
                       INDEX_%d ++;
                 """ % (dimnum, dimnum,
                        dimnum + 1)
-                for name in arraynames:
+                for name in arrayindnames:
                     loop_inds_inc += """
                       %s_i += ELEMSTRIDE_%s_%d - DIMELEMSTRIDE_%s_%d;
                     """ % (name, name, dimnum + 1, name, dimnum)
@@ -350,7 +417,7 @@ def create_source(self, precalc, grid, block, *args):
                 #include <stdio.h>
             """
             loop_inds_calc += """
-                if (cta_start == 0 && tid == 0) {
+                if (_CTA_START == 0 && _TID == 0) {
             """
             loop_inds_calc.indent()
             loop_inds_calc += r"""
@@ -358,7 +425,7 @@ def create_source(self, precalc, grid, block, *args):
                 printf("CALLING FUNC %s\n");
                 printf("N=%%u\n", (unsigned int)n);
             """ % (funcname,)
-            for name, elemstrides, dimelemstrides, blockelemstrides in arrayarginfos:
+            for name, elemstrides, dimelemstrides, blockelemstrides, inforef in arrayarginfos:
                 loop_inds_calc += r"""
                     printf("(%s) %s: ptr=0x%%lx maxoffset(elems)=%s\n", (unsigned long)%s);
                 """ % (funcname, name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name)
@@ -367,24 +434,23 @@ def create_source(self, precalc, grid, block, *args):
                 }
             """
             indtest = DeferredSource()
-            for name, elemstrides, dimelemstrides, blockelemstrides in arrayarginfos:
+            for name, elemstrides, dimelemstrides, blockelemstrides, inforef in arrayarginfos:
+                if inforef is not None:
+                    continue
                 indtest += r"""
                     if (%s_i > %s || %s_i < 0)
                 """ % (name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name)
                 indtest += r"""{"""
                 indtest.indent()
                 indtest += r"""
-                        printf("cta_start=%%d tid=%%d GLOBAL_i=%%u %s_i=%%ld %s\n", cta_start, tid, GLOBAL_i, %s_i, %s);
+                        printf("_CTA_START=%%d _TID=%%d _GLOBAL_i=%%u %s_i=%%ld %s\n", _CTA_START, _TID, _GLOBAL_i, %s_i, %s);
                 """ % (name, ' '.join("INDEX_%d=%%ld" % (dimnum,) for dimnum in range(ndim)), name, ',  '.join("INDEX_%d" % (dimnum,) for dimnum in range(ndim)))
                 indtest += """break;"""
                 indtest.dedent()
                 indtest += """}"""
             loop_preop = indtest + loop_preop
-            tmp_after_loop = after_loop
-            after_loop = DeferredSource()
-            after_loop += tmp_after_loop
             after_loop += r"""
-                if (cta_start == 0 && tid == 0) {
+                if (_CTA_START == 0 && _TID == 0) {
                     printf("DONE CALLING FUNC %s\n");
                     printf("-----------------------\n");
                 }
@@ -394,7 +460,7 @@ def create_source(self, precalc, grid, block, *args):
             loop_body.add("""
               if (step < 0)
               {
-                for (/*void*/; GLOBAL_i > stop; GLOBAL_i += total_threads*step)
+                for (/*void*/; _GLOBAL_i > stop; _GLOBAL_i += total_threads*step)
                 {
                   %(loop_preop)s;
 
@@ -405,7 +471,7 @@ def create_source(self, precalc, grid, block, *args):
               }
               else
               {
-                for (/*void*/; GLOBAL_i < stop; GLOBAL_i += total_threads*step)
+                for (/*void*/; _GLOBAL_i < stop; _GLOBAL_i += total_threads*step)
                 {
                   %(loop_preop)s;
 
@@ -421,7 +487,7 @@ def create_source(self, precalc, grid, block, *args):
             })
         else:
             loop_body.add("""
-              for (/*void*/; GLOBAL_i < n; GLOBAL_i += total_threads)
+              for (/*void*/; _GLOBAL_i < n; _GLOBAL_i += total_threads)
               {
                 %(loop_preop)s;
 
@@ -445,12 +511,12 @@ def create_source(self, precalc, grid, block, *args):
 
             %(preamble)s
 
-            __global__ void %(name)s(%(arguments)s)
+            __global__ void %(name)s(%(arg_decls)s)
             {
 
-              unsigned tid = threadIdx.x;
-              unsigned total_threads = gridDim.x*blockDim.x;
-              unsigned cta_start = blockDim.x*blockIdx.x;
+              const unsigned int _TID = threadIdx.x;
+              const unsigned int total_threads = gridDim.x*blockDim.x;
+              const unsigned int _CTA_START = blockDim.x*blockIdx.x;
 
               %(decls)s
 
@@ -463,7 +529,7 @@ def create_source(self, precalc, grid, block, *args):
               %(after_loop)s;
             }
             """, format_dict={
-                "arguments": ", ".join(arg.declarator() for arg in arguments),
+                "arg_decls": arg_decls,
                 "name": funcname,
                 "preamble": preamble,
                 "loop_prep": loop_prep,
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index d5c8cd24..196359c4 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -26,6 +26,7 @@ namespace precalc {
     py::object shape_obj;
     char order = 'N';
     int numarrays = py::len(arrayarginds);
+    PyObject * arrayitemstrides = PyTuple_New(numarrays);
 
     for (int aind = 0; aind < numarrays; aind++) {
       int arrayargind = py::extract<int>(arrayarginds[aind]);
@@ -37,12 +38,14 @@ namespace precalc {
       }
       py::object curshape_obj;
       py::object curstrides_obj;
+      py::object itemsize_obj;
       int itemsize;
       int ndim = 0;
       try {
 	curshape_obj = arg.attr("shape");
 	curstrides_obj = arg.attr("strides");
-	itemsize = py::extract<int>(arg.attr("itemsize"));
+	itemsize_obj = arg.attr("itemsize");
+	itemsize = py::extract<int>(itemsize_obj);
 	ndim = py::extract<int>(arg.attr("ndim"));
       } catch (...) {
 	// At least one array argument is probably sent as a
@@ -56,6 +59,10 @@ namespace precalc {
 	// probably a scalar
 	continue;
       }
+      PyObject * sizestrides = PyTuple_New(2);
+      PyTuple_SetItem(sizestrides, 0, py::incref(itemsize_obj.ptr()));
+      PyTuple_SetItem(sizestrides, 1, py::incref(curstrides_obj.ptr()));
+      PyTuple_SetItem(arrayitemstrides, aind, sizestrides);
       std::vector<int> curshape(ndim);
       std::vector<int> curstrides(ndim);
       for (int i = 0; i < ndim; i++) {
@@ -113,7 +120,8 @@ namespace precalc {
 	shape_obj = curshape_obj;
       }
     }
-    return py::make_tuple(contigmatch, arrayarginds, arrayspecificinds, shape_obj);
+
+    return py::make_tuple(contigmatch, arrayspecificinds, shape_obj, py::object(py::handle<>(arrayitemstrides)));
   }
 }
 

From c437bcf244f4bb55fcc75331f91a1792ad3f2634 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 5 Jul 2018 14:11:28 -0400
Subject: [PATCH 27/35] Fix string interpolation bug.

---
 pycuda/deferred.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index 0db9e757..bd961c26 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -97,12 +97,13 @@ def generate(self, indent=0, indent_first=True, get_list=False):
                         newline = newline + line[curpos:matchstart]
                         newline = newline + (('%' + formatchar) % (repl,))
                     curpos = matchend
-                if newlinelist is None:
-                    newline = newline + line[curpos:]
-                    newlines.append(newline)
-                else:
+                if newlinelist:
                     newlines.extend(newlinelist)
-                    newlines.append(line[curpos:])
+                else:
+                    newlines.append(newline)
+                # add remaining unprocessed part of line to end of last
+                # replacement
+                newlines[-1] = newlines[-1] + line[curpos:]
             indentstr = ' ' * (indent + subindent)
             for i, line in enumerate(newlines):
                 line = indentstr + line[minstrip:]

From 637939f5c310d8beccb3f860aa530e92975176da Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 5 Jul 2018 14:13:24 -0400
Subject: [PATCH 28/35] Small performance improvement.

---
 pycuda/deferred.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index bd961c26..46ab0067 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -486,15 +486,15 @@ def _delayed_get_function(self, funcname, funcargs, grid, block):
         funccache = DeferredSourceModule._cache.get(context, None)
         if funccache is None:
             funccache = self._cache[context] = {}
-        (key, precalc) = self.create_key(grid, block, *funcargs)
-        funckey = (funcname, key)
-        if key is None or funckey not in funccache:
+        (funckey, precalc) = self.create_key(grid, block, *funcargs)
+        modfunc = funccache.get(funckey, None)
+        if modfunc is None:
             source = self.create_source(precalc, grid, block, *funcargs)
             if isinstance(source, DeferredSource):
                 source = source.generate()
-            if key is None:
+            if funckey is None:
                 funckey = (funcname, source)
-        modfunc = funccache.get(funckey, None)
+            modfunc = funccache.get(funckey, None)
         if modfunc is None:
             module = self._delayed_compile(source)
             func = module.get_function(funcname)

From 6442bb3c5d92e6d2fd516702f5e01458989acbf3 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Mon, 9 Jul 2018 17:11:11 -0400
Subject: [PATCH 29/35] Fix comment.

---
 pycuda/gpuarray.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 2c270487..2a9e4ba9 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1171,7 +1171,7 @@ def _flip_negative_strides(arrays):
     # (or ``None`` if there was no flipping).
     # NOTE: Every input array A must have the same value for the following
     # expression:  np.sign(A.strides)
-    # NOTE: ``slicer`` is its own inverse, so ``A[slicer][slicer] == A``
+    # NOTE: ``flipper`` is its own inverse, so ``A[flipper][flipper] == A``
     if isinstance(arrays, GPUArray):
         raise TypeError("_flip_negative_strides expects a list of GPUArrays")
     if not any(any(s < 0 for s in x.strides) for x in arrays):

From 88f197c2ff5eaa82ba407cd7411b31c26f423dcc Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Tue, 31 Jul 2018 11:09:33 -0400
Subject: [PATCH 30/35] Interpolation fixes.

---
 pycuda/deferred.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index 46ab0067..ccde3f63 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -62,6 +62,7 @@ def generate(self, indent=0, indent_first=True, get_list=False):
             for line in lines:
                 linelen = len(line)
                 space_match = self._regex_space.match(line)
+                space = space_match.group(1)
                 end_leading_space = space_match.end(1)
                 begin_trailing_space = space_match.start(3)
                 if strip_space:
@@ -90,7 +91,6 @@ def generate(self, indent=0, indent_first=True, get_list=False):
                         nummatches == 1 and
                         matchstart == end_leading_space):
                         # only one replacement, and only spaces preceding
-                        space = space_match.group(1)
                         newlinelist = [ space + x
                                         for x in repl.generate(get_list=True) ]
                     else:
@@ -103,7 +103,10 @@ def generate(self, indent=0, indent_first=True, get_list=False):
                     newlines.append(newline)
                 # add remaining unprocessed part of line to end of last
                 # replacement
-                newlines[-1] = newlines[-1] + line[curpos:]
+                if newlinelist is not None and len(newlinelist) > 1:
+                    newlines.append(space + line[curpos:])
+                else:
+                    newlines[-1] = newlines[-1] + line[curpos:]
             indentstr = ' ' * (indent + subindent)
             for i, line in enumerate(newlines):
                 line = indentstr + line[minstrip:]

From 39d950ad7070d67eef0887525fb408d5640b68e4 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Tue, 31 Jul 2018 11:11:22 -0400
Subject: [PATCH 31/35] Make sure casting works for both built-in types and
 classes with constructors.

---
 pycuda/elementwise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 439aba02..944e7c3c 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -983,7 +983,7 @@ def get_arange_kernel(dtype):
             "%(tp)s *z, %(tp)s start, %(tp)s step" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[z_i] = start + %(tp)s(z_i)*step" % {
+            "z[z_i] = start + ((%(tp)s)(z_i))*step" % {
                 "tp": dtype_to_ctype(dtype),
                 },
             "arange")

From 07022cd631249ab3088faeb5565ece7241b49799 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Fri, 3 Aug 2018 16:42:04 -0400
Subject: [PATCH 32/35] Clean up DeferredVal implementation (_eval becomes
 _get, _evalbase becomes _eval) and documentation.  Get rid of evalcont
 parameter in favor of explicit _set_mod() call.

---
 pycuda/deferred.py | 107 ++++++++++++++++++++++++++++++---------------
 1 file changed, 71 insertions(+), 36 deletions(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index ccde3f63..a843322e 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -151,23 +151,40 @@ def __add__(self, other):
 
 class DeferredVal(object):
     '''
-    This is an object that serves as a proxy to an as-yet undetermined
-    object, which is only known at the time when either ``_set_val()``
-    or ``_evalbase()`` is called.  Any calls to methods listed in the class
-    attribute ``_deferred_method_dict`` are queued until a call to
-    ``_eval()``, at which point ``_evalbase()`` is called (unless the value
-    has been already set by an earlier call to ``_evalbase()`` or
-    ``set_val()``), and then the queued method calls are executed, in order,
-    immediately on the new object.
-    This class must be subclassed, and the class attribute
-    ``_deferred_method_dict`` must contain a mapping from defer-able method
-    names to either ``DeferredVal``, None (same as ``DeferredVal``), or a
-    subclass, which when instantiated, will be assigned (with ``_set_val()``)
-    the return value of the method.
-    There are two ways to set the proxied object.  One is to set it
-    explicitly with ``_set_val(val)``.  The other is to override the method
-    ``_evalbase()`` which should return the new object, and will be called
-    by ``_eval()``.
+    This is an object that serves as a wrapper to an object that may not
+    even exist yet (perhaps this should have been called FutureVal).  All
+    methods provided by ``DeferredVal`` start with underscores so as not to
+    interfere with common attribute names.
+
+    The life of a ``DeferredVal`` can be divided into two phases: before
+    and after the wrapped object is "completed", i.e. once the wrapped
+    object is specified (see below for how/when this happens).
+
+    After it is completed, any attempt to access attributes on this
+    ``DeferredVal`` will be simply redirected to the wrapped object.
+
+    Before the wrapped object is completed, the only attributes allowed are
+    methods named in the class variable ``_deferred_method_dict`` (which
+    must be set in a subclass).  ``_deferred_method_dict`` specifies the
+    names of methods that can be deferred to a later time when the current
+    ``DeferredVal`` is completed.  It maps method names to a result class
+    that will be instantiated and used to represent the return value of the
+    deferred method call.  The result class can be DeferredVal, a subclass
+    of ``DeferredVal``, or None (same as DeferredVal).  Any attempts to
+    call one of these defer-able methods will return an instance of the
+    result class, which will be tied to a future call of the method once
+    the current ``DeferredVal`` object is completed.
+
+    A ``DeferredVal`` can be completed by specifying the wrapped object in
+    one of two ways.  One way is to set it explicitly with
+    ``_set_val(val)``.  The other is to call ``_get()``, which requires
+    that ``_eval()`` be defined in a subclass to return the actual object.
+    Both ways will trigger the deferred method calls (described above) that
+    depend on this ``DeferredVal`` and the completion of their own
+    ``DeferredVal`` return values.
+
+    Once completed, the wrapped object can be retrieved by calling
+    ``_get()``.
     '''
     __unimpl = object()
     _deferred_method_dict = None # must be set by subclass
@@ -218,45 +235,54 @@ def _set_val(self, val):
         self._eval_methods()
         return val
 
-    def _evalbase(self, evalcont=None):
+    def _eval(self):
         raise NotImplementedError()
 
-    def _eval_list(self, vals, evalcont=None):
+    def _eval_list(self, vals):
         newvals = []
         for val in vals:
             if isinstance(val, DeferredVal):
-                val = val._eval(evalcont=evalcont)
+                val = val._get()
             newvals.append(val)
         return newvals
 
-    def _eval_dict(self, valsdict, evalcont=None):
+    def _eval_dict(self, valsdict):
         newvalsdict = {}
         for name, val in valsdict.items():
             if isinstance(val, DeferredVal):
-                val = val._eval(evalcont=evalcont)
+                val = val._get()
             newvalsdict[name] = val
         return newvalsdict
 
-    def _eval_methods(self, evalcont=None):
+    def _eval_methods(self):
         assert(self._val_available)
         val = self._val
         for op in self._deferred_method_calls:
             (methodname, methodargs, methodkwargs, deferredretval) = op
-            methodargs = self._eval_list(methodargs, evalcont=evalcont)
-            methodkwargs = self._eval_dict(methodkwargs, evalcont=evalcont)
+            methodargs = self._eval_list(methodargs)
+            methodkwargs = self._eval_dict(methodkwargs)
             retval = getattr(val, methodname)(*methodargs, **methodkwargs)
             deferredretval._set_val(retval)
         self._deferred_method_calls = []
 
-    def _eval(self, evalcont=None):
+    def _get(self):
         if not self._val_available:
-            self._val = self._evalbase(evalcont=evalcont)
+            self._val = self._eval()
             self._val_available = True
-            self._eval_methods(evalcont=evalcont)
+            self._eval_methods()
         return self._val
 
     def _get_deferred_func(self, _name, _retval):
         def _deferred_func(*args, **kwargs):
+            """
+            When this function is called from an "uncompleted" DeferredVal,
+            the name of the deferred method, arguments, and DeferredVal/wrapper
+            object for the eventual return value are stored for future
+            evaluation when the current wrapped object is completed, and
+            the DeferredVal return object is returned.
+            Otherwise, if the DeferredVal is already completed, then return
+            the result of calling the method directly.
+            """
             if not self._val_available:
                 self._deferred_method_calls.append((_name, args, kwargs, _retval))
                 return _retval
@@ -279,7 +305,11 @@ def __getattr__(self, name):
             return self._get_deferred_func(name, retval)
         raise AttributeError("no such attribute (yet): '%s'" % (name,))
 
-# we allow all math operators to be deferred
+class DeferredNumeric(DeferredVal):
+    """
+    This is a DeferredVal that allows the deferral of all math operations.
+    """
+    pass
 _mathops = (
     '__add__', '__sub__', '__mul__', '__floordiv__', '__mod__',
     '__divmod__', '__pow__', '__lshift__', '__rshift__', '__and__',
@@ -291,8 +321,6 @@ def __getattr__(self, name):
     '__irshift__', '__iand__', '__ixor__', '__ior__', '__pos__', '__abs__',
     '__invert__', '__complex__', '__int__', '__long__', '__float__',
     '__oct__', '__hex__', '__index__', '__coerce__')
-class DeferredNumeric(DeferredVal):
-    pass
 DeferredNumeric._deferred_method_dict = dict((x, DeferredNumeric)
                                              for x in _mathops)
 def _get_deferred_attr_func(_name):
@@ -310,15 +338,18 @@ def __init__(self, methodstr, *args, **kwargs):
         self._methodstr = methodstr
         self._args = args
         self._kwargs = kwargs
+        self._mod = None
+
+    def _set_mod(self, mod):
+        self._mod = mod
 
     def _copy(self, retval=None):
         if retval is None:
             retval = self.__class__(self._methodstr, *self._args, **self._kwargs)
         return super(DeferredModuleCall, self)._copy(retval=retval)
 
-    def _evalbase(self, evalcont):
-        # evalcont is the actual Module object
-        return getattr(evalcont, self._methodstr)(*self._args, **self._kwargs)
+    def _eval(self):
+        return getattr(self._mod, self._methodstr)(*self._args, **self._kwargs)
 
 class DeferredTexRef(DeferredModuleCall):
     _deferred_method_dict = {
@@ -366,8 +397,12 @@ def _fix_texrefs(self, kwargs, mod):
             newtexrefs = []
             for texref in texrefs:
                 if isinstance(texref, DeferredVal):
-                    texref = texref._copy() # future calls may use different modules/functions
-                    texref = texref._eval(mod)
+                    if isinstance(texref, DeferredModuleCall):
+                        texref = texref._copy() # future calls may use different modules/functions
+                        texref._set_mod(mod)
+                    texref = texref._get()
+                elif isinstance(texref, DeferredVal):
+                    texref = texref._get()
                 newtexrefs.append(texref)
             kwargs['texrefs'] = newtexrefs
         return kwargs

From 0411e5d5233db581b6d5d05236c87332f15c8132 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Mon, 27 Aug 2018 13:26:41 -0400
Subject: [PATCH 33/35] Fix contig test when only one non-contiguous argument.

---
 pycuda/elementwise.py        | 4 +++-
 src/wrapper/wrap_cudadrv.cpp | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 944e7c3c..b5798322 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -122,10 +122,12 @@ def _precalc_array_info(self, args, shape_arg_index):
                     curorder = 'F'
                 elif arg.flags.c_contiguous:
                     curorder = 'C'
+                if curorder == 'N':
+                    contigmatch = False
                 if shape is None:
                     shape = curshape
                     order = curorder
-                elif curorder == 'N' or order != curorder:
+                elif order != curorder:
                     contigmatch = False
             if shape_arg_index is None and shape != curshape:
                 raise Exception("All input arrays to elementwise kernels must have the same shape, or you must specify the argument that has the canonical shape with shape_arg_index; found shapes %s and %s" % (shape, curshape))
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 196359c4..a2c45c47 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -103,11 +103,14 @@ namespace precalc {
 	if (is_c) {
 	  curorder = 'C';
 	}
+	if (curorder == 'N') {
+	  contigmatch = false;
+	}
 	if (shape.size() == 0) {
 	  shape = curshape;
 	  shape_obj = curshape_obj;
 	  order = curorder;
-	} else if (curorder == 'N' || order != curorder) {
+	} else if (order != curorder) {
 	  contigmatch = false;
 	}
       }

From 886de8a702d3e384f8e98cb38e4bdda7e30ddcbf Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Mon, 27 Aug 2018 17:08:27 -0400
Subject: [PATCH 34/35] For elementwise arrays, always treat singleton
 dimensions as stride = 0.  This seems to allow some forms of broadcasting,
 though technically callers should be sending all elementwise arrays with the
 same shape (and valid strides) anyway.

---
 pycuda/elementwise.py        |  2 ++
 src/wrapper/wrap_cudadrv.cpp | 23 +++++++++++++++++------
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index b5798322..ee6b3126 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -116,6 +116,8 @@ def _precalc_array_info(self, args, shape_arg_index):
                 arrayspecificinds = False
                 continue
             curshape = arg.shape
+            # make strides == 0 when shape == 1
+            arg.strides = tuple((np.array(curshape) > 1) * arg.strides)
             curorder = 'N'
             if contigmatch:
                 if arg.flags.f_contiguous:
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index a2c45c47..215212e1 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -59,16 +59,27 @@ namespace precalc {
 	// probably a scalar
 	continue;
       }
-      PyObject * sizestrides = PyTuple_New(2);
-      PyTuple_SetItem(sizestrides, 0, py::incref(itemsize_obj.ptr()));
-      PyTuple_SetItem(sizestrides, 1, py::incref(curstrides_obj.ptr()));
-      PyTuple_SetItem(arrayitemstrides, aind, sizestrides);
       std::vector<int> curshape(ndim);
       std::vector<int> curstrides(ndim);
       for (int i = 0; i < ndim; i++) {
-	curshape[i] = py::extract<int>(curshape_obj[i]);
-	curstrides[i] = py::extract<int>(curstrides_obj[i]);
+	int tmp = py::extract<int>(curshape_obj[i]);
+	curshape[i] = tmp;
+	if (tmp > 1) {
+	  curstrides[i] = py::extract<int>(curstrides_obj[i]);
+	} else {
+	  curstrides[i] = 0;
+	}
       }
+      PyObject * newstrides = PyTuple_New(ndim);
+      for (int i = 0; i < ndim; i++) {
+	// PyTuple_SetItem steals the reference of the new PyInt
+	PyTuple_SetItem(newstrides, i, PyInt_FromLong(curstrides[i]));
+      }
+      PyObject * sizestrides = PyTuple_New(2);
+      PyTuple_SetItem(sizestrides, 0, py::incref(itemsize_obj.ptr()));
+      // PyTuple_SetItem steals the reference of newstrides
+      PyTuple_SetItem(sizestrides, 1, newstrides);
+      PyTuple_SetItem(arrayitemstrides, aind, sizestrides);
       if (contigmatch) {
 	char curorder = 'N';
 	int tmpaccum = itemsize;

From c2413efc9eb7ae67ccf7e3d710a1256cce99c396 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Mon, 27 Aug 2018 17:12:25 -0400
Subject: [PATCH 35/35] Rename array_arg_inds -> elwise_arg_inds, and require
 that all array args specified by elwise_arg_inds be the same shape,
 eliminating the need for shape_arg_index.

---
 pycuda/elementwise.py        | 79 +++++++++++++++++-------------------
 src/wrapper/wrap_cudadrv.cpp | 18 ++++----
 2 files changed, 44 insertions(+), 53 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index ee6b3126..c804bd7c 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -54,23 +54,20 @@ class ElementwiseSourceModule(DeferredSourceModule):
       * if do_indices=True, the N-D index of the current data item in all
         participating arrays is available in INDEX[dim] where dim goes from
         0 to NDIM - 1.
-    Array-specific flat indices only really work if all the arrays using them
-    are the same shape.  This shape is also used to optimize index
-    calculations.  By default, the shape is taken from the first argument
-    that is specified as a pointer/array, but you can override this by
-    sending ``shape_arg_index=N`` where ``N`` is the zero-based index of the
-    kernel argument whose shape should be used.
-    By default, any VectorArg objects in ```arguments``` are assumed to be
-    arrays that need index calculations.  If only some of the VectorArgs
-    will be traversed element-wise, you can specify their indices (within
-    ```arguments```) with ```array_arg_inds```, and the resulting kernel will
-    not waste time computing indices for those.
+    ``elwise_arg_inds`` specifies the kernel arguments (as indices into
+    ``arguments``) that will be treated element-wise and need array-specific
+    index calculations.  If ``elwise_arg_inds`` is not specified, it is
+    constructed by finding all arguments that are VectorArg objects.  If only
+    some of the VectorArgs will be traversed element-wise, specifying
+    ``elwise_arg_inds`` ensures that the kernel doesn't wastes time
+    calculating unneeded indices.  When the kernel is eventually called,
+    all elementwise array arguments specified in ``elwise_arg_inds`` must be
+    the same shape, which is used to optimize index calculations.
     '''
     def __init__(self, arguments, operation,
                  name="kernel", preamble="", loop_prep="", after_loop="",
                  do_range=False,
-                 shape_arg_index=None,
-                 array_arg_inds=None,
+                 elwise_arg_inds=None,
                  do_indices=False,
                  order='K',
                  debug=False,
@@ -81,29 +78,25 @@ def __init__(self, arguments, operation,
         assert(order in 'CFK', "order must be 'F', 'C', or 'K'")
         self._order = order
         self._arg_decls = ", ".join(arg.declarator() for arg in arguments)
-        if array_arg_inds is None:
-            array_arg_inds = [i for i in range(len(arguments)) if isinstance(arguments[i], VectorArg)]
-        self._array_arg_inds = array_arg_inds
-        if shape_arg_index is None:
-            self._shape_arg_index = array_arg_inds[0]
-        else:
-            self._shape_arg_index = shape_arg_index
+        if elwise_arg_inds is None:
+            elwise_arg_inds = [i for i in range(len(arguments)) if isinstance(arguments[i], VectorArg)]
+        self._elwise_arg_inds = elwise_arg_inds
         self._arg_names = tuple(argument.name for argument in arguments)
-        self._array_arg_names = tuple(self._arg_names[i] for i in self._array_arg_inds)
+        self._elwise_arg_names = tuple(self._arg_names[i] for i in self._elwise_arg_inds)
         self._init_args = (self._arg_names, self._arg_decls, operation,
                            name, preamble, loop_prep, after_loop)
         self._init_args_repr = repr(self._init_args)
         self._debug = debug
 
-    def _precalc_array_info(self, args, shape_arg_index):
+    def _precalc_array_info(self, args):
         # 'args' is the list of actual parameters being sent to the kernel
 
         contigmatch = True
         arrayspecificinds = True
         shape = None
         order = None
-        array_arg_inds = self._array_arg_inds
-        for i in array_arg_inds:
+        elwise_arg_inds = self._elwise_arg_inds
+        for i in elwise_arg_inds:
             # is a GPUArray/DeviceAllocation
             arg = args[i]
             if not arrayspecificinds:
@@ -131,21 +124,19 @@ def _precalc_array_info(self, args, shape_arg_index):
                     order = curorder
                 elif order != curorder:
                     contigmatch = False
-            if shape_arg_index is None and shape != curshape:
-                raise Exception("All input arrays to elementwise kernels must have the same shape, or you must specify the argument that has the canonical shape with shape_arg_index; found shapes %s and %s" % (shape, curshape))
-            if shape_arg_index == i:
-                shape = curshape
+            if shape != curshape:
+                raise Exception("All input arrays to elementwise kernels must have the same shape, or you must specify the arrays to be traversed element-wise explicitly with elwise_arg_inds; found shapes %s and %s" % (shape, curshape))
 
-        arrayitemstrides = tuple((arg.itemsize, arg.strides) for arg in (args[i] for i in array_arg_inds))
+        arrayitemstrides = tuple((arg.itemsize, arg.strides) for arg in (args[i] for i in elwise_arg_inds))
 
         return (contigmatch, arrayspecificinds, shape, arrayitemstrides)
 
 
     def create_key(self, grid, block, *args):
-        #print "Calling _precalc_array_info(args=%s, self._init_args[0]=%s, self._shape_arg_index=%s)\n" % (args, self._init_args[0], self._shape_arg_index)
-        #precalc_init = self._precalc_array_info(args, self._shape_arg_index)
-        array_arg_inds = self._array_arg_inds
-        precalc_init = _drv._precalc_array_info(args, array_arg_inds, self._shape_arg_index)
+        #print "Calling _precalc_array_info(args=%s, elwise_arg_inds=%s)\n" % (args, self._elwise_arg_inds)
+        #precalc_init = self._precalc_array_info(args)
+        elwise_arg_inds = self._elwise_arg_inds
+        precalc_init = _drv._precalc_array_info(args, elwise_arg_inds)
         (contigmatch, arrayspecificinds, shape, arrayitemstrides) = precalc_init
 
         if (contigmatch or not arrayspecificinds) and not self._do_indices:
@@ -163,7 +154,7 @@ def create_key(self, grid, block, *args):
         # Kernel traverses in Fortran-order by default, but if order == 'C',
         # we can just reverse the shape and strides, since the kernel is
         # elementwise.  If order == 'K', use index of minimum stride in first
-        # array (or that specified by shape_arg_index) as a hint on how to
+        # elementwise array as a hint on how to
         # order the traversal of dimensions.  We could probably do something
         # smarter, like tranposing/reshaping arrays if possible to maximize
         # performance, but that is probably best done in a pre-processing step.
@@ -174,7 +165,7 @@ def create_key(self, grid, block, *args):
         if self._order == 'C':
             do_reverse = True
         elif self._order == 'K':
-            if np.argmin(np.abs(args[self._shape_arg_index].strides)) > ndim // 2:
+            if np.argmin(np.abs(args[self._elwise_arg_inds[0]].strides)) > ndim // 2:
                 # traverse dimensions in reverse order
                 do_reverse = True
 
@@ -190,8 +181,8 @@ def create_source(self, precalc, grid, block, *args):
         (arg_names, arg_decls, operation,
          funcname, preamble, loop_prep, after_loop) = self._init_args
 
-        array_arg_inds = self._array_arg_inds
-        array_arg_names = tuple(arg_names[i] for i in array_arg_inds)
+        elwise_arg_inds = self._elwise_arg_inds
+        elwise_arg_names = self._elwise_arg_names
 
         if contigmatch and not self._do_indices:
             indtype = 'unsigned'
@@ -201,7 +192,7 @@ def create_source(self, precalc, grid, block, *args):
             # All arrays are contiguous and same order (or we don't know and
             # it's up to the caller to make sure it works)
             if arrayspecificinds:
-                for arg_name in array_arg_names:
+                for arg_name in elwise_arg_names:
                     preamble = preamble + """
                         #define %s_i i
                     """ % (arg_name,)
@@ -274,7 +265,7 @@ def create_source(self, precalc, grid, block, *args):
         # same indices
         arrayrefnames = []
 
-        argnamestrides = ((arg_name, itemstride[0], itemstride[1]) for (arg_name, itemstride) in zip(array_arg_names, arrayitemstrides))
+        argnamestrides = ((arg_name, itemstride[0], itemstride[1]) for (arg_name, itemstride) in zip(elwise_arg_names, arrayitemstrides))
 
         ndim = len(shape)
         numthreads = block[0] * grid[0]
@@ -702,7 +693,8 @@ def get_take_kernel(dtype, idx_dtype, vec_count=1):
                 "dest%d[dest%d_i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i, i)
                 for i in range(vec_count)))
 
-    mod = get_elwise_module(args, body, "take", preamble=preamble)
+    mod = get_elwise_module(args, body, "take", preamble=preamble,
+                            elwise_arg_inds=(0, 1))
     func = mod.get_function("take")
     tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]
     func.prepare("P"+(vec_count*"P")+np.dtype(np.uintp).char, texrefs=tex_src)
@@ -747,7 +739,7 @@ def get_copy_insn(i):
             + "\n".join(get_copy_insn(i) for i in range(vec_count)))
 
     mod = get_elwise_module(args, body, "take_put",
-                            preamble=preamble, shape_arg_index=0)
+                            preamble=preamble, elwise_arg_inds=(0,1))
     func = mod.get_function("take_put")
     tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]
 
@@ -782,7 +774,10 @@ def get_put_kernel(dtype, idx_dtype, vec_count=1):
                 for i in range(vec_count)))
 
     func = get_elwise_module(args, body, "put",
-                             shape_arg_index=0).get_function("put")
+                             elwise_arg_inds=(
+                                 [0] + range(1+vec_count, 1+(2*vec_count))
+                             )
+                            ).get_function("put")
     func.prepare("P"+(2*vec_count*"P")+np.dtype(np.uintp).char)
     return func
 
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 215212e1..fa08f5a1 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -19,20 +19,20 @@
 
 namespace precalc {
   namespace py = boost::python;
-  py::tuple _precalc_array_info(const py::object & args, const py::object & arrayarginds, const py::object & shape_arg_index) {
+  py::tuple _precalc_array_info(const py::object & args, const py::object & elwisearginds) {
     bool contigmatch = true;
     bool arrayspecificinds = true;
     std::vector<int> shape;
     py::object shape_obj;
     char order = 'N';
-    int numarrays = py::len(arrayarginds);
+    int numarrays = py::len(elwisearginds);
     PyObject * arrayitemstrides = PyTuple_New(numarrays);
 
     for (int aind = 0; aind < numarrays; aind++) {
-      int arrayargind = py::extract<int>(arrayarginds[aind]);
+      int elwiseargind = py::extract<int>(elwisearginds[aind]);
       // is a GPUArray/DeviceAllocation
       //py::object arg = py::object(args[i]);
-      py::object arg(args[arrayargind]);
+      py::object arg(args[elwiseargind]);
       if (!arrayspecificinds) {
 	continue;
       }
@@ -125,14 +125,10 @@ namespace precalc {
 	  contigmatch = false;
 	}
       }
-      if (shape_arg_index.is_none() && shape != curshape) {
-	PyErr_SetString(PyExc_RuntimeError, "All input arrays to elementwise kernels must have the same shape, or you must specify the argument that has the canonical shape with shape_arg_index");
+      if (shape != curshape) {
+	PyErr_SetString(PyExc_RuntimeError, "All input arrays to elementwise kernels must have the same shape, or you must specify the arrays to be traversed element-wise explicitly with elwise_arg_inds");
 	py::throw_error_already_set();
       }
-      if (shape_arg_index == arrayargind) {
-	shape = curshape;
-	shape_obj = curshape_obj;
-      }
     }
 
     return py::make_tuple(contigmatch, arrayspecificinds, shape_obj, py::object(py::handle<>(arrayitemstrides)));
@@ -1781,7 +1777,7 @@ BOOST_PYTHON_MODULE(_driver)
 
   {
     using namespace precalc;
-    DEF_SIMPLE_FUNCTION_WITH_ARGS(_precalc_array_info, ("args", "arrayarginds", "shape_arg_index"));
+    DEF_SIMPLE_FUNCTION_WITH_ARGS(_precalc_array_info, ("args", "elwisearginds"));
   }
 }