Source code for dimarray.core.align

""" Functions and dimarray methods associated to array alignment
"""
from __future__ import print_function
from future.utils import string_types
from collections import OrderedDict as odict
import itertools
import warnings
import numpy as np

from dimarray.config import get_option
from dimarray.tools import is_DimArray
from dimarray.core.axes import Axes, Axis
from dimarray.core.indexing import locate_many

__all__ = ["broadcast_arrays", "align", "stack", "concatenate"]

def get_dims(*arrays):
    """ find all dimensions from a variable list of arrays (or any object with `axes` attribute)
    Note: not in public API, but used by other modules
    """
    dims = []
    for o in arrays:
        for ax in o.axes:
            if ax.name not in dims:
                dims.append(ax.name)
    return dims

def _get_axes(*arrays):
    """ find list of axes from a list of axis-aligned DimArray objects
    """
    dims = get_dims(*arrays) # all dimensions present in objects
    axes = Axes()

    for dim in dims:

        common_axis = None

        for o in arrays:

            # skip missing dimensions
            if dim not in o.dims: continue

            axis = o.axes[dim]

            # update values
            if common_axis is None or (common_axis.size==1 and axis.size > 1):
                common_axis = axis

            # Test alignment for non-singleton axes
            if not (axis.size == 1 or np.all(axis.values==common_axis.values)):
                raise ValueError("axes are not aligned")

        # append new axis
        axes.append(common_axis)

    return axes

[docs]def align_dims(*arrays):
    """ Align dimensions of a list of arrays so that they are ready for broadcast.
    
    Method: inserting singleton axes at the right place and transpose where needed.
    Note : not part of public API, but used in other dimarray modules

    Examples
    --------
    >>> import dimarray as da
    >>> import numpy as np
    >>> x = da.DimArray(np.arange(2), dims=('x0',))
    >>> y = da.DimArray(np.arange(3), dims=('x1',))
    >>> align_dims(x, y)
    [dimarray: 2 non-null elements (0 null)
    0 / x0 (2): 0 to 1
    1 / x1 (1): None to None
    array([[0],
           [1]]), dimarray: 3 non-null elements (0 null)
    0 / x0 (1): None to None
    1 / x1 (3): 0 to 2
    array([[0, 1, 2]])]
    """
    # If dimensions are already equal, do nothing
    lst = {o.dims for o in arrays}
    if len(lst) == 1:
        return arrays

    # Determine the dimensions of the result
    newdims = get_dims(*arrays) 

    # Reshape all DimArrays
    newarrays = []
    for o in arrays:
        o = o.reshape(newdims)
        newarrays.append(o)

    return newarrays

[docs]def broadcast_arrays(*arrays):
    """ Analogous to numpy.broadcast_arrays
    
    but with looser requirements on input shape
    and returns copy instead of views

    Parameters
    ----------
    arrays : variable list of DimArrays

    Returns
    -------
    list of DimArrays

    Examples
    --------
    Just as numpy's broadcast_arrays

    >>> import dimarray as da
    >>> x = da.DimArray([[1,2,3]])
    >>> y = da.DimArray([[1],[2],[3]])
    >>> da.broadcast_arrays(x, y)
    [dimarray: 9 non-null elements (0 null)
    0 / x0 (3): 0 to 2
    1 / x1 (3): 0 to 2
    array([[1, 2, 3],
           [1, 2, 3],
           [1, 2, 3]]), dimarray: 9 non-null elements (0 null)
    0 / x0 (3): 0 to 2
    1 / x1 (3): 0 to 2
    array([[1, 1, 1],
           [2, 2, 2],
           [3, 3, 3]])]
    """
    # give all objects the same dimension (without changing the size)
    arrays = align_dims(*arrays)

    # get axes object made of all non-singleton common axes
    try:
        axes = _get_axes(*arrays)

    # fails if axes are not aligned
    except AssertionError as error:
        raise ValueError(error)

    # now broadcast each DimArray along commmon axes
    newarrays = []
    for o in arrays:
        o = o.broadcast(axes)
        newarrays.append(o)

    return newarrays

def _common_axis(axes, join):
    """ find the common axis among a list of axes ==> proceed recursively
    """
    assert len(axes) > 0

    # recursion end
    if len(axes) == 1:
        return axes[0]

    # recursive call
    ax0 = axes[0]
    ax1 = _common_axis(axes[1:],join)

    # special cases
    # do not include None unless we have a singleton
    if ax0[0] is None:
        return ax1
    if len(ax1) == 1 and ax1[0] is None:
        return ax0

    # TODO: make a separate version of the axis module
    # with helper functions that do the basic work, without the whole
    # Axis machinery. This may limit the possibility of optimizing the 
    # Axes via hidden attributes, but this would also make things simpler
    # and prevents false "good" ideas (such as indeed, adding hidden attributes)
    if join == 'outer':
        com_axis = ax0.union(ax1)
    else:
        com_axis = ax0.intersection(ax1)
    return com_axis

def _get_aligned_axes(arrays, join='outer', axis=None , sort=False, strict=False):
    """From a list of arrays, or any object with `axes` attributes, 
    a new list of axes.
    """
    # find the dimensions
    if axis is None:
        dims = get_dims(*arrays)
    elif isinstance(axis, string_types):
        dims = [axis]
    else:
        if not isinstance(axis, string_types):
            raise ValueError("align: axis must be provided as a string")

    axes = Axes()

    for jj, d in enumerate(dims):

        # arrays which have that dimension
        ii = [i for i in range(len(arrays)) if d in arrays[i].dims]

        if strict and len(ii) != len(arrays):
            raise ValueError("align (strict=True): some arrays lack dimension {}".format(d))

        # common axis to reindex on
        ax = _common_axis([arrays[i].axes[d] for i in ii], join)

        if sort:
            ax.sort()

        axes.append(ax)

    # assert len(axes) > 0

    return axes

def align(arrays, join='outer', axis=None , sort=False, strict=False):
    """Align axes of a list of DimArray arrays by reindexing

    Parameters
    ----------
    array1, array2, ... : variable list of DimArrays or Datasets
    join : {"outer", "inner"}, optional
        method to find the common axis
        "outer" : union of all axes, missing values filled with NaNs
        "inner" : intersection of all axes
        Default to "outer" (can be changed with `dimarray.set_option('align.join","inner")`)
    sort : bool, optional
        Sort the axis prior to aligning.
        default to False
    axis : str, optional
        default to None : align all axes
        (must be a string since the axes do not necessarily match)
    strict : bool, optional
        if True, check that all arrays have the same dimensions

    Returns
    -------
    aligned_array1, aligned_array2, ... : list of aligned DimArrays (or Dataset)

    See Also
    --------
    `DimArray.reindex_axis`, `DimArray.reindex_like`

    Examples
    --------
    >>> from dimarray import DimArray, align
    >>> a = DimArray([0,1,2],axes=[[0,1,2]])
    >>> b = DimArray([1,2,3],axes=[[1,2,3]])
    >>> align([a, b])
    [dimarray: 3 non-null elements (1 null)
    0 / x0 (4): 0 to 3
    array([ 0.,  1.,  2., nan]), dimarray: 3 non-null elements (1 null)
    0 / x0 (4): 0 to 3
    array([nan,  1.,  2.,  3.])]
    >>> align([a, b], join='inner')
    [dimarray: 2 non-null elements (0 null)
    0 / x0 (2): 1 to 2
    array([1, 2]), dimarray: 2 non-null elements (0 null)
    0 / x0 (2): 1 to 2
    array([1, 2])]

    Also work on multi-dimensional arrays
     
    >>> a = DimArray([0,1], axes=[[0,1]]) # on 'x0' only
    >>> b = DimArray([[0,1],[2,3.],[4.,5.]], axes=[[0,1,2],[1,2]]) # one more element along the 1st dimension, 2nd dimension ignored
    >>> align([a, b])
    [dimarray: 2 non-null elements (1 null)
    0 / x0 (3): 0 to 2
    array([ 0.,  1., nan]), dimarray: 6 non-null elements (0 null)
    0 / x0 (3): 0 to 2
    1 / x1 (2): 1 to 2
    array([[0., 1.],
           [2., 3.],
           [4., 5.]])]
    """
    # join = kwargs.pop('join', get_option('align.join'))
    # sort = kwargs.pop('sort', False)
    # axis = kwargs.pop('axis', None)
    # strict = kwargs.pop('strict', False)
    # if len(kwargs) > 0:
    #     raise TypeError("align() got unexpected argument(s): "+", ".join(kwargs.keys()))
    if not (isinstance(arrays, list) or isinstance(arrays, tuple)):
        raise ValueError("align: only accepts list or tuple arguments. Got: {}".format(type(arrays)))

    # convert any scalar to dimarray
    from dimarray import DimArray, Dataset
    arrays = [a for a in arrays] # convert to list
    for i, a in enumerate(arrays):
        if not isinstance(a, DimArray) and not isinstance(a, Dataset):
            if np.isscalar(a):
                arrays[i] = DimArray(a)
            else:
                raise TypeError("can only align DimArray and Dataset instances, got: {}".format(type(a)))

    # find the common axes
    axes = _get_aligned_axes(arrays, axis=axis, join=join, sort=sort, strict=strict)

    # update arrays
    for ax in axes:
        for i, o in enumerate(arrays):
            if ax.name not in o.dims: 
                continue
            if np.all(o.axes[ax.name] == ax):
                continue
            arrays[i] = o.reindex_axis(ax)

    return arrays

align_ = align  # for internal use, so that it does not conflict with "align" parameter

def _check_stack_args(arrays, keys=None):
    """ generic function to deal with arguments for stacking
    accepts arrays as sequence or dict and returns 
    a list of keys and values
    """
    # convert dictionary to sequence + keys
    if isinstance(arrays, dict):
        if keys is None: keys = arrays.keys()
        arrays = arrays.values()
        
    # make sure the result is a sequence
    if type(arrays) not in (list, tuple):
        raise TypeError("argument must be a dictionary, list or tuple")

    # make sure keys exist
    if keys is None: keys = np.arange(len(arrays))

    return arrays, keys

def _check_stack_axis(axis, dims, default='unnamed'):
    """ check or get new axis name when stacking array or datasets
    (just to have that in one place)
    """
    if axis is None:
        axis = default
        if axis in dims:
            i = 1
            while default+"_{}".format(i) in dims:
                i+=1
            axis = default+"_{}".format(i)

    if type(axis) is int:
        raise TypeError("axis must be a str (new axis name)")

    if axis in dims:
        raise ValueError("please provide an axis name which does not \
                already exist, or use `concatenate`")
    return axis

[docs]def stack(arrays, axis=None, keys=None, align=False, **kwargs):
    """ stack arrays along a new dimension (raise error if already existing)

    Parameters
    ----------
    arrays : sequence or dict of arrays
    axis : str, optional
        new dimension along which to stack the array
    keys : array-like, optional
        stack axis values, useful if array is a sequence, or a non-ordered dictionary
    align : bool, optional
        if True, align axes prior to stacking (Default to False)
    **kwargs : optional key-word arguments passed to align, if align is True

    Returns
    -------
    DimArray : joint array

    See Also
    --------
    concatenate : join arrays along an existing dimension
    swapaxes : to modify the position of the newly inserted axis

    Examples
    --------
    >>> from dimarray import DimArray
    >>> a = DimArray([1,2,3])
    >>> b = DimArray([11,22,33])
    >>> stack([a, b], axis='stackdim', keys=['a','b'])
    dimarray: 6 non-null elements (0 null)
    0 / stackdim (2): 'a' to 'b'
    1 / x0 (3): 0 to 2
    array([[ 1,  2,  3],
           [11, 22, 33]])
    """
    assert not isinstance(axis, int), "axis must be a str (you are creating a new axis)"

    # make a sequence of arrays
    arrays, keys = _check_stack_args(arrays, keys)

    for a in arrays: 
        if not is_DimArray(a): raise TypeError('can only stack DimArray instances')

    # make sure the stacking dimension is OK (new)
    dims = get_dims(*arrays)
    axis = _check_stack_axis(axis, dims)

    # re-index axes if needed
    if align:
        kwargs['strict'] = True
        arrays = align_(arrays, **kwargs)

    # make it a numpy array
    data = [a.values for a in arrays]
    data = np.array(data)

    # new axis
    newaxis = Axis(keys, axis)

    # find common axes
    try: 
        axes = _get_axes(*arrays)
    except ValueError as msg: 
        if 'axes are not aligned' in repr(msg):
            msg = 'axes are not aligned\n ==> Try passing `align=True`' 
        raise ValueError(msg)

    # new axes
    #newaxes = axes[:pos] + [newaxis] + axes[pos:] 
    newaxes = [newaxis] + axes

    # create dimarray
    _constructor = arrays[0]._constructor # DimArray
    return _constructor(data, axes=newaxes)

def _concatenate_axes(axes):
    """ concatenate Axis objects

    axes: list of Axis objects

    >>> a = Axis([1,2,3],'x0')
    >>> b = Axis([5,6,7],'x0')
    >>> ax = _concatenate_axes((a, b))
    >>> ax.name
    'x0'
    >>> ax.values
    array([1, 2, 3, 5, 6, 7])
    """
    #assert np.iterable(axes) and axes
    #if not isinstance(axes[0], Axis): raise TypeError()
    if len({ax.name for ax in axes}) != 1: 
        print(axes)
        raise ValueError("axis names differ!")
    values = np.concatenate([ax.values for ax in axes])
    return Axis(values, axes[0].name)

[docs]def concatenate(arrays, axis=0, _no_check=False, align=False, **kwargs):
    """ concatenate several DimArrays

    Parameters
    -----------
    arrays : list of DimArrays
        arrays to concatenate
    axis : int or str 
        axis along which to concatenate (must exist)
    align : bool, optional
        align secondary axes before joining on the primary
        axis `axis`. Default to False.
    **kwargs : optional key-word arguments passed to align, if align is True

    Returns
    -------
    concatenated DimArray 

    See Also
    --------
    stack: join arrays along a new dimension
    align: align arrays

    Examples
    --------

    1-D

    >>> from dimarray import DimArray
    >>> a = DimArray([1,2,3], axes=[['a','b','c']])
    >>> b = DimArray([4,5,6], axes=[['d','e','f']])
    >>> concatenate((a, b))
    dimarray: 6 non-null elements (0 null)
    0 / x0 (6): 'a' to 'f'
    array([1, 2, 3, 4, 5, 6])

    2-D

    >>> a = DimArray([[1,2,3],[11,22,33]])
    >>> b = DimArray([[4,5,6],[44,55,66]])
    >>> concatenate((a, b), axis=0)
    dimarray: 12 non-null elements (0 null)
    0 / x0 (4): 0 to 1
    1 / x1 (3): 0 to 2
    array([[ 1,  2,  3],
           [11, 22, 33],
           [ 4,  5,  6],
           [44, 55, 66]])
    >>> concatenate((a, b), axis='x1')
    dimarray: 12 non-null elements (0 null)
    0 / x0 (2): 0 to 1
    1 / x1 (6): 0 to 2
    array([[ 1,  2,  3,  4,  5,  6],
           [11, 22, 33, 44, 55, 66]])
    """
    # input argument check
    if not type(arrays) in (list, tuple):
        raise ValueError("arrays must be list or tuple, got {}:{}".format(type(arrays), arrays))
    arrays = [a for a in arrays]

    from dimarray import DimArray, Dataset

    for i, a in enumerate(arrays):
        if isinstance(a, Dataset):
            msg = "\n==>Note: you may use `concatenate_ds` for Datasets"
            raise ValueError("concatenate: expected DimArray. Got {}".format(type(a))+msg)
        elif np.isscalar(a):
            arrays[i] = DimArray(a)
        if not isinstance(a, DimArray):
            raise ValueError("concatenate: expected DimArray. Got {}".format(type(a)))

    if type(axis) is not int:
        axis = arrays[0].dims.index(axis)
    dim = arrays[0].dims[axis]

    # align secondary axes prior to concatenate
    # TODO: just encourage user to use align outside this function
    # and remove argument passing
    if align:
        kwargs['strict'] = True
        for ax in arrays[0].axes:
            if ax.name != dim:
                arrays = align_(arrays, axis=ax.name, **kwargs)

    values = np.concatenate([a.values for a in arrays], axis=axis)

    _get_subaxes = lambda x: [ax for i, ax in enumerate(arrays[0].axes) if i != axis]
    subaxes = _get_subaxes(arrays[0])

    # concatenate axis values
    newaxis = _concatenate_axes([a.axes[axis] for a in arrays])

    if not align and not _no_check:
        # check that other axes match
        for ax in subaxes:
            for a in arrays:
                if not np.all(a.axes[ax.name].values == ax.values):
                    raise ValueError("contatenate: secondary axes do not match. Align first? (`align=True`)")
        # print arrays[0]
        # for i,a in enumerate(arrays[1:]):
        #     if not _get_subaxes(a) == subaxes:
        #         msg = "First array:\n{}\n".format(subaxes)
        #         msg += "{}th array:\n{}\n".format(i,_get_subaxes(a))
        #         raise ValueError("contatenate: secondary axes do not match. Align first? (`align=True`)")
        #     print a
        # print '==> arrays look ok'

    newaxes = subaxes[:axis] + [newaxis] + subaxes[axis:]

    return arrays[0]._constructor(values, newaxes)

##############################################################
# The functions below are meant to be used as DimArray methods
##############################################################

#
# Reindex axis
#
def reindex_axis(self, values, axis=0, fill_value=np.nan, raise_error=False, method=None):
    """ reindex an array along an axis

    Parameters
    ----------
    values : array-like or Axis
        new axis values
    axis : int or str, optional
        axis number or name
    fill_value: bool, optional
        Fill data to use for missing axis value, 
        if `raise_error` is False.
    raise_error : bool, optional
        if True, raise error when an axis value is not present 
        otherwise just replace with `fill_value`. Defaulf is False
    method : {None, 'left', 'right'}
        method to fill the gaps (default None)
        If 'left' or 'right', just pass along to numpy.searchsorted.

    Returns
    -------
    dimarray: DimArray instance

    Examples
    --------
    Basic reindexing: fill missing values with NaN

    >>> import dimarray as da
    >>> a = da.DimArray([1,2,3],axes=[('x0', [1,2,3])])
    >>> b = da.DimArray([3,4],axes=[('x0',[1,3])])
    >>> b.reindex_axis([1,2,3])
    dimarray: 2 non-null elements (1 null)
    0 / x0 (3): 1 to 3
    array([ 3., nan,  4.])

    Or replace with anything else, like -9999

    >>> b.reindex_axis([1,2,3], fill_value=-9999)
    dimarray: 3 non-null elements (0 null)
    0 / x0 (3): 1 to 3
    array([    3, -9999,     4])
    """
    if isinstance(values, Axis):
        newaxis = values
        values = newaxis.values
        axis = newaxis.name
    elif np.isscalar(values) or type(values) is slice:
        raise TypeError("Please provide list, array-like or Axis object to perform re-indexing")
    else:
        values = np.asarray(values)

    # Get indices
    ax = self.axes[axis]
    # indices = ax.loc(values, mode='clip', side=method)
    indices = locate_many(ax.values, values, side=method or 'left')
    newobj = self.take_axis(indices, axis, indexing='position')

    # Replace mismatch with missing values?
    mask = ax.values.take(indices) != values
    if np.any(mask):
        if raise_error:
            raise IndexError("Some values where not found in the axis: {}".format(values[mask]))
        if method is None:
            newobj.put(mask, fill_value, axis=axis, inplace=True, indexing="position", cast=True)
        # Make sure the axis values match the requested new axis
        newobj.axes[axis][mask] = values[mask]

    return newobj

def reindex_axis_with_pandas(obj, values, axis=0, fill_value=np.nan):
    """ Convert to and from pandas to use a faster (?) indexing method
    """

    import pandas
    pandasobj = obj.to_pandas()

    axis_id, axis_nm = obj._get_axis_info(axis)

    try:
        newpandas = pandasobj.reindex_axis(values, axis=axis_id, fill_value=fill_value)
    except TypeError:
        # older versions of pandas do not have the fill_value parameter
        newpandas = pandasobj.reindex_axis(values, axis=axis_id)

    newobj = obj.from_pandas(newpandas) # use class method from_pandas
    newobj.attrs.update(obj.attrs)    # add metadata back
    newobj.axes[axis_id].name = axis_nm  # give back original name

    return newobj


def reindex_like(self, other, **kwargs):
    """ reindex_like : re-index like another dimarray / axes instance

    Applies reindex_axis on each axis to match another DimArray

    Parameters
    ----------
    other : DimArray or Axes instance
    **kwargs : 

    Returns
    -------
    DimArray

    Notes
    -----
    only reindex axes which are present in other

    Examples
    --------
    >>> import dimarray as da
    >>> b = da.DimArray([3,4],('x0',[1,3]))
    >>> c = da.DimArray([[1,2,3], [1,2,3]],[('x1',["a","b"]),('x0',[1, 2, 3])])
    >>> b.reindex_like(c)
    dimarray: 2 non-null elements (1 null)
    0 / x0 (3): 1 to 3
    array([ 3., nan,  4.])
    """
    if hasattr(other, 'axes'):
        axes = other.axes
    elif isinstance(other, Axes):
        axes = other
    else:
        raise TypeError('expected DimArray or Axes, got {}: {}'.format(type(other), other))

    newdims = [ax2.name for ax2 in axes]
    obj = self
    for ax in self.axes:
        if ax.name in newdims:
            newaxis = axes[ax.name].values
            obj = obj.reindex_axis(newaxis, axis=ax.name, **kwargs)

    return obj


def sort_axis(a, axis=0, key=None, kind='quicksort'):
    """ sort an axis 

    Parameters
    ----------
    a : DimArray (this argument is pre-assigned when using as bound method)
    axis : int or str, optional
        axis by position (int) or name (str) (default: 0)
    key : callable or dict-like, optional
        function that is called on each axis label and 
        whose return value is used for sorting instead of axis label.
        Any other object with __getitem__ attribute may also be used as key,
        such as a dictionary.
        If None (the default), axis label is used for sorting.
    kind : str, optional
        sort algorigthm (see numpy.sort for more info)

    Returns
    --------
    sorted : new DimArray with sorted axis

    Examples
    --------
    Basic

    >>> from dimarray import DimArray
    >>> a = DimArray([10,20,30], labels=[2, 0, 1])
    >>> a
    dimarray: 3 non-null elements (0 null)
    0 / x0 (3): 2 to 1
    array([10, 20, 30])

    >>> a.sort_axis()
    dimarray: 3 non-null elements (0 null)
    0 / x0 (3): 0 to 2
    array([20, 30, 10])

    >>> a.sort_axis(key=lambda x: -x)
    dimarray: 3 non-null elements (0 null)
    0 / x0 (3): 2 to 0
    array([10, 30, 20])

    Multi-dimensional
     
    >>> a = DimArray([[10,20,30],[40,50,60]], labels=[[0, 1], ['a','c','b']])
    >>> a.sort_axis(axis=1)
    dimarray: 6 non-null elements (0 null)
    0 / x0 (2): 0 to 1
    1 / x1 (3): 'a' to 'c'
    array([[10, 30, 20],
           [40, 60, 50]])
    """
    index = a.axes[axis].values

    # convert key to a function
    if key is None:
        ii = index.argsort(kind=kind) # the default
    else:
        if not hasattr(key, '__call__') and hasattr(key, '__getitem__'):
            key = key.__getitem__
        ii = argsort(index, key)

    return a.take_axis(ii, axis=axis, indexing='position')


def argsort(seq, key=None):
    """ equivalent of numpy's argsort in basic python

    Modified after http://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python

    >>> a = ['a', 'd', 'c']
    >>> argsort(a)
    [0, 2, 1]
    >>> argsort(a, key=lambda x: {'a':2,'c':1,'d':0}[x])
    [1, 2, 0]
    """
    if key is None:
        _key = seq.__getitem__
    else:
        _key = lambda x: key(seq.__getitem__(x))
    return sorted(range(len(seq)), key=_key)