Source code for larray.core.array

# -*- coding: utf8 -*-
from __future__ import absolute_import, division, print_function

__all__ = [
    'LArray', 'zeros', 'zeros_like', 'ones', 'ones_like', 'empty', 'empty_like', 'full', 'full_like', 'sequence',
    'create_sequential', 'ndrange', 'labels_array', 'ndtest', 'aslarray', 'identity', 'diag', 'eye',
    'larray_equal', 'larray_nan_equal', 'all', 'any', 'sum', 'prod', 'cumsum', 'cumprod', 'min', 'max', 'mean', 'ptp',
    'var', 'std', 'median', 'percentile', 'stack', 'nan', 'nan_equal'
]

"""
Matrix class
"""

# ? implement multi group in one axis getitem: lipro['P01,P02;P05'] <=> (lipro['P01,P02'], lipro['P05'])

# * we need an API to get to the "next" label. Sometimes, we want to use label+1, but that is problematic when labels
#   are not numeric, or have not a step of 1. X.agegroup[X.agegroup.after(25):]
#                                             X.agegroup[X.agegroup[25].next():]

# * implement keepaxes=True for _group_aggregate instead of/in addition to group tuples

# ? implement newaxis

# * Axis.sequence? geo.seq('A31', 'A38') (equivalent to geo['A31..A38'])

# * re-implement row_totals/col_totals? or what do we do with them?

# * time specific API so that we know if we go for a subclass or not

# * data alignment in arithmetic methods

# * test structured arrays

# ? move "utils" to its own project (so that it is not duplicated between larray and liam2)
#   OR
#   include utils only in larray project and make larray a dependency of liam2
#   (and potentially rename it to reflect the broader scope)

from collections import Iterable, Sequence
from itertools import product, chain, groupby, islice
import os
import sys
import functools

try:
    import builtins
except ImportError:
    import __builtin__ as builtins

import numpy as np
import pandas as pd

try:
    import xlwings as xw
except ImportError:
    xw = None

try:
    from numpy import nanprod as np_nanprod
except ImportError:
    np_nanprod = None

from larray.core.abstractbases import ABCLArray
from larray.core.expr import ExprNode
from larray.core.group import (Group, IGroup, LGroup, remove_nested_groups, _to_key, _to_keys,
                               _range_to_slice, _translate_sheet_name, _translate_key_hdf)
from larray.core.axis import Axis, AxisReference, AxisCollection, X, _make_axis
from larray.util.misc import (table2str, size2str, basestring, izip, rproduct, ReprString, duplicates,
                              float_error_handler_factory, _isnoneslice, light_product, unique_list, common_type,
                              renamed_to, deprecate_kwarg)


nan = np.nan


def all(values, axis=None):
    """
    Test whether all array elements along a given axis evaluate to True.

    See Also
    --------
    LArray.all
    """
    if isinstance(values, LArray):
        return values.all(axis)
    else:
        return builtins.all(values)


def any(values, axis=None):
    """
    Test whether any array elements along a given axis evaluate to True.

    See Also
    --------
    LArray.any
    """
    if isinstance(values, LArray):
        return values.any(axis)
    else:
        return builtins.any(values)


# commutative modulo float precision errors
def sum(array, *args, **kwargs):
    """
    Sum of array elements.

    See Also
    --------
    LArray.sum
    """
    # XXX: we might want to be more aggressive here (more types to convert), however, generators should still be
    #      computed via the builtin.
    if isinstance(array, (np.ndarray, list)):
        array = LArray(array)
    if isinstance(array, LArray):
        return array.sum(*args, **kwargs)
    else:
        return builtins.sum(array, *args, **kwargs)


def prod(array, *args, **kwargs):
    """
    Product of array elements.

    See Also
    --------
    LArray.prod
    """
    return array.prod(*args, **kwargs)


def cumsum(array, *args, **kwargs):
    """
    Returns the cumulative sum of array elements.

    See Also
    --------
    LArray.cumsum
    """
    return array.cumsum(*args, **kwargs)


def cumprod(array, *args, **kwargs):
    """
    Returns the cumulative product of array elements.

    See Also
    --------
    LArray.cumprod
    """
    return array.cumprod(*args, **kwargs)


def min(array, *args, **kwargs):
    """
    Minimum of array elements.

    See Also
    --------
    LArray.min
    """
    if isinstance(array, LArray):
        return array.min(*args, **kwargs)
    else:
        return builtins.min(array, *args, **kwargs)


def max(array, *args, **kwargs):
    """
    Maximum of array elements.

    See Also
    --------
    LArray.max
    """
    if isinstance(array, LArray):
        return array.max(*args, **kwargs)
    else:
        return builtins.max(array, *args, **kwargs)


def mean(array, *args, **kwargs):
    """
    Computes the arithmetic mean.

    See Also
    --------
    LArray.mean
    """
    return array.mean(*args, **kwargs)


def median(array, *args, **kwargs):
    """
    Computes the median.

    See Also
    --------
    LArray.median
    """
    return array.median(*args, **kwargs)


def percentile(array, *args, **kwargs):
    """
    Computes the qth percentile of the data along the specified axis.

    See Also
    --------
    LArray.percentile
    """
    return array.percentile(*args, **kwargs)


# not commutative
def ptp(array, *args, **kwargs):
    """
    Returns the range of values (maximum - minimum).

    See Also
    --------
    LArray.ptp
    """
    return array.ptp(*args, **kwargs)


def var(array, *args, **kwargs):
    """
    Computes the variance.

    See Also
    --------
    LArray.var
    """
    return array.var(*args, **kwargs)


def std(array, *args, **kwargs):
    """
    Computes the standard deviation.

    See Also
    --------
    LArray.std
    """
    return array.std(*args, **kwargs)


def concat(arrays, axis=0, dtype=None):
    """Concatenate arrays along axis

    Parameters
    ----------
    arrays : tuple of LArray
        Arrays to concatenate.
    axis : axis reference (int, str or Axis), optional
        Axis along which to concatenate. Defaults to the first axis.
    dtype : dtype, optional
        Result data type. Defaults to the "closest" type which can hold all arrays types without loss of information.

    Returns
    -------
    LArray

    Examples
    --------
    >>> arr1 = ndtest((2, 3))
    >>> arr1
    a\\b  b0  b1  b2
     a0   0   1   2
     a1   3   4   5
    >>> arr2 = ndrange('a=a0,a1;b=b3')
    >>> arr2
    a\\b  b3
     a0   0
     a1   1
    >>> arr3 = ndrange('b=b4,b5')
    >>> arr3
    b  b4  b5
        0   1
    >>> concat((arr1, arr2, arr3), 'b')
    a\\b  b0  b1  b2  b3  b4  b5
     a0   0   1   2   0   0   1
     a1   3   4   5   1   0   1
    """
    # Get axis by name, so that we do *NOT* check they are "compatible", because it makes sense to append axes of
    # different length
    name = arrays[0].axes[axis].name
    arrays_labels = [array.axes[axis].labels for array in arrays]

    # switch to object dtype if labels are of incompatible types, so that we do not implicitly convert numeric types to
    # strings (numpy should not do this in the first place but that is another story). This can happen for example when
    # we want to add a "total" tick to a numeric axis (eg age).
    labels_type = common_type(arrays_labels)
    if labels_type is object:
        # astype always copies, while asarray only copies if necessary
        arrays_labels = [np.asarray(labels, dtype=object) for labels in arrays_labels]

    combined_axis = Axis(np.concatenate(arrays_labels), name)

    # combine all axes (using labels from any side if any)
    result_axes = arrays[0].axes.replace(axis, combined_axis).union(*[array.axes - axis for array in arrays[1:]])

    if dtype is None:
        dtype = common_type(arrays)

    result = empty(result_axes, dtype=dtype)
    start = 0
    for labels, array in zip(arrays_labels, arrays):
        stop = start + len(labels)
        result[combined_axis.i[start:stop]] = array
        start = stop
    return result


class LArrayIterator(object):
    def __init__(self, array):
        self.array = array
        self.index = 0

    def __iter__(self):
        return self

    def __next__(self):
        array = self.array
        if self.index == len(self.array):
            raise StopIteration
        # result = array.i[array.axes[0].i[self.index]]
        result = array.i[self.index]
        self.index += 1
        return result
    # Python 2
    next = __next__


class LArrayPositionalIndexer(object):
    def __init__(self, array):
        self.array = array

    def _translate_key(self, key):
        """
        Translates key into tuple of IGroup, i.e.
        tuple of collections of labels.
        """
        if not isinstance(key, tuple):
            key = (key,)
        if len(key) > self.array.ndim:
            raise IndexError("key has too many indices (%d) for array with %d dimensions" % (len(key), self.array.ndim))
        # no need to create a full nd key as that will be done later anyway
        return tuple(axis.i[axis_key]
                     for axis_key, axis in zip(key, self.array.axes))

    def __getitem__(self, key):
        return self.array[self._translate_key(key)]

    def __setitem__(self, key, value):
        self.array[self._translate_key(key)] = value

    def __len__(self):
        return len(self.array)


class LArrayPointsIndexer(object):
    def __init__(self, array):
        self.array = array

    def __getitem__(self, key):
        # TODO: this should generate an "intersection"/points NDGroup and simply do return self.array[nd_group]
        data = np.asarray(self.array)
        translated_key = self.array._translated_key(key, bool_stuff=True)

        axes = self.array._bool_key_new_axes(translated_key)
        data = data[translated_key]
        # drop length 1 dimensions created by scalar keys
        # data = data.reshape(tuple(len(axis) for axis in axes))
        if not axes:
            # scalars do not need to be wrapped in LArray
            return data
        else:
            return LArray(data, axes)

    def __setitem__(self, key, value):
        data = np.asarray(self.array)
        translated_key = self.array._translated_key(key, bool_stuff=True)
        if isinstance(value, LArray):
            axes = self.array._bool_key_new_axes(translated_key, wildcard_allowed=True)
            value = value.broadcast_with(axes)
        data[translated_key] = value


class LArrayPositionalPointsIndexer(object):
    def __init__(self, array):
        self.array = array

    def __getitem__(self, key):
        data = np.asarray(self.array)

        axes = self.array._bool_key_new_axes(key, wildcard_allowed=False)
        data = data[key]
        # drop length 1 dimensions created by scalar keys
        # data = data.reshape(tuple(len(axis) for axis in axes))
        if not axes:
            # scalars do not need to be wrapped in LArray
            return data
        else:
            return LArray(data, axes)

    def __setitem__(self, key, value):
        data = np.asarray(self.array)
        data[key] = value


def get_axis(obj, i):
    """
    Returns an axis according to its position.

    Parameters
    ----------
    obj : LArray or other array
        Input LArray or any array object which has a shape attribute (NumPy or Pandas array).
    i : int
        index of the axis.

    Returns
    -------
    Axis
        Axis corresponding to the given index if input `obj` is a LArray. A new anonymous Axis with the length of
        the ith dimension of the input `obj` otherwise.

    Examples
    --------
    >>> arr = ndtest((2, 2, 2))
    >>> arr
     a  b\c  c0  c1
    a0   b0   0   1
    a0   b1   2   3
    a1   b0   4   5
    a1   b1   6   7
    >>> get_axis(arr, 1)
    Axis(['b0', 'b1'], 'b')
    >>> np_arr = np.zeros((2, 2, 2))
    >>> get_axis(np_arr, 1)
    Axis(2, None)
    """
    return obj.axes[i] if isinstance(obj, LArray) else Axis(obj.shape[i])


_arg_agg = {
    'q': """
        q : int in range of [0,100] (or sequence of floats)
            Percentile to compute, which must be between 0 and 100 inclusive."""
}

_kwarg_agg = {
    'dtype': {'value': None, 'doc': """
        dtype : dtype, optional
            The data type of the returned array. Defaults to None (the dtype of the input array)."""},
    'out': {'value': None, 'doc': """
        out : LArray, optional
            Alternate output array in which to place the result. It must have the same shape as the expected output and
            its type is preserved (e.g., if dtype(out) is float, the result will consist of 0.0’s and 1.0’s).
            Axes and labels can be different, only the shape matters. Defaults to None (create a new array)."""},
    'ddof': {'value': 1, 'doc': """
        ddof : int, optional
            "Delta Degrees of Freedom": the divisor used in the calculation is ``N - ddof``, where ``N`` represents
            the number of elements. Defaults to 1."""},
    'skipna': {'value': None, 'doc': """
        skipna : bool, optional
            Whether or not to skip NaN (null) values. If False, resulting cells will be NaN if any of the aggregated
            cells is NaN. Defaults to True."""},
    'keepaxes': {'value': False, 'doc': """
        keepaxes : bool, optional
            Whether or not reduced axes are left in the result as dimensions with size one. Defaults to False."""
    },
    'interpolation': {'value': 'linear', 'doc': """
        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, optional
            Interpolation method to use when the desired quantile lies between two data points ``i < j``:

              * linear: ``i + (j - i) * fraction``, where ``fraction`` is the fractional part of the index surrounded
                by ``i`` and ``j``.
              * lower: ``i``.
              * higher: ``j``.
              * nearest: ``i`` or ``j``, whichever is nearest.
              * midpoint: ``(i + j) / 2``.

            Defaults to 'linear'."""
    }
}


def _doc_agg_method(func, by=False, long_name='', action_verb='perform', extra_args=[], kwargs=[]):
    if not long_name:
        long_name = func.__name__

    _args = ','.join(extra_args) + ', ' if len(extra_args) > 0 else ''
    _kwargs = ', '.join(["{}={!r}".format(k, _kwarg_agg[k]['value']) for k in kwargs]) + ', ' if len(kwargs) > 0 else ''
    signature = '{name}({args}*axes_and_groups, {kwargs}**explicit_axes)'.format(name=func.__name__,
                                                                                 args=_args, kwargs=_kwargs)

    if by:
        specific_template = """The {long_name} is {action_verb}ed along all axes except the given one(s).
            For groups, {long_name} is {action_verb}ed along groups and non associated axes."""
    else:
        specific_template = "Axis(es) or group(s) along which the {long_name} is {action_verb}ed."
    doc_specific = specific_template.format(long_name=long_name, action_verb=action_verb)

    doc_args = "".join(_arg_agg[arg] for arg in extra_args)
    doc_kwargs = "".join(_kwarg_agg[kw]['doc'] for kw in kwargs)
    doc_varargs = """
        \*axes_and_groups : None or int or str or Axis or Group or any combination of those
            {specific}
            The default (no axis or group) is to {action_verb} the {long_name} over all the dimensions of the input
            array.

            An axis can be referred by:

            * its index (integer). Index can be a negative integer, in which case it counts from the last to the
              first axis.
            * its name (str or AxisReference). You can use either a simple string ('axis_name') or the special
              variable x (x.axis_name).
            * a variable (Axis). If the axis has been defined previously and assigned to a variable, you can pass it as
              argument.

            You may not want to {action_verb} the {long_name} over a whole axis but over a selection of specific
            labels. To do so, you have several possibilities:

            * (['a1', 'a3', 'a5'], 'b1, b3, b5') : labels separated by commas in a list or a string
            * ('a1:a5:2') : select labels using a slice (general syntax is 'start:end:step' where is 'step' is
              optional and 1 by default).
            * (a='a1, a2, a3', x.b['b1, b2, b3']) : in case of possible ambiguity, i.e. if labels can belong to more
              than one axis, you must precise the axis.
            * ('a1:a3; a5:a7', b='b0,b2; b1,b3') : create several groups with semicolons.
              Names are simply given by the concatenation of labels (here: 'a1,a2,a3', 'a5,a6,a7', 'b0,b2' and 'b1,b3')
            * ('a1:a3 >> a123', 'b[b0,b2] >> b12') : operator ' >> ' allows to rename groups."""\
        .format(specific=doc_specific, action_verb=action_verb, long_name=long_name)
    parameters = """Parameters
        ----------{args}{varargs}{kwargs}""".format(args=doc_args, varargs=doc_varargs, kwargs=doc_kwargs)

    func.__doc__ = func.__doc__.format(signature=signature, parameters=parameters)


_always_return_float = {np.mean, np.nanmean, np.median, np.nanmedian, np.percentile, np.nanpercentile,
                        np.std, np.nanstd, np.var, np.nanvar}


[docs]def larray_equal(a1, a2): """ Compares two arrays and returns True if they have the same axes and elements (and do not contain nan values, see note below), False otherwise. Parameters ---------- a1, a2 : LArray-like Input arrays. aslarray() is used on non-LArray inputs. Returns ------- bool Returns True if the arrays are equal (and do not contain nan values). Notes ----- An array containing nan values is never equal to another array, even if that other array also contains nan values at the same positions. The reason is that a nan value is different from *anything*, including itself. One might want to use larray_nan_equal to avoid this behavior. See Also -------- larray_nan_equal Examples -------- >>> arr1 = ndtest((2, 3)) >>> arr1 a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 >>> arr2 = arr1.copy() >>> larray_equal(arr1, arr2) True >>> arr2['b1'] += 1 >>> larray_equal(arr1, arr2) False >>> arr3 = arr1.set_labels(X.a, ['x0', 'x1']) >>> larray_equal(arr1, arr3) False """ try: a1, a2 = aslarray(a1), aslarray(a2) except Exception: return False return (a1.axes == a2.axes and np.array_equal(np.asarray(a1), np.asarray(a2)))
obj_isnan = np.vectorize(lambda x: x != x, otypes=[bool])
[docs]def nan_equal(a1, a2): """ Compares two arrays element-wise and returns array of booleans. True for each cell where corresponding elements are equal or are both NaN, False otherwise. Parameters ---------- a1, a2 : LArray-like Input arrays. aslarray() is used on non-LArray inputs. Returns ------- LArray Returns True if the arrays are equal (even in the presence of NaN). Examples -------- >>> arr1 = ndtest(3, dtype=float) >>> arr1['a1'] = nan >>> arr1 a a0 a1 a2 0.0 nan 2.0 >>> arr2 = arr1.copy() >>> arr1 == arr2 a a0 a1 a2 True False True >>> nan_equal(arr1, arr2) a a0 a1 a2 True True True """ from larray.core.ufuncs import isnan def general_isnan(a): if np.issubclass_(a.dtype.type, np.inexact): return isnan(a) elif a.dtype.type is np.object_: return LArray(obj_isnan(a), a.axes) else: return False a1, a2 = aslarray(a1), aslarray(a2) return (a1 == a2) | (general_isnan(a1) & general_isnan(a2))
[docs]def larray_nan_equal(a1, a2): """ Compares two arrays and returns True if they have the same axes and elements, False otherwise. Parameters ---------- a1, a2 : LArray-like Input arrays. aslarray() is used on non-LArray inputs. Returns ------- bool Returns True if the arrays are equal, even in the presence of nan values (if they are at the same positions). See Also -------- larray_equal Examples -------- >>> arr1 = ndtest((2, 3), dtype=float) >>> arr1['a1', 'b1'] = nan >>> arr1 a\\b b0 b1 b2 a0 0.0 1.0 2.0 a1 3.0 nan 5.0 >>> arr2 = arr1.copy() >>> larray_equal(arr1, arr2) False >>> larray_nan_equal(arr1, arr2) True >>> arr2['b1'] = 0.0 >>> larray_nan_equal(arr1, arr2) False >>> arr3 = arr1.set_labels(X.a, ['x0', 'x1']) >>> larray_nan_equal(arr1, arr3) False >>> larray_nan_equal([0], [0]) True """ try: a1, a2 = aslarray(a1), aslarray(a2) except Exception: return False return a1.axes == a2.axes and all(nan_equal(a1, a2))
[docs]class LArray(ABCLArray): """ A LArray object represents a multidimensional, homogeneous array of fixed-size items with labeled axes. The function :func:`aslarray` can be used to convert a NumPy array or Pandas DataFrame into a LArray. Parameters ---------- data : scalar, tuple, list or NumPy ndarray Input data. axes : collection (tuple, list or AxisCollection) of axes (int, str or Axis), optional Axes. title : str, optional Title of array. Attributes ---------- data : NumPy ndarray Data. axes : AxisCollection Axes. title : str Title. See Also -------- sequence : Create a LArray by sequentially applying modifications to the array along axis. ndrange : Create a LArray with increasing elements. zeros : Create a LArray, each element of which is zero. ones : Create a LArray, each element of which is 1. full : Create a LArray filled with a given value. empty : Create a LArray, but leave its allocated memory unchanged (i.e., it contains “garbage”). Examples -------- >>> age = Axis([10, 11, 12], 'age') >>> sex = Axis('sex=M,F') >>> time = Axis([2007, 2008, 2009], 'time') >>> axes = [age, sex, time] >>> data = np.zeros((len(axes), len(sex), len(time))) >>> LArray(data, axes) age sex\\time 2007 2008 2009 10 M 0.0 0.0 0.0 10 F 0.0 0.0 0.0 11 M 0.0 0.0 0.0 11 F 0.0 0.0 0.0 12 M 0.0 0.0 0.0 12 F 0.0 0.0 0.0 >>> full(axes, 10.0) age sex\\time 2007 2008 2009 10 M 10.0 10.0 10.0 10 F 10.0 10.0 10.0 11 M 10.0 10.0 10.0 11 F 10.0 10.0 10.0 12 M 10.0 10.0 10.0 12 F 10.0 10.0 10.0 >>> arr = empty(axes) >>> arr['F'] = 1.0 >>> arr['M'] = -1.0 >>> arr age sex\\time 2007 2008 2009 10 M -1.0 -1.0 -1.0 10 F 1.0 1.0 1.0 11 M -1.0 -1.0 -1.0 11 F 1.0 1.0 1.0 12 M -1.0 -1.0 -1.0 12 F 1.0 1.0 1.0 >>> bysex = sequence(sex, initial=-1, inc=2) >>> bysex sex M F -1 1 >>> sequence(age, initial=10, inc=bysex) sex\\age 10 11 12 M 10 9 8 F 10 11 12 """
[docs] def __init__(self, data, axes=None, title=''): data = np.asarray(data) ndim = data.ndim if axes is None: axes = AxisCollection(data.shape) else: if not isinstance(axes, AxisCollection): axes = AxisCollection(axes) if axes.ndim != ndim: raise ValueError("number of axes (%d) does not match " "number of dimensions of data (%d)" % (axes.ndim, ndim)) if axes.shape != data.shape: raise ValueError("length of axes %s does not match " "data shape %s" % (axes.shape, data.shape)) self.data = data self.axes = axes self.title = title
# XXX: rename to posnonzero and implement a label version of nonzero
[docs] def nonzero(self): """ Returns the indices of the elements that are non-zero. Specifically, it returns a tuple of arrays (one for each dimension) containing the indices of the non-zero elements in that dimension. Returns ------- tuple of arrays : tuple Indices of elements that are non-zero. Examples -------- >>> arr = ndtest((2, 3)) % 2 >>> arr a\\b b0 b1 b2 a0 0 1 0 a1 1 0 1 >>> arr.nonzero() # doctest: +SKIP [array([0, 1, 1]), array([1, 0, 2])] """ # FIXME: return tuple of IGroup instead (or even NDGroup) so that you # can do a[a.nonzero()] return self.data.nonzero()
[docs] def set_axes(self, axes_to_replace=None, new_axis=None, inplace=False, **kwargs): """ Replace one, several or all axes of the array. Parameters ---------- axes_to_replace : axis ref or dict {axis ref: axis} or list of tuple (axis ref, axis) \ or list of Axis or AxisCollection Axes to replace. If a single axis reference is given, the `new_axis` argument must be provided. If a list of Axis or an AxisCollection is given, all axes will be replaced by the new ones. In that case, the number of new axes must match the number of the old ones. new_axis : Axis, optional New axis if `axes_to_replace` contains a single axis reference. inplace : bool, optional Whether or not to modify the original object or return a new array and leave the original intact. Defaults to False. **kwargs : Axis New axis for each axis to replace given as a keyword argument. Returns ------- LArray Array with axes replaced. See Also -------- rename : rename one of several axes Examples -------- >>> arr = ndtest((2, 3)) >>> arr a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 >>> row = Axis(['r0', 'r1'], 'row') >>> column = Axis(['c0', 'c1', 'c2'], 'column') Replace one axis (second argument `new_axis` must be provided) >>> arr.set_axes(X.a, row) row\\b b0 b1 b2 r0 0 1 2 r1 3 4 5 Replace several axes (keywords, list of tuple or dictionary) >>> arr.set_axes(a=row, b=column) # doctest: +SKIP >>> # or >>> arr.set_axes([(X.a, row), (X.b, column)]) # doctest: +SKIP >>> # or >>> arr.set_axes({X.a: row, X.b: column}) row\\column c0 c1 c2 r0 0 1 2 r1 3 4 5 Replace all axes (list of axes or AxisCollection) >>> arr.set_axes([row, column]) row\\column c0 c1 c2 r0 0 1 2 r1 3 4 5 >>> arr2 = ndrange([row, column]) >>> arr.set_axes(arr2.axes) row\\column c0 c1 c2 r0 0 1 2 r1 3 4 5 """ new_axes = self.axes.replace(axes_to_replace, new_axis, **kwargs) if inplace: if new_axes.ndim != self.ndim: raise ValueError("number of axes (%d) does not match number of dimensions of data (%d)" % (new_axes.ndim, self.ndim)) if new_axes.shape != self.data.shape: raise ValueError("length of axes %s does not match data shape %s" % (new_axes.shape, self.data.shape)) self.axes = new_axes return self else: return LArray(self.data, new_axes, title=self.title)
with_axes = renamed_to(set_axes, 'with_axes') def __getattr__(self, key): if key in self.axes: return self.axes[key] else: raise AttributeError("'{}' object has no attribute '{}'".format(self.__class__.__name__, key)) # needed to make *un*pickling work (because otherwise, __getattr__ is called before .axes exists, which leads to # an infinite recursion) def __getstate__(self): return self.__dict__ def __setstate__(self, d): self.__dict__ = d def __dir__(self): names = set(axis.name for axis in self.axes if axis.name is not None) return list(set(dir(self.__class__)) | set(self.__dict__.keys()) | names) def _ipython_key_completions_(self): return list(chain(*[list(labels) for labels in self.axes.labels])) @property def i(self): """ Allows selection of a subset using indices of labels. Examples -------- >>> arr = ndtest((2, 3, 4)) >>> arr a b\\c c0 c1 c2 c3 a0 b0 0 1 2 3 a0 b1 4 5 6 7 a0 b2 8 9 10 11 a1 b0 12 13 14 15 a1 b1 16 17 18 19 a1 b2 20 21 22 23 >>> arr.i[:, 0:2, [0,2]] a b\\c c0 c2 a0 b0 0 2 a0 b1 4 6 a1 b0 12 14 a1 b1 16 18 """ return LArrayPositionalIndexer(self) @property def points(self): """ Allows selection of arbitrary items in the array based on their N-dimensional label index. Examples -------- >>> arr = ndtest((2, 3, 4)) >>> arr a b\\c c0 c1 c2 c3 a0 b0 0 1 2 3 a0 b1 4 5 6 7 a0 b2 8 9 10 11 a1 b0 12 13 14 15 a1 b1 16 17 18 19 a1 b2 20 21 22 23 To select the two points with label coordinates [a0, b0, c0] and [a1, b2, c2], you must do: >>> arr.points['a0,a1', 'b0,b2', 'c0,c2'] a_b_c a0_b0_c0 a1_b2_c2 0 22 The number of label(s) on each dimension must be equal: >>> arr.points['a0,a1', 'b0,b2', 'c0,c1,c2'] # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (2,) (2,) (3,) """ return LArrayPointsIndexer(self) @property def ipoints(self): """ Allows selection of arbitrary items in the array based on their N-dimensional index. Examples -------- >>> arr = ndtest((2, 3, 4)) >>> arr a b\\c c0 c1 c2 c3 a0 b0 0 1 2 3 a0 b1 4 5 6 7 a0 b2 8 9 10 11 a1 b0 12 13 14 15 a1 b1 16 17 18 19 a1 b2 20 21 22 23 To select the two points with index coordinates [0, 0, 0] and [1, 2, 2], you must do: >>> arr.ipoints[[0,1], [0,2], [0,2]] a_b_c a0_b0_c0 a1_b2_c2 0 22 The number of index(es) on each dimension must be equal: >>> arr.ipoints[[0,1], [0,2], [0,1,2]] # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (2,) (2,) (3,) """ return LArrayPositionalPointsIndexer(self)
[docs] def to_frame(self, fold_last_axis_name=False, dropna=None): """ Converts LArray into Pandas DataFrame. Parameters ---------- fold_last_axis_name : bool, optional Defaults to False. dropna : {'any', 'all', None}, optional * any : if any NA values are present, drop that label * all : if all values are NA, drop that label * None by default. Returns ------- Pandas DataFrame Examples -------- >>> arr = ndtest((2, 2, 2)) >>> arr a b\\c c0 c1 a0 b0 0 1 a0 b1 2 3 a1 b0 4 5 a1 b1 6 7 >>> arr.to_frame() # doctest: +NORMALIZE_WHITESPACE c c0 c1 a b a0 b0 0 1 b1 2 3 a1 b0 4 5 b1 6 7 >>> arr.to_frame(fold_last_axis_name=True) # doctest: +NORMALIZE_WHITESPACE c0 c1 a b\\c a0 b0 0 1 b1 2 3 a1 b0 4 5 b1 6 7 """ columns = pd.Index(self.axes[-1].labels) if not fold_last_axis_name: columns.name = self.axes[-1].name if self.ndim > 1: axes_names = self.axes.names[:-1] if fold_last_axis_name: tmp = axes_names[-1] if axes_names[-1] is not None else '' if self.axes[-1].name: axes_names[-1] = "{}\\{}".format(tmp, self.axes[-1].name) index = pd.MultiIndex.from_product(self.axes.labels[:-1], names=axes_names) else: index = pd.Index(['']) if fold_last_axis_name: index.name = self.axes.names[-1] data = np.asarray(self).reshape(len(index), len(columns)) df = pd.DataFrame(data, index, columns) if dropna is not None: dropna = dropna if dropna is not True else 'all' df.dropna(inplace=True, how=dropna) return df
df = property(to_frame)
[docs] def to_series(self, dropna=False): """ Converts LArray into Pandas Series. Parameters ---------- dropna : bool, optional. False by default. Returns ------- Pandas Series Examples -------- >>> arr = ndtest((2, 3), dtype=float) >>> arr.to_series() # doctest: +NORMALIZE_WHITESPACE a b a0 b0 0.0 b1 1.0 b2 2.0 a1 b0 3.0 b1 4.0 b2 5.0 dtype: float64 """ index = pd.MultiIndex.from_product([axis.labels for axis in self.axes], names=self.axes.names) series = pd.Series(np.asarray(self).reshape(self.size), index) if dropna: series.dropna(inplace=True) return series
series = property(to_series)
[docs] def describe(self, *args, **kwargs): """ Descriptive summary statistics, excluding NaN values. By default, it includes the number of non-NaN values, the mean, standard deviation, minimum, maximum and the 25, 50 and 75 percentiles. Parameters ---------- *args : int or str or Axis or Group or any combination of those, optional Axes or groups along which to compute the aggregates. Defaults to aggregate over the whole array. percentiles : array-like, optional. List of integer percentiles to include. Defaults to [25, 50, 75]. Returns ------- LArray See Also -------- LArray.describe_by Examples -------- >>> arr = LArray([0, 6, 2, 5, 4, 3, 1, 3], 'year=2013..2020') >>> arr year 2013 2014 2015 2016 2017 2018 2019 2020 0 6 2 5 4 3 1 3 >>> arr.describe() statistic count mean std min 25% 50% 75% max 8.0 3.0 2.0 0.0 1.75 3.0 4.25 6.0 >>> arr.describe(percentiles=[50, 90]) statistic count mean std min 50% 90% max 8.0 3.0 2.0 0.0 3.0 5.3 6.0 """ # retrieve kw-only arguments percentiles = kwargs.pop('percentiles', None) if kwargs: raise TypeError("describe() got an unexpected keyword argument '{}'".format(list(kwargs.keys())[0])) if percentiles is None: percentiles = [25, 50, 75] plabels = ['{}%'.format(p) for p in percentiles] labels = ['count', 'mean', 'std', 'min'] + plabels + ['max'] percentiles = [0] + list(percentiles) + [100] # TODO: we should use the commented code using *self.percentile(percentiles, *args) but this does not work # when *args is not empty (see https://github.com/liam2/larray/issues/192) # return stack([(~np.isnan(self)).sum(*args), self.mean(*args), self.std(*args), # *self.percentile(percentiles, *args)], Axis(labels, 'stats')) return stack([(~np.isnan(self)).sum(*args), self.mean(*args), self.std(*args)] + [self.percentile(p, *args) for p in percentiles], Axis(labels, 'statistic'))
[docs] def describe_by(self, *args, **kwargs): """ Descriptive summary statistics, excluding NaN values, along axes or for groups. By default, it includes the number of non-NaN values, the mean, standard deviation, minimum, maximum and the 25, 50 and 75 percentiles. Parameters ---------- *args : int or str or Axis or Group or any combination of those, optional Axes or groups to include in the result after aggregating. Defaults to aggregate over the whole array. percentiles : array-like, optional. list of integer percentiles to include. Defaults to [25, 50, 75]. Returns ------- LArray See Also -------- LArray.describe Examples -------- >>> data = [[0, 6, 3, 5, 4, 2, 1, 3], [7, 5, 3, 2, 8, 5, 6, 4]] >>> arr = LArray(data, 'gender=Male,Female;year=2013..2020').astype(float) >>> arr gender\year 2013 2014 2015 2016 2017 2018 2019 2020 Male 0.0 6.0 3.0 5.0 4.0 2.0 1.0 3.0 Female 7.0 5.0 3.0 2.0 8.0 5.0 6.0 4.0 >>> arr.describe_by('gender') gender\statistic count mean std min 25% 50% 75% max Male 8.0 3.0 2.0 0.0 1.75 3.0 4.25 6.0 Female 8.0 5.0 2.0 2.0 3.75 5.0 6.25 8.0 >>> arr.describe_by('gender', (X.year[:2015], X.year[2018:])) gender year\statistic count mean std min 25% 50% 75% max Male :2015 3.0 3.0 3.0 0.0 1.5 3.0 4.5 6.0 Male 2018: 3.0 2.0 1.0 1.0 1.5 2.0 2.5 3.0 Female :2015 3.0 5.0 2.0 3.0 4.0 5.0 6.0 7.0 Female 2018: 3.0 5.0 1.0 4.0 4.5 5.0 5.5 6.0 >>> arr.describe_by('gender', percentiles=[50, 90]) gender\statistic count mean std min 50% 90% max Male 8.0 3.0 2.0 0.0 3.0 5.3 6.0 Female 8.0 5.0 2.0 2.0 5.0 7.3 8.0 """ # retrieve kw-only arguments percentiles = kwargs.pop('percentiles', None) if kwargs: raise TypeError("describe() got an unexpected keyword argument '{}'".format(list(kwargs.keys())[0])) args = self._prepare_aggregate(None, args) args = self._by_args_to_normal_agg_args(args) return self.describe(*args, percentiles=percentiles)
# noinspection PyAttributeOutsideInit # def __array_finalize__(self, obj): # """ # used when arrays are allocated from subclasses of ndarrays # """ # return np.ndarray.__array_finalize__(self.data, obj) # def __array_prepare__(self, arr, context=None): # """ # called before ufuncs (must return an ndarray) # """ # return np.ndarray.__array_prepare__(self.data, arr, context) def __array_wrap__(self, out_arr, context=None): """ Called after numpy ufuncs. This is never called during our wrapped ufuncs, but if somebody uses raw numpy function, this works in some cases. """ data = np.ndarray.__array_wrap__(self.data, out_arr, context) return LArray(data, self.axes) def __bool__(self): return bool(self.data) # Python 2 __nonzero__= __bool__
[docs] def rename(self, renames=None, to=None, inplace=False, **kwargs): """Renames axes of the array. Parameters ---------- renames : axis ref or dict {axis ref: str} or list of tuple (axis ref, str) Renames to apply. If a single axis reference is given, the `to` argument must be used. to : str or Axis New name if `renames` contains a single axis reference. **kwargs : str or Axis New name for each axis given as a keyword argument. Returns ------- LArray Array with axes renamed. See Also -------- set_axes : replace one or several axes Examples -------- >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> arr = ndrange([nat, sex]) >>> arr nat\\sex M F BE 0 1 FO 2 3 >>> arr.rename(X.nat, 'nat2') nat2\\sex M F BE 0 1 FO 2 3 >>> arr.rename(nat='nat2', sex='sex2') nat2\\sex2 M F BE 0 1 FO 2 3 >>> arr.rename([('nat', 'nat2'), ('sex', 'sex2')]) nat2\\sex2 M F BE 0 1 FO 2 3 >>> arr.rename({'nat': 'nat2', 'sex': 'sex2'}) nat2\\sex2 M F BE 0 1 FO 2 3 """ if isinstance(renames, dict): items = list(renames.items()) elif isinstance(renames, list): items = renames[:] elif isinstance(renames, (str, Axis, int)): items = [(renames, to)] else: items = [] items += kwargs.items() renames = {self.axes[k]: v for k, v in items} axes = [a.rename(renames[a]) if a in renames else a for a in self.axes] if inplace: self.axes = AxisCollection(axes) return self else: return LArray(self.data, axes)
[docs] def reindex(self, axes_to_reindex=None, new_axis=None, fill_value=np.nan, inplace=False, **kwargs): """Reorder and/or add new labels in axes. Place NaN or given `fill_value` in locations having no value previously. Parameters ---------- axes_to_reindex : axis ref or dict {axis ref: axis} or list of tuple (axis ref, axis) \ or list of Axis or AxisCollection Axes to reindex. If a single axis reference is given, the `new_axis` argument must be provided. If a list of Axis or an AxisCollection is given, all axes will be reindexed by the new ones. In that case, the number of new axes must match the number of the old ones. new_axis : int, str, list/tuple/array of str or Axis, optional List of new labels or new axis if `axes_to_replace` contains a single axis reference. fill_value : scalar or LArray, optional Value used to fill cells corresponding to label combinations which were not present before reindexing. Defaults to NaN. inplace : bool, optional Whether or not to modify the original object or return a new array and leave the original intact. Defaults to False. **kwargs : Axis New axis for each axis to reindex given as a keyword argument. Returns ------- LArray Array with reindexed axes. Notes ----- When introducing NAs into an array containing integers via reindex, all data will be promoted to float in order to store the NAs. Examples -------- >>> arr = ndtest((2, 2)) >>> arr a\\b b0 b1 a0 0 1 a1 2 3 Reindex one axis >>> arr.reindex(X.b, ['b1', 'b2', 'b0'], fill_value=-1) a\\b b1 b2 b0 a0 1 -1 0 a1 3 -1 2 >>> arr.reindex(X.b, 'b0..b2', fill_value=-1) a\\b b0 b1 b2 a0 0 1 -1 a1 2 3 -1 Reindex several axes >>> a = Axis(['a1', 'a2', 'a0'], 'a') >>> b = Axis(['b2', 'b1', 'b0'], 'b') >>> arr.reindex({'a': a, 'b': b}, fill_value=-1) a\\b b2 b1 b0 a1 -1 3 2 a2 -1 -1 -1 a0 -1 1 0 >>> arr.reindex({X.a: a, X.b: b}) a\\b b2 b1 b0 a1 nan 3.0 2.0 a2 nan nan nan a0 nan 1.0 0.0 Reindex using axes from another array >>> arr2 = ndrange('a=a0,a1;c=c0..c0;b=b0..b2') >>> arr2 a c\\b b0 b1 b2 a0 c0 0 1 2 a1 c0 3 4 5 >>> arr.reindex(arr2.axes) a b\\c c0 a0 b0 0.0 a0 b1 1.0 a0 b2 nan a1 b0 2.0 a1 b1 3.0 a1 b2 nan >>> arr2.reindex(arr.axes) a c\\b b0 b1 a0 c0 0.0 1.0 a1 c0 3.0 4.0 """ # XXX: can't we move this to AxisCollection.replace? if isinstance(new_axis, (int, basestring, list, tuple, np.ndarray)): new_axis = Axis(new_axis, self.axes[axes_to_reindex].name) if isinstance(new_axis, Axis): new_axis = new_axis.rename(self.axes[axes_to_reindex].name) if isinstance(axes_to_reindex, AxisCollection): assert new_axis is None # add extra axes if needed res_axes = AxisCollection([axes_to_reindex.get(axis, axis) for axis in self.axes]) | axes_to_reindex else: res_axes = self.axes.replace(axes_to_reindex, new_axis, **kwargs) res = full(res_axes, fill_value, dtype=common_type((self.data, fill_value))) def get_labels(self_axis): res_axis = res_axes[self_axis] if res_axis.equals(self_axis): return self_axis[:] else: return self_axis[self_axis.intersection(res_axis).labels] self_labels = tuple(get_labels(axis) for axis in self.axes) res_labels = tuple(res_axes[group.axis][group] for group in self_labels) res[res_labels] = self[self_labels] if inplace: self.axes = res.axes self.data = res.data return self else: return res
[docs] def align(self, other, join='outer', fill_value=nan, axes=None): """Align two arrays on their axes with the specified join method. In other words, it ensure all common axes are compatible. Those arrays can then be used in binary operations. Parameters ---------- other : LArray-like join : {'outer', 'inner', 'left', 'right'}, optional Join method. For each axis common to both arrays: - outer: will use a label if it is in either arrays axis (ordered like the first array). This is the default as it results in no information loss. - inner: will use a label if it is in both arrays axis (ordered like the first array) - left: will use the first array axis labels - right: will use the other array axis labels. fill_value : scalar or LArray, optional Value used to fill cells corresponding to label combinations which are not common to both arrays. Defaults to NaN. axes : AxisReference or sequence of them, optional Axes to align. Need to be valid in both arrays. Defaults to None (all common axes). This must be specified when mixing anonymous and non-anonymous axes. Returns ------- (left, right) : (LArray, LArray) Aligned objects Notes ----- Arrays with anonymous axes are currently not supported. Examples -------- >>> arr1 = ndtest((2, 3)) >>> arr1 a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 >>> arr2 = -ndtest((3, 2)) >>> # reorder array to make the test more interesting >>> arr2 = arr2[['b1', 'b0']] >>> arr2 a\\b b1 b0 a0 -1 0 a1 -3 -2 a2 -5 -4 Align arr1 and arr2 >>> aligned1, aligned2 = arr1.align(arr2) >>> aligned1 a\\b b0 b1 b2 a0 0.0 1.0 2.0 a1 3.0 4.0 5.0 a2 nan nan nan >>> aligned2 a\\b b0 b1 b2 a0 0.0 -1.0 nan a1 -2.0 -3.0 nan a2 -4.0 -5.0 nan After aligning all common axes, one can then do operations between the two arrays >>> aligned1 + aligned2 a\\b b0 b1 b2 a0 0.0 0.0 nan a1 1.0 1.0 nan a2 nan nan nan Other kinds of joins are supported >>> aligned1, aligned2 = arr1.align(arr2, join='inner') >>> aligned1 a\\b b0 b1 a0 0.0 1.0 a1 3.0 4.0 >>> aligned2 a\\b b0 b1 a0 0.0 -1.0 a1 -2.0 -3.0 >>> aligned1, aligned2 = arr1.align(arr2, join='left') >>> aligned1 a\\b b0 b1 b2 a0 0.0 1.0 2.0 a1 3.0 4.0 5.0 >>> aligned2 a\\b b0 b1 b2 a0 0.0 -1.0 nan a1 -2.0 -3.0 nan >>> aligned1, aligned2 = arr1.align(arr2, join='right') >>> aligned1 a\\b b1 b0 a0 1.0 0.0 a1 4.0 3.0 a2 nan nan >>> aligned2 a\\b b1 b0 a0 -1.0 0.0 a1 -3.0 -2.0 a2 -5.0 -4.0 The fill value for missing labels defaults to nan but can be changed to any compatible value. >>> aligned1, aligned2 = arr1.align(arr2, fill_value=0) >>> aligned1 a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 a2 0 0 0 >>> aligned2 a\\b b0 b1 b2 a0 0 -1 0 a1 -2 -3 0 a2 -4 -5 0 >>> aligned1 + aligned2 a\\b b0 b1 b2 a0 0 0 2 a1 1 1 5 a2 -4 -5 0 It also works when either arrays (or both) have extra axes >>> arr3 = ndtest((3, 2, 2)) >>> arr1 a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 >>> arr3 a b\\c c0 c1 a0 b0 0 1 a0 b1 2 3 a1 b0 4 5 a1 b1 6 7 a2 b0 8 9 a2 b1 10 11 >>> aligned1, aligned2 = arr1.align(arr3, join='inner') >>> aligned1 a\\b b0 b1 a0 0.0 1.0 a1 3.0 4.0 >>> aligned2 a b\c c0 c1 a0 b0 0.0 1.0 a0 b1 2.0 3.0 a1 b0 4.0 5.0 a1 b1 6.0 7.0 >>> aligned1 + aligned2 a b\\c c0 c1 a0 b0 0.0 1.0 a0 b1 3.0 4.0 a1 b0 7.0 8.0 a1 b1 10.0 11.0 One can also align only some specific axes (but in that case arrays might not be compatible) >>> aligned1, aligned2 = arr1.align(arr2, axes='b') >>> aligned1 a\\b b0 b1 b2 a0 0.0 1.0 2.0 a1 3.0 4.0 5.0 >>> aligned2 a\\b b0 b1 b2 a0 0.0 -1.0 nan a1 -2.0 -3.0 nan a2 -4.0 -5.0 nan """ other = aslarray(other) # reindex does not currently support anonymous axes if any(name is None for name in self.axes.names) or any(name is None for name in other.axes.names): raise ValueError("arrays with anonymous axes are currently not supported by LArray.align") left_axes, right_axes = self.axes.align(other.axes, join=join, axes=axes) return self.reindex(left_axes, fill_value=fill_value), other.reindex(right_axes, fill_value=fill_value)
[docs] @deprecate_kwarg('reverse', 'ascending', {True: False, False: True}) def sort_values(self, key=None, axis=None, ascending=True): """Sorts values of the array. Parameters ---------- key : scalar or tuple or Group Key along which to sort. Must have exactly one dimension less than ndim. Cannot be used in combination with `axis` argument. If both `key` and `axis` are None, sort array with all axes combined. Defaults to None. axis : int or str or Axis Axis along which to sort. Cannot be used in combination with `key` argument. Defaults to None. ascending : bool, optional Sort values in ascending order. Defaults to True. Returns ------- LArray Array with sorted values. Examples -------- sort the whole array (no key or axis given) >>> arr_1D = LArray([10, 2, 4], 'a=a0..a2') >>> arr_1D a a0 a1 a2 10 2 4 >>> arr_1D.sort_values() a a1 a2 a0 2 4 10 >>> arr_2D = LArray([[10, 2, 4], [3, 7, 1]], 'a=a0,a1; b=b0..b2') >>> arr_2D a\\b b0 b1 b2 a0 10 2 4 a1 3 7 1 >>> # if the array has more than one dimension, sort array with all axes combined >>> arr_2D.sort_values() a_b a1_b2 a0_b1 a1_b0 a0_b2 a1_b1 a0_b0 1 2 3 4 7 10 Sort along a given key >>> # sort columns according to the values of the row associated with the label 'a1' >>> arr_2D.sort_values('a1') a\\b b2 b0 b1 a0 4 10 2 a1 1 3 7 >>> arr_2D.sort_values('a1', ascending=False) a\\b b1 b0 b2 a0 2 10 4 a1 7 3 1 >>> arr_3D = LArray([[[10, 2, 4], [3, 7, 1]], [[5, 1, 6], [2, 8, 9]]], ... 'a=a0,a1; b=b0,b1; c=c0..c2') >>> arr_3D a b\\c c0 c1 c2 a0 b0 10 2 4 a0 b1 3 7 1 a1 b0 5 1 6 a1 b1 2 8 9 >>> # sort columns according to the values of the row associated with the labels 'a0' and 'b1' >>> arr_3D.sort_values(('a0', 'b1')) a b\\c c2 c0 c1 a0 b0 4 10 2 a0 b1 1 3 7 a1 b0 6 5 1 a1 b1 9 2 8 Sort along an axis >>> arr_2D a\\b b0 b1 b2 a0 10 2 4 a1 3 7 1 >>> # sort values along axis 'a' >>> # equivalent to sorting the values of each column of the array >>> arr_2D.sort_values(axis='a') a*\\b b0 b1 b2 0 3 2 1 1 10 7 4 >>> # sort values along axis 'b' >>> # equivalent to sorting the values of each row of the array >>> arr_2D.sort_values(axis='b') a\\b* 0 1 2 a0 2 4 10 a1 1 3 7 """ if key is not None and axis is not None: raise ValueError("Arguments key and axis are exclusive and cannot be used in combination") if axis is not None: axis = self.axes[axis] axis_idx = self.axes.index(axis) data = np.sort(self.data, axis_idx) new_axes = self.axes.replace(axis_idx, Axis(len(axis), axis.name)) res = LArray(data, new_axes) elif key is not None: subset = self[key] if subset.ndim > 1: raise NotImplementedError("sort_values key must have one dimension less than array.ndim") assert subset.ndim == 1 axis = subset.axes[0] indicesofsorted = subset.indicesofsorted() # FIXME: .data shouldn't be necessary, but currently, if we do not do it, we get # IGroup(nat EU FO BE # 1 2 0, axis='nat') # which sorts the *data* correctly, but the labels on the nat axis are not sorted (because the __getitem__ in # that case reuse the key axis as-is -- like it should). # Both use cases have value, but I think reordering the ticks should be the default. Now, I am unsure where to # change this. Probably in IGroupMaker.__getitem__, but then how do I get the "not reordering labels" behavior # that I have now? # FWIW, using .data, I get IGroup([1, 2, 0], axis='nat'), which works. sorter = axis.i[indicesofsorted.data] res = self[sorter] else: res = self.combine_axes() indicesofsorted = np.argsort(res.data) res = res.i[indicesofsorted] axis = res.axes[0] return res[axis[::-1]] if not ascending else res
[docs] @deprecate_kwarg('reverse', 'ascending', {True: False, False: True}) def sort_axes(self, axes=None, ascending=True): """Sorts axes of the array. Parameters ---------- axes : axis reference (Axis, str, int) or list of them, optional Axes to sort. Defaults to all axes. ascending : bool, optional Sort axes in ascending order. Defaults to True. Returns ------- LArray Array with sorted axes. Examples -------- >>> a = ndrange("nat=EU,FO,BE; sex=M,F") >>> a nat\\sex M F EU 0 1 FO 2 3 BE 4 5 >>> a.sort_axes('sex') nat\\sex F M EU 1 0 FO 3 2 BE 5 4 >>> a.sort_axes() nat\\sex F M BE 5 4 EU 1 0 FO 3 2 >>> a.sort_axes(('sex', 'nat')) nat\\sex F M BE 5 4 EU 1 0 FO 3 2 >>> a.sort_axes(ascending=False) nat\\sex M F FO 2 3 EU 0 1 BE 4 5 """ if axes is None: axes = self.axes elif not isinstance(axes, (tuple, list, AxisCollection)): axes = [axes] if not isinstance(axes, AxisCollection): axes = self.axes[axes] def sort_key(axis): key = np.argsort(axis.labels) if not ascending: key = key[::-1] return axis.i[key] return self[tuple(sort_key(axis) for axis in axes)]
sort_axis = renamed_to(sort_axes, 'sort_axis') def _translate_axis_key_chunk(self, axis_key, bool_passthrough=True): """ Translates axis(es) key into axis(es) position(s). Parameters ---------- axis_key : any kind of key Key to select axis(es). bool_passthrough : bool, optional True by default. Returns ------- IGroup Positional group with valid axes (from self.axes) """ if isinstance(axis_key, Group) and axis_key.axis is not None: # retarget to real axis, if needed # only retarget IGroup and not LGroup to give the opportunity for axis.translate to try the "ticks" # version of the group ONLY if key.axis is not real_axis (for performance reasons) if isinstance(axis_key, IGroup): axis_key = axis_key.retarget_to(self.axes[axis_key.axis]) axis_key = remove_nested_groups(axis_key) # already positional if isinstance(axis_key, IGroup): if axis_key.axis is None: raise ValueError("positional groups without axis are not supported") return axis_key # labels but known axis if isinstance(axis_key, LGroup) and axis_key.axis is not None: real_axis = self.axes[axis_key.axis] try: axis_pos_key = real_axis.index(axis_key, bool_passthrough) except KeyError: raise ValueError("%r is not a valid label for any axis" % axis_key) return real_axis.i[axis_pos_key] # otherwise we need to guess the axis # TODO: instead of checking all axes, we should have a big mapping # (in AxisCollection or LArray): # label -> (axis, index) # but for Pandas, this wouldn't work, we'd need label -> axis valid_axes = [] # TODO: use axis_key dtype to only check compatible axes for axis in self.axes: try: axis_pos_key = axis.index(axis_key, bool_passthrough) valid_axes.append(axis) except KeyError: continue if not valid_axes: raise ValueError("%s is not a valid label for any axis" % axis_key) elif len(valid_axes) > 1: # TODO: make an AxisCollection.display_name(axis) method out of this # valid_axes = ', '.join(self.axes.display_name(axis) for a in valid_axes) valid_axes = ', '.join(a.name if a.name is not None else '{{{}}}'.format(self.axes.index(a)) for a in valid_axes) raise ValueError('%s is ambiguous (valid in %s)' % (axis_key, valid_axes)) return valid_axes[0].i[axis_pos_key] def _translate_axis_key(self, axis_key, bool_passthrough=True): """Same as chunk. Returns ------- IGroup Positional group with valid axes (from self.axes) """ if isinstance(axis_key, ExprNode): axis_key = axis_key.evaluate(self.axes) if isinstance(axis_key, LArray) and np.issubdtype(axis_key.dtype, np.bool_) and bool_passthrough: if len(axis_key.axes) > 1: raise ValueError("mixing ND boolean filters with other filters in getitem is not currently supported") else: return IGroup(axis_key.nonzero()[0], axis=axis_key.axes[0]) # translate Axis keys to LGroup keys # FIXME: this should be simply: # if isinstance(axis_key, Axis): # axis_key = axis_key[:] # but it does not work for some reason (the retarget does not seem to happen) if isinstance(axis_key, Axis): real_axis = self.axes[axis_key] if isinstance(axis_key, AxisReference) or axis_key.equals(real_axis): axis_key = real_axis[:] else: axis_key = axis_key.labels # TODO: do it for Group without axis too if isinstance(axis_key, (tuple, list, np.ndarray, LArray)): axis = None # TODO: I should actually do some benchmarks to see if this is useful, and estimate which numbers to use for size in (1, 10, 100, 1000): # TODO: do not recheck already checked elements key_chunk = axis_key.i[:size] if isinstance(axis_key, LArray) else axis_key[:size] try: tkey = self._translate_axis_key_chunk(key_chunk, bool_passthrough) axis = tkey.axis break except ValueError: continue # the (start of the) key match a single axis if axis is not None: # make sure we have an Axis object # TODO: we should make sure the tkey returned from _translate_axis_key_chunk always contains a # real Axis (and thus kill this line) axis = self.axes[axis] # wrap key in LGroup axis_key = axis[axis_key] # XXX: reuse tkey chunks and only translate the rest? return self._translate_axis_key_chunk(axis_key, bool_passthrough) else: return self._translate_axis_key_chunk(axis_key, bool_passthrough) def _guess_axis(self, axis_key): if isinstance(axis_key, Group): group_axis = axis_key.axis if group_axis is not None: # we have axis information but not necessarily an Axis object from self.axes real_axis = self.axes[group_axis] if group_axis is not real_axis: axis_key = axis_key.with_axis(real_axis) return axis_key # TODO: instead of checking all axes, we should have a big mapping # (in AxisCollection or LArray): # label -> (axis, index) # or possibly (for ambiguous labels) # label -> {axis: index} # but for Pandas, this wouldn't work, we'd need label -> axis valid_axes = [] for axis in self.axes: try: axis.index(axis_key) valid_axes.append(axis) except KeyError: continue if not valid_axes: raise ValueError("%s is not a valid label for any axis" % axis_key) elif len(valid_axes) > 1: valid_axes = ', '.join(a.name if a.name is not None else '{{{}}}'.format(self.axes.index(a)) for a in valid_axes) raise ValueError('%s is ambiguous (valid in %s)' % (axis_key, valid_axes)) return valid_axes[0][axis_key] # TODO: move this to AxisCollection def _translated_key(self, key, bool_stuff=False): """Completes and translates key Parameters ---------- key : single axis key or tuple of keys or dict {axis_name: axis_key} Each axis key can be either a scalar, a list of scalars or an LGroup. Returns ------- Returns a full N dimensional positional key. """ if isinstance(key, np.ndarray) and np.issubdtype(key.dtype, np.bool_) and not bool_stuff: return key.nonzero() if isinstance(key, LArray) and np.issubdtype(key.dtype, np.bool_) and not bool_stuff: # if only the axes order is wrong, transpose # FIXME: if the key has both missing and extra axes, it could be the correct size (or even shape, see below) if key.size == self.size and key.shape != self.shape: return np.asarray(key.transpose(self.axes)).nonzero() # otherwise we need to transform the key to integer elif key.size != self.size: extra_key_axes = key.axes - self.axes if extra_key_axes: raise ValueError("subset key %s contains more axes than array %s" % (key.axes, self.axes)) # do I want to allow key_axis.name to match against axis.num? does not seem like a good idea. # but this should work # >>> a = ndrange((3, 4)) # >>> x1, x2 = a.axes # >>> a[x2 > 2] # the current solution with hash = (labels, name) works but is slow for large axes and broken if axis # labels are modified in-place, which I am unsure I want to support anyway self.axes.check_compatible(key.axes) local_axes = [self.axes[axis] for axis in key.axes] map_key = dict(zip(local_axes, np.asarray(key).nonzero())) return tuple(map_key.get(axis, slice(None)) for axis in self.axes) else: # correct shape # FIXME: if the key has both missing and extra axes (at the index of the missing axes), the shape # could be the same while the result should not return np.asarray(key).nonzero() # convert scalar keys to 1D keys if not isinstance(key, (tuple, dict)): key = (key,) if isinstance(key, tuple): # drop slice(None) and Ellipsis since they are meaningless because of guess_axis. # XXX: we might want to raise an exception when we find Ellipses or (most) slice(None) because except for # a single slice(None) a[:], I don't think there is any point. key = [axis_key for axis_key in key if not _isnoneslice(axis_key) and axis_key is not Ellipsis] # translate all keys to IGroup key = [self._translate_axis_key(axis_key, bool_passthrough=not bool_stuff) for axis_key in key] assert all(isinstance(axis_key, IGroup) for axis_key in key) # extract axis from Group keys key_items = [(k.axis, k) for k in key] else: # key axes could be strings or axis references and we want real axes key_items = [(self.axes[k], v) for k, v in key.items()] # TODO: use _translate_axis_key (to translate to IGroup here too) # key_items = [axis.translate(axis_key, bool_passthrough=not bool_stuff) # for axis, axis_key in key_items] # even keys given as dict can contain duplicates (if the same axis was # given under different forms, e.g. name and AxisReference). dupe_axes = list(duplicates(axis for axis, axis_key in key_items)) if dupe_axes: dupe_axes = ', '.join(str(axis) for axis in dupe_axes) raise ValueError("key has several values for axis: %s" % dupe_axes) key = dict(key_items) # dict -> tuple (complete and order key) assert all(isinstance(k, Axis) for k in key) key = [key[axis] if axis in key else slice(None) for axis in self.axes] # IGroup -> raw positional return tuple(axis.index(axis_key, bool_passthrough=not bool_stuff) for axis, axis_key in zip(self.axes, key)) # TODO: we only need axes length => move this to AxisCollection # (but this backend/numpy-specific so we'll probably need to create a subclass of it) def _cross_key(self, key): """ Returns a key indexing the cross product. Parameters ---------- key : complete (contains all dimensions) index-based key. Returns ------- key A key for indexing the cross product. """ # handle advanced indexing with more than one indexing array: basic indexing (only integer and slices) and # advanced indexing with only one indexing array are handled fine by numpy if self._needs_advanced_indexing(key): # np.ix_ wants only lists so: # 1) transform scalar-key to lists of 1 element. In that case, ndarray.__getitem__ leaves length 1 # dimensions instead of dropping them like we would like, so we will need to drop them later ourselves # (via reshape) noscalar_key = [[axis_key] if np.isscalar(axis_key) else axis_key for axis_key in key] # 2) expand slices to lists (ranges) # XXX: cache the range in the axis? # TODO: fork np.ix_ to allow for slices directly # it will be tricky to get right though because in that case the result of a[key] can have its dimensions in # the wrong order (if the ix_arrays are not next to each other, the corresponding dimensions are moved to # the front). It is probably worth the trouble though because it is much faster than the current solution # (~5x in my simple test) but this case (num_ix_arrays > 1) is rare in the first place (at least in demo) # so it is not a priority. listkey = tuple(np.arange(*axis_key.indices(len(axis))) if isinstance(axis_key, slice) else axis_key for axis_key, axis in zip(noscalar_key, self.axes)) # np.ix_ computes the cross product of all lists return np.ix_(*listkey) else: return tuple(key) def _needs_advanced_indexing(self, key): sequence = (tuple, list, np.ndarray) # count number of indexing arrays (ie non scalar/slices) in tuple num_ix_arrays = sum(isinstance(axis_key, sequence) for axis_key in key) num_scalars = sum(np.isscalar(axis_key) for axis_key in key) num_slices = sum(isinstance(axis_key, slice) for axis_key in key) assert len(key) == num_ix_arrays + num_scalars + num_slices return num_ix_arrays > 1 or (num_ix_arrays > 0 and num_scalars) def _collapse_slices(self, key): # isinstance(ndarray, collections.Sequence) is False but it # behaves like one sequence = (tuple, list, np.ndarray) return [_range_to_slice(axis_key, len(axis)) if isinstance(axis_key, sequence) else axis_key for axis_key, axis in zip(key, self.axes)] def _get_axes_from_translated_key(self, translated_key, include_scalar_axis_key=False): if include_scalar_axis_key: return [axis.subaxis(axis_key) if not np.isscalar(axis_key) else Axis(1, axis.name) for axis, axis_key in zip(self.axes, translated_key)] else: return [axis.subaxis(axis_key) for axis, axis_key in zip(self.axes, translated_key) if not np.isscalar(axis_key)] def __getitem__(self, key, collapse_slices=False): if isinstance(key, ExprNode): key = key.evaluate(self.axes) data = np.asarray(self.data) # XXX: I think I should split this into complete_key and translate_key because for LArray keys I need a # complete key with axes for subaxis translated_key = self._translated_key(key) # FIXME: I have a huge problem with boolean labels + non points if isinstance(key, (LArray, np.ndarray)) and np.issubdtype(key.dtype, np.bool_): return LArray(data[translated_key], self._bool_key_new_axes(translated_key)) if any(isinstance(axis_key, LArray) for axis_key in translated_key): k2 = [k.data if isinstance(k, LArray) else k for k in translated_key] res_data = data[k2] axes = self._get_axes_from_translated_key(translated_key) first_col = AxisCollection(axes[0]) res_axes = first_col.union(*axes[1:]) return LArray(res_data, res_axes) # TODO: if the original key was a list of labels, subaxis(translated_key).labels == orig_key, so we should use # orig_axis_key.copy() axes = self._get_axes_from_translated_key(translated_key) if collapse_slices: translated_key = self._collapse_slices(translated_key) cross_key = self._cross_key(translated_key) data = data[cross_key] if not axes: # scalars do not need to be wrapped in LArray return data else: # drop length 1 dimensions created by scalar keys res_data = data.reshape(tuple(len(axis) for axis in axes)) assert _equal_modulo_len1(data.shape, res_data.shape) return LArray(res_data, axes) def __setitem__(self, key, value, collapse_slices=True): # TODO: if key or value has more axes than self, we should use # total_axes = self.axes + key.axes + value.axes # expanded = self.expand(total_axes) # data = np.asarray(expanded.data) # concerning keys this can make sense in several cases: # single bool LArray key with extra axes. # tuple of bool LArray keys (eg one for each axis). each could have extra axes. Common axes between keys are # not a problem, we can simply "and" them. Though we should avoid explicitly "and"ing them if there is no # common axis because that is less efficient than the implicit "and" that is done by numpy __getitem__ (and # the fact we need to combine dimensions when any key has more than 1 dim). # the bool value represents whether the axis label is taken or not if any bool key (part) has more than one # axis, we get combined dimensions out of it. # int LArray keys # the int value represent an index along ONE particular axis, even if the key has more than one axis. if isinstance(key, ExprNode): key = key.evaluate(self.axes) data = np.asarray(self.data) translated_key = self._translated_key(key) if isinstance(key, (LArray, np.ndarray)) and np.issubdtype(key.dtype, np.bool_): if isinstance(value, LArray): new_axes = self._bool_key_new_axes(translated_key, wildcard_allowed=True) value = value.broadcast_with(new_axes) data[translated_key] = value return if collapse_slices: translated_key = self._collapse_slices(translated_key) cross_key = self._cross_key(translated_key) if isinstance(value, LArray): # XXX: we might want to create fakes (or wildcard?) axes in this case, as we only use axes names and axes # length, not the ticks, and those could theoretically take a significant time to compute if self._needs_advanced_indexing(translated_key): # when adv indexing is needed, cross_key converts scalars to lists of 1 element, which does not remove # the dimension like scalars normally do axes = self._get_axes_from_translated_key(translated_key, True) else: axes = self._get_axes_from_translated_key(translated_key) value = value.broadcast_with(axes) value.axes.check_compatible(axes) # replace incomprehensible error message "could not broadcast input array from shape XX into shape YY" # for users by "incompatible axes" extra_axes = [axis for axis in value.axes - axes if len(axis) > 1] if extra_axes: extra_axes = AxisCollection(extra_axes) axes = AxisCollection(axes) text = 'axes are' if len(extra_axes) > 1 else 'axis is' raise ValueError("Value {!s} {} not present in target subset {!s}. A value can only have the same axes " "or fewer axes than the subset being targeted".format(extra_axes, text, axes)) else: # if value is a "raw" ndarray we rely on numpy broadcasting pass data[cross_key] = value def _bool_key_new_axes(self, key, wildcard_allowed=False, sep='_'): """ Returns an AxisCollection containing combined axes. Axes corresponding to scalar key are dropped. This method is used in case of boolean key. Parameters ---------- key : tuple Position-based key wildcard_allowed : bool Returns ------- AxisCollection Notes ----- See examples of properties `points` and `ipoints`. """ # TODO: use AxisCollection.combine_axes. The problem is that it uses product(*axes_labels) # while here we need zip(*axes_labels) combined_axes = [axis for axis_key, axis in zip(key, self.axes) if not _isnoneslice(axis_key) and not np.isscalar(axis_key)] # scalar axes are not taken, since we want to kill them other_axes = [axis for axis_key, axis in zip(key, self.axes) if _isnoneslice(axis_key)] assert len(key) > 0 axes_indices = [self.axes.index(axis) for axis in combined_axes] diff = np.diff(axes_indices) # this can happen if key has only None slices and scalars if not len(combined_axes): combined_axis_pos = None elif np.any(diff > 1): # combined axes in front combined_axis_pos = 0 else: combined_axis_pos = axes_indices[0] # all anonymous axes => anonymous combined axis if all(axis.name is None for axis in combined_axes): combined_name = None else: combined_name = sep.join(str(self.axes.axis_id(axis)) for axis in combined_axes) new_axes = other_axes if combined_axis_pos is not None: if wildcard_allowed: lengths = [len(axis_key) for axis_key in key if not _isnoneslice(axis_key) and not np.isscalar(axis_key)] combined_axis_len = lengths[0] assert all(l == combined_axis_len for l in lengths) combined_axis = Axis(combined_axis_len, combined_name) else: # TODO: the combined keys should be objects which display as: # (axis1_label, axis2_label, ...) but which should also store # the axis (names?) # Q: Should it be the same object as the NDLGroup?/NDKey? # A: yes, probably. On the Pandas backend, we could/should have # separate axes. On the numpy backend we cannot. axes_labels = [axis.labels[axis_key] for axis_key, axis in zip(key, self.axes) if not _isnoneslice(axis_key) and not np.isscalar(axis_key)] if len(combined_axes) == 1: # Q: if axis is a wildcard axis, should the result be a # wildcard axis (and axes_labels discarded?) combined_labels = axes_labels[0] else: combined_labels = [sep.join(str(l) for l in comb) for comb in zip(*axes_labels)] # CRAP, this can lead to duplicate labels (especially using .points) combined_axis = Axis(combined_labels, combined_name) new_axes.insert(combined_axis_pos, combined_axis) return AxisCollection(new_axes)
[docs] def set(self, value, **kwargs): """ Sets a subset of array to value. * all common axes must be either of length 1 or the same length * extra axes in value must be of length 1 * extra axes in current array can have any length Parameters ---------- value : scalar or LArray Examples -------- >>> arr = ndtest((3, 3)) >>> arr a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 a2 6 7 8 >>> arr['a1:', 'b1:'].set(10) >>> arr a\\b b0 b1 b2 a0 0 1 2 a1 3 10 10 a2 6 10 10 >>> arr['a1:', 'b1:'].set(ndrange("a=a1,a2;b=b1,b2")) >>> arr a\\b b0 b1 b2 a0 0 1 2 a1 3 0 1 a2 6 2 3 """ self.__setitem__(kwargs, value)
[docs] def reshape(self, target_axes): """ Given a list of new axes, changes the shape of the array. The size of the array (= number of elements) must be equal to the product of length of target axes. Parameters ---------- target_axes : iterable of Axis New axes. The size of the array (= number of stored data) must be equal to the product of length of target axes. Returns ------- LArray New array with new axes but same data. Examples -------- >>> arr = ndtest((2, 2, 2)) >>> arr a b\\c c0 c1 a0 b0 0 1 a0 b1 2 3 a1 b0 4 5 a1 b1 6 7 >>> new_arr = arr.reshape([Axis('a=a0,a1'), ... Axis(['b0c0', 'b0c1', 'b1c0', 'b1c1'], 'bc')]) >>> new_arr a\\bc b0c0 b0c1 b1c0 b1c1 a0 0 1 2 3 a1 4 5 6 7 """ # this is a dangerous operation, because except for adding length 1 axes (which is safe), it potentially # modifies data # TODO: add a check/flag? for "unsafe" reshapes (but allow merging # several axes & "splitting" axes) etc. # eg 4, 3, 2 -> 2, 3, 4 is wrong (even if size is respected) # 4, 3, 2 -> 12, 2 is potentially ok (merging adjacent dimensions) # -> 4, 6 is potentially ok (merging adjacent dimensions) # -> 24 is potentially ok (merging adjacent dimensions) # -> 3, 8 WRONG (non adjacent dimensions) # -> 8, 3 WRONG # 4, 3, 2 -> 2, 2, 3, 2 is potentially ok (splitting dim) data = np.asarray(self).reshape([len(axis) for axis in target_axes]) return LArray(data, target_axes)
[docs] def reshape_like(self, target): """ Same as reshape but with an array as input. Total size (= number of stored data) of the two arrays must be equal. See Also -------- reshape : returns a LArray with a new shape given a list of axes. Examples -------- >>> arr = zeros((2, 2, 2), dtype=int) >>> arr {0}* {1}*\\{2}* 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 >>> new_arr = arr.reshape_like(ndtest((2, 4))) >>> new_arr a\\b b0 b1 b2 b3 a0 0 0 0 0 a1 0 0 0 0 """ return self.reshape(target.axes)
[docs] def broadcast_with(self, target): """ Returns an array that is (NumPy) broadcastable with target. * all common axes must be either of length 1 or the same length * extra axes in source can have any length and will be moved to the front * extra axes in target can have any length and the result will have axes of length 1 for those axes This is different from reshape which ensures the result has exactly the shape of the target. Parameters ---------- target : LArray or collection of Axis Returns ------- LArray """ if isinstance(target, LArray): target_axes = target.axes else: target_axes = target if not isinstance(target, AxisCollection): target_axes = AxisCollection(target_axes) if self.axes == target_axes: return self target_axes = (self.axes - target_axes) | target_axes # XXX: this breaks la['1,5,9'] = la['2,7,3'] # but that use case should use drop_labels # self.axes.check_compatible(target_axes) # 1) reorder axes to target order array = self.transpose(target_axes & self.axes) # 2) add length one axes return array.reshape(array.axes.get_all(target_axes))
# XXX: I wonder if effectively dropping the labels is necessary or not # we could perfectly only mark the axis as being a wildcard axis and keep # the labels intact. These wildcard axes with labels # could be useful in a few situations. For example, Excel sheets could # have such behavior: you can slice columns using letters, but that # wouldn't prevent doing computation between arrays using different # columns. On the other hand, it makes wild axes less obvious and I # wonder if there would be a risk of wildcard axes inadvertently leaking. # plus it might be confusing if incompatible labels "work".
[docs] def drop_labels(self, axes=None): """Drops the labels from axes (replace those axes by "wildcard" axes). Useful when you want to apply operations between two arrays or subarrays with same shape but incompatible axes (different labels). Parameters ---------- axes : Axis or list/tuple/AxisCollection of Axis, optional Axis(es) on which you want to drop the labels. Returns ------- LArray Notes ----- Use it at your own risk. Examples -------- >>> a = Axis('a=a1,a2') >>> b = Axis('b=b1,b2') >>> b2 = Axis('b=b2,b3') >>> arr1 = ndrange([a, b]) >>> arr1 a\\b b1 b2 a1 0 1 a2 2 3 >>> arr1.drop_labels(b) a\\b* 0 1 a1 0 1 a2 2 3 >>> arr1.drop_labels([a, b]) a*\\b* 0 1 0 0 1 1 2 3 >>> arr2 = ndrange([a, b2]) >>> arr2 a\\b b2 b3 a1 0 1 a2 2 3 >>> arr1 * arr2 Traceback (most recent call last): ... ValueError: incompatible axes: Axis(['b2', 'b3'], 'b') vs Axis(['b1', 'b2'], 'b') >>> arr1 * arr2.drop_labels() a\\b b1 b2 a1 0 1 a2 4 9 >>> arr1.drop_labels() * arr2 a\\b b2 b3 a1 0 1 a2 4 9 >>> arr1.drop_labels(X.a) * arr2.drop_labels(X.b) a\\b b1 b2 a1 0 1 a2 4 9 """ if axes is None: axes = self.axes if not isinstance(axes, (tuple, list, AxisCollection)): axes = [axes] old_axes = self.axes[axes] new_axes = [Axis(len(axis), axis.name) for axis in old_axes] res_axes = self.axes[:] res_axes[axes] = new_axes return LArray(self.data, res_axes)
def __str__(self): if not self.ndim: return str(np.asscalar(self)) elif not len(self): return 'LArray([])' else: table = list(self.as_table(maxlines=200, edgeitems=5)) return table2str(table, 'nan', fullinfo=True, maxwidth=200, keepcols=self.ndim - 1) __repr__ = __str__ def __iter__(self): return LArrayIterator(self) def __contains__(self, key): return any(key in axis for axis in self.axes) def as_table(self, maxlines=None, edgeitems=5, light=False): """ Generator. Returns next line of the table representing an array. Parameters ---------- maxlines : int, optional Maximum number of lines to show. edgeitems : int, optional If number of lines to display is greater than `maxlines`, only the first and last `edgeitems` lines are displayed. Only active if `maxlines` is not None. Equals to 5 by default. Returns ------- list Next line of the table as a list. Examples -------- >>> arr = ndtest((2, 2, 3)) >>> list(arr.as_table()) # doctest: +NORMALIZE_WHITESPACE [['a', 'b\\\\c', 'c0', 'c1', 'c2'], ['a0', 'b0', 0, 1, 2], ['a0', 'b1', 3, 4, 5], ['a1', 'b0', 6, 7, 8], ['a1', 'b1', 9, 10, 11]] >>> list(arr.as_table(light=True)) # doctest: +NORMALIZE_WHITESPACE [['a', 'b\\\\c', 'c0', 'c1', 'c2'], ['a0', 'b0', 0, 1, 2], ['', 'b1', 3, 4, 5], ['a1', 'b0', 6, 7, 8], ['', 'b1', 9, 10, 11]] """ if not self.ndim: return # ert unit geo\time 2012 2011 2010 # NEER27 I05 AT 101.41 101.63 101.63 # NEER27 I05 AU 134.86 125.29 117.08 width = self.shape[-1] height = int(np.prod(self.shape[:-1])) data = np.asarray(self).reshape(height, width) # get list of names of axes axes_names = self.axes.display_names[:] # transforms ['a', 'b', 'c', 'd'] into ['a', 'b', 'c\\d'] if len(axes_names) > 1: axes_names[-2] = '\\'.join(axes_names[-2:]) axes_names.pop() # get list of labels for each axis except the last one. labels = [axis.labels.tolist() for axis in self.axes[:-1]] # creates vertical lines (ticks is a list of list) if self.ndim == 1: # There is no vertical axis, so the axis name should not have # any "tick" below it and we add an empty "tick". ticks = [['']] elif light: ticks = light_product(*labels) else: ticks = product(*labels) # returns the first line (axes names + labels of last axis) yield axes_names + self.axes[-1].labels.tolist() # summary if needed if maxlines is not None and height > maxlines: # replace middle lines of the table by '...'. # We show only the first and last edgeitems lines. startticks = islice(ticks, edgeitems) midticks = [["..."] * (self.ndim - 1)] endticks = list(islice(rproduct(*labels), edgeitems))[::-1] ticks = chain(startticks, midticks, endticks) data = chain(data[:edgeitems].tolist(), [["..."] * width], data[-edgeitems:].tolist()) for tick, dataline in izip(ticks, data): # returns next line (labels of N-1 first axes + data) yield list(tick) + dataline else: for tick, dataline in izip(ticks, data): # returns next line (labels of N-1 first axes + data) yield list(tick) + dataline.tolist() def dump(self, header=True): """Dump array as a 2D nested list Parameters ---------- header : bool Whether or not to output axes names and labels. Returns ------- 2D nested list """ if not header: # flatten all dimensions except the last one return self.data.reshape(-1, self.shape[-1]).tolist() else: return list(self.as_table()) # XXX: should filter(geo=['W']) return a view by default? (collapse=True) # I think it would be dangerous to make it the default # behavior, because that would introduce a subtle difference between # filter(dim=[a, b]) and filter(dim=[a]) even though it would be faster # and uses less memory. Maybe I should have a "view" argument which # defaults to 'auto' (ie collapse by default), can be set to False to # force a copy and to True to raise an exception if a view is not possible.
[docs] def filter(self, collapse=False, **kwargs): """Filters the array along the axes given as keyword arguments. The *collapse* argument determines whether consecutive ranges should be collapsed to slices, which is more efficient and returns a view (and not a copy) if possible (if all ranges are consecutive). Only use this argument if you do not intent to modify the resulting array, or if you know what you are doing. It is similar to np.take but works with several axes at once. """ return self.__getitem__(kwargs, collapse)
def _axis_aggregate(self, op, axes=(), keepaxes=False, out=None, **kwargs): """ Parameters ---------- op : function An aggregate function with this signature: func(a, axis=None, dtype=None, out=None, keepdims=False) axes : tuple of axes, optional Each axis can be an Axis object, str or int. out : LArray, optional Alternative output array in which to place the result. It must have the same shape as the expected output. keepaxes : bool or scalar, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- LArray or scalar """ src_data = np.asarray(self) axes = self.axes[list(axes)] if axes else self.axes axes_indices = tuple(self.axes.index(a) for a in axes) if axes != self.axes else None if op.__name__ == 'ptp': if axes_indices is not None and len(axes) > 1: raise ValueError('ptp can only be applied along a single axis or all axes, not multiple arbitrary axes') elif axes_indices is not None: axes_indices = axes_indices[0] else: kwargs['keepdims'] = bool(keepaxes) if out is not None: assert isinstance(out, LArray) kwargs['out'] = out.data res_data = op(src_data, axis=axes_indices, **kwargs) if keepaxes: label = op.__name__.replace('nan', '') if keepaxes is True else keepaxes new_axes = [Axis([label], axis.name) for axis in axes] res_axes = self.axes[:] res_axes[axes] = new_axes else: res_axes = self.axes - axes if not res_axes: # scalars don't need to be wrapped in LArray return res_data else: return LArray(res_data, res_axes) def _cum_aggregate(self, op, axis): """ op is a numpy cumulative aggregate function: func(arr, axis=0). axis is an Axis object, a str or an int. Contrary to other aggregate functions this only supports one axis at a time. """ # TODO: accept a single group in axis, to filter & aggregate in one shot return LArray(op(np.asarray(self), axis=self.axes.index(axis)), self.axes) # TODO: now that items is never a (k, v), it should be renamed to # something else: args? (groups would be misleading because each "item" can contain several groups) # TODO: experiment implementing this using ufunc.reduceat # http://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.ufunc.reduceat.html # XXX: rename keepaxes to label=value? For group_aggregates we might want to keep the LGroup label if any def _group_aggregate(self, op, items, keepaxes=False, out=None, **kwargs): assert out is None res = self # TODO: when working with several "axes" at the same times, we should not produce the intermediary result at # all. It should be faster and consume a bit less memory. for item in items: res_axes = res.axes[:] res_shape = list(res.shape) if isinstance(item, tuple): assert all(isinstance(g, Group) for g in item) groups = item axis = groups[0].axis # they should all have the same axis (this is already checked # in _prepare_aggregate though) assert all(g.axis.equals(axis) for g in groups[1:]) killaxis = False else: # item is in fact a single group assert isinstance(item, Group), type(item) groups = (item,) axis = item.axis # it is easier to kill the axis after the fact killaxis = True axis, axis_idx = res.axes[axis], res.axes.index(axis) # potentially translate axis reference to real axes groups = tuple(g.with_axis(axis) for g in groups) res_shape[axis_idx] = len(groups) # XXX: this code is fragile. I wonder if there isn't a way to ask the function what kind of dtype/shape it # will return given the input we are going to give it. My first search for this found nothing. One # way to do this would be to create one big mapping: {(op, input dtype): res dtype} res_dtype = float if op in _always_return_float else res.dtype if op in (np.sum, np.nansum) and res.dtype in (np.bool, np.bool_): res_dtype = int res_data = np.empty(res_shape, dtype=res_dtype) group_idx = [slice(None) for _ in res_shape] for i, group in enumerate(groups): group_idx[axis_idx] = i # this is only useful for ndim == 1 because a[(0,)] (equivalent to a[0] which kills the axis) # is different from a[[0]] (which does not kill the axis) idx = tuple(group_idx) # we need only lists of ticks, not single ticks, otherwise the dimension is discarded too early # (in __getitem__ instead of in the aggregate func) if isinstance(group, IGroup) and np.isscalar(group.key): group = IGroup([group.key], axis=group.axis) elif isinstance(group, LGroup): key = _to_key(group.key) assert not isinstance(key, Group) if np.isscalar(key): key = [key] # we do not care about the name at this point group = LGroup(key, axis=group.axis) arr = res.__getitem__(group, collapse_slices=True) if res_data.ndim == 1: assert len(idx) == 1 and idx[0] == i # res_data[idx] but instead of returning a scalar (eg np.int32), it returns a 0d array which is a # view on res_data, which can thus be used as out out = res_data[i:i + 1].reshape(()) else: out = res_data[idx] arr = np.asarray(arr) op(arr, axis=axis_idx, out=out, **kwargs) del arr if killaxis: assert group_idx[axis_idx] == 0 res_data = res_data[idx] del res_axes[axis_idx] else: # We do NOT modify the axis name (eg append "_agg" or "*") even though this creates a new axis that is # independent from the original one because the original name is what users will want to use to access # that axis (eg in .filter kwargs) res_axes[axis_idx] = Axis(groups, axis.name) if isinstance(res_data, np.ndarray): res = LArray(res_data, res_axes) else: res = res_data return res def _prepare_aggregate(self, op, args, kwargs=None, commutative=False, stack_depth=1): """converts args to keys & LGroup and kwargs to LGroup""" if kwargs is None: kwargs_items = [] else: explicit_axis = kwargs.pop('axis', None) if explicit_axis is not None: explicit_axis = self.axes[explicit_axis] if isinstance(explicit_axis, Axis): args += (explicit_axis,) else: assert isinstance(explicit_axis, AxisCollection) args += tuple(explicit_axis) kwargs_items = kwargs.items() if not commutative and len(kwargs_items) > 1: # TODO: lift this restriction for python3.6+ raise ValueError("grouping aggregates on multiple axes at the same time using keyword arguments is not " "supported for '%s' (because it is not a commutative operation and keyword arguments are " "*not* ordered in Python)" % op.__name__) # Sort kwargs by axis name so that we have consistent results between runs because otherwise rounding errors # could lead to slightly different results even for commutative operations. sorted_kwargs = sorted(kwargs_items) # convert kwargs to LGroup so that we can only use args afterwards but still keep the axis information def standardise_kw_arg(axis_name, key, stack_depth=1): if isinstance(key, str): key = _to_keys(key, stack_depth + 1) if isinstance(key, tuple): # XXX +2? return tuple(standardise_kw_arg(axis_name, k, stack_depth + 1) for k in key) if isinstance(key, LGroup): return key return self.axes[axis_name][key] def to_labelgroup(key, stack_depth=1): if isinstance(key, str): key = _to_keys(key, stack_depth + 1) if isinstance(key, tuple): # a tuple is supposed to be several groups on the same axis # TODO: it would be better to use self._translate_axis_key directly (so that we do not need to do the # label -> position translation twice) but this fails because the groups are also used as ticks on the # new axis, and igroups are not the same that LGroups in this regard (I wonder if ideally it shouldn't # be the same???) # groups = tuple(self._translate_axis_key(k) for k in key) groups = tuple(self._guess_axis(_to_key(k, stack_depth + 1)) for k in key) axis = groups[0].axis if not all(g.axis.equals(axis) for g in groups[1:]): raise ValueError("group with different axes: %s" % str(key)) return groups if isinstance(key, (Group, int, basestring, list, slice)): return self._guess_axis(key) else: raise NotImplementedError("%s has invalid type (%s) for a group aggregate key" % (key, type(key).__name__)) def standardise_arg(arg, stack_depth=1): if self.axes.isaxis(arg): return self.axes[arg] else: return to_labelgroup(arg, stack_depth + 1) operations = [standardise_arg(a, stack_depth=stack_depth + 2) for a in args if a is not None] + \ [standardise_kw_arg(k, v, stack_depth=stack_depth + 2) for k, v in sorted_kwargs] if not operations: # op() without args is equal to op(all_axes) operations = self.axes return operations def _by_args_to_normal_agg_args(self, operations): # get axes to aggregate flat_op = chain.from_iterable([(o,) if isinstance(o, (Group, Axis)) else o for o in operations]) axes = [o.axis if isinstance(o, Group) else o for o in flat_op] to_agg = self.axes - axes # add groups to axes to aggregate def is_or_contains_group(o): return isinstance(o, Group) or (isinstance(o, tuple) and isinstance(o[0], Group)) return list(to_agg) + [o for o in operations if is_or_contains_group(o)] def _aggregate(self, op, args, kwargs=None, keepaxes=False, by_agg=False, commutative=False, out=None, extra_kwargs={}): operations = self._prepare_aggregate(op, args, kwargs, commutative, stack_depth=3) if by_agg and operations != self.axes: operations = self._by_args_to_normal_agg_args(operations) res = self # group *consecutive* same-type (group vs axis aggregates) operations # we do not change the order of operations since we only group consecutive operations. for are_axes, axes in groupby(operations, self.axes.isaxis): func = res._axis_aggregate if are_axes else res._group_aggregate res = func(op, axes, keepaxes=keepaxes, out=out, **extra_kwargs) return res # op=sum does not parse correctly
[docs] def with_total(self, *args, **kwargs): """with_total(*args, op='sum', label='total', **kwargs) Add aggregated values (sum by default) along each axis. A user defined label can be given to specified the computed values. Parameters ---------- *args : int or str or Axis or Group or any combination of those, optional Axes or groups along which to compute the aggregates. Passed groups should be named. Defaults to aggregate over the whole array. op : aggregate function, optional Defaults to `sum`. label : scalar value, optional Label to use for the total. Applies only to aggregated axes, not groups. Defaults to "total". **kwargs : int or str or Group or any combination of those, optional Axes or groups along which to compute the aggregates. Returns ------- LArray Examples -------- >>> arr = ndtest((3, 3)) >>> arr a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 a2 6 7 8 >>> arr.with_total() a\\b b0 b1 b2 total a0 0 1 2 3 a1 3 4 5 12 a2 6 7 8 21 total 9 12 15 36 >>> arr.with_total('a', 'b0,b1 >> total_01') a\\b b0 b1 b2 total_01 a0 0 1 2 1 a1 3 4 5 7 a2 6 7 8 13 total 9 12 15 21 >>> arr.with_total(op=prod, label='product') a\\b b0 b1 b2 product a0 0 1 2 0 a1 3 4 5 60 a2 6 7 8 336 product 0 28 80 0 """ # TODO: default to op.__name__ label = kwargs.pop('label', 'total') op = kwargs.pop('op', sum) npop = { sum: np.sum, prod: np.prod, min: np.min, max: np.max, mean: np.mean, ptp: np.ptp, var: np.var, std: np.std, median: np.median, percentile: np.percentile, } # TODO: commutative should be known for usual ops operations = self._prepare_aggregate(op, args, kwargs, False, stack_depth=2) res = self # TODO: we should allocate the final result directly and fill it progressively, so that the original array is # only copied once for axis in operations: # TODO: append/extend first with an empty array then _aggregate with out= if self.axes.isaxis(axis): value = res._axis_aggregate(npop[op], (axis,), keepaxes=label) else: # groups if not isinstance(axis, tuple): # assume a single group axis = (axis,) lgkey = axis axis = lgkey[0].axis value = res._aggregate(npop[op], (lgkey,)) res = res.extend(axis, value) return res
# TODO: make sure we can do # arr[x.sex.i[arr.indexofmin(x.sex)]] <- fails # and # arr[arr.labelofmin(x.sex)] <- fails # should both be equal to arr.min(x.sex) # the versions where axis is None already work as expected in the simple # case (no ambiguous labels): # arr.i[arr.indexofmin()] # arr[arr.labelofmin()] # for the case where axis is None, we should return an NDGroup # so that arr[arr.labelofmin()] works even if the minimum is on ambiguous labels
[docs] def labelofmin(self, axis=None): """Returns labels of the minimum values along a given axis. Parameters ---------- axis : int or str or Axis, optional Axis along which to work. If not specified, works on the full array. Returns ------- LArray Notes ----- In case of multiple occurrences of the minimum values, the indices corresponding to the first occurrence are returned. Examples -------- >>> nat = Axis('nat=BE,FR,IT') >>> sex = Axis('sex=M,F') >>> arr = LArray([[0, 1], [3, 2], [2, 5]], [nat, sex]) >>> arr nat\\sex M F BE 0 1 FR 3 2 IT 2 5 >>> arr.labelofmin(X.sex) nat BE FR IT M F M >>> arr.labelofmin() ('BE', 'M') """ if axis is not None: axis, axis_idx = self.axes[axis], self.axes.index(axis) data = axis.labels[self.data.argmin(axis_idx)] return LArray(data, self.axes - axis) else: indices = np.unravel_index(self.data.argmin(), self.shape) return tuple(axis.labels[i] for i, axis in zip(indices, self.axes))
argmin = renamed_to(labelofmin, 'argmin')
[docs] def indexofmin(self, axis=None): """Returns indices of the minimum values along a given axis. Parameters ---------- axis : int or str or Axis, optional Axis along which to work. If not specified, works on the full array. Returns ------- LArray Notes ----- In case of multiple occurrences of the minimum values, the indices corresponding to the first occurrence are returned. Examples -------- >>> nat = Axis('nat=BE,FR,IT') >>> sex = Axis('sex=M,F') >>> arr = LArray([[0, 1], [3, 2], [2, 5]], [nat, sex]) >>> arr nat\\sex M F BE 0 1 FR 3 2 IT 2 5 >>> arr.indexofmin(X.sex) nat BE FR IT 0 1 0 >>> arr.indexofmin() (0, 0) """ if axis is not None: axis, axis_idx = self.axes[axis], self.axes.index(axis) return LArray(self.data.argmin(axis_idx), self.axes - axis) else: return np.unravel_index(self.data.argmin(), self.shape)
posargmin = renamed_to(indexofmin, 'posargmin')
[docs] def labelofmax(self, axis=None): """Returns labels of the maximum values along a given axis. Parameters ---------- axis : int or str or Axis, optional Axis along which to work. If not specified, works on the full array. Returns ------- LArray Notes ----- In case of multiple occurrences of the maximum values, the labels corresponding to the first occurrence are returned. Examples -------- >>> nat = Axis('nat=BE,FR,IT') >>> sex = Axis('sex=M,F') >>> arr = LArray([[0, 1], [3, 2], [2, 5]], [nat, sex]) >>> arr nat\\sex M F BE 0 1 FR 3 2 IT 2 5 >>> arr.labelofmax(X.sex) nat BE FR IT F M F >>> arr.labelofmax() ('IT', 'F') """ if axis is not None: axis, axis_idx = self.axes[axis], self.axes.index(axis) data = axis.labels[self.data.argmax(axis_idx)] return LArray(data, self.axes - axis) else: indices = np.unravel_index(self.data.argmax(), self.shape) return tuple(axis.labels[i] for i, axis in zip(indices, self.axes))
argmax = renamed_to(labelofmax, 'argmax')
[docs] def indexofmax(self, axis=None): """Returns indices of the maximum values along a given axis. Parameters ---------- axis : int or str or Axis, optional Axis along which to work. If not specified, works on the full array. Returns ------- LArray Notes ----- In case of multiple occurrences of the maximum values, the labels corresponding to the first occurrence are returned. Examples -------- >>> nat = Axis('nat=BE,FR,IT') >>> sex = Axis('sex=M,F') >>> arr = LArray([[0, 1], [3, 2], [2, 5]], [nat, sex]) >>> arr nat\\sex M F BE 0 1 FR 3 2 IT 2 5 >>> arr.indexofmax(X.sex) nat BE FR IT 1 0 1 >>> arr.indexofmax() (2, 1) """ if axis is not None: axis, axis_idx = self.axes[axis], self.axes.index(axis) return LArray(self.data.argmax(axis_idx), self.axes - axis) else: return np.unravel_index(self.data.argmax(), self.shape)
posargmax = renamed_to(indexofmax, 'posargmax')
[docs] def labelsofsorted(self, axis=None, ascending=True, kind='quicksort'): """Returns the labels that would sort this array. Performs an indirect sort along the given axis using the algorithm specified by the `kind` keyword. It returns an array of labels of the same shape as `a` that index data along the given axis in sorted order. Parameters ---------- axis : int or str or Axis, optional Axis along which to sort. This can be omitted if array has only one axis. ascending : bool, optional Sort values in ascending order. Defaults to True. kind : {'quicksort', 'mergesort', 'heapsort'}, optional Sorting algorithm. Defaults to 'quicksort'. Returns ------- LArray Examples -------- >>> arr = LArray([[0, 1], [3, 2], [2, 5]], "nat=BE,FR,IT; sex=M,F") >>> arr nat\\sex M F BE 0 1 FR 3 2 IT 2 5 >>> arr.labelsofsorted('sex') nat\\sex 0 1 BE M F FR F M IT M F >>> arr.labelsofsorted('sex', ascending=False) nat\\sex 0 1 BE F M FR M F IT F M """ if axis is None: if self.ndim > 1: raise ValueError("array has ndim > 1 and no axis specified for labelsofsorted") axis = self.axes[0] axis = self.axes[axis] pos = self.indicesofsorted(axis, ascending=ascending, kind=kind) return LArray(axis.labels[pos.data], pos.axes)
argsort = renamed_to(labelsofsorted, 'argsort')
[docs] def indicesofsorted(self, axis=None, ascending=True, kind='quicksort'): """Returns the indices that would sort this array. Performs an indirect sort along the given axis using the algorithm specified by the `kind` keyword. It returns an array of indices with the same axes as `a` that index data along the given axis in sorted order. Parameters ---------- axis : int or str or Axis, optional Axis along which to sort. This can be omitted if array has only one axis. ascending : bool, optional Sort values in ascending order. Defaults to True. kind : {'quicksort', 'mergesort', 'heapsort'}, optional Sorting algorithm. Defaults to 'quicksort'. Returns ------- LArray Examples -------- >>> arr = LArray([[1, 5], [3, 2], [0, 4]], "nat=BE,FR,IT; sex=M,F") >>> arr nat\\sex M F BE 1 5 FR 3 2 IT 0 4 >>> arr.indicesofsorted('nat') nat\\sex M F 0 2 1 1 0 2 2 1 0 >>> arr.indicesofsorted('nat', ascending=False) nat\\sex M F 0 1 0 1 0 2 2 2 1 """ if axis is None: if self.ndim > 1: raise ValueError("array has ndim > 1 and no axis specified for indicesofsorted") axis = self.axes[0] axis, axis_idx = self.axes[axis], self.axes.index(axis) data = self.data.argsort(axis_idx, kind=kind) if not ascending: reverser = tuple(slice(None, None, -1) if i == axis_idx else slice(None) for i in range(self.ndim)) data = data[reverser] new_axis = Axis(np.arange(len(axis)), axis.name) return LArray(data, self.axes.replace(axis, new_axis))
posargsort = renamed_to(indicesofsorted, 'posargsort')
[docs] def copy(self): """Returns a copy of the array. """ return LArray(self.data.copy(), axes=self.axes[:], title=self.title)
@property def info(self): """Describes a LArray (title + shape and labels for each axis). Returns ------- str Description of the array (title + shape and labels for each axis). Examples -------- >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> mat0 = ones([nat, sex]) >>> mat0.info 2 x 2 nat [2]: 'BE' 'FO' sex [2]: 'M' 'F' dtype: float64 >>> mat1 = LArray(np.ones((2, 2)), [nat, sex], 'test matrix') >>> mat1.info test matrix 2 x 2 nat [2]: 'BE' 'FO' sex [2]: 'M' 'F' dtype: float64 """ if self.title: return ReprString(self.title + '\n' + self.axes.info + '\ndtype: ' + self.dtype.name) else: return ReprString(self.axes.info + '\ndtype: ' + self.dtype.name)
[docs] def ratio(self, *axes): """Returns an array with all values divided by the sum of values along given axes. Parameters ---------- *axes Returns ------- LArray array / array.sum(axes) Examples -------- >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> a = LArray([[4, 6], [2, 8]], [nat, sex]) >>> a nat\\sex M F BE 4 6 FO 2 8 >>> a.sum() 20 >>> a.ratio() nat\\sex M F BE 0.2 0.3 FO 0.1 0.4 >>> a.ratio(X.sex) nat\\sex M F BE 0.4 0.6 FO 0.2 0.8 >>> a.ratio('M') nat\\sex M F BE 1.0 1.5 FO 1.0 4.0 """ # # this does not work, but I am unsure it should # # >>> a.sum(age[[0, 1]], age[2]) / a.sum(age) # >>> a.sum(([0, 1], 2)) / a.sum(age) # # >>> a / a.sum(([0, 1], 2)) # >>> a.sum(x.sex) # >>> a.sum(x.age) # >>> a.sum(x.sex) / a.sum(x.age) # >>> a.ratio('F') # could mean # >>> a.sum('F') / a.sum(a.get_axis('F')) # >>> a.sum('F') / a.sum(x.sex) # age 0 1 2 # 1.0 0.6 0.555555555556 # OR (current meaning) # >>> a / a.sum('F') # age\\sex M F # 0 0.0 1.0 # 1 0.666666666667 1.0 # 2 0.8 1.0 # One solution is to add an argument # >>> a.ratio(what='F', by=x.sex) # age 0 1 2 # 1.0 0.6 0.555555555556 # >>> a.sum('F') / a.sum(x.sex) # >>> a.sum((age[[0, 1]], age[[1, 2]])) / a.sum(age) # >>> a.ratio((age[[0, 1]], age[[1, 2]]), by=age) # >>> a.sum((x.age[[0, 1]], x.age[[1, 2]])) / a.sum(x.age) # >>> a.ratio((x.age[[0, 1]], x.age[[1, 2]], by=x.age) # >>> lalala.sum(([0, 1], [1, 2])) / lalala.sum(x.age) # >>> lalala.ratio(([0, 1], [1, 2]), by=x.age) # >>> b = a.sum((age[[0, 1]], age[[1, 2]])) # >>> b # age\sex M F # [0 1] 2 4 # [1 2] 6 8 # >>> b / b.sum(x.age) # age\\sex M F # [0 1] 0.25 0.333333333333 # [1 2] 0.75 0.666666666667 # >>> b / a.sum(x.age) # age\\sex M F # [0 1] 0.333333333333 0.444444444444 # [1 2] 1.0 0.888888888889 # # >>> a.ratio([0, 1], [2]) # # >>> a.ratio(x.age[[0, 1]], x.age[2]) # >>> a.ratio((x.age[[0, 1]], x.age[2])) # nat\\sex M F # BE 0.0 1.0 # FO 0.6666666666 1.0 return self / self.sum(*axes)
[docs] def rationot0(self, *axes): """Returns a LArray with values array / array.sum(axes) where the sum is not 0, 0 otherwise. Parameters ---------- *axes Returns ------- LArray array / array.sum(axes) Examples -------- >>> a = Axis('a=a0,a1') >>> b = Axis('b=b0,b1,b2') >>> arr = LArray([[6, 0, 2], ... [4, 0, 8]], [a, b]) >>> arr a\\b b0 b1 b2 a0 6 0 2 a1 4 0 8 >>> arr.sum() 20 >>> arr.rationot0() a\\b b0 b1 b2 a0 0.3 0.0 0.1 a1 0.2 0.0 0.4 >>> arr.rationot0(X.a) a\\b b0 b1 b2 a0 0.6 0.0 0.2 a1 0.4 0.0 0.8 for reference, the normal ratio method would return: >>> arr.ratio(X.a) a\\b b0 b1 b2 a0 0.6 nan 0.2 a1 0.4 nan 0.8 """ return self.divnot0(self.sum(*axes))
[docs] def percent(self, *axes): """Returns an array with values given as percent of the total of all values along given axes. Parameters ---------- *axes Returns ------- LArray array / array.sum(axes) * 100 Examples -------- >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> a = LArray([[4, 6], [2, 8]], [nat, sex]) >>> a nat\\sex M F BE 4 6 FO 2 8 >>> a.percent() nat\\sex M F BE 20.0 30.0 FO 10.0 40.0 >>> a.percent(X.sex) nat\\sex M F BE 40.0 60.0 FO 20.0 80.0 """ # dividing by self.sum(*axes) * 0.01 would be faster in many cases but I suspect it loose more precision. return self * 100 / self.sum(*axes)
# aggregate method decorator def _decorate_agg_method(npfunc, nanfunc=None, commutative=False, by_agg=False, extra_kwargs=[], long_name='', action_verb='perform'): def decorated(func): _doc_agg_method(func, by_agg, long_name, action_verb, kwargs=extra_kwargs + ['out', 'skipna', 'keepaxes']) @functools.wraps(func) def wrapper(self, *args, **kwargs): keepaxes = kwargs.pop('keepaxes', _kwarg_agg['keepaxes']['value']) skipna = kwargs.pop('skipna', _kwarg_agg['skipna']['value']) out = kwargs.pop('out', _kwarg_agg['out']['value']) if skipna is None: skipna = nanfunc is not None if skipna and nanfunc is None: raise ValueError("skipna is not available for {}".format(func.__name__)) _npfunc = nanfunc if skipna else npfunc _extra_kwargs = {} for k in extra_kwargs: _extra_kwargs[k] = kwargs.pop(k, _kwarg_agg[k]['value']) return self._aggregate(_npfunc, args, kwargs, by_agg=by_agg, keepaxes=keepaxes, commutative=commutative, out=out, extra_kwargs=_extra_kwargs) return wrapper return decorated
[docs] @_decorate_agg_method(np.all, commutative=True, long_name="AND reduction") def all(self, *args, **kwargs): """{signature} Test whether all selected elements evaluate to True. {parameters} Returns ------- LArray of bool or bool See Also -------- LArray.all_by, LArray.any, LArray.any_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> barr = arr < 6 >>> barr a\\b b0 b1 b2 b3 a0 True True True True a1 True True False False a2 False False False False a3 False False False False >>> barr.all() False >>> # along axis 'a' >>> barr.all(X.a) b b0 b1 b2 b3 False False False False >>> # along axis 'b' >>> barr.all(X.b) a a0 a1 a2 a3 True False False False Select some rows only >>> barr.all(['a0', 'a1']) b b0 b1 b2 b3 True True False False >>> # or equivalently >>> # barr.all('a0,a1') Split an axis in several parts >>> barr.all((['a0', 'a1'], ['a2', 'a3'])) a\\b b0 b1 b2 b3 a0,a1 True True False False a2,a3 False False False False >>> # or equivalently >>> # barr.all('a0,a1;a2,a3') Same with renaming >>> barr.all((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a\\b b0 b1 b2 b3 a01 True True False False a23 False False False False >>> # or equivalently >>> # barr.all('a0,a1>>a01;a2,a3>>a23') """ pass
[docs] @_decorate_agg_method(np.all, commutative=True, by_agg=True, long_name="AND reduction") def all_by(self, *args, **kwargs): """{signature} Test whether all selected elements evaluate to True. {parameters} Returns ------- LArray of bool or bool See Also -------- LArray.all, LArray.any, LArray.any_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> barr = arr < 6 >>> barr a\\b b0 b1 b2 b3 a0 True True True True a1 True True False False a2 False False False False a3 False False False False >>> barr.all_by() False >>> # by axis 'a' >>> barr.all_by(X.a) a a0 a1 a2 a3 True False False False >>> # by axis 'b' >>> barr.all_by(X.b) b b0 b1 b2 b3 False False False False Select some rows only >>> barr.all_by(['a0', 'a1']) False >>> # or equivalently >>> # barr.all_by('a0,a1') Split an axis in several parts >>> barr.all_by((['a0', 'a1'], ['a2', 'a3'])) a a0,a1 a2,a3 False False >>> # or equivalently >>> # barr.all_by('a0,a1;a2,a3') Same with renaming >>> barr.all_by((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a a01 a23 False False >>> # or equivalently >>> # barr.all_by('a0,a1>>a01;a2,a3>>a23') """ pass
[docs] @_decorate_agg_method(np.any, commutative=True, long_name="OR reduction") def any(self, *args, **kwargs): """{signature} Test whether any selected elements evaluate to True. {parameters} Returns ------- LArray of bool or bool See Also -------- LArray.any_by, LArray.all, LArray.all_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> barr = arr < 6 >>> barr a\\b b0 b1 b2 b3 a0 True True True True a1 True True False False a2 False False False False a3 False False False False >>> barr.any() True >>> # along axis 'a' >>> barr.any(X.a) b b0 b1 b2 b3 True True True True >>> # along axis 'b' >>> barr.any(X.b) a a0 a1 a2 a3 True True False False Select some rows only >>> barr.any(['a0', 'a1']) b b0 b1 b2 b3 True True True True >>> # or equivalently >>> # barr.any('a0,a1') Split an axis in several parts >>> barr.any((['a0', 'a1'], ['a2', 'a3'])) a\\b b0 b1 b2 b3 a0,a1 True True True True a2,a3 False False False False >>> # or equivalently >>> # barr.any('a0,a1;a2,a3') Same with renaming >>> barr.any((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a\\b b0 b1 b2 b3 a01 True True True True a23 False False False False >>> # or equivalently >>> # barr.any('a0,a1>>a01;a2,a3>>a23') """ pass
[docs] @_decorate_agg_method(np.any, commutative=True, by_agg=True, long_name="OR reduction") def any_by(self, *args, **kwargs): """{signature} Test whether any selected elements evaluate to True. {parameters} Returns ------- LArray of bool or bool See Also -------- LArray.any, LArray.all, LArray.all_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> barr = arr < 6 >>> barr a\\b b0 b1 b2 b3 a0 True True True True a1 True True False False a2 False False False False a3 False False False False >>> barr.any_by() True >>> # by axis 'a' >>> barr.any_by(X.a) a a0 a1 a2 a3 True True False False >>> # by axis 'b' >>> barr.any_by(X.b) b b0 b1 b2 b3 True True True True Select some rows only >>> barr.any_by(['a0', 'a1']) True >>> # or equivalently >>> # barr.any_by('a0,a1') Split an axis in several parts >>> barr.any_by((['a0', 'a1'], ['a2', 'a3'])) a a0,a1 a2,a3 True False >>> # or equivalently >>> # barr.any_by('a0,a1;a2,a3') Same with renaming >>> barr.any_by((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a a01 a23 True False >>> # or equivalently >>> # barr.any_by('a0,a1>>a01;a2,a3>>a23') """ pass
# commutative modulo float precision errors
[docs] @_decorate_agg_method(np.sum, np.nansum, commutative=True, extra_kwargs=['dtype']) def sum(self, *args, **kwargs): """{signature} Computes the sum of array elements along given axes/groups. {parameters} Returns ------- LArray or scalar See Also -------- LArray.sum_by, LArray.prod, LArray.prod_by, LArray.cumsum, LArray.cumprod Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.sum() 120 >>> # along axis 'a' >>> arr.sum(X.a) b b0 b1 b2 b3 24 28 32 36 >>> # along axis 'b' >>> arr.sum(X.b) a a0 a1 a2 a3 6 22 38 54 Select some rows only >>> arr.sum(['a0', 'a1']) b b0 b1 b2 b3 4 6 8 10 >>> # or equivalently >>> # arr.sum('a0,a1') Split an axis in several parts >>> arr.sum((['a0', 'a1'], ['a2', 'a3'])) a\\b b0 b1 b2 b3 a0,a1 4 6 8 10 a2,a3 20 22 24 26 >>> # or equivalently >>> # arr.sum('a0,a1;a2,a3') Same with renaming >>> arr.sum((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a\\b b0 b1 b2 b3 a01 4 6 8 10 a23 20 22 24 26 >>> # or equivalently >>> # arr.sum('a0,a1>>a01;a2,a3>>a23') """ pass
[docs] @_decorate_agg_method(np.sum, np.nansum, commutative=True, by_agg=True, extra_kwargs=['dtype'], long_name="sum") def sum_by(self, *args, **kwargs): """{signature} Computes the sum of array elements for the given axes/groups. {parameters} Returns ------- LArray or scalar See Also -------- LArray.sum, LArray.prod, LArray.prod_by, LArray.cumsum, LArray.cumprod Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.sum_by() 120 >>> # along axis 'a' >>> arr.sum_by(X.a) a a0 a1 a2 a3 6 22 38 54 >>> # along axis 'b' >>> arr.sum_by(X.b) b b0 b1 b2 b3 24 28 32 36 Select some rows only >>> arr.sum_by(['a0', 'a1']) 28 >>> # or equivalently >>> # arr.sum_by('a0,a1') Split an axis in several parts >>> arr.sum_by((['a0', 'a1'], ['a2', 'a3'])) a a0,a1 a2,a3 28 92 >>> # or equivalently >>> # arr.sum_by('a0,a1;a2,a3') Same with renaming >>> arr.sum_by((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a a01 a23 28 92 >>> # or equivalently >>> # arr.sum_by('a0,a1>>a01;a2,a3>>a23') """ pass
# nanprod needs numpy 1.10
[docs] @_decorate_agg_method(np.prod, np_nanprod, commutative=True, extra_kwargs=['dtype'], long_name="product") def prod(self, *args, **kwargs): """{signature} Computes the product of array elements along given axes/groups. {parameters} Returns ------- LArray or scalar See Also -------- LArray.prod_by, LArray.sum, LArray.sum_by, LArray.cumsum, LArray.cumprod Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.prod() 0 >>> # along axis 'a' >>> arr.prod(X.a) b b0 b1 b2 b3 0 585 1680 3465 >>> # along axis 'b' >>> arr.prod(X.b) a a0 a1 a2 a3 0 840 7920 32760 Select some rows only >>> arr.prod(['a0', 'a1']) b b0 b1 b2 b3 0 5 12 21 >>> # or equivalently >>> # arr.prod('a0,a1') Split an axis in several parts >>> arr.prod((['a0', 'a1'], ['a2', 'a3'])) a\\b b0 b1 b2 b3 a0,a1 0 5 12 21 a2,a3 96 117 140 165 >>> # or equivalently >>> # arr.prod('a0,a1;a2,a3') Same with renaming >>> arr.prod((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a\\b b0 b1 b2 b3 a01 0 5 12 21 a23 96 117 140 165 >>> # or equivalently >>> # arr.prod('a0,a1>>a01;a2,a3>>a23') """ pass
[docs] @_decorate_agg_method(np.prod, np_nanprod, commutative=True, by_agg=True, extra_kwargs=['dtype'], long_name="product") def prod_by(self, *args, **kwargs): """{signature} Computes the product of array elements for the given axes/groups. {parameters} Returns ------- LArray or scalar See Also -------- LArray.prod, LArray.sum, LArray.sum_by, LArray.cumsum, LArray.cumprod Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.prod_by() 0 >>> # along axis 'a' >>> arr.prod_by(X.a) a a0 a1 a2 a3 0 840 7920 32760 >>> # along axis 'b' >>> arr.prod_by(X.b) b b0 b1 b2 b3 0 585 1680 3465 Select some rows only >>> arr.prod_by(['a0', 'a1']) 0 >>> # or equivalently >>> # arr.prod_by('a0,a1') Split an axis in several parts >>> arr.prod_by((['a0', 'a1'], ['a2', 'a3'])) a a0,a1 a2,a3 0 259459200 >>> # or equivalently >>> # arr.prod_by('a0,a1;a2,a3') Same with renaming >>> arr.prod_by((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a a01 a23 0 259459200 >>> # or equivalently >>> # arr.prod_by('a0,a1>>a01;a2,a3>>a23') """ pass
[docs] @_decorate_agg_method(np.min, np.nanmin, commutative=True, long_name="minimum", action_verb="search") def min(self, *args, **kwargs): """{signature} Get minimum of array elements along given axes/groups. {parameters} Returns ------- LArray or scalar See Also -------- LArray.min_by, LArray.max, LArray.max_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.min() 0 >>> # along axis 'a' >>> arr.min(X.a) b b0 b1 b2 b3 0 1 2 3 >>> # along axis 'b' >>> arr.min(X.b) a a0 a1 a2 a3 0 4 8 12 Select some rows only >>> arr.min(['a0', 'a1']) b b0 b1 b2 b3 0 1 2 3 >>> # or equivalently >>> # arr.min('a0,a1') Split an axis in several parts >>> arr.min((['a0', 'a1'], ['a2', 'a3'])) a\\b b0 b1 b2 b3 a0,a1 0 1 2 3 a2,a3 8 9 10 11 >>> # or equivalently >>> # arr.min('a0,a1;a2,a3') Same with renaming >>> arr.min((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a\\b b0 b1 b2 b3 a01 0 1 2 3 a23 8 9 10 11 >>> # or equivalently >>> # arr.min('a0,a1>>a01;a2,a3>>a23') """ pass
[docs] @_decorate_agg_method(np.min, np.nanmin, commutative=True, by_agg=True, long_name="minimum", action_verb="search") def min_by(self, *args, **kwargs): """{signature} Get minimum of array elements for the given axes/groups. {parameters} Returns ------- LArray or scalar See Also -------- LArray.min, LArray.max, LArray.max_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.min_by() 0 >>> # along axis 'a' >>> arr.min_by(X.a) a a0 a1 a2 a3 0 4 8 12 >>> # along axis 'b' >>> arr.min_by(X.b) b b0 b1 b2 b3 0 1 2 3 Select some rows only >>> arr.min_by(['a0', 'a1']) 0 >>> # or equivalently >>> # arr.min_by('a0,a1') Split an axis in several parts >>> arr.min_by((['a0', 'a1'], ['a2', 'a3'])) a a0,a1 a2,a3 0 8 >>> # or equivalently >>> # arr.min_by('a0,a1;a2,a3') Same with renaming >>> arr.min_by((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a a01 a23 0 8 >>> # or equivalently >>> # arr.min_by('a0,a1>>a01;a2,a3>>a23') """ pass
[docs] @_decorate_agg_method(np.max, np.nanmax, commutative=True, long_name="maximum", action_verb="search") def max(self, *args, **kwargs): """{signature} Get maximum of array elements along given axes/groups. {parameters} Returns ------- LArray or scalar See Also -------- LArray.max_by, LArray.min, LArray.min_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.max() 15 >>> # along axis 'a' >>> arr.max(X.a) b b0 b1 b2 b3 12 13 14 15 >>> # along axis 'b' >>> arr.max(X.b) a a0 a1 a2 a3 3 7 11 15 Select some rows only >>> arr.max(['a0', 'a1']) b b0 b1 b2 b3 4 5 6 7 >>> # or equivalently >>> # arr.max('a0,a1') Split an axis in several parts >>> arr.max((['a0', 'a1'], ['a2', 'a3'])) a\\b b0 b1 b2 b3 a0,a1 4 5 6 7 a2,a3 12 13 14 15 >>> # or equivalently >>> # arr.max('a0,a1;a2,a3') Same with renaming >>> arr.max((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a\\b b0 b1 b2 b3 a01 4 5 6 7 a23 12 13 14 15 >>> # or equivalently >>> # arr.max('a0,a1>>a01;a2,a3>>a23') """ pass
[docs] @_decorate_agg_method(np.max, np.nanmax, commutative=True, by_agg=True, long_name="maximum", action_verb="search") def max_by(self, *args, **kwargs): """{signature} Get maximum of array elements for the given axes/groups. {parameters} Returns ------- LArray or scalar See Also -------- LArray.max, LArray.min, LArray.min_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.max_by() 15 >>> # along axis 'a' >>> arr.max_by(X.a) a a0 a1 a2 a3 3 7 11 15 >>> # along axis 'b' >>> arr.max_by(X.b) b b0 b1 b2 b3 12 13 14 15 Select some rows only >>> arr.max_by(['a0', 'a1']) 7 >>> # or equivalently >>> # arr.max_by('a0,a1') Split an axis in several parts >>> arr.max_by((['a0', 'a1'], ['a2', 'a3'])) a a0,a1 a2,a3 7 15 >>> # or equivalently >>> # arr.max_by('a0,a1;a2,a3') Same with renaming >>> arr.max_by((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a a01 a23 7 15 >>> # or equivalently >>> # arr.max_by('a0,a1>>a01;a2,a3>>a23') """ pass
[docs] @_decorate_agg_method(np.mean, np.nanmean, commutative=True, extra_kwargs=['dtype']) def mean(self, *args, **kwargs): """{signature} Computes the arithmetic mean. {parameters} Returns ------- LArray or scalar See Also -------- LArray.mean_by, LArray.median, LArray.median_by, LArray.var, LArray.var_by, LArray.std, LArray.std_by, LArray.percentile, LArray.percentile_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.mean() 7.5 >>> # along axis 'a' >>> arr.mean(X.a) b b0 b1 b2 b3 6.0 7.0 8.0 9.0 >>> # along axis 'b' >>> arr.mean(X.b) a a0 a1 a2 a3 1.5 5.5 9.5 13.5 Select some rows only >>> arr.mean(['a0', 'a1']) b b0 b1 b2 b3 2.0 3.0 4.0 5.0 >>> # or equivalently >>> # arr.mean('a0,a1') Split an axis in several parts >>> arr.mean((['a0', 'a1'], ['a2', 'a3'])) a\\b b0 b1 b2 b3 a0,a1 2.0 3.0 4.0 5.0 a2,a3 10.0 11.0 12.0 13.0 >>> # or equivalently >>> # arr.mean('a0,a1;a2,a3') Same with renaming >>> arr.mean((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a\\b b0 b1 b2 b3 a01 2.0 3.0 4.0 5.0 a23 10.0 11.0 12.0 13.0 >>> # or equivalently >>> # arr.mean('a0,a1>>a01;a2,a3>>a23') """ pass
[docs] @_decorate_agg_method(np.mean, np.nanmean, commutative=True, by_agg=True, extra_kwargs=['dtype'], long_name="mean") def mean_by(self, *args, **kwargs): """{signature} Computes the arithmetic mean. {parameters} Returns ------- LArray or scalar See Also -------- LArray.mean, LArray.median, LArray.median_by, LArray.var, LArray.var_by, LArray.std, LArray.std_by, LArray.percentile, LArray.percentile_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.mean() 7.5 >>> # along axis 'a' >>> arr.mean_by(X.a) a a0 a1 a2 a3 1.5 5.5 9.5 13.5 >>> # along axis 'b' >>> arr.mean_by(X.b) b b0 b1 b2 b3 6.0 7.0 8.0 9.0 Select some rows only >>> arr.mean_by(['a0', 'a1']) 3.5 >>> # or equivalently >>> # arr.mean_by('a0,a1') Split an axis in several parts >>> arr.mean_by((['a0', 'a1'], ['a2', 'a3'])) a a0,a1 a2,a3 3.5 11.5 >>> # or equivalently >>> # arr.mean_by('a0,a1;a2,a3') Same with renaming >>> arr.mean_by((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a a01 a23 3.5 11.5 >>> # or equivalently >>> # arr.mean_by('a0,a1>>a01;a2,a3>>a23') """ pass
[docs] @_decorate_agg_method(np.median, np.nanmedian, commutative=True) def median(self, *args, **kwargs): """{signature} Computes the arithmetic median. {parameters} Returns ------- LArray or scalar See Also -------- LArray.median_by, LArray.mean, LArray.mean_by, LArray.var, LArray.var_by, LArray.std, LArray.std_by, LArray.percentile, LArray.percentile_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr[:,:] = [[10, 7, 5, 9], \ [5, 8, 3, 7], \ [6, 2, 0, 9], \ [9, 10, 5, 6]] >>> arr a\\b b0 b1 b2 b3 a0 10 7 5 9 a1 5 8 3 7 a2 6 2 0 9 a3 9 10 5 6 >>> arr.median() 6.5 >>> # along axis 'a' >>> arr.median(X.a) b b0 b1 b2 b3 7.5 7.5 4.0 8.0 >>> # along axis 'b' >>> arr.median(X.b) a a0 a1 a2 a3 8.0 6.0 4.0 7.5 Select some rows only >>> arr.median(['a0', 'a1']) b b0 b1 b2 b3 7.5 7.5 4.0 8.0 >>> # or equivalently >>> # arr.median('a0,a1') Split an axis in several parts >>> arr.median((['a0', 'a1'], ['a2', 'a3'])) a\\b b0 b1 b2 b3 a0,a1 7.5 7.5 4.0 8.0 a2,a3 7.5 6.0 2.5 7.5 >>> # or equivalently >>> # arr.median('a0,a1;a2,a3') Same with renaming >>> arr.median((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a\\b b0 b1 b2 b3 a01 7.5 7.5 4.0 8.0 a23 7.5 6.0 2.5 7.5 >>> # or equivalently >>> # arr.median('a0,a1>>a01;a2,a3>>a23') """ pass
[docs] @_decorate_agg_method(np.median, np.nanmedian, commutative=True, by_agg=True, long_name="mediane") def median_by(self, *args, **kwargs): """{signature} Computes the arithmetic median. {parameters} Returns ------- LArray or scalar See Also -------- LArray.median, LArray.mean, LArray.mean_by, LArray.var, LArray.var_by, LArray.std, LArray.std_by, LArray.percentile, LArray.percentile_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr[:,:] = [[10, 7, 5, 9], \ [5, 8, 3, 7], \ [6, 2, 0, 9], \ [9, 10, 5, 6]] >>> arr a\\b b0 b1 b2 b3 a0 10 7 5 9 a1 5 8 3 7 a2 6 2 0 9 a3 9 10 5 6 >>> arr.median_by() 6.5 >>> # along axis 'a' >>> arr.median_by(X.a) a a0 a1 a2 a3 8.0 6.0 4.0 7.5 >>> # along axis 'b' >>> arr.median_by(X.b) b b0 b1 b2 b3 7.5 7.5 4.0 8.0 Select some rows only >>> arr.median_by(['a0', 'a1']) 7.0 >>> # or equivalently >>> # arr.median_by('a0,a1') Split an axis in several parts >>> arr.median_by((['a0', 'a1'], ['a2', 'a3'])) a a0,a1 a2,a3 7.0 5.75 >>> # or equivalently >>> # arr.median_by('a0,a1;a2,a3') Same with renaming >>> arr.median_by((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a a01 a23 7.0 5.75 >>> # or equivalently >>> # arr.median_by('a0,a1>>a01;a2,a3>>a23') """ pass
# XXX: for performance reasons, we should use the fact that the underlying numpy function handles multiple # percentiles in one call. This is easy to implement in _axis_aggregate() but not in _group_aggregate() # since in this case np.percentile() may be called several times. # percentile needs an explicit method because it has not the same # signature as other aggregate functions (extra argument)
[docs] def percentile(self, q, *args, **kwargs): """{signature} Computes the qth percentile of the data along the specified axis. {parameters} Returns ------- LArray or scalar See Also -------- LArray.percentile_by, LArray.mean, LArray.mean_by, LArray.median, LArray.median_by, LArray.var, LArray.var_by, LArray.std, LArray.std_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.percentile(25) 3.75 >>> # along axis 'a' >>> arr.percentile(25, X.a) b b0 b1 b2 b3 3.0 4.0 5.0 6.0 >>> # along axis 'b' >>> arr.percentile(25, X.b) a a0 a1 a2 a3 0.75 4.75 8.75 12.75 >>> # several percentile values >>> arr.percentile([25, 50, 75], X.b) percentile\\a a0 a1 a2 a3 25 0.75 4.75 8.75 12.75 50 1.5 5.5 9.5 13.5 75 2.25 6.25 10.25 14.25 Select some rows only >>> arr.percentile(25, ['a0', 'a1']) b b0 b1 b2 b3 1.0 2.0 3.0 4.0 >>> # or equivalently >>> # arr.percentile(25, 'a0,a1') Split an axis in several parts >>> arr.percentile(25, (['a0', 'a1'], ['a2', 'a3'])) a\\b b0 b1 b2 b3 a0,a1 1.0 2.0 3.0 4.0 a2,a3 9.0 10.0 11.0 12.0 >>> # or equivalently >>> # arr.percentile(25, 'a0,a1;a2,a3') Same with renaming >>> arr.percentile(25, (X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a\\b b0 b1 b2 b3 a01 1.0 2.0 3.0 4.0 a23 9.0 10.0 11.0 12.0 >>> # or equivalently >>> # arr.percentile(25, 'a0,a1>>a01;a2,a3>>a23') """ keepaxes = kwargs.pop('keepaxes', _kwarg_agg['keepaxes']['value']) skipna = kwargs.pop('skipna', _kwarg_agg['skipna']['value']) out = kwargs.pop('out', _kwarg_agg['out']['value']) if skipna is None: skipna = True _npfunc = np.nanpercentile if skipna else np.percentile interpolation = kwargs.pop('interpolation', _kwarg_agg['interpolation']['value']) if isinstance(q, (list, tuple)): res = stack([(v, self._aggregate(_npfunc, args, kwargs, keepaxes=keepaxes, commutative=True, extra_kwargs={'q': v, 'interpolation': interpolation})) for v in q], 'percentile') return res.transpose() else : _extra_kwargs = {'q': q, 'interpolation': interpolation} return self._aggregate(_npfunc, args, kwargs, by_agg=False, keepaxes=keepaxes, commutative=True, out=out, extra_kwargs=_extra_kwargs)
_doc_agg_method(percentile, False, "qth percentile", extra_args=['q'], kwargs=['out', 'interpolation', 'skipna', 'keepaxes'])
[docs] def percentile_by(self, q, *args, **kwargs): """{signature} Computes the qth percentile of the data for the specified axis. {parameters} Returns ------- LArray or scalar See Also -------- LArray.percentile, LArray.mean, LArray.mean_by, LArray.median, LArray.median_by, LArray.var, LArray.var_by, LArray.std, LArray.std_by Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.percentile_by(25) 3.75 >>> # along axis 'a' >>> arr.percentile_by(25, X.a) a a0 a1 a2 a3 0.75 4.75 8.75 12.75 >>> # along axis 'b' >>> arr.percentile_by(25, X.b) b b0 b1 b2 b3 3.0 4.0 5.0 6.0 >>> # several percentile values >>> arr.percentile_by([25, 50, 75], X.b) percentile\\b b0 b1 b2 b3 25 3.0 4.0 5.0 6.0 50 6.0 7.0 8.0 9.0 75 9.0 10.0 11.0 12.0 Select some rows only >>> arr.percentile_by(25, ['a0', 'a1']) 1.75 >>> # or equivalently >>> # arr.percentile_by('a0,a1') Split an axis in several parts >>> arr.percentile_by(25, (['a0', 'a1'], ['a2', 'a3'])) a a0,a1 a2,a3 1.75 9.75 >>> # or equivalently >>> # arr.percentile_by('a0,a1;a2,a3') Same with renaming >>> arr.percentile_by(25, (X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a a01 a23 1.75 9.75 >>> # or equivalently >>> # arr.percentile_by('a0,a1>>a01;a2,a3>>a23') """ keepaxes = kwargs.pop('keepaxes', _kwarg_agg['keepaxes']['value']) skipna = kwargs.pop('skipna', _kwarg_agg['skipna']['value']) out = kwargs.pop('out', _kwarg_agg['out']['value']) if skipna is None: skipna = True _npfunc = np.nanpercentile if skipna else np.percentile interpolation = kwargs.pop('interpolation', _kwarg_agg['interpolation']['value']) if isinstance(q, (list, tuple)): res = stack([(v, self._aggregate(_npfunc, args, kwargs, by_agg=True, keepaxes=keepaxes, commutative=True, extra_kwargs={'q': v, 'interpolation': interpolation})) for v in q], 'percentile') return res.transpose() else: return self._aggregate(_npfunc, args, kwargs, by_agg=True, keepaxes=keepaxes, commutative=True, out=out, extra_kwargs={'q': q, 'interpolation': interpolation})
_doc_agg_method(percentile_by, True, "qth percentile", extra_args=['q'], kwargs=['out', 'interpolation', 'skipna', 'keepaxes']) # not commutative
[docs] def ptp(self, *args, **kwargs): """{signature} Returns the range of values (maximum - minimum). The name of the function comes from the acronym for ‘peak to peak’. {parameters} Returns ------- LArray or scalar Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.ptp() 15 >>> # along axis 'a' >>> arr.ptp(X.a) b b0 b1 b2 b3 12 12 12 12 >>> # along axis 'b' >>> arr.ptp(X.b) a a0 a1 a2 a3 3 3 3 3 Select some rows only >>> arr.ptp(['a0', 'a1']) b b0 b1 b2 b3 4 4 4 4 >>> # or equivalently >>> # arr.ptp('a0,a1') Split an axis in several parts >>> arr.ptp((['a0', 'a1'], ['a2', 'a3'])) a\\b b0 b1 b2 b3 a0,a1 4 4 4 4 a2,a3 4 4 4 4 >>> # or equivalently >>> # arr.ptp('a0,a1;a2,a3') Same with renaming >>> arr.ptp((X.a['a0', 'a1'] >> 'a01', X.a['a2', 'a3'] >> 'a23')) a\\b b0 b1 b2 b3 a01 4 4 4 4 a23 4 4 4 4 >>> # or equivalently >>> # arr.ptp('a0,a1>>a01;a2,a3>>a23') """ out = kwargs.pop('out', _kwarg_agg['out']['value']) return self._aggregate(np.ptp, args, kwargs, out=out)
_doc_agg_method(ptp, False, kwargs=['out'])
[docs] @_decorate_agg_method(np.var, np.nanvar, extra_kwargs=['dtype', 'ddof'], long_name="variance") def var(self, *args, **kwargs): """{signature} Computes the unbiased variance. Normalized by N-1 by default. This can be changed using the ddof argument. {parameters} Returns ------- LArray or scalar See Also -------- LArray.var_by, LArray.std, LArray.std_by, LArray.mean, LArray.mean_by, LArray.median, LArray.median_by, LArray.percentile, LArray.percentile_by Examples -------- >>> arr = ndtest((2, 8), dtype=float) >>> arr[:,:] = [[0, 3, 5, 6, 4, 2, 1, 3], \ [7, 3, 2, 5, 8, 5, 6, 4]] >>> arr a\\b b0 b1 b2 b3 b4 b5 b6 b7 a0 0.0 3.0 5.0 6.0 4.0 2.0 1.0 3.0 a1 7.0 3.0 2.0 5.0 8.0 5.0 6.0 4.0 >>> arr.var() 4.7999999999999998 >>> # along axis 'b' >>> arr.var(X.b) a a0 a1 4.0 4.0 Select some columns only >>> arr.var(['b0', 'b1', 'b3']) a a0 a1 9.0 4.0 >>> # or equivalently >>> # arr.var('b0,b1,b3') Split an axis in several parts >>> arr.var((['b0', 'b1', 'b3'], 'b5:')) a\\b b0,b1,b3 b5: a0 9.0 1.0 a1 4.0 1.0 >>> # or equivalently >>> # arr.var('b0,b1,b3;b5:') Same with renaming >>> arr.var((X.b['b0', 'b1', 'b3'] >> 'b013', X.b['b5:'] >> 'b567')) a\\b b013 b567 a0 9.0 1.0 a1 4.0 1.0 >>> # or equivalently >>> # arr.var('b0,b1,b3>>b013;b5:>>b567') """ pass
[docs] @_decorate_agg_method(np.var, np.nanvar, by_agg=True, extra_kwargs=['dtype', 'ddof'], long_name="variance") def var_by(self, *args, **kwargs): """{signature} Computes the unbiased variance. Normalized by N-1 by default. This can be changed using the ddof argument. {parameters} Returns ------- LArray or scalar See Also -------- LArray.var, LArray.std, LArray.std_by, LArray.mean, LArray.mean_by, LArray.median, LArray.median_by, LArray.percentile, LArray.percentile_by Examples -------- >>> arr = ndtest((2, 8), dtype=float) >>> arr[:,:] = [[0, 3, 5, 6, 4, 2, 1, 3], \ [7, 3, 2, 5, 8, 5, 6, 4]] >>> arr a\\b b0 b1 b2 b3 b4 b5 b6 b7 a0 0.0 3.0 5.0 6.0 4.0 2.0 1.0 3.0 a1 7.0 3.0 2.0 5.0 8.0 5.0 6.0 4.0 >>> arr.var_by() 4.7999999999999998 >>> # along axis 'a' >>> arr.var_by(X.a) a a0 a1 4.0 4.0 Select some columns only >>> arr.var_by(X.a, ['b0','b1','b3']) a a0 a1 9.0 4.0 >>> # or equivalently >>> # arr.var_by('a','b0,b1,b3') Split an axis in several parts >>> arr.var_by(X.a, (['b0', 'b1', 'b3'], 'b5:')) a\\b b0,b1,b3 b5: a0 9.0 1.0 a1 4.0 1.0 >>> # or equivalently >>> # arr.var_by('a','b0,b1,b3;b5:') Same with renaming >>> arr.var_by(X.a, (X.b['b0', 'b1', 'b3'] >> 'b013', X.b['b5:'] >> 'b567')) a\\b b013 b567 a0 9.0 1.0 a1 4.0 1.0 >>> # or equivalently >>> # arr.var_by('a','b0,b1,b3>>b013;b5:>>b567') """ pass
[docs] @_decorate_agg_method(np.std, np.nanstd, extra_kwargs=['dtype', 'ddof'], long_name="standard deviation") def std(self, *args, **kwargs): """{signature} Computes the sample standard deviation. Normalized by N-1 by default. This can be changed using the ddof argument. {parameters} Returns ------- LArray or scalar See Also -------- LArray.std_by, LArray.var, LArray.var_by, LArray.mean, LArray.mean_by, LArray.median, LArray.median_by, LArray.percentile, LArray.percentile_by Examples -------- >>> arr = ndtest((2, 8), dtype=float) >>> arr[:,:] = [[0, 3, 5, 6, 4, 2, 1, 3], ... [7, 3, 2, 5, 8, 5, 6, 4]] >>> arr a\\b b0 b1 b2 b3 b4 b5 b6 b7 a0 0.0 3.0 5.0 6.0 4.0 2.0 1.0 3.0 a1 7.0 3.0 2.0 5.0 8.0 5.0 6.0 4.0 >>> arr.std() 2.1908902300206643 >>> # along axis 'b' >>> arr.std(X.b) a a0 a1 2.0 2.0 Select some columns only >>> arr.std(['b0', 'b1', 'b3']) a a0 a1 3.0 2.0 >>> # or equivalently >>> # arr.std('b0,b1,b3') Split an axis in several parts >>> arr.std((['b0', 'b1', 'b3'], 'b5:')) a\\b b0,b1,b3 b5: a0 3.0 1.0 a1 2.0 1.0 >>> # or equivalently >>> # arr.std('b0,b1,b3;b5:') Same with renaming >>> arr.std((X.b['b0', 'b1', 'b3'] >> 'b013', X.b['b5:'] >> 'b567')) a\\b b013 b567 a0 3.0 1.0 a1 2.0 1.0 >>> # or equivalently >>> # arr.std('b0,b1,b3>>b013;b5:>>b567') """ pass
[docs] @_decorate_agg_method(np.std, np.nanstd, by_agg=True, extra_kwargs=['dtype', 'ddof'], long_name="standard deviation") def std_by(self, *args, **kwargs): """{signature} Computes the sample standard deviation. Normalized by N-1 by default. This can be changed using the ddof argument. {parameters} Returns ------- LArray or scalar See Also -------- LArray.std_by, LArray.var, LArray.var_by, LArray.mean, LArray.mean_by, LArray.median, LArray.median_by, LArray.percentile, LArray.percentile_by Examples -------- >>> arr = ndtest((2, 8), dtype=float) >>> arr[:,:] = [[0, 3, 5, 6, 4, 2, 1, 3], ... [7, 3, 2, 5, 8, 5, 6, 4]] >>> arr a\\b b0 b1 b2 b3 b4 b5 b6 b7 a0 0.0 3.0 5.0 6.0 4.0 2.0 1.0 3.0 a1 7.0 3.0 2.0 5.0 8.0 5.0 6.0 4.0 >>> arr.std_by() 2.1908902300206643 >>> # along axis 'a' >>> arr.std_by(X.a) a a0 a1 2.0 2.0 Select some columns only >>> arr.std_by(X.a, ['b0','b1','b3']) a a0 a1 3.0 2.0 >>> # or equivalently >>> # arr.std_by('a','b0,b1,b3') Split an axis in several parts >>> arr.std_by(X.a, (['b0', 'b1', 'b3'], 'b5:')) a\\b b0,b1,b3 b5: a0 3.0 1.0 a1 2.0 1.0 >>> # or equivalently >>> # arr.std_by('a','b0,b1,b3;b5:') Same with renaming >>> arr.std_by(X.a, (X.b['b0', 'b1', 'b3'] >> 'b013', X.b['b5:'] >> 'b567')) a\\b b013 b567 a0 3.0 1.0 a1 2.0 1.0 >>> # or equivalently >>> # arr.std_by('a','b0,b1,b3>>b013;b5:>>b567') """ pass
# cumulative aggregates
[docs] def cumsum(self, axis=-1): """ Returns the cumulative sum of array elements along an axis. Parameters ---------- axis : int or str or Axis, optional Axis along which to perform the cumulative sum. If given as position, it can be a negative integer, in which case it counts from the last to the first axis. By default, the cumulative sum is performed along the last axis. Returns ------- LArray or scalar See Also -------- LArray.cumprod, LArray.sum, LArray.sum_by, LArray.prod, LArray.prod_by Notes ----- Cumulative aggregation functions accept only one axis Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.cumsum() a\\b b0 b1 b2 b3 a0 0 1 3 6 a1 4 9 15 22 a2 8 17 27 38 a3 12 25 39 54 >>> arr.cumsum(X.a) a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 6 8 10 a2 12 15 18 21 a3 24 28 32 36 """ return self._cum_aggregate(np.cumsum, axis)
[docs] def cumprod(self, axis=-1): """ Returns the cumulative product of array elements. Parameters ---------- axis : int or str or Axis, optional Axis along which to perform the cumulative product. If given as position, it can be a negative integer, in which case it counts from the last to the first axis. By default, the cumulative product is performed along the last axis. Returns ------- LArray or scalar See Also -------- LArray.cumsum, LArray.sum, LArray.sum_by, LArray.prod, LArray.prod_by Notes ----- Cumulative aggregation functions accept only one axis. Examples -------- >>> arr = ndtest((4, 4)) >>> arr a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 4 5 6 7 a2 8 9 10 11 a3 12 13 14 15 >>> arr.cumprod() a\\b b0 b1 b2 b3 a0 0 0 0 0 a1 4 20 120 840 a2 8 72 720 7920 a3 12 156 2184 32760 >>> arr.cumprod(X.a) a\\b b0 b1 b2 b3 a0 0 1 2 3 a1 0 5 12 21 a2 0 45 120 231 a3 0 585 1680 3465 """ return self._cum_aggregate(np.cumprod, axis)
# element-wise method factory def _binop(opname): fullname = '__%s__' % opname super_method = getattr(np.ndarray, fullname) def opmethod(self, other): res_axes = self.axes if isinstance(other, ExprNode): other = other.evaluate(self.axes) # we could pass scalars through aslarray too but it is too costly performance-wise for only suppressing one # isscalar test and an if statement. # TODO: ndarray should probably be converted to larrays because that would harmonize broadcasting rules, but # it makes some tests fail for some reason. if not isinstance(other, (LArray, np.ndarray)) and not np.isscalar(other): other = aslarray(other) if isinstance(other, LArray): # TODO: first test if it is not already broadcastable (self, other), res_axes = make_numpy_broadcastable([self, other]) other = other.data return LArray(super_method(self.data, other), res_axes) opmethod.__name__ = fullname return opmethod __lt__ = _binop('lt') __le__ = _binop('le') __eq__ = _binop('eq') __ne__ = _binop('ne') __gt__ = _binop('gt') __ge__ = _binop('ge') __add__ = _binop('add') __radd__ = _binop('radd') __sub__ = _binop('sub') __rsub__ = _binop('rsub') __mul__ = _binop('mul') __rmul__ = _binop('rmul') if sys.version < '3': __div__ = _binop('div') __rdiv__ = _binop('rdiv') __truediv__ = _binop('truediv') __rtruediv__ = _binop('rtruediv') __floordiv__ = _binop('floordiv') __rfloordiv__ = _binop('rfloordiv') __mod__ = _binop('mod') __rmod__ = _binop('rmod') __divmod__ = _binop('divmod') __rdivmod__ = _binop('rdivmod') __pow__ = _binop('pow') __rpow__ = _binop('rpow') __lshift__ = _binop('lshift') __rlshift__ = _binop('rlshift') __rshift__ = _binop('rshift') __rrshift__ = _binop('rrshift') __and__ = _binop('and') __rand__ = _binop('rand') __xor__ = _binop('xor') __rxor__ = _binop('rxor') __or__ = _binop('or') __ror__ = _binop('ror') def __matmul__(self, other): """ Overrides operator @ for matrix multiplication. Notes ----- Only available with Python >= 3.5 Examples -------- >>> arr1d = ndtest(3) >>> arr1d a a0 a1 a2 0 1 2 >>> arr2d = ndtest((3, 3)) >>> arr2d a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 a2 6 7 8 >>> arr1d @ arr1d # doctest: +SKIP 5 >>> arr1d @ arr2d # doctest: +SKIP b b0 b1 b2 15 18 21 >>> arr2d @ arr1d # doctest: +SKIP a a0 a1 a2 5 14 23 >>> arr3d = ndrange('c=c0..c2;d=d0..d2;e=e0..e2') >>> arr1d @ arr3d # doctest: +SKIP c\\e e0 e1 e2 c0 15 18 21 c1 42 45 48 c2 69 72 75 >>> arr3d @ arr1d # doctest: +SKIP c\\d d0 d1 d2 c0 5 14 23 c1 32 41 50 c2 59 68 77 >>> arr3d @ arr3d # doctest: +SKIP c d\\e e0 e1 e2 c0 d0 15 18 21 c0 d1 42 54 66 c0 d2 69 90 111 c1 d0 366 396 426 c1 d1 474 513 552 c1 d2 582 630 678 c2 d0 1203 1260 1317 c2 d1 1392 1458 1524 c2 d2 1581 1656 1731 """ current = self[:] axes = self.axes if not isinstance(other, (LArray, np.ndarray)): raise NotImplementedError("matrix multiplication not implemented for %s" % type(other)) if isinstance(other, np.ndarray): other = LArray(other) other_axes = other.axes combined_axes = axes[:-2] + other_axes[:-2] if self.ndim > 2 and other.ndim > 2: current = current.expand(combined_axes).transpose(combined_axes) other = other.expand(combined_axes).transpose(combined_axes) # XXX : What doc of Numpy matmul says: # The behavior depends on the arguments in the following way: # * If both arguments are 2-D they are multiplied like conventional matrices. # * If either argument is N-D, N > 2, it is treated as a stack of matrices residing in the last two indexes # and broadcast accordingly. # * If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions. After matrix # multiplication the prepended 1 is removed. # * If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions. After matrix # multiplication the appended 1 is removed. res_data = current.data.__matmul__(other.data) res_axes = list(combined_axes) if self.ndim > 1: res_axes += [axes[-2]] if other.ndim > 1: res_axes += [other_axes[-1].copy()] return LArray(res_data, res_axes) def __rmatmul__(self, other): if isinstance(other, np.ndarray): other = LArray(other) if not isinstance(other, LArray): raise NotImplementedError("matrix multiplication not implemented for %s" % type(other)) return other.__matmul__(self) # element-wise method factory def _unaryop(opname): fullname = '__%s__' % opname super_method = getattr(np.ndarray, fullname) def opmethod(self): return LArray(super_method(self.data), self.axes) opmethod.__name__ = fullname return opmethod # unary ops do not need broadcasting so do not need to be overridden __neg__ = _unaryop('neg') __pos__ = _unaryop('pos') __abs__ = _unaryop('abs') __invert__ = _unaryop('invert') def __round__(self, n=0): # XXX: use the ufuncs.round instead? return np.round(self, decimals=n) def __index__(self): return self.data.__index__() def __int__(self): return self.data.__int__() def __float__(self): return self.data.__float__()
[docs] def divnot0(self, other): """Divides array by other, but returns 0.0 where other is 0. Parameters ---------- other : scalar or LArray What to divide by. Returns ------- LArray Array divided by other, 0.0 where other is 0 Examples -------- >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> a = ndrange((nat, sex)) >>> a nat\\sex M F BE 0 1 FO 2 3 >>> b = ndrange(sex) >>> b sex M F 0 1 >>> a / b nat\\sex M F BE nan 1.0 FO inf 3.0 >>> a.divnot0(b) nat\\sex M F BE 0.0 1.0 FO 0.0 3.0 """ if np.isscalar(other): if other == 0: return zeros_like(self, dtype=float) else: return self / other else: with np.errstate(divide='ignore', invalid='ignore'): res = self / other res[other == 0] = 0 return res
# XXX: rename/change to "add_axes" ? # TODO: add a flag copy=True to force a new array.
[docs] def expand(self, target_axes=None, out=None, readonly=False): """Expands array to target_axes. Target axes will be added to array if not present. In most cases this function is not needed because LArray can do operations with arrays having different (compatible) axes. Parameters ---------- target_axes : list of Axis or AxisCollection, optional Self can contain axes not present in `target_axes`. The result axes will be: [self.axes not in target_axes] + target_axes out : LArray, optional Output array, must have the correct shape readonly : bool, optional Whether returning a readonly view is acceptable or not (this is much faster) Returns ------- LArray Original array if possible (and out is None). Examples -------- >>> a = Axis('a=a1,a2') >>> b = Axis('b=b1,b2') >>> arr = ndrange([a, b]) >>> arr a\\b b1 b2 a1 0 1 a2 2 3 >>> c = Axis('c=c1,c2') >>> arr.expand([a, c, b]) a c\\b b1 b2 a1 c1 0 1 a1 c2 0 1 a2 c1 2 3 a2 c2 2 3 >>> arr.expand([b, c]) a b\\c c1 c2 a1 b1 0 0 a1 b2 1 1 a2 b1 2 2 a2 b2 3 3 """ if target_axes is None and out is None or target_axes is not None and out is not None: raise ValueError("either target_axes or out must be defined (not both)") if out is not None: target_axes = out.axes else: if not isinstance(target_axes, AxisCollection): target_axes = AxisCollection(target_axes) target_axes = (self.axes - target_axes) | target_axes if out is None: # this is not strictly necessary but avoids doing this test twice if it is True if self.axes == target_axes: return self broadcasted = self.broadcast_with(target_axes) # this can only happen if only the order of axes differed and/or all extra axes have length 1 if broadcasted.axes == target_axes: return broadcasted if readonly: # requires numpy 1.10 return LArray(np.broadcast_to(broadcasted, target_axes.shape), target_axes) else: out = empty(target_axes, dtype=self.dtype) out[:] = broadcasted return out
[docs] def append(self, axis, value, label=None): """Adds an array to self along an axis. The two arrays must have compatible axes. Parameters ---------- axis : axis reference Axis along which to append input array (`value`). value : scalar or LArray Array with compatible axes. label : str, optional Label for the new item in axis Returns ------- LArray Array expanded with `value` along `axis`. Examples -------- >>> a = ones('nat=BE,FO;sex=M,F') >>> a nat\\sex M F BE 1.0 1.0 FO 1.0 1.0 >>> a.append(X.sex, a.sum(X.sex), 'M+F') nat\\sex M F M+F BE 1.0 1.0 2.0 FO 1.0 1.0 2.0 >>> a.append(X.nat, 2, 'Other') nat\\sex M F BE 1.0 1.0 FO 1.0 1.0 Other 2.0 2.0 >>> b = zeros('type=type1,type2') >>> b type type1 type2 0.0 0.0 >>> a.append(X.nat, b, 'Other') nat sex\\type type1 type2 BE M 1.0 1.0 BE F 1.0 1.0 FO M 1.0 1.0 FO F 1.0 1.0 Other M 0.0 0.0 Other F 0.0 0.0 """ axis = self.axes[axis] return self.insert(value, pos=len(axis), axis=axis, label=label)
[docs] def prepend(self, axis, value, label=None): """Adds an array before self along an axis. The two arrays must have compatible axes. Parameters ---------- axis : axis reference Axis along which to prepend input array (`value`) value : LArray Array with compatible axes. label : str, optional Label for the new item in axis Returns ------- LArray Array expanded with 'value' at the start of 'axis'. Examples -------- >>> a = ones('nat=BE,FO;sex=M,F') >>> a nat\sex M F BE 1.0 1.0 FO 1.0 1.0 >>> a.prepend(X.sex, a.sum(X.sex), 'M+F') nat\\sex M+F M F BE 2.0 1.0 1.0 FO 2.0 1.0 1.0 >>> a.prepend(X.nat, 2, 'Other') nat\\sex M F Other 2.0 2.0 BE 1.0 1.0 FO 1.0 1.0 >>> b = zeros('type=type1,type2') >>> b type type1 type2 0.0 0.0 >>> a.prepend(X.sex, b, 'Other') nat sex\\type type1 type2 BE Other 0.0 0.0 BE M 1.0 1.0 BE F 1.0 1.0 FO Other 0.0 0.0 FO M 1.0 1.0 FO F 1.0 1.0 """ return self.insert(value, pos=0, axis=axis, label=label)
[docs] def extend(self, axis, other): """Adds an array to self along an axis. The two arrays must have compatible axes. Parameters ---------- axis : axis Axis along which to extend with input array (`other`) other : LArray Array with compatible axes Returns ------- LArray Array expanded with 'other' along 'axis'. Examples -------- >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> sex2 = Axis('sex=U') >>> xtype = Axis('type=type1,type2') >>> arr1 = ones([sex, xtype]) >>> arr1 sex\\type type1 type2 M 1.0 1.0 F 1.0 1.0 >>> arr2 = zeros([sex2, xtype]) >>> arr2 sex\\type type1 type2 U 0.0 0.0 >>> arr1.extend(X.sex, arr2) sex\\type type1 type2 M 1.0 1.0 F 1.0 1.0 U 0.0 0.0 >>> arr3 = zeros([sex2, nat]) >>> arr3 sex\\nat BE FO U 0.0 0.0 >>> arr1.extend(X.sex, arr3) sex type\\nat BE FO M type1 1.0 1.0 M type2 1.0 1.0 F type1 1.0 1.0 F type2 1.0 1.0 U type1 0.0 0.0 U type2 0.0 0.0 """ return concat((self, other), axis)
[docs] def insert(self, value, before=None, after=None, pos=None, axis=None, label=None): """Inserts value in array along an axis. Parameters ---------- value : scalar or LArray Value to insert. If an LArray, it must have compatible axes. If value already has the axis along which it is inserted, `label` should not be used. before : scalar or Group Label or group before which to insert `value`. after : scalar or Group Label or group after which to insert `value`. pos : int Index before which to insert `value`. axis : axis reference (int, str or Axis), optional Axis in which to insert `value`. This is only required when using `pos` or when before or after are ambiguous labels. label : str, optional Label for the new item in axis. Returns ------- LArray Array with `value` inserted along `axis`. The dtype of the returned array will be the "closest" type which can hold both the array values and the inserted values without loss of information. For example, when mixing numeric and string types, the dtype will be object. Examples -------- >>> arr1 = ndtest((2, 3)) >>> arr1 a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 >>> arr1.insert(42, before='b1', label='b0.5') a\\b b0 b0.5 b1 b2 a0 0 42 1 2 a1 3 42 4 5 >>> arr2 = ndtest(2) >>> arr2 a a0 a1 0 1 >>> arr1.insert(arr2, after='b0', label='b0.5') a\\b b0 b0.5 b1 b2 a0 0 0 1 2 a1 3 1 4 5 >>> arr1.insert(42, axis='b', pos=1, label='b0.5') a\\b b0 b0.5 b1 b2 a0 0 42 1 2 a1 3 42 4 5 >>> arr1.insert(42, before=X.b.i[1], label='b0.5') a\\b b0 b0.5 b1 b2 a0 0 42 1 2 a1 3 42 4 5 insert an array which already has the axis >>> arr3 = ndrange('a=a0,a1;b=b0.1,b0.2') + 42 >>> arr3 a\\b b0.1 b0.2 a0 42 43 a1 44 45 >>> arr1.insert(arr3, before='b1') a\\b b0 b0.1 b0.2 b1 b2 a0 0 42 43 1 2 a1 3 44 45 4 5 """ # XXX: unsure we should have arr1.insert(arr3, before='b1,b2') result in (see unit tests): # a\\b b0 b0.1 b1 b0.2 b2 # a0 0 42 1 43 2 # a1 3 44 4 45 5 # we might to implement the following instead: # a\\b b0 b0.1 b0.2 b1 b0.1 b0.2 b2 # a0 0 42 43 1 42 43 2 # a1 3 44 45 4 44 45 5 # The later looks less useful and could be emulated easily via: # arr1.insert([arr3, arr3], before='b1,b2') # while the above is a bit harder to achieve manually: # arr1.insert([arr3[[b]] for b in arr3.b], before=['b1', 'b2']) # but the later is *probably* more intuitive (and wouldn't suffer from the inefficiency we currently have). # XXX: when we have several lists, we implicitly match them by position, which we should avoid for the usual # reason, but I am unsure what the best syntax for that would be. # the goal is to get this result # a\b b0 b0.5 b1 b1.5 b2 # a0 0 8 1 9 2 # a1 3 8 4 9 5 # When the inserted arrays already contain a label, this seems reasonably readable: # >>> arr1 = ndtest((2, 3)) # >>> arr1 # a\\b b0 b1 b2 # a0 0 1 2 # a1 3 4 5 # >>> arr2 = full('b=b0.5', 8) # >>> arr2 # b b0.5 # 8 # >>> arr3 = full('b=b1.5', 9) # >>> arr3 # b b1.5 # 9 # >>> arr1.insert(before={'b1': arr2, 'b2': arr3}) # a\\b b0 b0.5 b1 b1.5 b2 # a0 0 8 1 9 2 # a1 3 8 4 9 5 # When the inserted arrays/values have no label, this does not really convince me and it prevents using after # or pos. # >>> arr1.insert(value={'b0.5': ('b1', 8), 'b1.5': ('b2', 9)}) # a\b b0 b0.5 b1 b1.5 b2 # a0 0 8 1 9 2 # a1 3 8 4 9 5 # This works with both after and pos and we could support it along with the above syntax when no label is # needed. Problem: label, value is arbitrary and as such potentially hard to remember. # >>> arr1.insert(before={'b1': ('b0.5', 8), 'b2': ('b1.5', 9)}) # a\b b0 b0.5 b1 b1.5 b2 # a0 0 8 1 9 2 # a1 3 8 4 9 5 # This is shorter but not readable enough/even more arbitrary than the previous option. # >>> arr1.insert([(8, 'b1', 'b0.5'), (9, 'b2', 'b1.5')]) # a\b b0 b0.5 b1 b1.5 b2 # a0 0 8 1 9 2 # a1 3 8 4 9 5 # This is readable but odd and not much gained (except efficiency) compared with multiple insert calls # >>> arr1.insert([(8, 'before', 'b1', 'label', 'b0.5'), # (9, 'before', 'b2', 'label', 'b1.5')]) # >>> arr1.insert(8, before='b1', label='b0.5') \ # .insert(9, before='b2', label='b1.5') if sum([before is not None, after is not None, pos is not None]) != 1: raise ValueError("must specify exactly one of before, after or pos") axis = self.axes[axis] if axis is not None else None if before is not None: before = self._translate_axis_key(before) if axis is None else axis[before] axis = before.axis before_pos = axis.index(before) elif after is not None: after = self._translate_axis_key(after) if axis is None else axis[after] axis = after.axis before_pos = axis.index(after) + 1 else: assert pos is not None if axis is None: raise ValueError("axis argument must be provided when using insert(pos=)") before_pos = pos def length(v): if isinstance(v, LArray) and axis in v.axes: return len(v.axes[axis]) else: return len(v) if isinstance(v, (tuple, list, np.ndarray)) else 1 def expand(v, length): return v if isinstance(v, (tuple, list, np.ndarray)) else [v] * length num_inserts = max(length(before_pos), length(label), length(value)) stops = expand(before_pos, num_inserts) if isinstance(value, LArray) and axis in value.axes: # FIXME: when length(before_pos) == 1 and length(label) == 1, this is inefficient values = [value[[k]] for k in value.axes[axis]] else: values = expand(value, num_inserts) values = [aslarray(v) if not isinstance(v, LArray) else v for v in values] if label is not None: labels = expand(label, num_inserts) values = [v.expand(Axis([l], axis.name), readonly=True) for v, l in zip(values, labels)] start = 0 chunks = [] for stop, value in zip(stops, values): chunks.append(self[axis.i[start:stop]]) chunks.append(value) start = stop chunks.append(self[axis.i[start:]]) return concat(chunks, axis)
[docs] def transpose(self, *args): """Reorder axes. Parameters ---------- *args Accepts either a tuple of axes specs or axes specs as `*args`. Omitted axes keep their order. Use ... to avoid specifying intermediate axes. Returns ------- LArray LArray with reordered axes. Examples -------- >>> arr = ndtest((2, 2, 2)) >>> arr a b\\c c0 c1 a0 b0 0 1 a0 b1 2 3 a1 b0 4 5 a1 b1 6 7 >>> arr.transpose('b', 'c', 'a') b c\\a a0 a1 b0 c0 0 4 b0 c1 1 5 b1 c0 2 6 b1 c1 3 7 >>> arr.transpose('b') b a\\c c0 c1 b0 a0 0 1 b0 a1 4 5 b1 a0 2 3 b1 a1 6 7 >>> arr.transpose(..., 'a') # doctest: +SKIP b c\\a a0 a1 b0 c0 0 4 b0 c1 1 5 b1 c0 2 6 b1 c1 3 7 >>> arr.transpose('c', ..., 'a') # doctest: +SKIP c b\\a a0 a1 c0 b0 0 4 c0 b1 2 6 c1 b0 1 5 c1 b1 3 7 """ if len(args) == 1 and isinstance(args[0], (tuple, list, AxisCollection)): axes = args[0] elif len(args) == 0: axes = self.axes[::-1] else: axes = args axes = self.axes[axes] axes_indices = [self.axes.index(axis) for axis in axes] # this whole mumbo jumbo is required (for now) for anonymous axes indices_present = set(axes_indices) missing_indices = [i for i in range(len(self.axes)) if i not in indices_present] axes_indices = axes_indices + missing_indices return LArray(self.data.transpose(axes_indices), self.axes[axes_indices])
T = property(transpose)
[docs] def clip(self, a_min, a_max, out=None): """Clip (limit) the values in an array. Given an interval, values outside the interval are clipped to the interval edges. For example, if an interval of [0, 1] is specified, values smaller than 0 become 0, and values larger than 1 become 1. Parameters ---------- a_min : scalar or array-like Minimum value. a_max : scalar or array-like Maximum value. out : LArray, optional The results will be placed in this array. Returns ------- LArray An array with the elements of the current array, but where values < `a_min` are replaced with `a_min`, and those > `a_max` with `a_max`. Notes ----- If `a_min` and/or `a_max` are array_like, broadcast will occur between self, `a_min` and `a_max`. """ from larray.core.ufuncs import clip return clip(self, a_min, a_max, out)
[docs] def to_csv(self, filepath, sep=',', na_rep='', transpose=True, dropna=None, dialect='default', **kwargs): """ Writes array to a csv file. Parameters ---------- filepath : str path where the csv file has to be written. sep : str seperator for the csv file. na_rep : str replace NA values with na_rep. transpose : boolean transpose = True => transpose over last axis. transpose = False => no transpose. dialect : 'default' | 'classic' Whether or not to write the last axis name (using '\' ) dropna : None, 'all', 'any' or True, optional Drop lines if 'all' its values are NA, if 'any' value is NA or do not drop any line (default). True is equivalent to 'all'. Examples -------- >>> from larray.tests.common import abspath >>> fpath = abspath('test.csv') >>> a = ndrange('nat=BE,FO;sex=M,F') >>> a nat\\sex M F BE 0 1 FO 2 3 >>> a.to_csv(fpath) >>> with open(fpath) as f: ... print(f.read().strip()) nat\\sex,M,F BE,0,1 FO,2,3 >>> a.to_csv(fpath, sep=';', transpose=False) >>> with open(fpath) as f: ... print(f.read().strip()) nat;sex;0 BE;M;0 BE;F;1 FO;M;2 FO;F;3 >>> a.to_csv(fpath, dialect='classic') >>> with open(fpath) as f: ... print(f.read().strip()) nat,M,F BE,0,1 FO,2,3 """ fold = dialect == 'default' if transpose: frame = self.to_frame(fold, dropna) frame.to_csv(filepath, sep=sep, na_rep=na_rep, **kwargs) else: series = self.to_series(dropna is not None) series.to_csv(filepath, sep=sep, na_rep=na_rep, header=True, **kwargs)
[docs] def to_hdf(self, filepath, key, *args, **kwargs): """ Writes array to a HDF file. A HDF file can contain multiple arrays. The 'key' parameter is a unique identifier for the array. Parameters ---------- filepath : str Path where the hdf file has to be written. key : str or Group Name of the array within the HDF file. *args **kargs Examples -------- >>> a = ndtest((2, 3)) >>> a.to_hdf('test.h5', 'a') # doctest: +SKIP """ key = _translate_key_hdf(key) self.to_frame().to_hdf(filepath, key, *args, **kwargs)
[docs] def to_excel(self, filepath=None, sheet_name=None, position='A1', overwrite_file=False, clear_sheet=False, header=True, transpose=False, engine=None, *args, **kwargs): """ Writes array in the specified sheet of specified excel workbook. Parameters ---------- filepath : str or int or None, optional Path where the excel file has to be written. If None (default), creates a new Excel Workbook in a live Excel instance (Windows only). Use -1 to use the currently active Excel Workbook. Use a name without extension (.xlsx) to use any unsaved* workbook. sheet_name : str or Group or int or None, optional Sheet where the data has to be written. Defaults to None, Excel standard name if adding a sheet to an existing file, "Sheet1" otherwise. sheet_name can also refer to the position of the sheet (e.g. 0 for the first sheet, -1 for the last one). position : str or tuple of integers, optional Integer position (row, column) must be 1-based. Defaults to 'A1'. overwrite_file : bool, optional Whether or not to overwrite the existing file (or just modify the specified sheet). Defaults to False. clear_sheet : bool, optional Whether or not to clear the existing sheet (if any) before writing. Defaults to False. header : bool, optional Whether or not to write a header (axes names and labels). Defaults to True. transpose : bool, optional Whether or not to transpose the resulting array. This can be used, for example, for writing one dimensional arrays vertically. Defaults to False. engine : 'xlwings' | 'openpyxl' | 'xlsxwriter' | 'xlwt' | None, optional Engine to use to make the output. If None (default), it will use 'xlwings' by default if the module is installed and relies on Pandas default writer otherwise. *args **kwargs Examples -------- >>> a = ndrange('nat=BE,FO;sex=M,F') >>> # write to a new (unnamed) sheet >>> a.to_excel('test.xlsx') # doctest: +SKIP >>> # write to top-left corner of an existing sheet >>> a.to_excel('test.xlsx', 'Sheet1') # doctest: +SKIP >>> # add to existing sheet starting at position A15 >>> a.to_excel('test.xlsx', 'Sheet1', 'A15') # doctest: +SKIP """ sheet_name = _translate_sheet_name(sheet_name) df = self.to_frame(fold_last_axis_name=True) if engine is None: engine = 'xlwings' if xw is not None else None if engine == 'xlwings': from larray.inout.excel import open_excel close = False new_workbook = False if filepath is None: new_workbook = True elif isinstance(filepath, str): basename, ext = os.path.splitext(filepath) if ext: if not os.path.isfile(filepath): new_workbook = True close = True if new_workbook or overwrite_file: new_workbook = overwrite_file = True wb = open_excel(filepath, overwrite_file=overwrite_file) if new_workbook: sheet = wb.sheets[0] if sheet_name is not None: sheet.name = sheet_name elif sheet_name is not None and sheet_name in wb: sheet = wb.sheets[sheet_name] if clear_sheet: sheet.clear() else: sheet = wb.sheets.add(sheet_name, after=wb.sheets[-1]) options = dict(header=header, index=header, transpose=transpose) sheet[position].options(**options).value = df # TODO: implement transpose via/in dump # sheet[position] = self.dump(header=header, transpose=transpose) if close: wb.save() wb.close() else: if sheet_name is None: sheet_name = 'Sheet1' # TODO: implement position in this case # startrow, startcol df.to_excel(filepath, sheet_name, *args, engine=engine, **kwargs)
[docs] def to_clipboard(self, *args, **kwargs): """Sends the content of the array to clipboard. Using to_clipboard() makes it possible to paste the content of the array into a file (Excel, ascii file,...). Examples -------- >>> a = ndrange('nat=BE,FO;sex=M,F') >>> a.to_clipboard() # doctest: +SKIP """ self.to_frame().to_clipboard(*args, **kwargs)
# XXX: sep argument does not seem very useful # def to_excel(self, filename, sep=None): # # Why xlsxwriter? Because it is faster than openpyxl and xlwt # # currently does not .xlsx (only .xls). # # PyExcelerate seem like a decent alternative too # import xlsxwriter as xl # # if sep is None: # sep = '_' # #sep = self.sep # workbook = xl.Workbook(filename) # if self.ndim > 2: # for key in product(*[axis.labels for axis in self.axes[:-2]]): # sheetname = sep.join(str(k) for k in key) # # sheet names must not: # # * contain any of the following characters: : \ / ? * [ ] # # XXX: this will NOT work for unicode strings ! # table = string.maketrans('[:]', '(-)') # todelete = r'\/?*' # sheetname = sheetname.translate(table, todelete) # # * exceed 31 characters # # sheetname = sheetname[:31] # # * be blank # assert sheetname, "sheet name cannot be blank" # worksheet = workbook.add_worksheet(sheetname) # worksheet.write_row(0, 1, self.axes[-1].labels) # worksheet.write_column(1, 0, self.axes[-2].labels) # for row, data in enumerate(np.asarray(self[key])): # worksheet.write_row(1+row, 1, data) # # else: # worksheet = workbook.add_worksheet('Sheet1') # worksheet.write_row(0, 1, self.axes[-1].labels) # if self.ndim == 2: # worksheet.write_column(1, 0, self.axes[-2].labels) # for row, data in enumerate(np.asarray(self)): # worksheet.write_row(1+row, 1, data) @property def plot(self): """Plots the data of the array into a graph (window pop-up). The graph can be tweaked to achieve the desired formatting and can be saved to a .png file. Parameters ---------- kind : str - 'line' : line plot (default) - 'bar' : vertical bar plot - 'barh' : horizontal bar plot - 'hist' : histogram - 'box' : boxplot - 'kde' : Kernel Density Estimation plot - 'density' : same as 'kde' - 'area' : area plot - 'pie' : pie plot - 'scatter' : scatter plot (if array's dimensions >= 2) - 'hexbin' : hexbin plot (if array's dimensions >= 2) ax : matplotlib axes object, default None subplots : boolean, default False Make separate subplots for each column sharex : boolean, default True if ax is None else False In case subplots=True, share x axis and set some x axis labels to invisible; defaults to True if ax is None otherwise False if an ax is passed in; Be aware, that passing in both an ax and sharex=True will alter all x axis labels for all axis in a figure! sharey : boolean, default False In case subplots=True, share y axis and set some y axis labels to invisible layout : tuple (optional) (rows, columns) for the layout of subplots figsize : a tuple (width, height) in inches use_index : boolean, default True Use index as ticks for x axis title : string Title to use for the plot grid : boolean, default None (matlab style default) Axis grid lines legend : False/True/'reverse' Place legend on axis subplots style : list or dict matplotlib line style per column logx : boolean, default False Use log scaling on x axis logy : boolean, default False Use log scaling on y axis loglog : boolean, default False Use log scaling on both x and y axes xticks : sequence Values to use for the xticks yticks : sequence Values to use for the yticks xlim : 2-tuple/list ylim : 2-tuple/list rot : int, default None Rotation for ticks (xticks for vertical, yticks for horizontal plots) fontsize : int, default None Font size for xticks and yticks colormap : str or matplotlib colormap object, default None Colormap to select colors from. If string, load colormap with that name from matplotlib. colorbar : boolean, optional If True, plot colorbar (only relevant for 'scatter' and 'hexbin' plots) position : float Specify relative alignments for bar plot layout. From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) layout : tuple (optional) (rows, columns) for the layout of the plot yerr : array-like Error bars on y axis xerr : array-like Error bars on x axis stacked : boolean, default False in line and bar plots, and True in area plot. If True, create stacked plot. \**kwargs : keywords Options to pass to matplotlib plotting method Returns ------- axes : matplotlib.AxesSubplot or np.array of them Notes ----- See Pandas documentation of `plot` function for more details on this subject Examples -------- >>> import matplotlib.pyplot as plt # doctest: +SKIP >>> a = ndrange('sex=M,F;age=0..20') Simple line plot >>> a.plot() # doctest: +SKIP >>> # shows figure (reset the current figure after showing it! Do not call it before savefig) >>> plt.show() # doctest: +SKIP Line plot with grid, title and both axes in logscale >>> a.plot(grid=True, loglog=True, title='line plot') # doctest: +SKIP >>> # saves figure in a file (see matplotlib.pyplot.savefig documentation for more details) >>> plt.savefig('my_file.png') # doctest: +SKIP 2 bar plots sharing the same x axis (one for males and one for females) >>> a.plot.bar(subplots=True, sharex=True) # doctest: +SKIP >>> plt.show() # doctest: +SKIP Create a figure containing 2 x 2 graphs >>> # see matplotlib.pyplot.subplots documentation for more details >>> fig, ax = plt.subplots(2, 2, figsize=(15, 15)) # doctest: +SKIP >>> # 2 curves : Males and Females >>> a.plot(ax=ax[0, 0], title='line plot') # doctest: +SKIP >>> # bar plot with stacked values >>> a.plot.bar(ax=ax[0, 1], stacked=True, title='stacked bar plot') # doctest: +SKIP >>> # same as previously but with colored areas instead of bars >>> a.plot.area(ax=ax[1, 0], title='area plot') # doctest: +SKIP >>> # scatter plot >>> a.plot.scatter(ax=ax[1, 1], x='M', y='F', title='scatter plot') # doctest: +SKIP >>> plt.show() # doctest: +SKIP """ combined = self.combine_axes(self.axes[:-1], sep=' ') if self.ndim > 2 else self if combined.ndim == 1: return combined.to_series().plot else: return combined.transpose().to_frame().plot @property def shape(self): """Returns the shape of the array as a tuple. Returns ------- tuple Tuple representing the current shape. Examples -------- >>> a = ndrange('nat=BE,FO;sex=M,F;type=type1,type2,type3') >>> a.shape # doctest: +SKIP (2, 2, 3) """ return self.data.shape @property def ndim(self): """Returns the number of dimensions of the array. Returns ------- int Number of dimensions of a LArray. Examples -------- >>> a = ndrange('nat=BE,FO;sex=M,F') >>> a.ndim 2 """ return self.data.ndim @property def size(self): """Returns the number of elements in array. Returns ------- int Number of elements in array. Examples -------- >>> a = ndrange('sex=M,F;type=type1,type2,type3') >>> a.size 6 """ return self.data.size @property def nbytes(self): """Returns the number of bytes used to store the array in memory. Returns ------- int Number of bytes in array. Examples -------- >>> a = ndrange('sex=M,F;type=type1,type2,type3', dtype=float) >>> a.nbytes 48 """ return self.data.nbytes @property def memory_used(self): """Returns the memory consumed by the array in human readable form. Returns ------- str Memory used by the array. Examples -------- >>> a = ndrange('sex=M,F;type=type1,type2,type3', dtype=float) >>> a.memory_used '48 bytes' """ return size2str(self.data.nbytes) @property def dtype(self): """Returns the type of the data of the array. Returns ------- dtype Type of the data of the array. Examples -------- >>> a = zeros('sex=M,F;type=type1,type2,type3') >>> a.dtype dtype('float64') """ return self.data.dtype @property def item(self): return self.data.item def __len__(self): return len(self.data) def __array__(self, dtype=None): return np.asarray(self.data, dtype=dtype) __array_priority__ = 100 # XXX: implement guess axis? """ # guessing each axis >>> a.set_labels({'M': 'Men', 'BE': 'Belgian'}) nat\\sex Men Women BE 0 1 FO 2 3 # we have to choose which one to support because it is probably not a good idea to simultaneously support the # following syntax (even though we *could* support both if we split values on , before we determine if the key is # an axis or a label by looking if the value is a list or a single string. >>> a.set_labels({'sex': 'Men,Women', 'BE': 'Belgian'}) nat\\sex Men Women BE 0 1 FO 2 3 # this is shorter but I do not like it because string are both quoted and not quoted and you cannot have int # labels >>> a.set_labels(M='Men', BE='Belgian') nat\\sex Men Women BE 0 1 FO 2 3 """
[docs] def set_labels(self, axis=None, labels=None, inplace=False, **kwargs): """Replaces the labels of an axis of array. Parameters ---------- axis : string or Axis or dict Axis for which we want to replace labels, or mapping {axis: changes} where changes can either be the complete list of labels or a mapping {old_label: new_label}. labels : int, str, iterable or mapping, optional Integer or list of values usable as the collection of labels for an Axis. If this is mapping, it must be {old_label: new_label}. This argument must not be used if axis is a mapping. inplace : bool, optional Whether or not to modify the original object or return a new array and leave the original intact. Defaults to False. **kwargs : `axis`=`labels` for each axis you want to set labels. Returns ------- LArray Array with modified labels. Examples -------- >>> a = ndrange('nat=BE,FO;sex=M,F') >>> a nat\\sex M F BE 0 1 FO 2 3 >>> a.set_labels(X.sex, ['Men', 'Women']) nat\\sex Men Women BE 0 1 FO 2 3 when passing a single string as labels, it will be interpreted to create the list of labels, so that one can use the same syntax than during axis creation. >>> a.set_labels(X.sex, 'Men,Women') nat\\sex Men Women BE 0 1 FO 2 3 to replace only some labels, one must give a mapping giving the new label for each label to replace >>> a.set_labels(X.sex, {'M': 'Men'}) nat\\sex Men F BE 0 1 FO 2 3 to replace labels for several axes at the same time, one should give a mapping giving the new labels for each changed axis >>> a.set_labels({'sex': 'Men,Women', 'nat': 'Belgian,Foreigner'}) nat\\sex Men Women Belgian 0 1 Foreigner 2 3 or use keyword arguments >>> a.set_labels(sex='Men,Women', nat='Belgian,Foreigner') nat\\sex Men Women Belgian 0 1 Foreigner 2 3 one can also replace some labels in several axes by giving a mapping of mappings >>> a.set_labels({'sex': {'M': 'Men'}, 'nat': {'BE': 'Belgian'}}) nat\\sex Men F Belgian 0 1 FO 2 3 """ if axis is None: changes = {} elif isinstance(axis, dict): changes = axis elif isinstance(axis, (basestring, Axis, int)): changes = {axis: labels} else: raise ValueError("Expected None or a string/int/Axis/dict instance for axis argument") changes.update(kwargs) # TODO: we should implement the non-dict behavior in Axis.replace, so that we can simplify this code to: # new_axes = [self.axes[old_axis].replace(axis_changes) for old_axis, axis_changes in changes.items()] new_axes = [] for old_axis, axis_changes in changes.items(): real_axis = self.axes[old_axis] if isinstance(axis_changes, dict): new_axis = real_axis.replace(axis_changes) else: new_axis = Axis(axis_changes, real_axis.name) new_axes.append((old_axis, new_axis)) axes = self.axes.replace(new_axes) if inplace: self.axes = axes return self else: return LArray(self.data, axes)
[docs] def astype(self, dtype, order='K', casting='unsafe', subok=True, copy=True): return LArray(self.data.astype(dtype, order, casting, subok, copy), self.axes)
astype.__doc__ = np.ndarray.astype.__doc__
[docs] def shift(self, axis, n=1): """Shifts the cells of the array n-times to the left along axis. Parameters ---------- axis : int, str or Axis Axis for which we want to perform the shift. n : int Number of cells to shift. Returns ------- LArray Examples -------- >>> a = ndrange('sex=M,F;type=type1,type2,type3') >>> a sex\\type type1 type2 type3 M 0 1 2 F 3 4 5 >>> a.shift(X.type) sex\\type type2 type3 M 0 1 F 3 4 >>> a.shift(X.type, n=-1) sex\\type type1 type2 M 1 2 F 4 5 """ axis = self.axes[axis] if n > 0: return self[axis.i[:-n]].set_labels(axis, axis.labels[n:]) elif n < 0: return self[axis.i[-n:]].set_labels(axis, axis.labels[:n]) else: return self[:]
# TODO: add support for groups as axis (like aggregates) # eg a.diff(x.year[2018:]) instead of a[2018:].diff(x.year)
[docs] def diff(self, axis=-1, d=1, n=1, label='upper'): """Calculates the n-th order discrete difference along a given axis. The first order difference is given by out[n] = a[n + 1] - a[n] along the given axis, higher order differences are calculated by using diff recursively. Parameters ---------- axis : int, str or Axis, optional Axis along which the difference is taken. Defaults to the last axis. d : int, optional Periods to shift for forming difference. Defaults to 1. n : int, optional The number of times values are differenced. Defaults to 1. label : {'lower', 'upper'}, optional The new labels in `axis` will have the labels of either the array being subtracted ('lower') or the array it is subtracted from ('upper'). Defaults to 'upper'. Returns ------- LArray : The n-th order differences. The shape of the output is the same as `a` except for `axis` which is smaller by `n` * `d`. Examples -------- >>> a = ndrange('sex=M,F;type=type1,type2,type3').cumsum(X.type) >>> a sex\\type type1 type2 type3 M 0 1 3 F 3 7 12 >>> a.diff() sex\\type type2 type3 M 1 2 F 4 5 >>> a.diff(n=2) sex\\type type3 M 1 F 1 >>> a.diff(X.sex) sex\\type type1 type2 type3 F 3 6 9 """ array = self for _ in range(n): axis_obj = array.axes[axis] left = array[axis_obj.i[d:]] right = array[axis_obj.i[:-d]] if label == 'upper': right = right.drop_labels(axis) else: left = left.drop_labels(axis) array = left - right return array
# XXX: this is called pct_change in Pandas (but returns the same results, not results * 100, which I find silly). # Maybe change_rate would be better (because growth is not always positive)? # TODO: add support for groups as axis (like aggregates) # eg a.growth_rate(x.year[2018:]) instead of a[2018:].growth_rate(x.year)
[docs] def growth_rate(self, axis=-1, d=1, label='upper'): """Calculates the growth along a given axis. Roughly equivalent to a.diff(axis, d, label) / a[axis.i[:-d]] Parameters ---------- axis : int, str or Axis, optional Axis along which the difference is taken. Defaults to the last axis. d : int, optional Periods to shift for forming difference. Defaults to 1. label : {'lower', 'upper'}, optional The new labels in `axis` will have the labels of either the array being subtracted ('lower') or the array it is subtracted from ('upper'). Defaults to 'upper'. Returns ------- LArray Examples -------- >>> sex = Axis('sex=M,F') >>> year = Axis(range(2016, 2020), 'year') >>> a = LArray([[1.0, 2.0, 3.0, 3.0], [2.0, 3.0, 1.5, 3.0]], ... [sex, year]) >>> a sex\\year 2016 2017 2018 2019 M 1.0 2.0 3.0 3.0 F 2.0 3.0 1.5 3.0 >>> a.growth_rate() sex\\year 2017 2018 2019 M 1.0 0.5 0.0 F 0.5 -0.5 1.0 >>> a.growth_rate(d=2) sex\\year 2018 2019 M 2.0 0.5 F -0.25 0.0 """ diff = self.diff(axis=axis, d=d, label=label) axis_obj = self.axes[axis] return diff / self[axis_obj.i[:-d]].drop_labels(axis)
[docs] def compact(self): """Detects and removes "useless" axes (ie axes for which values are constant over the whole axis) Returns ------- LArray or scalar Array with constant axes removed. Examples -------- >>> a = LArray([[1, 2], ... [1, 2]], [Axis('sex=M,F'), Axis('nat=BE,FO')]) >>> a sex\\nat BE FO M 1 2 F 1 2 >>> a.compact() nat BE FO 1 2 """ res = self for axis in res.axes: if (res == res[axis.i[0]]).all(): res = res[axis.i[0]] return res
[docs] def combine_axes(self, axes=None, sep='_', wildcard=False): """Combine several axes into one. Parameters ---------- axes : tuple, list, AxisCollection of axes or list of combination of those or dict, optional axes to combine. Tuple, list or AxisCollection will combine several axes into one. To chain several axes combinations, pass a list of tuple/list/AxisCollection of axes. To set the name(s) of resulting axis(es), use a {(axes, to, combine): 'new_axis_name'} dictionary. Defaults to all axes. sep : str, optional delimiter to use for combining. Defaults to '_'. wildcard : bool, optional whether or not to produce a wildcard axis even if the axes to combine are not. This is much faster, but loose axes labels. Returns ------- LArray Array with combined axes. Examples -------- >>> arr = ndtest((2, 3)) >>> arr a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 >>> arr.combine_axes() a_b a0_b0 a0_b1 a0_b2 a1_b0 a1_b1 a1_b2 0 1 2 3 4 5 >>> arr.combine_axes(sep='/') a/b a0/b0 a0/b1 a0/b2 a1/b0 a1/b1 a1/b2 0 1 2 3 4 5 >>> arr = ndtest((2, 2, 2, 2)) >>> arr a b c\\d d0 d1 a0 b0 c0 0 1 a0 b0 c1 2 3 a0 b1 c0 4 5 a0 b1 c1 6 7 a1 b0 c0 8 9 a1 b0 c1 10 11 a1 b1 c0 12 13 a1 b1 c1 14 15 >>> arr.combine_axes(('a', 'c')) a_c b\\d d0 d1 a0_c0 b0 0 1 a0_c0 b1 4 5 a0_c1 b0 2 3 a0_c1 b1 6 7 a1_c0 b0 8 9 a1_c0 b1 12 13 a1_c1 b0 10 11 a1_c1 b1 14 15 >>> arr.combine_axes({('a', 'c'): 'ac'}) ac b\\d d0 d1 a0_c0 b0 0 1 a0_c0 b1 4 5 a0_c1 b0 2 3 a0_c1 b1 6 7 a1_c0 b0 8 9 a1_c0 b1 12 13 a1_c1 b0 10 11 a1_c1 b1 14 15 # make several combinations at once >>> arr.combine_axes([('a', 'c'), ('b', 'd')]) a_c\\b_d b0_d0 b0_d1 b1_d0 b1_d1 a0_c0 0 1 4 5 a0_c1 2 3 6 7 a1_c0 8 9 12 13 a1_c1 10 11 14 15 >>> arr.combine_axes({('a', 'c'): 'ac', ('b', 'd'): 'bd'}) ac\\bd b0_d0 b0_d1 b1_d0 b1_d1 a0_c0 0 1 4 5 a0_c1 2 3 6 7 a1_c0 8 9 12 13 a1_c1 10 11 14 15 """ if axes is None: axes = {tuple(self.axes): None} elif isinstance(axes, AxisCollection): axes = {tuple(axes): None} elif isinstance(axes, (list, tuple)): # checks for nested tuple/list if all(isinstance(axis, (list, tuple, AxisCollection)) for axis in axes): axes = {tuple(axes_to_combine): None for axes_to_combine in axes} else: axes = {tuple(axes): None} # axes should be a dict at this time assert isinstance(axes, dict) transposed_axes = self.axes[:] for axes_to_combine, name in axes.items(): # transpose all axes next to each other, using index of first axis axes_to_combine = self.axes[axes_to_combine] axes_indices = [transposed_axes.index(axis) for axis in axes_to_combine] min_axis_index = min(axes_indices) transposed_axes = transposed_axes - axes_to_combine transposed_axes = transposed_axes[:min_axis_index] + axes_to_combine + transposed_axes[min_axis_index:] transposed = self.transpose(transposed_axes) new_axes = transposed.axes.combine_axes(axes, sep=sep, wildcard=wildcard) return transposed.reshape(new_axes)
[docs] def split_axes(self, axes=None, sep='_', names=None, regex=None, sort=False, fill_value=nan): """Split axes and returns a new array Parameters ---------- axes : int, str, Axis or any combination of those axes to split. All labels *must* contain the given delimiter string. To split several axes at once, pass a list or tuple of axes to split. To set the names of resulting axes, use a {'axis_to_split': (new, axes)} dictionary. Defaults to all axes whose name contains the `sep` delimiter. sep : str, optional delimiter to use for splitting. Defaults to '_'. When `regex` is provided, the delimiter is only used on `names` if given as one string or on axis name if `names` is None. names : str or list of str, optional names of resulting axes. Defaults to None. regex : str, optional use regex instead of delimiter to split labels. Defaults to None. sort : bool, optional Whether or not to sort the combined axis before splitting it. When all combinations of labels are present in the combined axis, sorting is faster than not sorting. Defaults to False. fill_value : scalar or LArray, optional Value to use for missing values when the combined axis does not contain all combination of labels. Defaults to NaN. Returns ------- LArray Examples -------- >>> arr = ndtest((2, 3)) >>> arr a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 >>> combined = arr.combine_axes() >>> combined a_b a0_b0 a0_b1 a0_b2 a1_b0 a1_b1 a1_b2 0 1 2 3 4 5 >>> combined.split_axes() a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 Split labels using regex >>> combined = ndrange('a_b=a0b0..a1b2') >>> combined a_b a0b0 a0b1 a0b2 a1b0 a1b1 a1b2 0 1 2 3 4 5 >>> combined.split_axes('a_b', regex='(\\\\w{2})(\\\\w{2})') a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 Split several axes at once >>> combined = ndrange('a_b=a0_b0..a1_b1; c_d=c0_d0..c1_d1') >>> combined a_b\\c_d c0_d0 c0_d1 c1_d0 c1_d1 a0_b0 0 1 2 3 a0_b1 4 5 6 7 a1_b0 8 9 10 11 a1_b1 12 13 14 15 >>> # equivalent to combined.split_axes() which split all axes whose name contains the `sep` delimiter. >>> combined.split_axes(['a_b', 'c_d']) a b c\\d d0 d1 a0 b0 c0 0 1 a0 b0 c1 2 3 a0 b1 c0 4 5 a0 b1 c1 6 7 a1 b0 c0 8 9 a1 b0 c1 10 11 a1 b1 c0 12 13 a1 b1 c1 14 15 >>> combined.split_axes({'a_b': ('A', 'B'), 'c_d': ('C', 'D')}) A B C\\D d0 d1 a0 b0 c0 0 1 a0 b0 c1 2 3 a0 b1 c0 4 5 a0 b1 c1 6 7 a1 b0 c0 8 9 a1 b0 c1 10 11 a1 b1 c0 12 13 a1 b1 c1 14 15 """ array = self.sort_axes(axes) if sort else self # TODO: # * do multiple axes split in one go # * somehow factorize this code with AxisCollection.split_axes if axes is None: axes = {axis: None for axis in array.axes if sep in axis.name} elif isinstance(axes, (int, basestring, Axis)): axes = {axes: None} elif isinstance(axes, (list, tuple)): if all(isinstance(axis, (int, basestring, Axis)) for axis in axes): axes = {axis: None for axis in axes} else: raise ValueError("Expected tuple or list of int, string or Axis instances") # axes should be a dict at this time assert isinstance(axes, dict) for axis, names in axes.items(): axis = array.axes[axis] split_axes, split_labels = axis.split(sep, names, regex, return_labels=True) axis_index = array.axes.index(axis) new_axes = array.axes[:axis_index] + split_axes + array.axes[axis_index + 1:] # fast path when all combinations of labels are present in the combined axis all_combinations_present = AxisCollection(split_axes).size == len(np.unique(axis.labels)) if all_combinations_present and sort: array = array.reshape(new_axes) else: if all_combinations_present: res = empty(new_axes, dtype=array.dtype) else: res = full(new_axes, fill_value=fill_value, dtype=common_type((array, fill_value))) if names is None: names = axis.name.split(sep) # Rename axis to make sure we broadcast correctly. We should NOT use sep here, but rather '_' must be # kept in sync with the default sep of _bool_key_new_axes new_axis_name = '_'.join(names) if new_axis_name != axis.name: array = array.rename(axis, new_axis_name) res.points[split_labels] = array array = res return array
split_axis = renamed_to(split_axes, 'split_axis')
[docs]def aslarray(a): """ Converts input as LArray if possible. Parameters ---------- a : array-like Input array to convert into a LArray. Returns ------- LArray Examples -------- >>> # NumPy array >>> np_arr = np.arange(6).reshape((2,3)) >>> aslarray(np_arr) {0}*\{1}* 0 1 2 0 0 1 2 1 3 4 5 >>> # Pandas dataframe >>> data = {'normal' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), ... 'reverse' : pd.Series([3., 2., 1.], index=['a', 'b', 'c'])} >>> df = pd.DataFrame(data) >>> aslarray(df) {0}\{1} normal reverse a 1.0 3.0 b 2.0 2.0 c 3.0 1.0 """ if isinstance(a, LArray): return a elif hasattr(a, '__larray__'): return a.__larray__() elif isinstance(a, pd.DataFrame): from larray.inout.array import from_frame return from_frame(a) else: return LArray(a)
def _check_axes_argument(func): @functools.wraps(func) def wrapper(*args, **kwargs): if len(args) > 1 and isinstance(args[1], (int, Axis)): raise ValueError("If you want to pass several axes or dimension lengths to {}, you must pass them as a " "list (using []) or tuple (using()).".format(func.__name__)) return func(*args, **kwargs) return wrapper
[docs]@_check_axes_argument def zeros(axes, title='', dtype=float, order='C'): """Returns an array with the specified axes and filled with zeros. Parameters ---------- axes : int, tuple of int, Axis or tuple/list/AxisCollection of Axis Collection of axes or a shape. title : str, optional Title. dtype : data-type, optional Desired data-type for the array, e.g., `numpy.int8`. Default is `numpy.float64`. order : {'C', 'F'}, optional Whether to store multidimensional data in C- (default) or Fortran-contiguous (row- or column-wise) order in memory. Returns ------- LArray Examples -------- >>> zeros('nat=BE,FO;sex=M,F') nat\sex M F BE 0.0 0.0 FO 0.0 0.0 >>> zeros([(['BE', 'FO'], 'nat'), ... (['M', 'F'], 'sex')]) nat\sex M F BE 0.0 0.0 FO 0.0 0.0 >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> zeros([nat, sex]) nat\sex M F BE 0.0 0.0 FO 0.0 0.0 """ axes = AxisCollection(axes) return LArray(np.zeros(axes.shape, dtype, order), axes, title)
[docs]def zeros_like(array, title='', dtype=None, order='K'): """Returns an array with the same axes as array and filled with zeros. Parameters ---------- array : LArray Input array. title : str, optional Title. dtype : data-type, optional Overrides the data type of the result. order : {'C', 'F', 'A', or 'K'}, optional Overrides the memory layout of the result. 'C' means C-order, 'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous, 'C' otherwise. 'K' (default) means match the layout of `a` as closely as possible. Returns ------- LArray Examples -------- >>> a = ndrange((2, 3)) >>> zeros_like(a) {0}*\\{1}* 0 1 2 0 0 0 0 1 0 0 0 """ if not title: title = array.title return LArray(np.zeros_like(array, dtype, order), array.axes, title)
[docs]@_check_axes_argument def ones(axes, title='', dtype=float, order='C'): """Returns an array with the specified axes and filled with ones. Parameters ---------- axes : int, tuple of int, Axis or tuple/list/AxisCollection of Axis Collection of axes or a shape. title : str, optional Title. dtype : data-type, optional Desired data-type for the array, e.g., `numpy.int8`. Default is `numpy.float64`. order : {'C', 'F'}, optional Whether to store multidimensional data in C- (default) or Fortran-contiguous (row- or column-wise) order in memory. Returns ------- LArray Examples -------- >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> ones([nat, sex]) nat\\sex M F BE 1.0 1.0 FO 1.0 1.0 """ axes = AxisCollection(axes) return LArray(np.ones(axes.shape, dtype, order), axes, title)
[docs]def ones_like(array, title='', dtype=None, order='K'): """Returns an array with the same axes as array and filled with ones. Parameters ---------- array : LArray Input array. title : str, optional Title. dtype : data-type, optional Overrides the data type of the result. order : {'C', 'F', 'A', or 'K'}, optional Overrides the memory layout of the result. 'C' means C-order, 'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous, 'C' otherwise. 'K' (default) means match the layout of `a` as closely as possible. Returns ------- LArray Examples -------- >>> a = ndrange((2, 3)) >>> ones_like(a) {0}*\\{1}* 0 1 2 0 1 1 1 1 1 1 1 """ axes = array.axes if not title: title = array.title return LArray(np.ones_like(array, dtype, order), axes, title)
[docs]@_check_axes_argument def empty(axes, title='', dtype=float, order='C'): """Returns an array with the specified axes and uninitialized (arbitrary) data. Parameters ---------- axes : int, tuple of int, Axis or tuple/list/AxisCollection of Axis Collection of axes or a shape. title : str, optional Title. dtype : data-type, optional Desired data-type for the array, e.g., `numpy.int8`. Default is `numpy.float64`. order : {'C', 'F'}, optional Whether to store multidimensional data in C- (default) or Fortran-contiguous (row- or column-wise) order in memory. Returns ------- LArray Examples -------- >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> empty([nat, sex]) # doctest: +SKIP nat\\sex M F BE 2.47311483356e-315 2.47498446195e-315 FO 0.0 6.07684618082e-31 """ axes = AxisCollection(axes) return LArray(np.empty(axes.shape, dtype, order), axes, title)
[docs]def empty_like(array, title='', dtype=None, order='K'): """Returns an array with the same axes as array and uninitialized (arbitrary) data. Parameters ---------- array : LArray Input array. title : str, optional Title. dtype : data-type, optional Overrides the data type of the result. Defaults to the data type of array. order : {'C', 'F', 'A', or 'K'}, optional Overrides the memory layout of the result. 'C' means C-order, 'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous, 'C' otherwise. 'K' (default) means match the layout of `a` as closely as possible. Returns ------- LArray Examples -------- >>> a = ndrange((3, 2)) >>> empty_like(a) # doctest: +SKIP -\- 0 1 0 2.12199579097e-314 6.36598737388e-314 1 1.06099789568e-313 1.48539705397e-313 2 1.90979621226e-313 2.33419537056e-313 """ if not title: title = array.title # cannot use empty() because order == 'K' is not understood return LArray(np.empty_like(array.data, dtype, order), array.axes, title)
# We cannot use @_check_axes_argument here because an integer fill_value would be considered as an error
[docs]def full(axes, fill_value, title='', dtype=None, order='C'): """Returns an array with the specified axes and filled with fill_value. Parameters ---------- axes : int, tuple of int, Axis or tuple/list/AxisCollection of Axis Collection of axes or a shape. fill_value : scalar or LArray Value to fill the array title : str, optional Title. dtype : data-type, optional Desired data-type for the array. Default is the data type of fill_value. order : {'C', 'F'}, optional Whether to store multidimensional data in C- (default) or Fortran-contiguous (row- or column-wise) order in memory. Returns ------- LArray Examples -------- >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> full([nat, sex], 42.0) nat\\sex M F BE 42.0 42.0 FO 42.0 42.0 >>> initial_value = ndrange([sex]) >>> initial_value sex M F 0 1 >>> full([nat, sex], initial_value) nat\\sex M F BE 0 1 FO 0 1 """ if isinstance(fill_value, Axis): raise ValueError("If you want to pass several axes or dimension lengths to full, you must pass them as a " "list (using []) or tuple (using()).") if dtype is None: dtype = np.asarray(fill_value).dtype res = empty(axes, title, dtype, order) res[:] = fill_value return res
[docs]def full_like(array, fill_value, title='', dtype=None, order='K'): """Returns an array with the same axes and type as input array and filled with fill_value. Parameters ---------- array : LArray Input array. fill_value : scalar or LArray Value to fill the array title : str, optional Title. dtype : data-type, optional Overrides the data type of the result. Defaults to the data type of array. order : {'C', 'F', 'A', or 'K'}, optional Overrides the memory layout of the result. 'C' means C-order, 'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous, 'C' otherwise. 'K' (default) means match the layout of `a` as closely as possible. Returns ------- LArray Examples -------- >>> a = ndrange((2, 3)) >>> full_like(a, 5) {0}*\\{1}* 0 1 2 0 5 5 5 1 5 5 5 """ if not title: title = array.title # cannot use full() because order == 'K' is not understood # cannot use np.full_like() because it would not handle LArray fill_value res = empty_like(array, title, dtype, order) res[:] = fill_value return res
# XXX: would it be possible to generalize to multiple axes and deprecate ndrange? # ndrange is only ever used to create test data (except for 1d). See https://github.com/pydata/pandas/issues/4567
[docs]def sequence(axis, initial=0, inc=None, mult=1, func=None, axes=None, title=''): """ Creates an array by sequentially applying modifications to the array along axis. The value for each label in axis will be given by sequentially transforming the value for the previous label. This transformation on the previous label value consists of applying the function "func" on that value if provided, or to multiply it by mult and increment it by inc otherwise. Parameters ---------- axis : axis definition (Axis, str, int) Axis along which to apply mod. An axis definition can be passed as a string. An int will be interpreted as the length for a new anonymous axis. initial : scalar or LArray, optional Value for the first label of axis. Defaults to 0. inc : scalar, LArray, optional Value to increment the previous value by. Defaults to 0 if mult is provided, 1 otherwise. mult : scalar, LArray, optional Value to multiply the previous value by. Defaults to 1. func : function/callable, optional Function to apply to the previous value. Defaults to None. Note that this is much slower than using inc and/or mult. axes : int, tuple of int or tuple/list/AxisCollection of Axis, optional Axes of the result. Defaults to the union of axes present in other arguments. title : str, optional Title. Examples -------- >>> year = Axis('year=2016..2019') >>> sex = Axis('sex=M,F') >>> sequence(year) year 2016 2017 2018 2019 0 1 2 3 >>> sequence('year=2016..2019') year 2016 2017 2018 2019 0 1 2 3 >>> sequence(year, 1.0, 0.5) year 2016 2017 2018 2019 1.0 1.5 2.0 2.5 >>> sequence(year, 1.0, mult=1.5) year 2016 2017 2018 2019 1.0 1.5 2.25 3.375 >>> inc = LArray([1, 2], [sex]) >>> inc sex M F 1 2 >>> sequence(year, 1.0, inc) sex\\year 2016 2017 2018 2019 M 1.0 2.0 3.0 4.0 F 1.0 3.0 5.0 7.0 >>> mult = LArray([2, 3], [sex]) >>> mult sex M F 2 3 >>> sequence(year, 1.0, mult=mult) sex\\year 2016 2017 2018 2019 M 1.0 2.0 4.0 8.0 F 1.0 3.0 9.0 27.0 >>> initial = LArray([3, 4], [sex]) >>> initial sex M F 3 4 >>> sequence(year, initial, 1) sex\\year 2016 2017 2018 2019 M 3 4 5 6 F 4 5 6 7 >>> sequence(year, initial, mult=2) sex\\year 2016 2017 2018 2019 M 3 6 12 24 F 4 8 16 32 >>> sequence(year, initial, inc, mult) sex\\year 2016 2017 2018 2019 M 3 7 15 31 F 4 14 44 134 >>> def modify(prev_value): ... return prev_value / 2 >>> sequence(year, 8, func=modify) year 2016 2017 2018 2019 8 4 2 1 >>> sequence(3) {0}* 0 1 2 0 1 2 >>> sequence(X.year, axes=(sex, year)) sex\\year 2016 2017 2018 2019 M 0 1 2 3 F 0 1 2 3 sequence can be used as the inverse of growth_rate: >>> a = LArray([1.0, 2.0, 3.0, 3.0], year) >>> a year 2016 2017 2018 2019 1.0 2.0 3.0 3.0 >>> g = a.growth_rate() + 1 >>> g year 2017 2018 2019 2.0 1.5 1.0 >>> sequence(year, a[2016], mult=g) year 2016 2017 2018 2019 1.0 2.0 3.0 3.0 """ if inc is None: inc = 1 if mult is 1 else 0 if axes is None: if not isinstance(axis, Axis): axis = _make_axis(axis) def strip_axes(col): return get_axes(col) - axis # we need to remove axis if present, because it might be incompatible axes = strip_axes(initial) | strip_axes(inc) | strip_axes(mult) | axis else: axes = AxisCollection(axes) axis = axes[axis] res_dtype = np.dtype(common_type((initial, inc, mult))) res = empty(axes, title=title, dtype=res_dtype) res[axis.i[0]] = initial def has_axis(a, axis): return isinstance(a, LArray) and axis in a.axes if func is not None: for i in range(1, len(axis)): res[axis.i[i]] = func(res[axis.i[i - 1]]) elif has_axis(inc, axis) and has_axis(mult, axis): # This case is more complicated to vectorize. It seems # doable (probably by adding a fictive axis), but let us wait until # someone requests it. The trick is to be able to write this: # a[i] = initial * prod(mult[j]) + inc[1] * prod(mult[j]) + ... # j=1..i j=2..i # + inc[i-2] * prod(mult[j]) + inc[i-1] * mult[i] + inc[i] # j=i-1..i # a[0] = initial # a[1] = initial * mult[1] # + inc[1] # a[2] = initial * mult[1] * mult[2] # + inc[1] * mult[2] # + inc[2] # a[3] = initial * mult[1] * mult[2] * mult[3] # + inc[1] * mult[2] * mult[3] # + inc[2] * mult[3] # + inc[3] # a[4] = initial * mult[1] * mult[2] * mult[3] * mult[4] # + inc[1] * mult[2] * mult[3] * mult[4] # + inc[2] * mult[3] * mult[4] # + inc[3] * mult[4] # + inc[4] # a[1:] = initial * cumprod(mult[1:]) + ... def index_if_exists(a, axis, i): if isinstance(a, LArray) and axis in a.axes: a_axis = a.axes[axis] return a[a_axis[axis.labels[i]]] else: return a for i in range(1, len(axis)): i_mult = index_if_exists(mult, axis, i) i_inc = index_if_exists(inc, axis, i) res[axis.i[i]] = res[axis.i[i - 1]] * i_mult + i_inc else: # TODO: use cumprod and cumsum to avoid the explicit loop # it is easy for constant inc OR constant mult. # it is easy for array inc OR array mult. # it is a bit more complicated for constant inc AND constant mult # # it gets hairy for array inc AND array mult. It seems doable but let us wait until someone requests it. def array_or_full(a, axis, initial): dt = common_type((a, initial)) r = empty((get_axes(a) - axis) | axis, title=title, dtype=dt) r[axis.i[0]] = initial if isinstance(a, LArray) and axis in a.axes: # not using axis.i[1:] because a could have less ticks # on axis than axis r[axis.i[1:]] = a[axis[axis.labels[1]:]] else: r[axis.i[1:]] = a return r if isinstance(initial, LArray) and np.isscalar(inc): inc = full_like(initial, inc) # inc only (integer scalar) if np.isscalar(mult) and mult == 1 and np.isscalar(inc) and res_dtype.kind == 'i': # stop is not included stop = initial + inc * len(axis) data = np.arange(initial, stop, inc) res[:] = LArray(data, axis) # inc only (other scalar) elif np.isscalar(mult) and mult == 1 and np.isscalar(inc): # stop is included stop = initial + inc * (len(axis) - 1) data = np.linspace(initial, stop=stop, num=len(axis)) res[:] = LArray(data, axis) # inc only (array) elif np.isscalar(mult) and mult == 1: inc_array = array_or_full(inc, axis, initial) res[axis.i[1:]] = inc_array.cumsum(axis)[axis.i[1:]] # mult only (scalar or array) elif np.isscalar(inc) and inc == 0: mult_array = array_or_full(mult, axis, initial) res[axis.i[1:]] = mult_array.cumprod(axis)[axis.i[1:]] # both inc and mult defined but scalars or axis not present else: mult_array = array_or_full(mult, axis, 1.0) cum_mult = mult_array.cumprod(axis)[axis.i[1:]] res[axis.i[1:]] = ((1 - cum_mult) / (1 - mult)) * inc + initial * cum_mult return res
create_sequential = renamed_to(sequence, 'create_sequential')
[docs]@_check_axes_argument def ndrange(axes, start=0, title='', dtype=int): """Returns an array with the specified axes and filled with increasing int. Parameters ---------- axes : single axis or tuple/list/AxisCollection of axes Axes of the array to create. Each axis can be given as either: * Axis object: actual axis object to use. * single int: length of axis. will create a wildcard axis of that length. * str: coma separated list of labels, with optional leading '=' to set the name of the axis. eg. "a,b,c" or "sex=F,M" * (labels, name) pair: name and labels of axis start : number, optional title : str, optional Title. dtype : dtype, optional The type of the output array. Defaults to int. Returns ------- LArray Examples -------- >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> ndrange([nat, sex]) nat\\sex M F BE 0 1 FO 2 3 >>> ndrange(['nat=BE,FO', 'sex=M,F']) nat\\sex M F BE 0 1 FO 2 3 >>> ndrange([(['BE', 'FO'], 'nat'), ... (['M', 'F'], 'sex')]) nat\\sex M F BE 0 1 FO 2 3 >>> ndrange([('BE,FO', 'nat'), ... ('M,F', 'sex')]) nat\\sex M F BE 0 1 FO 2 3 >>> ndrange('nat=BE,FO;sex=M,F') nat\\sex M F BE 0 1 FO 2 3 >>> ndrange([2, 3], dtype=float) {0}*\\{1}* 0 1 2 0 0.0 1.0 2.0 1 3.0 4.0 5.0 >>> ndrange(3, start=2) {0}* 0 1 2 2 3 4 >>> ndrange('a,b,c') {0} a b c 0 1 2 """ # XXX: implement something like: # >>> mat = ndrange([['BE', 'FO'], ['M', 'F']], axes=['nat', 'sex']) # >>> mat = ndrange(['BE,FO', 'M,F'], axes=['nat', 'sex']) # XXX: try to come up with a syntax where start is before "end". For ndim # > 1, I cannot think of anything nice. axes = AxisCollection(axes) data = np.arange(start, start + axes.size, dtype=dtype) return LArray(data.reshape(axes.shape), axes, title)
[docs]@_check_axes_argument def ndtest(shape, start=0, label_start=0, title='', dtype=int): """Returns test array with given shape. Axes are named by single letters starting from 'a'. Axes labels are constructed using a '{axis_name}{label_pos}' pattern (e.g. 'a0'). Values start from `start` increase by steps of 1. Parameters ---------- shape : int, tuple/list of int Shape of the array to create. An int can be used directly for one dimensional arrays. start : int or float, optional Start value label_start : int, optional Label index for each axis is `label_start + position`. `label_start` defaults to 0. title : str, optional Title. dtype : type or np.dtype, optional Type of resulting array. Returns ------- LArray Examples -------- >>> ndtest(6) a a0 a1 a2 a3 a4 a5 0 1 2 3 4 5 >>> ndtest((2, 3)) a\\b b0 b1 b2 a0 0 1 2 a1 3 4 5 >>> ndtest((2, 3), label_start=1) a\\b b1 b2 b3 a1 0 1 2 a2 3 4 5 """ a = ndrange(shape, start=start, dtype=dtype, title=title) # TODO: move this to a class method on AxisCollection assert a.ndim <= 26 axes_names = [chr(ord('a') + i) for i in range(a.ndim)] label_ranges = [range(label_start, label_start + length) for length in a.shape] new_axes = [Axis([name + str(i) for i in label_range], name) for name, label_range in zip(axes_names, label_ranges)] return a.set_axes(new_axes)
def kth_diag_indices(shape, k): indices = np.diag_indices(min(shape), ndim=len(shape)) if len(shape) == 2 and k != 0: rows, cols = indices if k < 0: return rows[-k:], cols[:k] elif k > 0: return rows[:-k], cols[k:] elif k != 0: raise NotImplementedError("k != 0 and len(axes) != 2") else: return indices
[docs]def diag(a, k=0, axes=(0, 1), ndim=2, split=True): """ Extracts a diagonal or construct a diagonal array. Parameters ---------- a : LArray If `a` has 2 dimensions or more, return a copy of its `k`-th diagonal. If `a` has 1 dimension, return an array with `ndim` dimensions on the `k`-th diagonal. k : int, optional Offset of the diagonal from the main diagonal. Can be positive or negative. Defaults to main diagonal (0). axes : tuple or list or AxisCollection of axes references, optional Axes along which the diagonals should be taken. Use None for all axes. Defaults to the first two axes (0, 1). ndim : int, optional Target number of dimensions when constructing a diagonal array from an array without axes names/labels. Defaults to 2. split : bool, optional Whether or not to try to split the axis name and labels Returns ------- LArray The extracted diagonal or constructed diagonal array. Examples -------- >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> a = ndrange([nat, sex], start=1) >>> a nat\\sex M F BE 1 2 FO 3 4 >>> d = diag(a) >>> d nat_sex BE_M FO_F 1 4 >>> diag(d) nat\\sex M F BE 1 0 FO 0 4 >>> a = ndrange(sex, start=1) >>> a sex M F 1 2 >>> diag(a) sex\\sex M F M 1 0 F 0 2 """ if a.ndim == 1: axis = a.axes[0] axis_name = axis.name if k != 0: raise NotImplementedError("k != 0 not supported for 1D arrays") if split and isinstance(axis_name, str) and '_' in axis_name: axes_names = axis_name.split('_') axes_labels = list(zip(*np.char.split(axis.labels, '_'))) axes = [Axis(labels, name) for labels, name in zip(axes_labels, axes_names)] else: axes = [axis] + [axis.copy() for _ in range(ndim - 1)] res = zeros(axes, dtype=a.dtype) diag_indices = kth_diag_indices(res.shape, k) res.ipoints[diag_indices] = a return res else: if k != 0 and len(axes) > 2: raise NotImplementedError("k != 0 and len(axes) > 2") if axes is None: axes = a.axes else: axes = a.axes[axes] axes_indices = kth_diag_indices(axes.shape, k) indexer = tuple(axis.i[indices] for axis, indices in zip(axes, axes_indices)) return a.points[indexer]
[docs]@_check_axes_argument def labels_array(axes, title=''): """Returns an array with specified axes and the combination of corresponding labels as values. Parameters ---------- axes : Axis or collection of Axis title : str, optional Title. Returns ------- LArray Examples -------- >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> labels_array(sex) sex M F M F >>> labels_array((nat, sex)) nat sex\\axis nat sex BE M BE M BE F BE F FO M FO M FO F FO F """ # >>> labels_array((nat, sex)) # nat\\sex M F # BE BE,M BE,F # FO FO,M FO,F axes = AxisCollection(axes) if len(axes) > 1: res_axes = axes + Axis(axes.names, 'axis') res_data = np.empty(res_axes.shape, dtype=object) res_data.flat[:] = list(product(*axes.labels)) # XXX: I wonder if it wouldn't be better to return LGroups or a similar object which would display as "a,b" but # where each label is stored separately. # flat_data = np.array([p for p in product(*axes.labels)]) # res_data = flat_data.reshape(axes.shape) else: res_axes = axes res_data = axes[0].labels return LArray(res_data, res_axes, title)
[docs]def identity(axis): raise NotImplementedError("identity(axis) is deprecated. In most cases, you can now use the axis directly. " "For example, 'identity(age) < 10' can be replaced by 'age < 10'. " "In other cases, you should use labels_array(axis) instead.")
[docs]def eye(rows, columns=None, k=0, title='', dtype=None): """Returns a 2-D array with ones on the diagonal and zeros elsewhere. Parameters ---------- rows : int or Axis Rows of the output. columns : int or Axis, optional Columns of the output. If None, defaults to rows. k : int, optional Index of the diagonal: 0 (the default) refers to the main diagonal, a positive value refers to an upper diagonal, and a negative value to a lower diagonal. title : str, optional Title. dtype : data-type, optional Data-type of the returned array. Defaults to float. Returns ------- LArray of shape (rows, columns) An array where all elements are equal to zero, except for the k-th diagonal, whose values are equal to one. Examples -------- >>> eye(2, dtype=int) {0}*\\{1}* 0 1 0 1 0 1 0 1 >>> sex = Axis('sex=M,F') >>> eye(sex) sex\\sex M F M 1.0 0.0 F 0.0 1.0 >>> age = Axis('age=0..2') >>> eye(age, sex) age\\sex M F 0 1.0 0.0 1 0.0 1.0 2 0.0 0.0 >>> eye(3, k=1) {0}*\\{1}* 0 1 2 0 0.0 1.0 0.0 1 0.0 0.0 1.0 2 0.0 0.0 0.0 """ if columns is None: columns = rows.copy() if isinstance(rows, Axis) else rows axes = AxisCollection([rows, columns]) shape = axes.shape data = np.eye(shape[0], shape[1], k, dtype) return LArray(data, axes, title)
# XXX: we could change the syntax to use *args # => less punctuation but forces kwarg # => potentially longer # => unsure for now. The most important point is that it should be consistent with other functions. # stack(a1, a2, axis=Axis('M,F', 'sex')) # stack(('M', a1), ('F', a2), axis='sex') # stack(a1, a2, axis='sex') # on Python 3.6, we could do something like (it would make from_lists obsolete for 1D arrays): # stack('sex', M=1, F=2) # which is almost equivalent to: # stack(M=1, F=2, axis='sex') # but we cannot support the current syntax unmodified AND the first version, but second version we could. # we would only have to explain that they cannot do: # stack(0=1, 1=2, axis='age') # stack(0A=1, 1B=2, axis='code') # but should use this instead: # stack({0: 1, 1: 2}, 'age=0,1') # stack({'0A': 1, '1B': 2}, 'code=0A,1B') # stack({0: 1, 1: 2}, age) # stack({'0A': 1, '1B': 2}, code) # or this, if we decide to support *args instead: # stack((0, 1), (1, 2), axis='age') # stack(('0A', 1), ('1B', 2), axis='code') # stack(M=1, F=2, axis='sex') # is much nicer than: # from_lists(['sex', 'M', 'F'], # [ '', 1, 2]) # for 2D arrays, from_lists and stack would be mostly as ugly and for 3D+ from_lists stays nicer even though I still do # not like it much. # stack('nationality', # BE=stack('sex', M=0, F=1), # FR=stack('sex', M=2, F=3), # DE=stack('sex', M=4, F=5)) # # from_lists([['nationality\\sex', 'M', 'F'], # [ 'BE', 0, 1], # [ 'FR', 2, 3], # [ 'DE', 4, 5]]) # SUPER SLOPPY (I hate this, but I bet users would like it): # stack(BE_M=0, BE_F=1, # FR_M=2, FR_F=3, # DE_M=4, DE_F=5, axis='nationality_sex') # stack(('nationality', 'sex'), { # ('BE', 'M'): 0, ('BE', 'F'): 1, # ('FR', 'M'): 2, ('FR', 'F'): 3, # ('DE', 'M'): 4, ('DE', 'F'): 5})
[docs]def stack(elements=None, axis=None, title='', **kwargs): """ Combines several arrays or sessions along an axis. Parameters ---------- elements : tuple, list or dict. Elements to stack. Elements can be scalars, arrays, sessions, (label, value) pairs or a {label: value} mapping. In the later case, axis must be defined and cannot be a name only, because we need to have labels order, which the mapping does not provide. Stacking sessions will return a new session containing the arrays of all sessions stacked together. An array missing in a session will be replaced by NaN. axis : str or Axis, optional Axis to create. If None, defaults to a range() axis. title : str, optional Title. Returns ------- LArray A single array combining arrays. Examples -------- >>> nat = Axis('nat=BE,FO') >>> sex = Axis('sex=M,F') >>> arr1 = ones(nat) >>> arr1 nat BE FO 1.0 1.0 >>> arr2 = zeros(nat) >>> arr2 nat BE FO 0.0 0.0 In the case the axis to create has already been defined in a variable >>> stack({'M': arr1, 'F': arr2}, sex) nat\\sex M F BE 1.0 0.0 FO 1.0 0.0 Otherwise (when one wants to create an axis from scratch), any of these syntaxes works: >>> stack([arr1, arr2], 'sex=M,F') nat\\sex M F BE 1.0 0.0 FO 1.0 0.0 >>> stack({'M': arr1, 'F': arr2}, 'sex=M,F') nat\\sex M F BE 1.0 0.0 FO 1.0 0.0 >>> stack([('M', arr1), ('F', arr2)], 'sex') nat\\sex M F BE 1.0 0.0 FO 1.0 0.0 When stacking arrays with different axes, the result has the union of all axes present: >>> stack({'M': arr1, 'F': 0}, sex) nat\\sex M F BE 1.0 0.0 FO 1.0 0.0 Creating an axis without name nor labels can be done using: >>> stack((arr1, arr2)) nat\\{1}* 0 1 BE 1.0 0.0 FO 1.0 0.0 When labels are "simple" strings (ie no integers, no string starting with integers, etc.), using keyword arguments can be an attractive alternative. >>> stack(F=arr2, M=arr1, axis=sex) nat\\sex M F BE 1.0 0.0 FO 1.0 0.0 Without passing an explicit order for labels (or an axis object like above), it should only be used on Python 3.6 or later because keyword arguments are NOT ordered on earlier Python versions. >>> # use this only on Python 3.6 and later >>> stack(M=arr1, F=arr2, axis='sex') # doctest: +SKIP nat\\sex M F BE 1.0 0.0 FO 1.0 0.0 To stack sessions, let us first create two test sessions. For example suppose we have a session storing the results of a baseline simulation: >>> from larray import Session >>> baseline = Session([('arr1', arr1), ('arr2', arr2)]) and another session with a variant (here we simply added 0.5 to each array) >>> variant = Session([('arr1', arr1 + 0.5), ('arr2', arr2 + 0.5)]) then we stack them together >>> stacked = stack([('baseline', baseline), ('variant', variant)], 'sessions') >>> stacked Session(arr1, arr2) >>> stacked.arr1 nat\sessions baseline variant BE 1.0 1.5 FO 1.0 1.5 >>> stacked.arr2 nat\sessions baseline variant BE 0.0 0.5 FO 0.0 0.5 """ from larray import Session if isinstance(axis, str) and '=' in axis: axis = Axis(axis) if elements is None: if not isinstance(axis, Axis) and sys.version_info[:2] < (3, 6): raise TypeError("axis argument should provide label order when using keyword arguments on Python < 3.6") elements = kwargs.items() elif kwargs: raise TypeError("stack() accept either keyword arguments OR a collection of elements, not both") if isinstance(axis, Axis) and all(isinstance(e, tuple) for e in elements): assert all(len(e) == 2 for e in elements) elements = {k: v for k, v in elements} if isinstance(elements, LArray): if axis is None: axis = -1 axis = elements.axes[axis] values = [elements[k] for k in axis] elif isinstance(elements, dict): assert isinstance(axis, Axis) values = [elements[v] for v in axis.labels] elif isinstance(elements, Iterable): if not isinstance(elements, Sequence): elements = list(elements) if all(isinstance(e, tuple) for e in elements): assert all(len(e) == 2 for e in elements) keys = [k for k, v in elements] values = [v for k, v in elements] assert all(np.isscalar(k) for k in keys) # this case should already be handled assert not isinstance(axis, Axis) # axis should be None or str axis = Axis(keys, axis) else: values = elements if axis is None or isinstance(axis, basestring): axis = Axis(len(elements), axis) else: assert len(axis) == len(elements) else: raise TypeError('unsupported type for arrays: %s' % type(elements).__name__) if any(isinstance(v, Session) for v in values): sessions = values if not all(isinstance(s, Session) for s in sessions): raise TypeError("stack() only supports stacking Session with other Session objects") seen = set() all_keys = [] for s in sessions: unique_list(s.keys(), all_keys, seen) res = [] for name in all_keys: try: stacked = stack([s.get(name, np.nan) for s in sessions], axis=axis) # TypeError for str arrays, ValueError for incompatible axes, ... except Exception: stacked = np.nan res.append((name, stacked)) return Session(res) else: # XXX : use concat? result_axes = AxisCollection.union(*[get_axes(v) for v in values]) result_axes.append(axis) result = empty(result_axes, title=title, dtype=common_type(values)) for k, v in zip(axis, values): result[k] = v return result
def get_axes(value): return value.axes if isinstance(value, LArray) else AxisCollection([]) def _strip_shape(shape): return tuple(s for s in shape if s != 1) def _equal_modulo_len1(shape1, shape2): return _strip_shape(shape1) == _strip_shape(shape2) # assigning a temporary name to anonymous axes before broadcasting and removing it afterwards is not a good idea after # all because it copies the axes/change the object, and thus "flatten" wouldn't work with index axes: # a[ones(a.axes[axes], dtype=bool)] # but if we had assigned axes names from the start (without dropping them) this wouldn't be a problem. def make_numpy_broadcastable(values): """ Returns values where LArrays are (NumPy) broadcastable between them. For that to be possible, all common axes must be compatible (see Axis class documentation). Extra axes (in any array) can have any length. * the resulting arrays will have the combination of all axes found in the input arrays, the earlier arrays defining the order of axes. Axes with labels take priority over wildcard axes. * length 1 wildcard axes will be added for axes not present in input Parameters ---------- values : iterable of arrays Arrays that requires to be (NumPy) broadcastable between them. Returns ------- list of arrays List of arrays broadcastable between them. Arrays will have the combination of all axes found in the input arrays, the earlier arrays defining the order of axes. AxisCollection Collection of axes of all input arrays. See Also -------- Axis.iscompatible : tests if axes are compatible between them. """ all_axes = AxisCollection.union(*[get_axes(v) for v in values]) return [v.broadcast_with(all_axes) if isinstance(v, LArray) else v for v in values], all_axes _default_float_error_handler = float_error_handler_factory(3) original_float_error_settings = np.seterr(divide='call', invalid='call') original_float_error_handler = np.seterrcall(_default_float_error_handler) # excel IO tools in Python # - openpyxl: the slowest but most-complete package but still lags behind PHPExcel from which it was ported. despite # the drawbacks the API is very complete. # biggest drawbacks: # * you can get either the "cached" value of cells OR their formulas but NOT BOTH and this is a file-wide setting # (data_only=True). if you have an excel file and want to add a sheet to it, you either loose all cached values # (which is problematic in many cases since you do not necessarily have linked files) or loose all formulas. # * it loose "charts" on read. => cannot append/update a sheet to a file with charts, which is precisely what many # users asked. => users need to create their charts using code. # - xlsxwriter: faster and slightly more feature-complete than openpyxl regarding writing but does not read anything # => cannot update an existing file. API seems extremely complete. # - pyexcelerate: yet faster but also write only. Didn't check whether API is more featured than xlsxwriter or not. # - xlwings: wraps win32com & equivalent on mac, so can potentially do everything (I guess) but this is SLOW and needs # a running excel instance, etc.