Source code for larray.core.group

# -*- coding: utf8 -*-
from __future__ import absolute_import, division, print_function

import re
import sys
import warnings
from itertools import product, chain

import numpy as np
import pandas as pd

from larray.core.abstractbases import ABCAxis, ABCAxisReference, ABCLArray
from larray.util.oset import *
from larray.util.misc import basestring, PY2, unique, find_closing_chr, _parse_bound, _seq_summary, renamed_to

__all__ = ['Group', 'LGroup', 'LSet', 'IGroup', 'union']


def _slice_to_str(key, repr_func=str):
    """
    Converts a slice to a string

    Examples
    --------
    >>> _slice_to_str(slice(None))
    ':'
    >>> _slice_to_str(slice(24))
    ':24'
    >>> _slice_to_str(slice(25, None))
    '25:'
    >>> _slice_to_str(slice(5, 10))
    '5:10'
    >>> _slice_to_str(slice(None, 5, 2))
    ':5:2'
    """
    # examples of result: ":24" "25:" ":" ":5:2"
    start = repr_func(key.start) if key.start is not None else ''
    stop = repr_func(key.stop) if key.stop is not None else ''
    step = (":" + repr_func(key.step)) if key.step is not None else ''
    return '%s:%s%s' % (start, stop, step)


def irange(start, stop, step=None):
    """Create a range, with inclusive stop bound and automatic sign for step.

    Parameters
    ----------
    start : int
        Start bound
    stop : int
        Inclusive stop bound
    step : int, optional
        Distance between two generated numbers. If provided this *must* be a positive integer.

    Returns
    -------
    range

    Examples
    --------
    >>> list(irange(1, 3))
    [1, 2, 3]
    >>> list(irange(2, 0))
    [2, 1, 0]
    >>> list(irange(1, 6, 2))
    [1, 3, 5]
    >>> list(irange(6, 1, 2))
    [6, 4, 2]
    >>> list(irange(-1, 1))
    [-1, 0, 1]
    """
    if step is None:
        step = 1
    elif step <= 0:
        raise ValueError("step must be a positive integer or None")
    step = step if start <= stop else -step
    stop = stop + 1 if start <= stop else stop - 1
    return range(start, stop, step)


_range_bound_pattern = re.compile('([0-9]+|[a-zA-Z]+)')


def generalized_range(start, stop, step=1):
    """Create a range, with inclusive stop bound and automatic sign for step. Bounds can be strings.

    Parameters
    ----------
    start : int or str
        Start bound
    stop : int or str
        Inclusive stop bound
    step : int, optional
        Distance between two generated numbers. If provided this *must* be a positive integer.

    Returns
    -------
    range

    Examples
    --------
    works with both number and letter bounds

    >>> list(generalized_range(-1, 2))
    [-1, 0, 1, 2]
    >>> generalized_range('a', 'c')
    ['a', 'b', 'c']

    can generate in reverse

    >>> list(generalized_range(2, 0))
    [2, 1, 0]
    >>> generalized_range('c', 'a')
    ['c', 'b', 'a']

    can combine letters and numbers

    >>> generalized_range('a0', 'c1')
    ['a0', 'a1', 'b0', 'b1', 'c0', 'c1']

    any special character is left intact

    >>> generalized_range('a_0', 'c_1')
    ['a_0', 'a_1', 'b_0', 'b_1', 'c_0', 'c_1']

    consecutive digits are treated like numbers

    >>> generalized_range('A8', 'A10')
    ['A8', 'A9', 'A10']

    one may use zero padding on numbers

    >>> generalized_range('A08', 'A10')
    ['A08', 'A09', 'A10']

    consecutive letters create all combinations

    >>> generalized_range('AA', 'CC')
    ['AA', 'AB', 'AC', 'BA', 'BB', 'BC', 'CA', 'CB', 'CC']

    one cannot go from a integer to a letter and vice versa

    >>> generalized_range('1', 'F')
    Traceback (most recent call last):
    ...
    ValueError: expected an integer for the stop bound (because the start bound is an integer) but got 'F' instead

    when using special characters, they must be the same on both sides

    >>> generalized_range('a|1', 'a/2')
    Traceback (most recent call last):
    ...
    ValueError: Special characters must be the same for start and stop
    """
    if isinstance(start, str):
        assert isinstance(stop, str)
        start_parts = _range_bound_pattern.split(start)
        stop_parts = _range_bound_pattern.split(stop)
        assert len(start_parts) == len(stop_parts)
        ranges = []
        for start_part, stop_part in zip(start_parts, stop_parts):
            # we only handle non-negative int-like strings on purpose. Int-only bounds should already be converted to
            # real integers by now, and mixing negative int-like strings and letters yields some strange results.
            if start_part.isdigit():
                if not stop_part.isdigit():
                    raise ValueError("expected an integer for the stop bound (because the start bound is an integer) "
                                     "but got '%s' instead" % stop_part)
                rng = irange(int(start_part), int(stop_part))
                start_pad = len(start_part) if start_part.startswith('0') else None
                stop_pad = len(stop_part) if stop_part.startswith('0') else None
                if start_pad is not None and stop_pad is not None and start_pad != stop_pad:
                    raise ValueError("Inconsistent zero padding for start and stop ({} vs {}) of the numerical part. "
                                     "Must be either the same on both sides or no padding on either side"
                                     .format(start_pad, stop_pad))
                elif start_pad is None and stop_pad is None:
                    r = [str(num) for num in rng]
                else:
                    pad = start_pad if stop_pad is None else stop_pad
                    r = ['%0*d' % (pad, num) for num in rng]
            elif start_part.isalpha():
                assert stop_part.isalpha()
                int_start = [ord(c) for c in start_part]
                int_stop = [ord(c) for c in stop_part]
                sranges = [[chr(c) for c in irange(r_start, r_stop) if chr(c).isalnum()]
                           for r_start, r_stop in zip(int_start, int_stop)]
                r = [''.join(p) for p in product(*sranges)]
            else:
                # special characters
                if start_part != stop_part:
                    raise ValueError("Special characters must be the same for start and stop")
                r = [start_part]
            ranges.append(r)
        res = [''.join(p) for p in product(*ranges)]
        return res if step == 1 else res[::step]
    else:
        return irange(start, stop, step)


_range_str_pattern = re.compile('(?P<start>[^\s.]+)?\s*\.\.\s*(?P<stop>[^\s.]+)?(\s+step\s+(?P<step>\d+))?')


def _range_str_to_range(s, stack_depth=1):
    """
    Converts a range string to a range (of values).
    The end point is included.

    Parameters
    ----------
    s : str
        String representing a range of values

    Returns
    -------
    range
        range of int or list of str.

    Examples
    --------
    >>> list(_range_str_to_range('-1..2'))
    [-1, 0, 1, 2]
    >>> _range_str_to_range('a..c')
    ['a', 'b', 'c']
    >>> list(_range_str_to_range('2..6 step 2'))
    [2, 4, 6]

    any special character except . and spaces should work
    >>> _range_str_to_range('a|+*@-b .. a|+*@-d')
    ['a|+*@-b', 'a|+*@-c', 'a|+*@-d']
    """
    s = s.strip()
    m = _range_str_pattern.match(s)

    groups = m.groupdict()
    start, stop, step = groups['start'], groups['stop'], groups['step']
    start = _parse_bound(start, stack_depth + 1) if start is not None else 0
    if stop is None:
        raise ValueError("no stop bound provided in range: %r" % s)
    stop = _parse_bound(stop, stack_depth + 1)
    # TODO: use parse_bound
    step = int(step) if step is not None else 1
    return generalized_range(start, stop, step)


def _range_to_slice(seq, length=None):
    """
    Returns a slice if possible (including for sequences of 1 element) otherwise returns the input sequence itself

    Parameters
    ----------
    seq : sequence-like of int
        List, tuple or ndarray of integers representing the range.
        It should be something like [start, start+step, start+2*step, ...]
    length : int, optional
        length of sequence of positions. This is only useful when you must be able to transform decreasing sequences
        which can stop at 0.

    Returns
    -------
    slice or sequence-like
        return the input sequence if a slice cannot be defined

    Examples
    --------
    >>> _range_to_slice([3, 4, 5])
    slice(3, 6, None)
    >>> _range_to_slice([3, 5, 7])
    slice(3, 9, 2)
    >>> _range_to_slice([-3, -2])
    slice(-3, -1, None)
    >>> _range_to_slice([-1, -2])
    slice(-1, -3, -1)
    >>> _range_to_slice([2, 1])
    slice(2, 0, -1)
    >>> _range_to_slice([1, 0], 4)
    slice(-3, -5, -1)
    >>> _range_to_slice([1, 0])
    [1, 0]
    >>> _range_to_slice([1])
    slice(1, 2, None)
    >>> _range_to_slice([])
    []
    """
    if len(seq) < 1:
        return seq
    start = seq[0]
    if len(seq) == 1:
        return slice(start, start + 1)
    second = seq[1]
    step = second - start
    prev_value = second
    for value in seq[2:]:
        if value != prev_value + step:
            return seq
        prev_value = value
    stop = prev_value + step
    if prev_value == 0 and step < 0:
        if length is None:
            return seq
        else:
            stop -= length
            start -= length
    if step == 1:
        step = None
    return slice(start, stop, step)


def _is_object_array(array):
    return isinstance(array, np.ndarray) and array.dtype.type == np.object_


def _can_have_groups(seq):
    return _is_object_array(seq) or isinstance(seq, (tuple, list))


def _contain_group_ticks(ticks):
    return _can_have_groups(ticks) and any(isinstance(tick, Group) for tick in ticks)


def _seq_group_to_name(seq):
    if _can_have_groups(seq):
        return [v.name if isinstance(v, Group) else v for v in seq]
    else:
        return seq


def _to_tick(v):
    """
    Converts any value to a tick (ie makes it hashable, and acceptable as an ndarray element)

    scalar -> not modified
    slice -> 'start:stop'
    list|tuple -> 'v1,v2,v3'
    Group with name -> v.name
    Group without name -> _to_tick(v.key)
    other -> str(v)

    Parameters
    ----------
    v : any
        value to be converted.

    Returns
    -------
    any scalar
        scalar representing the tick
    """
    # the fact that an "aggregated tick" is passed as a LGroup or as a string should be as irrelevant as possible.
    # The thing is that we cannot (currently) use the more elegant _to_tick(e.key) that means the LGroup is not
    # available in Axis.__init__ after to_ticks, and we need it to update the mapping if it was named. Effectively,
    # this creates two entries in the mapping for a single tick. Besides, I like having the LGroup as the tick, as it
    # provides extra info as to where it comes from.
    if np.isscalar(v):
        return v
    elif isinstance(v, Group):
        return v.name if v.name is not None else _to_tick(v.to_label())
    elif isinstance(v, slice):
        return _slice_to_str(v)
    elif isinstance(v, (tuple, list)):
        if len(v) == 1:
            return str(v) + ','
        else:
            # TODO: it would be nicer/saner to use n=1, sep='' but this currently breaks at lot of tests
            return _seq_summary(v, n=1000, repr_func=str, sep=',')
    else:
        return str(v)


def _to_ticks(s, parse_single_int=False):
    """
    Makes a (list of) value(s) usable as the collection of labels for an Axis (ie hashable).

    Strip strings, split them on ',' and translate "range strings" to list of values **including the end point** !

    Parameters
    ----------
    s : iterable
        List of values usable as the collection of labels for an Axis.

    Returns
    -------
    collection of labels

    Notes
    -----
    This function is only used in Axis.__init__ and union().

    Examples
    --------
    >>> _to_ticks('M , F')
    ['M', 'F']
    >>> _to_ticks('A,C..E,F..G,Z')
    ['A', 'C', 'D', 'E', 'F', 'G', 'Z']
    >>> _to_ticks('U')
    ['U']
    >>> list(_to_ticks('..3'))
    [0, 1, 2, 3]
    """
    if isinstance(s, ABCAxis):
        return s.labels
    if isinstance(s, Group):
        # a single LGroup used for all ticks of an Axis
        return _to_ticks(s.eval())
    elif isinstance(s, pd.Index):
        return s.values
    elif isinstance(s, np.ndarray):
        # we assume it has already been translated
        # XXX: Is it a safe assumption?
        return s
    elif isinstance(s, (list, tuple)):
        return [_to_tick(e) for e in s]
    elif sys.version >= '3' and isinstance(s, range):
        return list(s)
    elif isinstance(s, basestring):
        seq = _seq_str_to_seq(s, parse_single_int=parse_single_int)
        if isinstance(seq, slice):
            raise ValueError("using : to define axes is deprecated, please use .. instead")
        elif isinstance(seq, (basestring, int)):
            return [seq]
        else:
            return seq
    elif hasattr(s, '__array__'):
        return s.__array__()
    else:
        try:
            return list(s)
        except TypeError:
            raise TypeError("ticks must be iterable (%s is not)" % type(s))


_axis_name_pattern = re.compile('\s*(([A-Za-z]\w*)(\.i)?\s*\[)?(.*)')


def _seq_str_to_seq(s, stack_depth=1, parse_single_int=False):
    """
    Converts a sequence string to its sequence (or scalar)

    Parameters
    ----------
    s : basestring
        string to parse

    Returns
    -------
    scalar, slice, range or list
    """
    numcolons = s.count(':')
    if numcolons:
        assert numcolons <= 2
        # bounds can be of len 2 or 3 (if step is provided)
        # stack_depth + 2 because the list comp has its own stack
        bounds = [_parse_bound(b, stack_depth + 2) for b in s.split(':')]
        return slice(*bounds)
    elif ',' in s and '..' in s:
        # strip extremity commas to avoid empty string sequence elements
        s = s.strip(',')

        def to_seq(b, stack_depth=1):
            if '..' in b:
                return _range_str_to_range(b, stack_depth + 1)
            else:
                parsed = _parse_bound(b, stack_depth + 1)
                return (parsed,)

        # stack_depth + 2 because the list comp has its own stack
        return list(chain(*[to_seq(b, stack_depth + 2) for b in s.split(',')]))
    elif ',' in s:
        # strip extremity commas to avoid empty string sequence elements
        s = s.strip(',')
        return [_parse_bound(b, stack_depth + 2) for b in s.split(',')]
    elif '..' in s:
        return _range_str_to_range(s, stack_depth + 1)
    else:
        return _parse_bound(s, stack_depth + 1, parse_int=parse_single_int)


def _to_key(v, stack_depth=1, parse_single_int=False):
    """
    Converts a value to a key usable for indexing (slice object, list of values,...).
    Strings are split on ',' and stripped. Colons (:) are interpreted as slices.

    Parameters
    ----------
    v : int or basestring or tuple or list or slice or LArray or Group
        value to convert into a key usable for indexing

    Returns
    -------
    key
        a key represents any object that can be used for indexing

    Examples
    --------
    >>> _to_key('a:c')
    slice('a', 'c', None)
    >>> _to_key('a, b,c ,')
    ['a', 'b', 'c']
    >>> _to_key('a..c')
    ['a', 'b', 'c']
    >>> _to_key('a,c..e,g..h,z')
    ['a', 'c', 'd', 'e', 'g', 'h', 'z']
    >>> _to_key('a,')
    ['a']
    >>> _to_key(' a ')
    'a'
    >>> _to_key(10)
    10
    >>> _to_key('10')
    '10'
    >>> _to_key('10:20')
    slice(10, 20, None)
    >>> _to_key(slice('10', '20'))
    slice('10', '20', None)
    >>> _to_key('year.i[-1]')
    year.i[-1]
    >>> _to_key('age[10:19]>>teens')
    age[10:19] >> 'teens'
    >>> _to_key('a,b,c >> abc')
    LGroup(['a', 'b', 'c']) >> 'abc'
    >>> _to_key('a:c >> abc')
    LGroup(slice('a', 'c', None)) >> 'abc'

    # evaluated variables do not work on Python 2, probably because the stackdepth is different
    # >>> ext = [1, 2, 3]
    # >>> _to_key('{ext} >> ext')
    # LGroup([1, 2, 3]) >> 'ext'
    # >>> answer = 42
    # >>> _to_key('{answer}')
    # 42
    # >>> _to_key('{answer} >> answer')
    # LGroup(42) >> 'answer'
    # >>> _to_key('10:{answer} >> answer')
    # LGroup(slice(10, 42, None)) >> 'answer'
    # >>> _to_key('4,{answer},2 >> answer')
    # LGroup([4, 42, 2]) >> 'answer'
    # >>> list(_to_key('40..{answer}'))
    # [40, 41, 42]
    # >>> _to_key('4,40..{answer},2')
    # [4, 40, 41, 42, 2]
    # >>> _to_key('4,40..{answer},2 >> answer')
    # LGroup([4, 40, 41, 42, 2]) >> 'answer'
    """
    if isinstance(v, tuple):
        return list(v)
    elif isinstance(v, basestring):
        # axis name
        m = _axis_name_pattern.match(v)
        _, axis, positional, key = m.groups()
        # group name. using rfind in the unlikely case there is another >>
        name_pos = key.rfind('>>')
        name = None
        if name_pos != -1:
            key, name = key[:name_pos].strip(), key[name_pos + 2:].strip()
        if axis is not None:
            axis = axis.strip()
            axis_bracket_open = m.end(1) - 1
            # check that the string parentheses are correctly balanced
            _ = find_closing_chr(v, axis_bracket_open)
            # strip closing bracket (it should be at the end because we took care of the name earlier)
            assert key[-1] == ']'
            key = key[:-1]
        if name is not None or axis is not None:
            cls = IGroup if positional else LGroup
            key = _to_key(key, stack_depth + 1, parse_single_int=positional)
            return cls(key, name=name, axis=axis)
        else:
            return _seq_str_to_seq(v, stack_depth + 1, parse_single_int=parse_single_int)
    elif v is Ellipsis or np.isscalar(v) or isinstance(v, (Group, slice, list, np.ndarray, ABCLArray, OrderedSet)):
        return v
    else:
        raise TypeError("%s has an invalid type (%s) for a key" % (v, type(v).__name__))


def _to_keys(value, stack_depth=1):
    """
    Converts a (collection of) group(s) to a structure usable for indexing.

    'label' or ['l1', 'l2'] or [['l1', 'l2'], ['l3']]

    Parameters
    ----------
    value : int or basestring or tuple or list or slice or LArray or Group
        (collection of) value(s) to convert into key(s) usable for indexing

    Returns
    -------
    list of keys

    Examples
    --------
    It is only used for .sum(axis=xxx)
    >>> _to_keys('P01,P02')  # <-- one group => collapse dimension
    ['P01', 'P02']
    >>> _to_keys(('P01,P02',))  # <-- do not collapse dimension
    (['P01', 'P02'],)
    >>> _to_keys('P01;P02,P03;:')
    ('P01', ['P02', 'P03'], slice(None, None, None))

    # evaluated variables do not work on Python 2, probably because the stack depth is different
    # >>> ext = 'P03'
    # >>> to_keys('P01,P02,{ext}')
    # ['P01', 'P02', 'P03']
    # >>> to_keys('P01;P02;{ext}')
    # ('P01', 'P02', 'P03')

    >>> _to_keys('age[10:19] >> teens ; year.i[-1]')
    (age[10:19] >> 'teens', year.i[-1])

    # >>> to_keys('P01,P02,:') # <-- INVALID !
    # it should have an explicit failure

    # we allow this, even though it is a dubious syntax
    >>> _to_keys(('P01', 'P02', ':'))
    ('P01', 'P02', slice(None, None, None))

    # it is better to use explicit groups
    >>> _to_keys(('P01,', 'P02,', ':'))
    (['P01'], ['P02'], slice(None, None, None))

    # or even the ugly duck...
    >>> _to_keys((('P01',), ('P02',), ':'))
    (['P01'], ['P02'], slice(None, None, None))
    """
    if isinstance(value, basestring) and ';' in value:
        value = tuple(value.split(';'))

    if isinstance(value, tuple):
        # stack_depth + 2 because the list comp has its own stack
        return tuple([_to_key(group, stack_depth + 2) for group in value])
    else:
        return _to_key(value, stack_depth + 1)


# forbidden characters in sheet names
_sheet_name_pattern = re.compile('[\\\/?*\[\]:]')


def _translate_sheet_name(sheet_name):
    if isinstance(sheet_name, Group):
        sheet_name = _sheet_name_pattern.sub('_', str(_to_tick(sheet_name)))
    if isinstance(sheet_name, basestring) and len(sheet_name) > 30:
        raise ValueError("Sheet names cannot exceed 31 characters")
    return sheet_name


# forbidden characters for dataset names in HDF files
_key_hdf_pattern = re.compile('[\\\/]')


def _translate_key_hdf(key):
    if isinstance(key, Group):
        key = _key_hdf_pattern.sub('_', str(_to_tick(key)))
    return key


[docs]def union(*args):
    # TODO: add support for LGroup and lists
    """
    Returns the union of several "value strings" as a list.

    Parameters
    ----------
    *args
        (collection of) value(s) to be converted into label(s). Repeated values are taken only once.

    Returns
    -------
    list of labels

    Examples
    --------
    >>> union('a', 'a, b, c, d', ['d', 'e', 'f'], '..2')
    ['a', 'b', 'c', 'd', 'e', 'f', 0, 1, 2]
    """
    if args:
        return list(unique(chain(*(_to_ticks(arg) for arg in args))))
    else:
        return []


class IGroupMaker(object):
    """
    Generates a new instance of IGroup for a given axis and key.

    Attributes
    ----------
    axis : Axis
        an axis.

    Notes
    -----
    This class is used by the method `Axis.i`
    """
    def __init__(self, axis):
        assert isinstance(axis, ABCAxis)
        self.axis = axis

    def __getitem__(self, key):
        return IGroup(key, None, self.axis)


# We need a separate class for LGroup and cannot simply create a new Axis with a subset of values/ticks/labels:
# the subset of ticks/labels of the LGroup need to correspond to its *Axis* indices
class Group(object):
    """Abstract Group.
    """
    format_string = None

    def __init__(self, key, name=None, axis=None):
        if isinstance(key, tuple):
            key = list(key)
        if isinstance(key, Group):
            key = key.to_label()
        self.key = remove_nested_groups(key)

        # we do NOT assign a name automatically when missing because that makes it impossible to know whether a name
        # was explicitly given or not
        self.name = str(_to_tick(name)) if name is not None else name
        assert axis is None or isinstance(axis, (basestring, int, ABCAxis)), \
            "invalid axis '%s' (%s)" % (axis, type(axis).__name__)

        # we could check the key is valid but this can be slow and could be useless
        # TODO: for performance reasons, we should cache the result. This will need to be invalidated correctly
        # axis.translate(key)

        # we store the Axis object and not its name like we did previously so that groups on anonymous axes are more
        # meaningful and that we can iterate on a slice of an axis (an LGroup). The reason to store the name instead of
        # the object was to make sure that a Group from an axis (or without axis) could be used on another axis with
        # the same name. See test_array.py:test_...
        self.axis = axis

    def __repr__(self):
        key = self.key

        # eval only returns a slice for groups without an Axis object
        if isinstance(key, slice):
            key_repr = _slice_to_str(key, repr_func=repr)
        elif isinstance(key, (tuple, list, np.ndarray, OrderedSet)):
            key_repr = _seq_summary(key, n=1000, repr_func=repr, sep=', ')
        else:
            key_repr = repr(key)

        axis_name = self.axis.name if isinstance(self.axis, ABCAxis) else self.axis
        if axis_name is not None:
            axis_name = 'X.{}'.format(axis_name) if isinstance(self.axis, ABCAxisReference) else axis_name
            s = self.format_string.format(axis=axis_name, key=key_repr)
        else:
            if self.axis is not None:
                # anonymous axis
                axis_ref = ', axis={}'.format(repr(self.axis))
            else:
                axis_ref = ''
            if isinstance(key, slice):
                key_repr = repr(key)
            elif isinstance(key, list):
                key_repr = '[{}]'.format(key_repr)
            s = '{}({}{})'.format(self.__class__.__name__, key_repr, axis_ref)
        return "{} >> {}".format(s, repr(self.name)) if self.name is not None else s

    def __str__(self):
        return str(self.eval())

    # TODO: rename to "to_positional"
    def translate(self, bound=None, stop=False):
        """
        Translate key to a position if it is not already

        Parameters
        ----------
        bound : any, optional
        stop : bool, optional

        Returns
        -------
        int-based key (single int, slice of int or tuple/list/array of them)
        """
        raise NotImplementedError()

    def eval(self):
        """
        Translate key to labels, if it is not already, expanding slices in the process.

        Returns
        -------
        label-based key (single scalar or tuple/list/array of them)
        """
        raise NotImplementedError()

    def to_label(self):
        """
        Translate key to labels, if it is not already

        Returns
        -------
        label-based key (single scalar, slice of scalars or tuple/list/array of them)
        """
        raise NotImplementedError()

    def retarget_to(self, target_axis):
        """Retarget group to another axis.

        It will be translated to an LGroup using its former axis, if necessary.

        Parameters
        ----------
        target_axis : Axis
            axis to conform to

        Returns
        -------
        Group with axis, raise ValueError if retargeting is not possible
        """
        if self.axis is target_axis:
            return self
        elif isinstance(self.axis, basestring) or isinstance(self.axis, ABCAxisReference):
            axis_name = self.axis.name if isinstance(self.axis, ABCAxisReference) else self.axis
            if axis_name != target_axis.name:
                raise ValueError('cannot retarget a Group defined without a real axis object (e.g. using '
                                 'an AxisReference (x.)) to an axis with a different name')
            return self.__class__(self.key, self.name, target_axis)
        elif self.axis.equals(target_axis) or isinstance(self.axis, int):
            # in the case of isinstance(self.axis, int), we can only hope the axis corresponds. This is the
            # case if we come from _translate_axis_key_chunk, but if the users calls this manually, we cannot know.
            # XXX: maybe changing this to retarget_to_axes would be a good idea after all?

            # just change the axis object
            return self.__class__(self.key, self.name, target_axis)
        else:
            # to retarget to another (non-equal) Axis, we need to translate to labels and expand slices
            return LGroup(self.eval(), self.name, target_axis)

    def __len__(self):
        # XXX: we probably want to_label instead of .eval (so that we do not expand slices)
        value = self.eval()
        # for some reason this breaks having LGroup ticks/labels on an axis
        # if isinstance(value, (tuple, list, LArray, np.ndarray, str)):
        if hasattr(value, '__len__'):
            return len(value)
        elif isinstance(value, slice):
            start, stop, key_step = value.start, value.stop, value.step
            # not using stop - start because that does not work for string bounds
            # (and it is different for LGroup & IGroup)
            start_pos = self.translate(start)
            stop_pos = self.translate(stop)
            return stop_pos - start_pos
        else:
            raise TypeError('len() of unsized object ({})'.format(value))

    def __iter__(self):
        # XXX: use translate/IGroup instead, so that it works even in the presence of duplicate labels
        #      possibly, only if axis is set?
        return iter([LGroup(v, axis=self.axis) for v in self.eval()])

    def named(self, name):
        """Returns group with a different name.

        Parameters
        ----------
        name : str
            new name for group

        Returns
        -------
        Group
        """
        return self.__class__(self.key, name, self.axis)
    __rshift__ = named

    def with_axis(self, axis):
        """Returns group with a different axis.

        Parameters
        ----------
        axis : int, str, Axis
            new axis for group

        Returns
        -------
        Group
        """
        return self.__class__(self.key, self.name, axis)

    def by(self, length, step=None):
        """Split group into several groups of specified length.

        Parameters
        ----------
        length : int
            length of new groups
        step : int, optional
            step between groups. Defaults to length.

        Notes
        -----
        step can be smaller than length, in which case, this will produce overlapping groups.

        Returns
        -------
        list of Group

        Examples
        --------
        >>> from larray import Axis, X
        >>> age = Axis(range(10), 'age')
        >>> age[[1, 2, 3, 4, 5]].by(2)
        (age[1, 2], age[3, 4], age[5])
        >>> age[1:5].by(2)
        (age.i[1:3], age.i[3:5], age.i[5:6])
        >>> age[1:5].by(2, 4)
        (age.i[1:3], age.i[5:6])
        >>> age[1:5].by(3, 2)
        (age.i[1:4], age.i[3:6], age.i[5:6])
        >>> X.age[[0, 1, 2, 3, 4]].by(2)
        (X.age[0, 1], X.age[2, 3], X.age[4])
        """
        if step is None:
            step = length
        return tuple(self[start:start + length]
                     for start in range(0, len(self), step))

    # TODO: __getitem__ should work by label and .i[] should work by position. I guess it would be more consistent this
    # way even if the usefulness of subsetting a group with labels is dubious (but it is sometimes practical to treat
    # the group as if it was an axis).
    # >>> vla = geo['...']
    # >>> # first 10 regions of flanders (this could have some use)
    # >>> vla.i[:10]  # => IGroup on geo
    # >>> vla["antwerp", "gent"]  # => LGroup on geo

    # LGroup[] => LGroup
    # IGroup[] => LGroup
    # IGroup.i[] => IGroup
    # LGroup.i[] => IGroup
    def __getitem__(self, key):
        """

        Parameters
        ----------
        key : int, slice of int or list of int
            position-based key (even for LGroup)

        Returns
        -------
        Group
        """
        cls = self.__class__
        orig_key = self.key
        # XXX: unsure we should support tuple
        if isinstance(orig_key, (tuple, list)):
            return cls(orig_key[key], None, self.axis)
        elif isinstance(orig_key, slice):
            orig_start, orig_stop, orig_step = orig_key.start, orig_key.stop, orig_key.step
            if orig_step is None:
                orig_step = 1

            orig_start_pos = self.translate(orig_start) if orig_start is not None else 0
            if isinstance(key, slice):
                key_start, key_stop, key_step = key.start, key.stop, key.step
                if key_step is None:
                    key_step = 1

                orig_stop_pos = self.translate(orig_stop, stop=True) if orig_stop is not None else len(self)
                new_start = orig_start_pos + key_start * orig_step
                new_stop = min(orig_start_pos + key_stop * orig_step, orig_stop_pos)
                new_step = orig_step * key_step
                if new_step == 1:
                    new_step = None
                return IGroup(slice(new_start, new_stop, new_step), None, self.axis)
            elif isinstance(key, int):
                return IGroup(orig_start_pos + key * orig_step, None, self.axis)
            elif isinstance(key, (tuple, list)):
                return IGroup([orig_start_pos + k * orig_step for k in key], None, self.axis)
        elif isinstance(orig_key, ABCLArray):
            # XXX: why .i ?
            return cls(orig_key.i[key], None, self.axis)
        elif isinstance(orig_key, int):
            # give the opportunity to subset the label/key itself (for example for string keys)
            value = self.eval()
            return value[key]
        else:
            raise TypeError("cannot take a subset of {} because it has a '{}' key".format(self.key, type(self.key)))

    def _ipython_key_completions_(self):
        return list(self.eval())

    # method factory
    def _binop(opname):
        op_fullname = '__%s__' % opname

        # TODO: implement this in a delayed fashion for axes references
        if PY2:
            # workaround the fact slice objects do not have any __binop__ methods defined on Python2 (even though
            # the actual operations work on them).
            def opmethod(self, other):
                self_value = self.eval()
                other_value = other.eval() if isinstance(other, Group) else other
                # this can only happen when self.axis is not an Axis instance
                if isinstance(self_value, slice):
                    if not isinstance(other_value, slice):
                        # FIXME: we should raise a TypeError instead for all ops except == and !=
                        # FIXME: we should return True for !=
                        return False
                    # FIXME: we should raise a TypeError instead of doing this for all ops except comparison ops
                    self_value = (self_value.start, self_value.stop, self_value.step)
                    other_value = (other_value.start, other_value.stop, other_value.step)
                return getattr(self_value, op_fullname)(other_value)
        else:
            def opmethod(self, other):
                other_value = other.eval() if isinstance(other, Group) else other
                return getattr(self.eval(), op_fullname)(other_value)

        opmethod.__name__ = op_fullname
        return opmethod

    __matmul__ = _binop('matmul')
    __ror__ = _binop('ror')
    __or__ = _binop('or')
    __rxor__ = _binop('rxor')
    __xor__ = _binop('xor')
    __rand__ = _binop('rand')
    __and__ = _binop('and')
    __rpow__ = _binop('rpow')
    __pow__ = _binop('pow')
    __rdivmod__ = _binop('rdivmod')
    __divmod__ = _binop('divmod')
    __rmod__ = _binop('rmod')
    __mod__ = _binop('mod')
    __rfloordiv__ = _binop('rfloordiv')
    __floordiv__ = _binop('floordiv')
    __rtruediv__ = _binop('rtruediv')
    __truediv__ = _binop('truediv')
    if sys.version < '3':
        __div__ = _binop('div')
        __rdiv__ = _binop('rdiv')
    __rmul__ = _binop('rmul')
    __mul__ = _binop('mul')
    __rsub__ = _binop('rsub')
    __sub__ = _binop('sub')
    __radd__ = _binop('radd')
    __add__ = _binop('add')

    __ge__ = _binop('ge')
    __gt__ = _binop('gt')
    __le__ = _binop('le')
    __lt__ = _binop('lt')

    # having ne and eq use .eval on a slice group creates an ndarray, for which __eq__ does not return a single value,
    # which means, it cannot be in a mapping/Axis, but this is no longer a problem, since we do not create axes with
    # LGroup labels anymore anyway
    __ne__ = _binop('ne')
    __eq__ = _binop('eq')

    def set(self):
        """Creates LSet from this group

        Returns
        -------
        LSet
        """
        return LSet(self.eval(), self.name, self.axis)

    def union(self, other):
        """Returns (set) union of this label group and other.

        Labels relative order will be kept intact, but only unique labels will be returned. Labels from this group will
        be before labels from other.

        Parameters
        ----------
        other : Group or any sequence of labels
            other labels

        Returns
        -------
        LSet

        Examples
        --------
        >>> from larray import Axis
        >>> letters = Axis('letters=a..d')
        >>> letters['a', 'b'].union(letters['b', 'c'])
        letters['a', 'b', 'c'].set()
        >>> letters['a', 'b'].union(['b', 'c'])
        letters['a', 'b', 'c'].set()
        """
        return self.set().union(other)

    def intersection(self, other):
        """Returns (set) intersection of this label group and other.

        In other words, this will return labels from this group which are also in other. Labels relative order will be
        kept intact, but only unique labels will be returned.

        Parameters
        ----------
        other : Group or any sequence of labels
            other labels

        Returns
        -------
        LSet

        Examples
        --------
        >>> from larray import Axis
        >>> letters = Axis('letters=a..d')
        >>> letters['a', 'b'].intersection(letters['b', 'c'])
        letters['b'].set()
        >>> letters['a', 'b'].intersection(['b', 'c'])
        letters['b'].set()
        """
        return self.set().intersection(other)

    def difference(self, other):
        """Returns (set) difference of this label group and other.

        In other words, this will return labels from this group without those in other. Labels relative order will be
        kept intact, but only unique labels will be returned.

        Parameters
        ----------
        other : Group or any sequence of labels
            other labels

        Returns
        -------
        LSet

        Examples
        --------
        >>> from larray import Axis
        >>> letters = Axis('letters=a..d')
        >>> letters['a', 'b'].difference(letters['b', 'c'])
        letters['a'].set()
        >>> letters['a', 'b'].difference(['b', 'c'])
        letters['a'].set()
        """
        return self.set().difference(other)

    def __contains__(self, item):
        if isinstance(item, Group):
            item = item.eval()
        return item in self.eval()

    def startingwith(self, prefix):
        """
        Returns a group with the labels starting with the specified string.

        Parameters
        ----------
        prefix : str or Group
            The prefix to search for.

        Returns
        -------
        LGroup
            Group containing all the labels starting with the given string.

        Examples
        --------
        >>> from larray import Axis
        >>> people = Axis(['Bruce Wayne', 'Arthur Dent', 'Harvey Dent'], 'people')
        >>> group = people.endingwith('Dent')
        >>> group
        people['Arthur Dent', 'Harvey Dent']
        >>> group.startingwith('Art')
        people['Arthur Dent']
        """
        if isinstance(prefix, Group):
            prefix = prefix.eval()
        return LGroup([v for v in self.eval() if v.startswith(prefix)], axis=self.axis)

    def endingwith(self, suffix):
        """
        Returns a group with the labels ending with the specified string.

        Parameters
        ----------
        suffix : str or Group
            The suffix to search for.

        Returns
        -------
        LGroup
            Group containing all the labels ending with the given string.

        Examples
        --------
        >>> from larray import Axis
        >>> people = Axis(['Bruce Wayne', 'Bruce Willis', 'Arthur Dent'], 'people')
        >>> group = people.startingwith('Bru')
        >>> group
        people['Bruce Wayne', 'Bruce Willis']
        >>> people.endingwith('yne')
        people['Bruce Wayne']
        """
        if isinstance(suffix, Group):
            suffix = suffix.eval()
        return LGroup([v for v in self.eval() if v.endswith(suffix)], axis=self.axis)

    def matching(self, pattern):
        """
        Returns a group with all the labels matching the specified pattern (regular expression).

        Parameters
        ----------
        pattern : str or Group
            Regular expression (regex).

        Returns
        -------
        LGroup
            Group containing all the labels matching the pattern.

        Notes
        -----
        See `Regular Expression <https://docs.python.org/3/library/re.html>`_
        for more details about how to build a pattern.

        Examples
        --------
        >>> from larray import Axis
        >>> people = Axis(['Bruce Wayne', 'Bruce Willis', 'Arthur Dent'], 'people')

        All labels containing "B" and "e" with exactly 3 characters in between are given by

        >>> group = people.matching('B...e')
        >>> group
        people['Bruce Wayne', 'Bruce Willis']

        Within that group, all labels containing any characters then W then any characters then s are given by
        >>> group.matching('.*W.*s')
        people['Bruce Willis']
        """
        if isinstance(pattern, Group):
            pattern = pattern.eval()
        rx = re.compile(pattern)
        return LGroup([v for v in self.eval() if rx.match(v)], axis=self.axis)

    def containing(self, substring):
        """
        Returns a group with all the labels containing the specified substring.

        Parameters
        ----------
        substring : str or Group
            The substring to search for.

        Returns
        -------
        LGroup
            Group containing all the labels containing the substring.

        Examples
        --------
        >>> from larray import Axis
        >>> people = Axis(['Bruce Wayne', 'Bruce Willis', 'Arthur Dent'], 'people')
        >>> group = people.startingwith('Bru')
        >>> group
        people['Bruce Wayne', 'Bruce Willis']
        >>> group.containing('Will')
        people['Bruce Willis']
        """
        if isinstance(substring, Group):
            substring = substring.eval()
        return LGroup([v for v in self.eval() if substring in v], axis=self.axis)

    # this makes range(LGroup(int)) possible
    def __index__(self):
        return self.eval().__index__()

    def __int__(self):
        # 'str' objects have no '__int__' attribute, so this is better than calling __int__ explicitly
        return int(self.eval())

    def __float__(self):
        # 'str' objects have no '__float__' attribute, so this is better than calling __float__ explicitly
        return float(self.eval())

    def __array__(self, dtype=None):
        return np.asarray(self.eval(), dtype=dtype)

    def __dir__(self):
        # called by dir() and tab-completion at the interactive prompt, must return a list of any valid getattr key.
        # dir() takes care of sorting but not uniqueness, so we must ensure that.
        return list(set(dir(self.eval())) | set(self.__dict__.keys()) | set(dir(self.__class__)))

    def __getattr__(self, key):
        if key == '__array_struct__':
            raise AttributeError("'Group' object has no attribute '__array_struct__'")
        else:
            return getattr(self.eval(), key)

    def __hash__(self):
        # to_tick & to_key are partially opposite operations but this standardize on a single notation so that they can
        # all target each other. eg, this removes spaces in "list strings", instead of hashing them directly
        # XXX: but we might want to include that normalization feature in to_tick directly, instead of using to_key
        #      explicitly here
        # XXX: we might want to make hash use the position along the axis instead of the labels so that if an axis has
        #      ambiguous labels, they do not hash to the same thing.
        # XXX: for performance reasons, I think hash should not evaluate slices. It should only translate pos bounds to
        #      labels or vice versa. We would loose equality between list Groups and equivalent slice groups but that
        #      is a small price to pay if the performance impact is large.
        # the problem with using self.translate() is that we cannot compare groups without axis
        # return hash(_to_tick(self.translate()))
        return hash(_to_tick(self.key))


def remove_nested_groups(key):
    # "struct" key with Group elements -> key without Group
    # TODO: ideally if all key elements are groups on the same Axis, we should make a group on that axis
    #       for slice bounds, watch out for None
    if isinstance(key, slice):
        key_start, key_stop = key.start, key.stop
        start = key_start.to_label() if isinstance(key_start, Group) else key_start
        stop = key_stop.to_label() if isinstance(key_stop, Group) else key_stop
        return slice(start, stop, key.step)
    elif isinstance(key, (tuple, list)):
        res = [k.to_label() if isinstance(k, Group) else k for k in key]
        return tuple(res) if isinstance(key, tuple) else res
    else:
        return key


[docs]class LGroup(Group):
    """Label group.

    Represents a subset of labels of an axis.

    Parameters
    ----------
    key : key
        Anything usable for indexing. A key should be either sequence of labels, a slice with label bounds or a string.
    name : str, optional
        Name of the group.
    axis : int, str, Axis, optional
        Axis for group.

    Examples
    --------
    >>> from larray import Axis, X
    >>> age = Axis('0..100', 'age')
    >>> teens = X.age[10:19].named('teens')
    >>> teens
    X.age[10:19] >> 'teens'
    >>> teens = X.age[10:19] >> 'teens'
    >>> teens
    X.age[10:19] >> 'teens'
    """
    format_string = "{axis}[{key}]"

[docs]    def __init__(self, key, name=None, axis=None):
        key = _to_key(key)
        Group.__init__(self, key, name, axis)

    # XXX: return IGroup instead?
[docs]    def translate(self, bound=None, stop=False):
        """
        compute position(s) of group
        """
        if bound is None:
            bound = self.key
        if isinstance(self.axis, ABCAxis):
            pos = self.axis.index(bound)
            return pos + int(stop) if np.isscalar(pos) else pos
        else:
            raise ValueError("Cannot translate an LGroup without axis")

    def to_label(self):
        return self.key

    def eval(self):
        if isinstance(self.key, slice):
            if isinstance(self.axis, ABCAxis):
                # expand slices
                return self.axis.labels[self.translate()]
            else:
                return self.key
                # raise ValueError("Cannot evaluate a slice group without axis")
        else:
            # we do not check the group labels are actually valid on Axis
            return self.key


[docs]class LSet(LGroup):
    """Label set.

    Represents a set of (unique) labels of an axis.

    Parameters
    ----------
    key : key
        Anything usable for indexing. A key should be either sequence of labels, a slice with label bounds or a string.
    name : str, optional
        Name of the set.
    axis : int, str, Axis, optional
        Axis for set.

    Examples
    --------
    >>> from larray import Axis
    >>> letters = Axis('letters=a..z')
    >>> abc = letters[':c'].set() >> 'abc'
    >>> abc
    letters['a', 'b', 'c'].set() >> 'abc'
    >>> abc & letters['b:d']
    letters['b', 'c'].set()
    """
    format_string = "{axis}[{key}].set()"

[docs]    def __init__(self, key, name=None, axis=None):
        key = _to_key(key)
        if isinstance(key, LGroup):
            if name is None:
                name = key.name
            if axis is None:
                axis = key.axis
            if not isinstance(key, LSet):
                key = key.eval()
        if np.isscalar(key):
            key = [key]
        key = OrderedSet(key)
        LGroup.__init__(self, key, name, axis)

    # method factory
    def _binop(opname, c):
        op_fullname = '__%s__' % opname

        # TODO: implement this in a delayed fashion for reference axes
        def opmethod(self, other):
            if not isinstance(other, LSet):
                other = LSet(other)
            axis = self.axis if self.axis is not None else other.axis

            # setting a meaningful name is hard when either one has no name
            if self.name is not None and other.name is not None:
                name = '%s %s %s' % (self.name, c, other.name)
            else:
                name = None
            # TODO: implement this in a more efficient way for ndarray keys which can be large
            result_set = getattr(self.key, op_fullname)(other.key)
            return LSet(result_set, name=name, axis=axis)
        opmethod.__name__ = op_fullname
        return opmethod

    union = _binop('or', '|')
    __or__ = union

    intersection = _binop('and', '&')
    __and__ = intersection

    difference = _binop('sub', '-')
    __sub__ = difference


[docs]class IGroup(Group):
    """Index Group.

    Represents a subset of indices of an axis.

    Parameters
    ----------
    key : key
        Anything usable for indexing. A key should be either a single position, a sequence of positions, or a slice
        with integer bounds.
    name : str, optional
        Name of the group.
    axis : int, str, Axis, optional
        Axis for group.
    """
    format_string = "{axis}.i[{key}]"

[docs]    def translate(self, bound=None, stop=False):
        """
        compute position(s) of group
        """
        if bound is not None:
            return bound
        else:
            return self.key

    def to_label(self):
        if isinstance(self.axis, ABCAxis):
            labels = self.axis.labels
            key = self.key
            if isinstance(key, slice):
                start = labels[key.start] if key.start is not None else None
                # FIXME: this probably breaks for reverse slices
                # - 1 because IGroup slice stop is excluded while LGroup slice stop is included
                stop = labels[key.stop - 1] if key.stop is not None else None
                return slice(start, stop, key.step)
            else:
                # key is a single int or tuple/list/array of them
                return labels[key]
        else:
            raise ValueError("Cannot evaluate a positional group without axis")

    def eval(self):
        if isinstance(self.axis, ABCAxis):
            return self.axis.labels[self.key]
        else:
            raise ValueError("Cannot evaluate a positional group without axis")

    def __hash__(self):
        return hash(('IGroup', _to_tick(self.key)))

PGroup = renamed_to(IGroup, 'PGroup')