Source code for museotoolbox.cross_validation

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# =============================================================================
# ___  ___                       _____           _______
# |  \/  |                      |_   _|         | | ___ \
# | .  . |_   _ ___  ___  ___     | | ___   ___ | | |_/ / _____  __
# | |\/| | | | / __|/ _ \/ _ \    | |/ _ \ / _ \| | ___ \/ _ \ \/ /
# | |  | | |_| \__ \  __/ (_) |   | | (_) | (_) | | |_/ / (_) >  <
# \_|  |_/\__,_|___/\___|\___/    \_/\___/ \___/|_\____/ \___/_/\_\
#
# @author:  Nicolas Karasiak
# @site:    www.karasiak.net
# @git:     www.github.com/nkarasiak/MuseoToolBox
# =============================================================================
"""
The :mod:`museotoolbox.cross_validation` module gathers cross-validation classes.
"""
import numpy as np
from . import _sample_selection


[docs]def train_test_split(cv, X, y, random_state=False, **kwargs):
    """
    Split arrays into random train and test subsets according to your choosen cross_validation method.

    Quick utility that wraps input validation and next(ShuffleSplit().split(X, y)) and application to input data into a single call for splitting (and optionally subsampling) data in a oneliner.

    Parameters
    -----------
    cv : cross-validation function.
        Allowed function from museotoolbox as scikit-learn.
    X : array-like, shape (n_samples, n_features), optional
        Training data, where n_samples is the number of samples
        and n_features is the number of features.
    y : array-like, of length n_samples
        The target variable for supervised learning problems.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Examples
    ---------
    import numpy as np
    import museotoolbox as mtb

    X, y = np.arange(10).reshape((5, 2)), range(5)
    cv = mtb.cross_validation.LeaveOneOut(random_state=42)
    X_train, y_train, X_test, y_test = mtb.cross_validation.train_test_split(cv,X,y)
    """
    # X_train, X_test = [np.asarray([],dtype=X.dtype).reshape(-1,X.shape[-1])]*2 # empty X
    # y_train, y_test = [np.asarray([],dtype=np.int64)]*2 # empty y
    if y.ndim == 2:
        y = y.flatten()

    groups = False

    for tr, vl in cv.split(X, y, **kwargs):
        X_train = X[tr, ...]
        y_train = y[tr]
        X_test = X[vl, ...]
        y_test = y[vl]

        if 'groups' in kwargs:

            g = kwargs['groups']

            if g is not None:
                groups = True
                if g.ndim == 2:
                    g = y.flatten()
                g_train = g[tr, ...]
                g_test = g[vl, ...]
        break  # only the first fold is needed

    if groups is True:
        return X_train, X_test, y_train, y_test, g_train, g_test
    else:
        return X_train, X_test, y_train, y_test


[docs]class LeaveOneOut(_sample_selection._cv_manager):
    """
    Generate a Cross-Validation using a Stratified Leave One Out.
    Note : :class:`~LeaveOneOut` is equivalent to :class:`~museotoolbox.cross_validation.RandomStratifiedKFold` with ``valid_size=1`` and ``n_splits=False``.

    Parameters
    ----------
    n_repeats : int or bool, optional (default=False).
        If False : will iterate as many times as the smallest class.
        If int : will iterate the number of times given in n_splits.
    random_state : integer or None, optional (default=False).
        If int, random_state is the seed used by the random number generator;
        If None, the random number generator is created with ``time.time()``.
    verbose : integer or False, optional (default=False).
        Controls the verbosity: the higher the value is, the more the messages are detailed.
    """

[docs]    def __init__(self,
                 n_repeats=False,
                 random_state=False,
                 verbose=False):

        super().__init__(
            _sample_selection.randomPerClass,
            valid_size=1,
            n_repeats=n_repeats,
            random_state=random_state,
            verbose=verbose)


[docs]class LeavePSubGroupOut(_sample_selection._cv_manager):
    """
    Generate a Cross-Validation using subgroup (each group belong to a unique label).

    Parameters
    ----------
    valid_size : float, default 0.5.
        From 0 to 1.
    n_repeats : int or bool, optional (default=False).
        If False, n_splits is 1/valid_size (default : 1/0.5 = 2).
        If int : will iterate the number of times given in n_splits.
    random_state : integer or None, optional (default=False).
        If int, random_state is the seed used by the random number generator;
        If None, the random number generator is created with ``time.time()``.
    verbose : integer or False, optional (default=False).
        Controls the verbosity: the higher the value is, the more the messages are detailed.
    """

[docs]    def __init__(self, valid_size=0.5, n_repeats=False, random_state=False, verbose=False):

        if isinstance(valid_size, float):
            if valid_size > 1 or valid_size < 0:
                raise ValueError('Percent must be between 0 and 1')
        else:
            raise ValueError(
                'Percent must be between 0 and 1 and must be a float')
        if not n_repeats:
            n_repeats = int(1 / valid_size)

        super().__init__(
            _sample_selection.groupCV,
            valid_size=valid_size,
            n_repeats=n_repeats,
            random_state=random_state,
            verbose=verbose)


[docs]class LeaveOneSubGroupOut(_sample_selection._cv_manager):
    """
    Generate a Cross-Validation by subgroup.

    Parameters
    ----------
    n_repeats : int or bool, optional (default=False).
        If False : will iterate as many times as the smallest number of groups.
        If int : will iterate the number of times given in n_splits.
    random_state : integer or None, optional (default=False).
        If int, random_state is the seed used by the random number generator;
        If None, the random number generator is created with ``time.time()``.
    verbose : integer or False, optional (default=False).
        Controls the verbosity: the higher the value is, the more the messages are detailed.
    """

[docs]    def __init__(self, n_repeats=False, random_state=False, verbose=False):

        super().__init__(
            _sample_selection.groupCV,
            valid_size=1,
            n_repeats=n_repeats,
            random_state=random_state,
            verbose=verbose)


[docs]class SpatialLeaveAsideOut(_sample_selection._cv_manager):
    """
    Generate a Cross-Validation using the farthest distance between the training and validation samples.

    Parameters
    ----------
    distance_matrix : numpy.ndarray, shape [n_samples, n_samples].
        Array got from function samplingMethods.getdistance_matrixForDistanceCV(inRaster,inVector)
    valid_size : float, default 0.5.
        The percentage of validaton to keep : from 0 to 1.
    n_repeats : int or bool, optional (default=False).
        If False, n_repeats is 1/valid_size (default : 1/0.5 = 2)
        If int : will iterate the number of times given in n_repeats.
    random_state : integer or None, optional (default=None).
        If int, random_state is the seed used by the random number generator;
        If None, the random number generator is created with ``time.time()``.
    verbose : integer or False, optional (default=False).
        Controls the verbosity: the higher the value is, the more the messages are detailed.

    References
    ----------
    See "Combining ensemble modeling and remote sensing for mapping
    individual tree species at high spatial resolution" : https://doi.org/10.1016/j.foreco.2013.07.059.
    """

[docs]    def __init__(self,
                 distance_matrix,
                 valid_size=0.5,
                 n_repeats=False,
                 random_state=False,
                 verbose=False):

        super().__init__(
            _sample_selection.distanceCV,
            distance_matrix=distance_matrix,
            valid_size=valid_size,
            n_repeats=n_repeats,
            random_state=random_state,
            verbose=verbose)


[docs]class SpatialLeaveOneSubGroupOut(_sample_selection._cv_manager):
    """
    Generate a Cross-Validation with Spatial Leave-One-Out method.

    Parameters
    ----------
    distance_matrix : numpy.ndarray, shape [n_samples, n_samples].
        Array got from function :func:`museotoolbox.vector_tools.get_distance_matrix`
    distance_thresold : int.
        In pixels.
    distance_label : None or array.
        If array, got from function :func:`museotoolbox.vector_tools.get_distance_matrix`
    random_state : integer or None, optional (default=None).
        If int, random_state is the seed used by the random number generator;
        If None, the random number generator is created with ``time.time()``.
    verbose : integer or False, optional (default=False).
        Controls the verbosity: the higher the value is, the more the messages are detailed.

    See also
    --------
    museotoolbox.processing.get_distance_matrix : to get distance matrix and label.
    """

[docs]    def __init__(self,
                 distance_thresold,
                 distance_matrix,
                 distance_label,
                 random_state=False,
                 verbose=False):

        super().__init__(
            _sample_selection.distanceCV,
            distance_matrix=distance_matrix,
            distance_thresold=distance_thresold,
            distance_label=distance_label,
            random_state=random_state,
            verbose=verbose)


[docs]class SpatialLeaveOneOut(_sample_selection._cv_manager):
    """
    Generate a Cross-Validation with a stratified spatial Leave-One-Out method.

    Parameters
    ----------
    distance_matrix : numpy.ndarray, shape [n_samples, n_samples].
        Array got from function museotoolbox.vector_tools.get_distance_matrix(inRaster,inVector)
    distance_thresold : int.
        In pixels.
    n_repeats: int or False, optional (default=False).
        If False : will iterate as many times as the smallest number of groups.
        If int : will iterate the number of times specified.
    random_state : int or False, optional (default=False).
        If int, random_state is the seed used by the random number generator;
        If None, the random number generator is created with ``time.time()``.
    verbose : integer or False, optional (default=False).
        Controls the verbosity: the higher the value is, the more the messages are detailed.

    See also
    ---------
    museotoolbox.vector_tools.get_distance_matrix : to get distance matrix and label.

    References
    ----------
    See "Spatial leave‐one‐out cross‐validation for variable selection in the
    presence of spatial autocorrelation" : https://doi.org/10.1111/geb.12161.

    """

[docs]    def __init__(self,
                 distance_thresold=None,
                 distance_matrix=None,
                 n_repeats=False,
                 n_splits=False,
                 random_state=False,
                 verbose=False,
                 **kwargs):

        super().__init__(
            _sample_selection.distanceCV,
            distance_matrix=distance_matrix,
            distance_thresold=distance_thresold,
            distance_label=False,
            valid_size=1,
            n_repeats=n_repeats,
            n_splits=n_splits,
            random_state=random_state,
            verbose=verbose,
            **kwargs)


[docs]class RandomStratifiedKFold(_sample_selection._cv_manager):
    """
    Generate a Cross-Validation with full random selection and Stratified K-Fold (same percentange per class).

    Parameters
    ----------
    n_splits : int, optional (default=2).
        Number of splits. 2 means 50% for each class at training and validation.
    n_repeats : integer or False, optional (default=False)
        If False, will repeat n_splits once.
    valid_size : int or False, optional (default=False).
        If False, valid size is ``1 / n_splits``.
    random_state : integer or None, optional (default=False).
        If int, random_state is the seed used by the random number generator;
        If None, the random number generator is created with ``time.time()``.
    verbose : integer or False, optional (default=False).
        Controls the verbosity: the higher the value is, the more the messages are detailed.

    Example
    -------
    >>> from museotoolbox.cross_validation import RandomStratifiedKFold
    >>> from museotoolbox import datasets
    >>> X,y = datasets.load_historical_data(return_X_y=True)
    >>> RSK = RandomStratifiedKFold(n_splits=2,random_state=12,verbose=False)
    >>> for tr,vl in RSK.split(X=X,y=y):
            print(tr,vl)
    [ 1600  1601  1605 ...,  9509  9561 10322] [ 3632  1988 11480 ..., 10321  9457  9508]
    [ 1599  1602  1603 ...,  9508  9560 10321] [ 3948 10928  3490 ..., 10322  9458  9561]
    """

[docs]    def __init__(self,
                 n_splits=2,
                 n_repeats=False,
                 valid_size=False,
                 random_state=False,
                 verbose=False):

        if valid_size is False:
            valid_size = 1 / n_splits

        if n_repeats == False or n_repeats == 0:
            n_repeats = n_splits
        else:
            n_repeats = n_splits * n_repeats

        super().__init__(
            _sample_selection.randomPerClass,
            valid_size=valid_size,
            random_state=random_state,
            n_repeats=n_repeats,
            verbose=verbose)