#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# =============================================================================
# ___ ___ _____ _______
# | \/ | |_ _| | | ___ \
# | . . |_ _ ___ ___ ___ | | ___ ___ | | |_/ / _____ __
# | |\/| | | | / __|/ _ \/ _ \ | |/ _ \ / _ \| | ___ \/ _ \ \/ /
# | | | | |_| \__ \ __/ (_) | | | (_) | (_) | | |_/ / (_) > <
# \_| |_/\__,_|___/\___|\___/ \_/\___/ \___/|_\____/ \___/_/\_\
#
# @author: Nicolas Karasiak
# @site: www.karasiak.net
# @git: www.github.com/nkarasiak/MuseoToolBox
# =============================================================================
"""
The :mod:`museotoolbox.cross_validation` module gathers cross-validation classes.
"""
import numpy as np
from . import _sample_selection
[docs]def train_test_split(cv, X, y, random_state=False, **kwargs):
"""
Split arrays into random train and test subsets according to your choosen cross_validation method.
Quick utility that wraps input validation and next(ShuffleSplit().split(X, y)) and application to input data into a single call for splitting (and optionally subsampling) data in a oneliner.
Parameters
-----------
cv : cross-validation function.
Allowed function from museotoolbox as scikit-learn.
X : array-like, shape (n_samples, n_features), optional
Training data, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, of length n_samples
The target variable for supervised learning problems.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Examples
---------
import numpy as np
import museotoolbox as mtb
X, y = np.arange(10).reshape((5, 2)), range(5)
cv = mtb.cross_validation.LeaveOneOut(random_state=42)
X_train, y_train, X_test, y_test = mtb.cross_validation.train_test_split(cv,X,y)
"""
# X_train, X_test = [np.asarray([],dtype=X.dtype).reshape(-1,X.shape[-1])]*2 # empty X
# y_train, y_test = [np.asarray([],dtype=np.int64)]*2 # empty y
if y.ndim == 2:
y = y.flatten()
groups = False
for tr, vl in cv.split(X, y, **kwargs):
X_train = X[tr, ...]
y_train = y[tr]
X_test = X[vl, ...]
y_test = y[vl]
if 'groups' in kwargs:
g = kwargs['groups']
if g is not None:
groups = True
if g.ndim == 2:
g = y.flatten()
g_train = g[tr, ...]
g_test = g[vl, ...]
break # only the first fold is needed
if groups is True:
return X_train, X_test, y_train, y_test, g_train, g_test
else:
return X_train, X_test, y_train, y_test
[docs]class LeaveOneOut(_sample_selection._cv_manager):
"""
Generate a Cross-Validation using a Stratified Leave One Out.
Note : :class:`~LeaveOneOut` is equivalent to :class:`~museotoolbox.cross_validation.RandomStratifiedKFold` with ``valid_size=1`` and ``n_splits=False``.
Parameters
----------
n_repeats : int or bool, optional (default=False).
If False : will iterate as many times as the smallest class.
If int : will iterate the number of times given in n_splits.
random_state : integer or None, optional (default=False).
If int, random_state is the seed used by the random number generator;
If None, the random number generator is created with ``time.time()``.
verbose : integer or False, optional (default=False).
Controls the verbosity: the higher the value is, the more the messages are detailed.
"""
[docs] def __init__(self,
n_repeats=False,
random_state=False,
verbose=False):
super().__init__(
_sample_selection.randomPerClass,
valid_size=1,
n_repeats=n_repeats,
random_state=random_state,
verbose=verbose)
[docs]class LeavePSubGroupOut(_sample_selection._cv_manager):
"""
Generate a Cross-Validation using subgroup (each group belong to a unique label).
Parameters
----------
valid_size : float, default 0.5.
From 0 to 1.
n_repeats : int or bool, optional (default=False).
If False, n_splits is 1/valid_size (default : 1/0.5 = 2).
If int : will iterate the number of times given in n_splits.
random_state : integer or None, optional (default=False).
If int, random_state is the seed used by the random number generator;
If None, the random number generator is created with ``time.time()``.
verbose : integer or False, optional (default=False).
Controls the verbosity: the higher the value is, the more the messages are detailed.
"""
[docs] def __init__(self, valid_size=0.5, n_repeats=False, random_state=False, verbose=False):
if isinstance(valid_size, float):
if valid_size > 1 or valid_size < 0:
raise ValueError('Percent must be between 0 and 1')
else:
raise ValueError(
'Percent must be between 0 and 1 and must be a float')
if not n_repeats:
n_repeats = int(1 / valid_size)
super().__init__(
_sample_selection.groupCV,
valid_size=valid_size,
n_repeats=n_repeats,
random_state=random_state,
verbose=verbose)
[docs]class LeaveOneSubGroupOut(_sample_selection._cv_manager):
"""
Generate a Cross-Validation by subgroup.
Parameters
----------
n_repeats : int or bool, optional (default=False).
If False : will iterate as many times as the smallest number of groups.
If int : will iterate the number of times given in n_splits.
random_state : integer or None, optional (default=False).
If int, random_state is the seed used by the random number generator;
If None, the random number generator is created with ``time.time()``.
verbose : integer or False, optional (default=False).
Controls the verbosity: the higher the value is, the more the messages are detailed.
"""
[docs] def __init__(self, n_repeats=False, random_state=False, verbose=False):
super().__init__(
_sample_selection.groupCV,
valid_size=1,
n_repeats=n_repeats,
random_state=random_state,
verbose=verbose)
[docs]class SpatialLeaveAsideOut(_sample_selection._cv_manager):
"""
Generate a Cross-Validation using the farthest distance between the training and validation samples.
Parameters
----------
distance_matrix : numpy.ndarray, shape [n_samples, n_samples].
Array got from function samplingMethods.getdistance_matrixForDistanceCV(inRaster,inVector)
valid_size : float, default 0.5.
The percentage of validaton to keep : from 0 to 1.
n_repeats : int or bool, optional (default=False).
If False, n_repeats is 1/valid_size (default : 1/0.5 = 2)
If int : will iterate the number of times given in n_repeats.
random_state : integer or None, optional (default=None).
If int, random_state is the seed used by the random number generator;
If None, the random number generator is created with ``time.time()``.
verbose : integer or False, optional (default=False).
Controls the verbosity: the higher the value is, the more the messages are detailed.
References
----------
See "Combining ensemble modeling and remote sensing for mapping
individual tree species at high spatial resolution" : https://doi.org/10.1016/j.foreco.2013.07.059.
"""
[docs] def __init__(self,
distance_matrix,
valid_size=0.5,
n_repeats=False,
random_state=False,
verbose=False):
super().__init__(
_sample_selection.distanceCV,
distance_matrix=distance_matrix,
valid_size=valid_size,
n_repeats=n_repeats,
random_state=random_state,
verbose=verbose)
[docs]class SpatialLeaveOneSubGroupOut(_sample_selection._cv_manager):
"""
Generate a Cross-Validation with Spatial Leave-One-Out method.
Parameters
----------
distance_matrix : numpy.ndarray, shape [n_samples, n_samples].
Array got from function :func:`museotoolbox.vector_tools.get_distance_matrix`
distance_thresold : int.
In pixels.
distance_label : None or array.
If array, got from function :func:`museotoolbox.vector_tools.get_distance_matrix`
random_state : integer or None, optional (default=None).
If int, random_state is the seed used by the random number generator;
If None, the random number generator is created with ``time.time()``.
verbose : integer or False, optional (default=False).
Controls the verbosity: the higher the value is, the more the messages are detailed.
See also
--------
museotoolbox.processing.get_distance_matrix : to get distance matrix and label.
"""
[docs] def __init__(self,
distance_thresold,
distance_matrix,
distance_label,
random_state=False,
verbose=False):
super().__init__(
_sample_selection.distanceCV,
distance_matrix=distance_matrix,
distance_thresold=distance_thresold,
distance_label=distance_label,
random_state=random_state,
verbose=verbose)
[docs]class SpatialLeaveOneOut(_sample_selection._cv_manager):
"""
Generate a Cross-Validation with a stratified spatial Leave-One-Out method.
Parameters
----------
distance_matrix : numpy.ndarray, shape [n_samples, n_samples].
Array got from function museotoolbox.vector_tools.get_distance_matrix(inRaster,inVector)
distance_thresold : int.
In pixels.
n_repeats: int or False, optional (default=False).
If False : will iterate as many times as the smallest number of groups.
If int : will iterate the number of times specified.
random_state : int or False, optional (default=False).
If int, random_state is the seed used by the random number generator;
If None, the random number generator is created with ``time.time()``.
verbose : integer or False, optional (default=False).
Controls the verbosity: the higher the value is, the more the messages are detailed.
See also
---------
museotoolbox.vector_tools.get_distance_matrix : to get distance matrix and label.
References
----------
See "Spatial leave‐one‐out cross‐validation for variable selection in the
presence of spatial autocorrelation" : https://doi.org/10.1111/geb.12161.
"""
[docs] def __init__(self,
distance_thresold=None,
distance_matrix=None,
n_repeats=False,
n_splits=False,
random_state=False,
verbose=False,
**kwargs):
super().__init__(
_sample_selection.distanceCV,
distance_matrix=distance_matrix,
distance_thresold=distance_thresold,
distance_label=False,
valid_size=1,
n_repeats=n_repeats,
n_splits=n_splits,
random_state=random_state,
verbose=verbose,
**kwargs)
[docs]class RandomStratifiedKFold(_sample_selection._cv_manager):
"""
Generate a Cross-Validation with full random selection and Stratified K-Fold (same percentange per class).
Parameters
----------
n_splits : int, optional (default=2).
Number of splits. 2 means 50% for each class at training and validation.
n_repeats : integer or False, optional (default=False)
If False, will repeat n_splits once.
valid_size : int or False, optional (default=False).
If False, valid size is ``1 / n_splits``.
random_state : integer or None, optional (default=False).
If int, random_state is the seed used by the random number generator;
If None, the random number generator is created with ``time.time()``.
verbose : integer or False, optional (default=False).
Controls the verbosity: the higher the value is, the more the messages are detailed.
Example
-------
>>> from museotoolbox.cross_validation import RandomStratifiedKFold
>>> from museotoolbox import datasets
>>> X,y = datasets.load_historical_data(return_X_y=True)
>>> RSK = RandomStratifiedKFold(n_splits=2,random_state=12,verbose=False)
>>> for tr,vl in RSK.split(X=X,y=y):
print(tr,vl)
[ 1600 1601 1605 ..., 9509 9561 10322] [ 3632 1988 11480 ..., 10321 9457 9508]
[ 1599 1602 1603 ..., 9508 9560 10321] [ 3948 10928 3490 ..., 10322 9458 9561]
"""
[docs] def __init__(self,
n_splits=2,
n_repeats=False,
valid_size=False,
random_state=False,
verbose=False):
if valid_size is False:
valid_size = 1 / n_splits
if n_repeats == False or n_repeats == 0:
n_repeats = n_splits
else:
n_repeats = n_splits * n_repeats
super().__init__(
_sample_selection.randomPerClass,
valid_size=valid_size,
random_state=random_state,
n_repeats=n_repeats,
verbose=verbose)