Source code for museotoolbox.ai

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# =============================================================================
# ___  ___                       _____           _______
# |  \/  |                      |_   _|         | | ___ \
# | .  . |_   _ ___  ___  ___     | | ___   ___ | | |_/ / _____  __
# | |\/| | | | / __|/ _ \/ _ \    | |/ _ \ / _ \| | ___ \/ _ \ \/ /
# | |  | | |_| \__ \  __/ (_) |   | | (_) | (_) | | |_/ / (_) >  <
# \_|  |_/\__,_|___/\___|\___/    \_/\___/ \___/|_\____/ \___/_/\_\
#
# @author:  Nicolas Karasiak
# @site:    www.karasiak.net
# @git:     www.github.com/nkarasiak/MuseoToolBox
# =============================================================================
"""
The :mod:`museotoolbox.ai` module gathers artificial intelligence tools.
"""
from joblib import Parallel, delayed
import os
import numpy as np
from sklearn import metrics
from sklearn.base import clone
from ..processing import RasterMath, get_gdt_from_minmax_values, convert_dt, _reshape_ndim
from ..internal_tools import ProgressBar, push_feedback


[docs]class SuperLearner:
[docs] def __init__(self, classifier, param_grid=None, n_jobs=1, verbose=False): """ SuperLearner, shortname for Supervised Learning, ease the way to learn a model via an array or a raster using Scikit-Learn algorithm. After learning a model via :func:`fit`, you can predict via :func:`predict_image` or :func:`predict_array`. Parameters ----------- classifier : algorithm compatible with scikit-learn. For example ``RandomForestClassifier(n_estimators=100)`` from ``from sklearn.ensemble import RandomForestClassifier`` param_grid : False or dict, optional (default=False). param_grid for the grid_search. E.g. for RandomForestClassifier : ``param_grid=dict(n_estimators=[10,100],max_features=[1,3])`` n_jobs : int, default 1. Number of cores to be used by ``sklearn`` in grid-search. verbose : bool or int, optional (default=False) The higher it is the more sequential will show progression. Examples --------- >>> import museotoolbox as mtb >>> from sklearn.ensemble import RandomForestClassifier >>> X,y = mtb.datasets.load_historical_data(return_X_y=True) >>> RS50 = mtb.cross_validation.RandomStratifiedKFold(n_splits=2,n_repeats=5, random_state=12,verbose=False) >>> classifier = RandomForestClassifier() >>> SL = mtb.ai.SuperLearner(verbose=True,classifier=classifier) >>> SL.fit(X,y,cv=RS50,param_grid=dict(n_estimators=[100,200])) Fitting 10 folds for each of 2 candidates, totalling 20 fits best score : 0.966244859222 best n_estimators : 200 >>> for kappa in SL.get_stats_from_cv(confusion_matrix=False,kappa=True): print(kappa) [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers. {'kappa': 0.94145803865870303} {'kappa': 0.94275572196698443} {'kappa': 0.94566553229314054} {'kappa': 0.94210064101370472} {'kappa': 0.94566137634353153} {'kappa': 0.94085890364956737} {'kappa': 0.94136385707385184} {'kappa': 0.9383201352573155} {'kappa': 0.93887726891376944} {'kappa': 0.94450020549861891} [Parallel(n_jobs=-1)]: Done 10 out of 10 | eSLsed: 8.7s finished >>> SL.predict_image(raster,'/tmp/classification.tif') Total number of blocks : 15 Prediction... [########################################]100% Saved /tmp/classification.tif using function predictArray """ self.n_jobs = n_jobs self.verbose = verbose if self.verbose <= 1 or self.verbose is False: self.verbose_gridsearch = 0 else: self.verbose_gridsearch = self.verbose - 1 self.classifier = classifier self.param_grid = param_grid self.xFunction = False self.standardize = False self._is_standardized = False self._array_is_customized = False self.xKwargs = {} self.CV = False self.cloneModel = False
[docs] def standardize_array(self, X=None): """ Scale X data using StandardScaler from ``sklearn``. If X is None, initialize StandardScaler. Parameters ---------- X : np.ndarray, or None, optional (default=None). The array to scale the data from. need_transformation : bool, default True If you used function to transform your array. """ from sklearn.preprocessing import StandardScaler try: self.StandardScaler except BaseException: self.StandardScaler = StandardScaler() if X is not None: if self._is_standardized is False: self.StandardScaler.fit(X) self._is_standardized = True Xt = self.StandardScaler.transform(X) return Xt
[docs] def fit( self, X, y, group=None, standardize=True, cv=None, scoring='accuracy', refit=True, **gridSearchCVParams): """ Fit model from array. Parameters ---------- X : np.ndarray. Array with values of each label variable. y : np.ndarray. Array with labels only. group : str or False. If you use a cross-validation which needs group-splitting. strandardize : Bool, default True. If True, will standardize features by removing the mean and scaling to unit variance. cv : Cross-Validation or int or None. Default None. if cv, choose one from cross_validation. if int, uses :class:`museotoolbox.cross_validation.RandomStratifiedKFold` with K = the int value. """ self.y = y self.group = group if self._array_is_customized: X = self.xFunction(X, **self.xKwargs) X = _reshape_ndim(X) self.X = X self.standardize = standardize if standardize: self.standardize_array() self.X = self.standardize_array(X) self._fit( self.X, self.y, self.group, self.classifier, self.param_grid, cv, scoring, refit, **gridSearchCVParams)
def _fit( self, X, y, groups, classifier, param_grid, cv, scoring='accuracy', refit=True, **gridSearchCVParams): if isinstance(cv, int) and cv: from ..cross_validation import RandomStratifiedKFold cv = RandomStratifiedKFold(n_splits=cv) if cv is not None and cv is not False: if isinstance(cv, list): self.CV = cv else: self.CV = [] if groups is None: for tr, vl in (cv for cv in cv.split( X, y) if cv is not None): self.CV.append((tr, vl)) else: for tr, vl in (cv for cv in cv.split( X, y, groups) if cv is not None): self.CV.append((tr, vl)) from sklearn.model_selection import GridSearchCV if isinstance(param_grid, dict) and cv is not False: if gridSearchCVParams == {}: self.model = GridSearchCV( self.classifier, param_grid=param_grid, cv=cv, scoring=scoring, refit=refit, n_jobs=self.n_jobs, verbose=self.verbose) else: self.model = GridSearchCV( self.classifier, param_grid=param_grid, cv=cv, scoring=scoring, refit=refit, n_jobs=self.n_jobs, verbose=self.verbose, **gridSearchCVParams) if groups is None: self.model.fit(X, y) else: self.model.fit(X, y, groups=groups) # self.model = self.gS.best_estimator_ self.cloneModel = clone(self.model.best_estimator_) #self.model.fit(X, y, groups) if self.verbose: push_feedback('best score : ' + str(self.model.best_score_)) for key in self.param_grid.keys(): message = 'best ' + key + ' : ' + \ str(self.model.best_params_[key]) push_feedback(message) else: if cv is not False or param_grid is not False: raise ValueError( 'Cannot fit model because a CV or a param_grid is given and and no param_grid was defined?\ If you want to fit your mode with no param_grid, please set cv=False and param_grid=False.') if groups is None: self.model = self.classifier.fit(X, y) else: self.model = self.classifier.fit(X, y, groups)
[docs] def save_model(self, path): """ Save model 'myModel.npz' to be loaded later via `SuperLearner.load_model(path)` Parameters ---------- path : str. If path ends with npz, perfects, else will add '.npz' after your fileName. Returns ------- path : str. Path and filename with mtb extension. """ if not path.endswith('npz'): path += '.npz' np.savez_compressed(path, SL=self.__dict__) return path
[docs] def load_model(self, path): """ Load model previously saved with `SuperLearner.save_model(path)`. Parameters ---------- path : str. If path ends with npy, perfects, else will add '.npy' after your fileName. """ if not path.endswith('npz'): path += '.npz' model = np.load(path, allow_pickle=True) self.__dict__.update(model['SL'].tolist())
def _convert_array(self, X): if self._array_is_customized is True: X = self.xFunction(X, **self.xKwargs) if self.standardize: if np.ma.is_masked(X): if X.mask.ndim == 1: X = X.reshape(-1, 1) tmpMask = X.mask[:, 0] X = _reshape_ndim(X) X = self.StandardScaler.transform(X) if np.ma.is_masked(X): tmpMask = np.repeat(tmpMask.reshape(-1, 1), X.shape[-1], axis=1) X = np.ma.masked_array(X, tmpMask) X = _reshape_ndim(X) return X
[docs] def predict_array(self, X): """ Predict label from array. Parameters ---------- X : np.ndarray (n_size,). The array to predict. Must have the same number of bands of the initial array/raster. **kwargs : Xfunction : a custom function to modify directly the array from the raster. """ X = self._convert_array(X) self.Xpredict = self.model.predict(X) return self.Xpredict
[docs] def predict_confidence_per_class(self, X): """ Predict confidence for each class. Parameters ---------- X : np.ndarray. The array to predict proba. Must have the same number of bands of the initial array/raster. Returns ---------- Xpredict : np.ndarray (n_size,n_class). The probability from 0 to 100. """ X = self._convert_array(X) Xpredict_proba = self.model.predict_proba(X) * 100 # share prediction in class in case of confidence for only predicted # class self.Xpredict_proba = Xpredict_proba return Xpredict_proba
[docs] def predict_higher_confidence(self, X): """ Get confidence of the predicted label. Parameters ---------- X : np.ndarray. The array to predict proba. Must have the same number of bands of the initial array/raster. Returns ---------- Xpredict : np.ndarray (n_size,). The probability from 0 to 100. """ Xpredict_proba = np.amax( self.model.predict_proba( self._convert_array( X)) * 100, axis=1) return Xpredict_proba
[docs] def predict_image( self, in_image, out_image, confidence_per_class=False, higher_confidence=False, in_image_mask=False, out_nodata=0, compress=True, n_jobs=1): """ Predict label from raster using previous learned model. This function will call self.predictArray(X). Parameters ---------- in_image : str. A filename or path of a raster file. It could be any file that GDAL can open. out_image : str. A geotiff extension filename corresponding to a raster image to create. confidence_per_class : str or bool, optional (default=False) A path to a geotiff extension filename to store each confidence per class (one band = one label). higher_confidence: str or bool, optional (default=False). A path to a geotiff extension filename to store the max confidence from all classes. in_image_mask : str or False, optional (default=False). Path of the raster where 0 is mask and value above are no mask. outNumpyDT : numpy datatype, default will get the datatype according to your maximum class value. Get numpy datatype throught : convert_dt(get_gdt_from_minmax_values(maximumClassValue))) out_nodata : int, optional (default=0). Value of no data only for the out_image. """ rM = RasterMath(in_image, in_image_mask, message='Prediction...') numpyDT = convert_dt( get_gdt_from_minmax_values(np.amax(self.model.classes_))) rM.add_function( self.predict_array, out_image, out_n_bands=1, out_np_dt=numpyDT, out_nodata=out_nodata, compress=compress) if confidence_per_class: rM.add_function( self.predict_confidence_per_class, confidence_per_class, out_n_bands=False, out_np_dt=np.int16, out_nodata=np.iinfo(np.int16).min, compress=compress) if higher_confidence: rM.add_function( self.predict_higher_confidence, higher_confidence, out_n_bands=1, out_np_dt=np.int16, out_nodata=np.iinfo(np.int16).min, compress=compress) rM.run()
def _get_stats_from_each_cv(self, statsidx, trvl, confusion_matrix=True, kappa=False, OA=False, F1=False, nTrain=False): """ Compute stats per each fold """ X_train, X_test = self.X[trvl[0]], self.X[trvl[1]] Y_train, Y_test = self.y[trvl[0]], self.y[trvl[1]] self.cloneModel.fit(X_train, Y_train) X_pred = self.cloneModel.predict(X_test) accuracies = {} if confusion_matrix: accuracies['confusion_matrix'] = metrics.confusion_matrix( Y_test, X_pred) if kappa: accuracies['kappa'] = metrics.cohen_kappa_score(Y_test, X_pred) if OA: accuracies['OA'] = metrics.accuracy_score(Y_test, X_pred) if F1: accuracies['F1'] = metrics.f1_score(Y_test, X_pred, average=None) if nTrain: accuracies['nTrain'] = np.unique(Y_train, return_counts=True)[1] return accuracies
[docs] def save_cm_from_cv(self, savePath, prefix='', header=True, n_jobs=1): """ Save each confusion matrix (csv format) from cross-validation. For each matrix, will save as header : - The number of training samples per class, - The F1-score per class, - Overall Accuracy, - Kappa. Example of confusion matrix saved as csv : +-------------------------+------------+ | # Training samples : 90,80 | +-------------------------+------------+ | # F1 : 91.89,90.32 | +-------------------------+------------+ | # OA : 91.18 | +-------------------------+------------+ | # Kappa : 82.23 | +-------------------------+------------+ | 85 | 5 | +-------------------------+------------+ | 10 | 70 | +-------------------------+------------+ - **In X (columns)** : prediction (95 predicted labels for class 1). - **In Y (lines)** : reference (90 labels from class 1). Parameters ---------- savePath : str. The path where to save the different csv. If not exists, will be created prefix : str, default ''. If prefix, will add this prefix before the csv name (i.e. 0.csv) header : boolean, default True. If True, will save F1, OA, Kappa and number of training samples. If False, will only save confusion matrix Returns ------- None Examples -------- After having learned with :mod:`museotoolbox.ai.SuperLearner` : >>> SL.saveCMFromCV('/tmp/testMTB/',prefix='RS50_') [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=-1)]: Done 10 out of 10 | eSLsed: 3.4s finished >>> np.loadtxt('/tmp/testMTB/RS50_0.csv') array([[85, 5], [10, 70]]) """ def _compute_stats_per_cv(statsidx, trvl, savePath, prefix, header): outFile = savePath + '/' + prefix + str(statsidx) + '.csv' dictStats = self._get_stats_from_each_cv( statsidx, trvl, True, header, header, header, header) if header: np_header = 'Training samples : ' + ','.join(str(tr) for tr in dictStats['nTrain']) +\ '\nF1 : ' + ','.join(str(np.round(f * 100, 2)) for f in dictStats['F1']) +\ '\nOA : {}'.format(np.round(dictStats['OA'] * 100), 2) +\ '\nKappa : {}'.format( np.round(dictStats['kappa'] * 100), 2) else: np_header = '' np.savetxt( outFile, dictStats['confusion_matrix'], header=np_header, fmt='%0.d') if not os.path.exists(savePath): os.makedirs(savePath) Parallel(n_jobs=n_jobs, verbose=self.verbose + 1)(delayed(_compute_stats_per_cv)(statsidx, trvl, savePath, prefix, header) for statsidx, trvl in enumerate(self.CV))
[docs] def get_stats_from_cv( self, confusion_matrix=True, kappa=False, OA=False, F1=False, nTrain=False): """ Extract statistics from the Cross-Validation. If Cross-Validation is 5-fold, getStatsFromCV will return 5 confusion matrix, 5 kappas... Parameters ----------- confusion_matrix : bool, default True. If True, will return first the Confusion Matrix. kappa : bool, default False. If True, will return in kappa. OA : bool, default False. If True, will return Overall Accuracy/ F1 : bool, default False. If True, will return F1 Score per class. nTrain : bool, default False. If True, will return number of train samples ordered asc. per label. Returns ------- Accuracies : dict A dictionary of each statistic asked. Examples -------- After having learned with :mod:`museotoolbox.ai.SuperLearner` : >>> for stats in SL.get_stats_from_cv(confusion_matrix=False,kappa=True): >>> stats['kappa'] 0.942560083148 0.94227598585 0.942560083148 ... """ def _computeStatsPerCV(statsidx, trvl, **kwargs): dictStats = self._get_stats_from_each_cv(statsidx, trvl, **kwargs) return dictStats statsCV = Parallel( n_jobs=self.n_jobs, verbose=self.verbose)( delayed(_computeStatsPerCV)( statsidx, trvl, confusion_matrix=confusion_matrix, kappa=kappa, OA=OA, F1=F1, nTrain=nTrain) for statsidx, trvl in enumerate( self.CV)) return statsCV
[docs] def customize_array(self, xFunction, **kwargs): self._array_is_customized = True self.xFunction = xFunction self.xKwargs = kwargs
[docs]class SequentialFeatureSelection: """ Sequential Feature Selection Parameters ---------- classifier : class. Classifier from scikit-learn. param_grid : np.ndarray. param_grid for hyperparameters of the classifier. path_to_save_models : False or str, optional (default=False). If False, will store best model per combination in memory. If str, must be path to save each model and accuracy per feature. n_comp : int, optional (default=1). The number of component per feature. If 4, each feature has 4 columns. verbose : bool or int, optional (default=False) The higher it is the more sequential will show progression. """
[docs] def __init__(self, classifier, param_grid, path_to_save_models=False, n_comp=1, verbose=False): # share args self.n_comp = n_comp self.classifier = classifier self.param_grid = param_grid self.verbose = verbose if self.verbose < 1: self.verbose_gridsearch = 0 else: self.verbose_gridsearch = self.verbose - 1 self.xFunction = False self.xKwargs = False self.path_to_save_models = path_to_save_models
[docs] def fit(self, X, y, group=None, cv=5, scoring='accuracy', standardize=True, max_features=False, n_jobs=1, **kwargs): """ Parameters ---------- X : np.ndarray shape of np.ndarray is (n_size,n_bands). y : np.ndarray Size of X.shape[0]. group : None, optional group for cross-validation cv : int, or cross_validation method, optional (default=5). Default will use scoring : str or class, optional (default='accuracy'). default is 'accuracy'. See sklearn.metrics.make_scorer from scikit-learn. standardize : optional Default True. max_features : int or bool. Default False, if value int. n_jobs : int. Number of job to compute cross-validation. """ self.X = X self.X_ = np.copy(X) self.y = y self.group = group self.cv = cv self.scoring = scoring self.models_path_ = [] if self.xFunction: self.X = self.xFunction(X, **self.xKwargs) self.X = _reshape_ndim(self.X) xSize = self.X.shape[1] self.n_features = int(xSize / self.n_comp) self.max_features = self.n_features if max_features is not False: if max_features < self.n_features: self.max_features = max_features totalIter = np.sum( [self.n_features - i for i in range(self.max_features)]) if self.verbose: pB = ProgressBar(totalIter, message='SFFS:') self.mask = np.ones(xSize, dtype=bool) self.models_, self.best_scores_, self.best_features_ = [[], [], []] self.subsets_ = dict() for j in range(self.max_features): resPerFeatures = list() need_fit = True n_features_to_test = int( self.X[:, self.mask].shape[1] / self.n_comp) if self.path_to_save_models: all_scores_file = os.path.join( self.path_to_save_models, 'all_scores_{}.csv'.format(j)) if os.path.exists(all_scores_file): need_fit = False push_feedback('Feature {} already computed'.format(j)) scores = np.loadtxt(all_scores_file, delimiter=',') if scores.ndim == 1: all_scores = [scores[1]] best_candidate = 0 else: all_scores = scores[:, 1] best_candidate = np.argmax(scores[:, 1]) SL = SuperLearner( classifier=self.classifier, param_grid=self.param_grid, n_jobs=n_jobs, verbose=self.verbose_gridsearch) SL.load_model( os.path.join( self.path_to_save_models, 'model_{}.npz'.format(j))) self.models_path_.append( os.path.join( self.path_to_save_models, 'model_{}.npz'.format(j))) if need_fit is True: for idx in range( n_features_to_test): # at each loop, remove best candidate if self.verbose: pB.add_position() SL = SuperLearner( classifier=self.classifier, param_grid=self.param_grid, n_jobs=n_jobs, verbose=self.verbose_gridsearch) curX = self._transform_in_fit(self.X, idx) if standardize is False: scale = False else: scale = True SL.fit( curX, y, group=group, standardize=scale, scoring=self.scoring, cv=self.cv) resPerFeatures.append(SL) all_scores = [np.amax(SL.model.best_score_) for SL in resPerFeatures] best_candidate = np.argmax(all_scores) # self._bestSLs.append(resPerFeatures[best_candidate]) SL = resPerFeatures[best_candidate] if self.path_to_save_models: if not os.path.exists(os.path.join( self.path_to_save_models, str(j))): os.makedirs(os.path.join( self.path_to_save_models, str(j))) SL.save_model( os.path.join( self.path_to_save_models, 'model_{}.npz'.format(j))) SL.save_cm_from_cv( os.path.join( self.path_to_save_models, str(j)), n_jobs=n_jobs) if self.n_comp == 1: bandidx = np.where(self.mask == 1)[0].reshape(-1, 1) else: bandidx = np.arange( 0, self.mask.shape[0], self.n_comp).reshape(-1, 1) bandidx = np.int32( bandidx[np.in1d(self.mask[bandidx], 1)] / self.n_comp) scoreWithIdx = np.hstack((bandidx, np.asarray( all_scores, dtype=np.float32).reshape(-1, 1))) if self.path_to_save_models: np.savetxt(all_scores_file, scoreWithIdx, fmt='%0.d,%.4f') self.models_path_.append( os.path.join( self.path_to_save_models, 'model_{}.npz'.format(j))) else: self.models_.append(resPerFeatures[best_candidate]) # store results best_feature_id = int( self._get_feature_id(best_candidate) / self.n_comp) self.best_scores_.append(all_scores[best_candidate]) self.best_features_.append(best_feature_id) self.best_idx_ = np.argmax(self.best_scores_) if self.verbose: push_feedback( '\nBest feature with %s feature(s) : %s' % (j + 1, best_feature_id)) push_feedback('Best mean score : %s' % np.amax(all_scores)) self.subsets_[ str(j)] = dict( avg_score=np.amax(all_scores), feature_idx=self.best_features_.copy(), cv_score=SL.model.cv_results_, best_score_=np.amax(all_scores), best_feature_=best_feature_id) self._maskIdx(best_candidate)
[docs] def predict(self, X, idx): """ Predict in raster using the best features. Parameters ----------- X : np.ndarray. The array to predict. Must have the same number of bands of the initial array/raster. idx : int. The combination (from 0). """ self._reset_mask() if idx == 'best': idx = self.best_idx_ if self.path_to_save_models is False: SL = self.models_[idx] else: SL = SuperLearner( classifier=self.classifier, param_grid=self.param_grid, n_jobs=1, verbose=self.verbose_gridsearch) SL.load_model(self.models_path_[idx]) SL.customize_array(self.transform, idx=idx, customizeX=True) return SL.predict_array(X)
[docs] def predict_best_combination( self, in_image, out_image, in_image_mask=False, higher_confidence=False): """ Predict in raster using the best features. Parameters ----------- in_image : str. A filename or path of a raster file. It could be any file that GDAL can open. out_image : str. A geotiff extension filename corresponding to a raster image to create. in_image_mask : str or False, optional (default=False). Path to a geotiff extension filename corresponding to a raster image to create. higher_confidence : str or False, optional (default=False). Path to a geotiff extension filename corresponding to a raster image to create. """ self._reset_mask() push_feedback('Predict with combination ' + str(self.best_idx_)) if self.path_to_save_models is False: SL = self.models_[self.best_idx_] else: SL = SuperLearner(classifier=False, n_jobs=1, verbose=self.verbose) SL.load_model(self.models_path_[self.best_idx_]) SL.customize_array(self.transform, idx=self.best_idx_, customizeX=True) SL.predict_image(in_image, out_image, in_image_mask=in_image_mask, higher_confidence=higher_confidence)
[docs] def predict_images(self, in_image, out_image_prefix, in_image_mask=False, higher_confidence=False): """ Predict each best found features with SFFS.fit(X,y). Parameters ---------- in_image : str. Path of the raster to predict. out_image_prefix : str. Prefix of each raster to save. Will add in suffix the iteration number then .tif. E.g. outRasterPrefix = `classification_`, will give `classification_0.tif` for the first prediction. in_image_mask : str or False, optional (default=False). Path to the image mask where 0 values are masked data. higher_confidence : False or str. Default False. If str, same as outRasterPrefix. """ self._reset_mask() for idx, model in enumerate(self.models_): if self.path_to_save_models is False: SL = self.models_[idx] else: SL = SuperLearner(n_jobs=1, verbose=self.verbose) SL.load_model(model) SL.customize_array(self.transform, idx=idx, customizeX=True) out_image = out_image_prefix + str(idx) + '.tif' SL.predict_image( in_image, out_image, higher_confidence=higher_confidence, in_image_mask=in_image_mask)
[docs] def customize_array(self, xFunction, **kwargs): self.xFunction = xFunction self.xKwargs = kwargs
def _transform_in_fit(self, X, idx=0, customizeX=False): mask = np.copy(self.mask) # if self.xFunction: # X = self.xFunction(X, **self.xKwargs) # if customizeX is False: fieldsToKeep = self._convertIdxToNComp(idx) mask[fieldsToKeep] = 0 X = X[:, ~mask] if customizeX is True: self.mask[self.best_features_[idx]] = 0 X = X[:, ~self.mask] X = _reshape_ndim(X) return X
[docs] def transform(self, X, idx=0, customizeX=False): """ Parameters ---------- idx : int, or str with 'best'. The idx to return X array """ self._reset_mask() self.best_idx_ = np.argmax(self.best_scores_) if idx == 'best': idx = self.best_idx_ if self.n_comp > 1: for candidate in range(idx + 1): if candidate <= idx: idxToMask = [ self.best_features_[candidate] * self.n_comp + i for i in range( self.n_comp)] self.mask[idxToMask] = 0 else: self.mask[self.best_features_[:idx + 1]] = 0 if self.xFunction: X = self.xFunction(X, **self.xKwargs) if customizeX is False: # fieldsToKeep = self.__convertIdxToNComp(idx) X = X[:, ~self.mask] # X = np.hstack((X[:, ~self.mask], xToStack)) if customizeX is True: # if self.n_comp>1: # idxToMask = [self.best_features_[candidate]*self.n_comp + i for i in range(self.n_comp)] # self.mask[idxToMask] = 0 # else: # self.mask[self.best_features_[idx]] = 0 X = X[:, ~self.mask] X = _reshape_ndim(X) return X
def _get_feature_id(self, candidate): """ """ return np.where(self.mask == 1)[0][candidate * self.n_comp] def _convertIdxToNComp(self, idx): """ """ idxUnmask = self._get_feature_id(idx) n_features_to_get = [idxUnmask + j for j in range(self.n_comp)] return n_features_to_get def _maskIdx(self, idx): """ Add the idx to the mask """ self.mask[self._convertIdxToNComp(idx)] = 0 def _reset_mask(self): """ """ self.mask[:] = 1
[docs] def get_best_model(self, clone=False): self.best_idx_ = np.argmax(self.best_scores_) if self.path_to_save_models: SL = SuperLearner(classifier=None, n_jobs=1, verbose=self.verbose) SL.load_model(self.models_path_[self.best_idx_]) else: SL = self.models_[self.best_idx_] return SL