Source code for protopipe.mva.train_model

import numpy as np
from sklearn.model_selection import GridSearchCV
from .utils import split_train_test

__all__ = ["TrainModel"]


[docs]class TrainModel(object): """ Train classification or regressor model. Parameters ---------- case: str Possibilities are regressor or classifier feature_name_list: list List of features target_name: str, optional Regression target """ def __init__(self, case, feature_name_list, target_name=None): self.case = case self.target_name = target_name self.feature_name_list = feature_name_list self.data_test = None self.data_train = None self.data_scikit = None
[docs] def split_data( self, data_sig, train_fraction, data_bkg=None, force_same_nsig_nbkg=False ): """ Load and split data to build train/test samples. Parameters ---------- data_sig: `~pandas.DataFrame` Data frame train_fraction: float Fraction of events to build the training sample data_bkg: `~pandas.DataFrame` Data frame force_same_nsig_nbkg: bool If true, the same number of signal and bkg events will be used to build a classifier """ if self.case in "regressor": ( X_train, X_test, y_train, y_test, self.data_train, self.data_test, ) = split_train_test( ds=data_sig, train_fraction=train_fraction, feature_name_list=self.feature_name_list, target_name=self.target_name, ) weight = np.ones(len(self.data_train)) weight_train = weight / sum(weight) elif self.case in "classifier": ( X_train_sig, X_test_sig, y_train_sig, y_test_sig, data_train_sig, data_test_sig, ) = split_train_test( ds=data_sig, train_fraction=train_fraction, feature_name_list=self.feature_name_list, target_name=self.target_name, ) ( X_train_bkg, X_test_bkg, y_train_bkg, y_test_bkg, data_train_bkg, data_test_bkg, ) = split_train_test( ds=data_bkg, train_fraction=train_fraction, feature_name_list=self.feature_name_list, target_name=self.target_name, ) max_events = -1 if force_same_nsig_nbkg is True: if len(X_train_bkg) <= len(X_train_sig): max_events = len(X_train_bkg) else: max_events = len(X_train_sig) X_train = X_train_sig[0:max_events].append(X_train_bkg[0:max_events]) y_train = y_train_sig[0:max_events].append(y_train_bkg[0:max_events]) self.data_train = data_train_sig[0:max_events].append( data_train_bkg[0:max_events] ) if force_same_nsig_nbkg is True: if len(X_test_bkg) <= len(X_test_sig): max_events = len(X_test_bkg) else: max_events = len(X_test_sig) X_test = X_test_sig[0:max_events].append(X_test_bkg[0:max_events]) y_test = y_test_sig[0:max_events].append(y_test_bkg[0:max_events]) self.data_test = data_test_sig[0:max_events].append( data_test_bkg[0:max_events] ) weight = np.ones(len(X_train)) weight_train = weight / sum(weight) self.data_scikit = { "X_train": X_train.values, "X_test": X_test.values, "y_train": y_train.values, "y_test": y_test.values, "w_train": weight_train, }
[docs] def get_optimal_model(self, init_model, tuned_parameters, scoring, cv): """ Get optimal hyperparameters and returns best model. Parameters ---------- init_model: `~sklearn.base.BaseEstimator` Model to optimise tuned_parameters: dict Contains parameter names and ranges to optimise on scoring: str Estimator cv: int number of split for x-validation Returns ------- best_estimator: `~sklearn.base.BaseEstimator` Best model """ model = GridSearchCV(init_model, tuned_parameters, scoring=scoring, cv=cv) model.fit( self.data_scikit["X_train"], self.data_scikit["y_train"], sample_weight=self.data_scikit["w_train"], ) print("Best parameters set found on development set:") for key in model.best_params_.keys(): print(" - {}: {}".format(key, model.best_params_[key])) print("Grid scores on development set:") means = model.cv_results_["mean_test_score"] stds = model.cv_results_["std_test_score"] for mean, std, params in zip(means, stds, model.cv_results_["params"]): print(" - {:.3f}+/-{:.3f} for {}".format(mean, std * 2, params)) best_estimator = model.best_estimator_ return best_estimator