fromsklearn.pipelineimportPipelinepipe=Pipeline([("preprocessor",preprocessing_pipeline),('model',None)# Set None as placeholder],memory="cache_name"# cache results, especially useful for grid-search)
hyperparameter_tuning_search=RandomizedSearchCV(pipe,param_distributions=hyperparameter_tuning_params,n_iter=10,# only for random searchcv=inner_cv,# RepeatedKFoldrefit=False,# True forces to refit best model for the entire dataset at the end; pointless if you only want cv resultsn_jobs=-1,# memory = "hyperparameter_tuning" # caching; do not use for RandomSearch)hyperparameter_tuning_search.fit(X_train_inner_val,y_train_inner_val)
Note: Cross-validation estimators are faster than using the model inside CrossValidation, mainly - RidgeCV - LassoCV - LogisticRegressionCV
alphas=np.logspace(-2,2,num=10,base=10)# as fast as Ridge for a single alphamodel=RidgeCV(alphas=alphas,)# you could nest RidgeCV inside GridSearchCV, using list of alphas as single list itemmodel=GridSearchCV(RidgeCV(),params=dict(alphas=[alphas],),)
Should also repeat the gridsearch/randomsearch with different random seed. not required as it is only applicable for randomsearch and will end up giving different estimators, which is not goal of model evaluation
classRepeatedSearchCV():def__init__(self,search,randomized_steps=None,n_repeats=3,random_state=None):self.search=searchself.randomized_steps=randomized_stepsself.random_state=random_stateself.n_repeats=n_repeatssearch_params=self.search.get_params()previous_params=search_params[paramforparaminsearch_params.keys()if"param"inparam]# param_grid or param_distributionifrandomized_stepsisNone:estimators=[search_parms["estimator"]]updated_params=previous_paramsupdated_params["random_state"]=[self.random_state+iforiinrange(0,n_repeats,1)]else:estimators=[step_estimatorforstep_id,step_estimatorinsearch_parms["estimator"].stepsifstep_idinself.randomized_stepsand"random_state"indir(step_estimator)]updated_params=[]forprevious_paraminprevious_params:foriinrange(0,n_repeats,1):temp=previous_paramtemp["random_state"]=self.random_state+iupdated_params.append(temp)self.updated_params=updated_paramsdeffit(self,X,y):self.search_=clone(self.search)updated_kwargs={}forkey,valueinself.search_.__dir__.keys():ifkeyin["param_grid","param_distribution"]:updated_kwargs[key]=self.updated_paramsifkey=="n_iter":updated_kwargs[key]*=self.n_repeatsxlen(self.randomized_steps)self.search_.set_params(**updated_kwargs)self.search_.fit(X,y)forkey,valueinself.search_.__dict__.items():ifkey.endswith('_'):setattr(self,key,value)defpredict(self,X,y):pass
importtimefromsklearn.utils.validationimportcheck_is_fitted,check_array,check_X_yclassNestedCV():def__init__(self,pipeline,params,inner_cv,outer_cv,n_repeats_fit,scoring,hyperparameter_tuning_niter,min_inner_size=None,random_state=0,refit=True):self.pipeline=pipelineself.params=paramsself.inner_cv=inner_cvself.outer_cv=outer_cvself.n_repeats_fit=n_repeats_fitself.scoring=scoringself.hyperparameter_tuning_niter=hyperparameter_tuning_niterself.refit=refitself.random_state=random_stateself.min_inner_size=min_inner_sizedeffit(self,X,y):check_X_y(X,y)best_hyperparameters=[]validation_metrics=[]train_metrics=[]outer_fold=[]model_classes=[]tested_params=[]train_durations=[]inference_durations=[]fori,(outer_train_idx,outer_valid_idx)inenumerate(self.outer_cv.split(X,y)):forparaminself.params:fori,(inner_train_idx,inner_valid_idx)inenumerate(self.outer_cv.split(X,y)):ifself.min_inner_sizeisnotNoneandinner_valid_idx.shape[0]<self.min_inner_size:returnException("Not enough samples")# innerhyperparameter_tuning_search=RepeatedSearchCV(RandomizedSearchCV(self.pipeline,param_distributions=param,n_iter=self.hyperparameter_tuning_niter,# only for random searchcv=self.inner_cv,# will split into (inner_train_idx, inner_val_idx)refit=True,# True forces to refit best model for the entire dataset at the end; required for nested CV; pointless if you only want cv resultsn_jobs=-1,scoring=self.scoring,random_state=self.random_state+i,),n_repeats=3,random_state=0)hyperparameter_tuning_search.fit(X[outer_train_idx],y[outer_train_idx])# outerouter_fold.append(i+1)best_hyperparameter=hyperparameter_tuning_search.best_estimator_best_hyperparameters.append(best_hyperparameter)tested_params.append(param)model_class=best_hyperparameter.steps[-1][1].__class__.__name__model_classes.append(model_class)train_duration=hyperparameter_tuning_search.refit_time_/outer_train_idx.shape[0]train_durations.append(train_duration)train_metric=hyperparameter_tuning_search.score(X[outer_train_idx],y[outer_train_idx])train_metrics.append(train_metric)inference_start_time=time.time()validation_metric=hyperparameter_tuning_search.score(X[outer_valid_idx],y[outer_valid_idx])inference_end_time=time.time()validation_metrics.append(validation_metric)inference_duration=(inference_end_time-inference_start_time)/outer_valid_idx.shape[0]inference_durations.append(inference_duration)df=(pd.DataFrame().assign(outer_fold=outer_fold,model_class=model_classes,model=best_hyperparameters,tested_params=tested_params,train_metrics=train_metrics,validation_metrics=validation_metrics,train_duration_per_row=train_durations,inference_duration_per_row=inference_durations))defmy_func(x,statistics=["mean","std"]):temp=x.agg(statistics)returntemp.iloc[0].round(4).astype(str)+" ± "+temp.iloc[1].round(4).astype(str)summary=(df.groupby("model_class")[["train_metrics","validation_metrics","train_duration_per_row","inference_duration_per_row"]].agg(my_func))self.cv_results_=summary.to_dict()ifself.refit:best_model_class=(df.groupby("model_class")["validation_metrics"].mean().idxmax())best_row=(df[df["model_class"]==best_model_class].iloc[0])best_model=best_row["model"]best_params_search=best_row["tested_params"]best_model_hyperparameter_search_=RandomizedSearchCV(best_model,param_distributions=best_params_search,n_iter=self.hyperparameter_tuning_niter,# only for random searchcv=self.inner_cv,# will split into (inner_train_idx, inner_val_idx)refit=True,# True forces to refit best model for the entire dataset at the end; required for nested CV; pointless if you only want cv resultsn_jobs=-1,scoring=self.scoring,random_state=self.random_state,# memory = "hyperparameter_tuning" # caching; do not use for RandomSearch)best_model_hyperparameter_search_.fit(X,y)forkey,valueinbest_model_hyperparameter_search_.__dict__.items():ifkey.endswith('_'):setattr(self,key,value)# self.best_params_ = best_model_hyperparameter_search_.best_params_# self.best_estimator_ = best_model_hyperparameter_search_.best_estimator_# self.score = best_model_hyperparameter_search_.scorereturnself
pipeline=Pipeline([('model',None)# Set None as placeholder],# memory = "cache_name" # cache results, especially useful for grid-search)params=[dict(model=[RandomForestClassifier()],model__n_estimators=np.arange(2,10),),dict(model=[HistGradientBoostingClassifier()],model__max_iter=np.arange(2,10),)]