Possible lack of consistency in xgboost hyperparameters optimization?

Question

Possible lack of consistency in xgboost hyperparameters optimization?

deanforce opened this issue 3 years ago · 0 comments

I am trying to optimize the hyperparameters in an xgboost model using Bayesian optimization (mlrmbo R package). The code below seem to produce reasonable results, but the problem I keep facing is that the results are not reproducible, despite using a consistent seed.

I am including some simplified code below, and some fake data. As you can see in the output (bayes_1 and bayes_2), the algorithm gives consistent results (i.e. same hyperparameters values) for the design phase and for the first few steps in the optimization process, but then the values from the 2 identical runs diverge.

I'd appreciate any feedback you may have!


db<-read.delim("https://drive.google.com/uc?export=download&id=1JXMUbwAWi8jjRPGQxzC15tUzSIWrPfNH")
db<-as.data.frame(db)
        # OK, Bayesian optimization 
        library("xgboost")
        library("data.table")
        library("tidymodels")
        library("mlrMBO")
        library("skimr")
        library("DiceKriging")
        library("rgenound")
        library("Matrix")
        sparse_matrix<- sparse.model.matrix(outcome ~ .-1, data = db)
        labels <- db$outcome
        db_xgb <- xgb.DMatrix(data = sparse_matrix, label= labels)
        
        
        obj.fun<-smoof::makeSingleObjectiveFunction(
            name="xgbcv", 
            fn=function(x){
              set.seed(1234, "L'Ecuyer-CMRG")
              xgbcv <- xgb.cv(params = list(
                nthreads=NULL,
                max_depth = x["max_depth"],
                min_child_weight = x["min_child_weight"],
                eta=x["eta"],
                gamma=x["gamma"],
                max_delta_step=x["max_delta_step"],
                alpha=x["alpha"],
                lambda=x["lambda"],
                booster="gbtree",
                objective = "binary:logistic",  # for regression models
                eval_metric="logloss"),
                tree_method="hist",
                nround=1000,
                nfold=5,
                early_stopping_rounds = 50,
                verbose=0, 
                maximize=F,
                data = db_xgb)
              
              xgbcv$evaluation_log[,min(xgbcv$evaluation_log$test_logloss_mean)]
              
            },
            par.set=makeParamSet(
              makeNumericParam("eta", lower=0.0001, upper=0.5),
              makeNumericParam("gamma", lower=0, upper=20),
              makeIntegerParam("max_depth", lower=1, upper=50),
              makeIntegerParam("min_child_weight", lower=1, upper=10),
              makeNumericParam("max_delta_step", lower=0, upper=10),
              makeNumericParam("alpha", lower=0, upper=10),
              makeNumericParam("lambda", lower=0, upper=10)),
            minimize=T)
          
          set.seed(1234, "L'Ecuyer-CMRG")
          bayes<-function(n_design=(10*getNumberOfParameters(obj.fun)), 
                          opt_steps=50){
            set.seed(1234, "L'Ecuyer-CMRG")
            des<-generateDesign(n=n_design,par.set=getParamSet(obj.fun), 
                                fun=lhs::optimumLHS) #randomLHS
            set.seed(1234, "L'Ecuyer-CMRG")
            des$y<-apply(des,1,obj.fun)
            
            control=makeMBOControl()
            control=setMBOControlTermination(control,iters =50)  
            control=setMBOControlInfill(control, crit=makeMBOInfillCritEI())  
            set.seed(1234, "L'Ecuyer-CMRG")
            run=mbo(fun=obj.fun, design=des, 
                    learner=makeLearner("regr.km", predict.type="se", covtype="matern3_2", optim.method= "gen", #BFGS",
                                        nugget.estim=F, jitter=F, nugget=1e-22,control=list(trace=F)), # noise.var=0),# 
                    control=control, show.info = T)
            
            
            optimization_process<-as.data.frame(run$opt.path$env$path %>%   #get optimization data
                                      dplyr:: mutate(round=row_number())%>%
                                      dplyr::mutate(type=case_when(round<=(n_design) ~"design",
                                                                   TRUE~"bayesian optimization")))
            
            return(list(optimization_process))
                   }
          
          
          set.seed(1234)
          runs<-bayes()
          bayes_1<-as.data.frame(runs[[1]])
          
          set.seed(1234)
          runs<-bayes()
          bayes_2<-as.data.frame(runs[[1]])``