ARM-software/mango

'Parallel' object is not iterable

ggous opened this issue · 0 comments

ggous commented

Hello and thanks for this project! It seems very promising!

I am trying to train a xgboost classifier.

My code is:


import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from scipy.stats import uniform
from mango.domain.distribution import loguniform
from mango import Tuner
from joblib import Parallel, delayed

xgb_params = {
    'n_estimators': range(10, 200, 50), # 10 to 200 in steps of 50
    'max_depth': range(1, 15), # 1 to 14
    'reg_alpha': loguniform(-3, 6),  # 10^-3 to 10^3
    'booster': ['gbtree', 'gblinear'],
    'colsample_bylevel': uniform(0.05, 0.95), # 0.05 to 1.0
    'colsample_bytree': uniform(0.05, 0.95), # 0.05 to 1.0
    'learning_rate': loguniform(-3, 3),  # 0.001 to 1
    'reg_lambda': loguniform(-3, 6),  # 10^-3 to 10^3
    'min_child_weight': loguniform(0, 2), # 1 to 100
    'subsample': uniform(0.1, 0.89), # 0.1 to 0.99
}
 
class MangoParallelOptimization:
    def __init__(self,
                 njobs, 
                 configuration_params,
                 features_train,
                 target_train,
                 features_val,
                 target_val):
        self.njobs = njobs
        self.conf_dict = configuration_params
        self.x_train = features_train
        self.y_train = target_train
        self.x_val = features_val
        self.y_val = target_val
        self.space = xgb_params
        
        
    def _objective(self, **model_params):
        kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=123)
        results = []
        for hyper_param in model_params:
            model = xgb.XGBClassifier(**hyper_param)
                                      
            result = cross_val_score(model, 
                                     self.x_train,
                                     self.y_train,
                                     scoring='accuracy',
                                     cv=kfold).mean()
            results.append(result)
        return results
        
           
    def _objective2(self, params_batch):
        global parameters
        results_batch = Parallel(self.njobs,
                                 backend='multiprocessing')
        (delayed(self._objective)(**params) for params in params_batch)
        acc = [result for result in results_batch]
        return acc
    
    def mango_optimization(self):
        tuner = Tuner(self.space, self._objective2, self.conf_dict)
        optimization_results = tuner.maximize()
        return optimization_results['best_params'], optimization_results['best_objective']
    
    
    
if __name__=="__main__":
    df = pd.read_csv('/home/ggous/example.csv')
    df.dropna(axis=1, inplace=True)
    features = df.drop(['id', 'CLASS'], axis=1)
    labels = df['CLASS'].values
    
    # Split the data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(
        features,
        labels,
        stratify=labels,
        test_size = 0.2,
        random_state = 123)
    
    # encode string class values as integers
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)

    scaler = StandardScaler()
    x_train_sc = pd.DataFrame(scaler.fit_transform(x_train),
                              index=x_train.index, 
                              columns=x_train.columns)
    x_test_sc = scaler.transform(x_test)

    # Parallel optimization with Mango
    config_params = {'num_iteration': 40, 'initial_random': 10}
    optim = MangoParallelOptimization(njobs=4,
                                                            configuration_params=config_params,
                                                            features_train=x_train, 
                                                            target_train=y_train,
                                                            features_val=x_test,
                                                            target_val=y_test)
     
    best_parameters, best_objective = optim.mango_optimization()

    # Results
    print('best parameters:', best_parameters)
    print('best accuracy:', best_objective)
    # Train the model with the best hyper-parameters 
    best_model = xgb.XGBClassifier(n_jobs=-1, **best_parameters)
    best_model.fit(x_train, y_train)

The file I am using is here.

I have some questions:

  1. First of all, running the code , gives : Parallel object is not iterable

  2. If I want to use in the xgb classifier, the following arguments:

'use_label_encoder': False,
'eval_metric': 'mlogloss',
'seed': 123,
'enable_categorical': False

Can I do ?? :

for hyper_param in model_params:
            model = xgb.XGBClassifier(**hyper_param, 
                                                       use_label_encoder'=False,
                                                       eval_metric= 'mlogloss',
                                                       seed= 123,
                                                       enable_categorical= False )
  1. If I want to manually do the k fold , like this:
def _objective(self, **model_params):
  kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=123)
  for i, (train_idx, val_idx) in enumerate(kfold.split(x_train, y_train)):
      x_train_, y_train_ = x_train[train_idx, :], y_train[train_idx]
      x_val_, y_val_ = x_train[val_idx, :], y_train[val_idx]
          
      model = xgb.XGBClassifier(**hyper_param) 
  
      history = model.fit(x_train_,
                          y_train_,
                          early_stopping_rounds=10,
                          eval_set=[(x_train_, y_train_), (x_val_, y_val_)])
      
      ....

How can I do that? And use the history object inside every fold iteration in order to plot things.
And finally return the result that mango wants.?
What kind of result should be?