SB3_contrib Maskable PPO, Learning Rate schedule error

Question

SB3_contrib Maskable PPO, Learning Rate schedule error

CodingWookie opened this issue 2 years ago · 1 comments

CodingWookie commented 2 years ago

Expected behavior

Optimizing the initial learning rate for PPO to linearly decrease the learning rate for each trial.

Environment-

Optuna version: optuna 3.0.5
Python version: Python 3.8.10
OS: WSL2- Ubuntu 20.04
(Optional) Other libraries and their versions:
sb3-contrib 1.6.2
stable-baselines3 1.6.2

Error messages, stack traces, or logs

2023-01-04 12:51:53 [W 2023-01-04 17:51:46,602] Trial 0 failed because of the following error: ValueError('value should be one of int, float, str, bool, or torch.Tensor')
2023-01-04 12:51:53 Traceback (most recent call last):
2023-01-04 12:51:53   File "/home/ftuser/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
2023-01-04 12:51:53     value_or_values = func(trial)
2023-01-04 12:51:53   File "/user_data/models/ReforceXOptuna.py", line 353, in <lambda>
2023-01-04 12:51:53     lambda trial: self._hyperopt_objective(
2023-01-04 12:51:53   File "/user_data/models/ReforceXOptuna.py", line 289, in _hyperopt_objective
2023-01-04 12:51:53     model.learn(
2023-01-04 12:51:53   File "/home/ftuser/.local/lib/python3.10/site-packages/sb3_contrib/ppo_mask/ppo_mask.py", line 613, in learn
2023-01-04 12:51:53     self.logger.dump(step=self.num_timesteps)
2023-01-04 12:51:53   File "/home/ftuser/.local/lib/python3.10/site-packages/stable_baselines3/common/logger.py", line 528, in dump
2023-01-04 12:51:53     _format.write(self.name_to_value, self.name_to_excluded, step)
2023-01-04 12:51:53   File "/home/ftuser/.local/lib/python3.10/site-packages/stable_baselines3/common/logger.py", line 429, in write
2023-01-04 12:51:53     experiment, session_start_info, session_end_info = hparams(value.hparam_dict, metric_dict=value.metric_dict)
2023-01-04 12:51:53   File "/home/ftuser/.local/lib/python3.10/site-packages/torch/utils/tensorboard/summary.py", line 231, in hparams
2023-01-04 12:51:53     raise ValueError(
2023-01-04 12:51:53 ValueError: value should be one of int, float, str, bool, or torch.Tensor

Steps to reproduce

I am using RL_zoo3 example https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/rl_zoo3/hyperparams_opt.py

Reproducible examples (optional)

This is just a snippet of a large algo.

def linear_schedule(initial_value: Union[float, str]) -> Callable[[float], float]:
    """
    Linear learning rate schedule.
    :param initial_value: (float or str)
    :return: (function)
    """
    # Force conversion to float
    initial_value_ = float(initial_value)

    def func(progress_remaining: float) -> float:
        """
        Progress will decrease from 1 (beginning) to 0
        :param progress_remaining: (float)
        :return: (float)
        """
        return progress_remaining * initial_value_

    return func


def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for PPO hyperparams.
    :param trial:
    :return:
    """
      
    batch_size = trial.suggest_categorical("batch_size", [ 512, 1024, 2048,4096])
    n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
    gamma = trial.suggest_categorical("gamma", [0.6,0.7,0.8, 0.85, 0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
  
    lr_schedule = trial.suggest_float("learning_rate", 0.00000003, 0.009)
    learning_rate = linear_schedule(lr_schedule)# 0.003 to 5e-6
   
    ent_coef = trial.suggest_float("ent_coef", 0.00000001, 0.01)
    clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4, 0.5])
    n_epochs = trial.suggest_categorical("n_epochs", [ 5, 10, 15, 20, 25, 30,50,100,150])
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.85, 0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    vf_coef = trial.suggest_float("vf_coef", 0, 1)
       
    if batch_size > n_steps:
        batch_size = n_steps   
    
return {
        
        ########PPO PARAMS######
        "n_steps": n_steps,
        "batch_size": batch_size,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef, 
        "clip_range": clip_range,
        "n_epochs": n_epochs,
        "gae_lambda": gae_lambda,
        "max_grad_norm": max_grad_norm,
        "vf_coef": vf_coef,
        # "sde_sample_freq": sde_sample_freq,
        #"policy_kwargs": net_arch
        #     # log_std_init=log_std_init,
         #   net_arch=net_arch,
        #     activation_fn=activation_fn,
        #     ortho_init=ortho_init,
        # ),
      
    }  

class ReforceXOptuna(ReforceX):
    
    
     def _hyperopt_objective(self, trial: optuna.Trial, train_df, total_timesteps: int) -> float:
        
        params = sample_ppo_params(trial)
        policy_kwargs = dict(activation_fn=th.nn.SiLU,net_arch=self.net_arch)
        #print(params)
        # Create the RL model
        model = self.MODELCLASS(
            self.policy_type,
            self.train_env,
            policy_kwargs=policy_kwargs,
            tensorboard_log=Path(self.custom_tensorboard_path),
            **params
        )
        
        nan_encountered = False
        optuna_callback = TrialEvalCallback(
            self.eval_env, trial, eval_freq=len(train_df), deterministic=True,
        )
        try:
            model.learn(
                total_timesteps=int(total_timesteps),
                callback=[optuna_callback, self.tensorboard_callback]
            )
            """
            results=optuna.study.trials_dataframe()
            sortedresults=results.sort_values(by='value',ascending=True) 
            print(sortedresults.tail(30))      
            """    
        except AssertionError as e:
            # Sometimes, random hyperparams can generate NaN
            logger.warning(f"Optuna encountered NaN n\{e}")
            nan_encountered = True
        finally:
            # Free memory
            model.env.close()
            self.eval_env.close()

        # Tell the optimizer that the trial failed
        if nan_encountered:
            return float("nan")

        if optuna_callback.is_pruned:
            raise optuna.exceptions.TrialPruned()

        return optuna_callback.last_mean_reward
    
     def fit(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen, **kwargs):
        """
        User customizable fit method
        :params:
        data_dictionary: dict = common data dictionary containing all train/test
            features/labels/weights.
        dk: FreqaiDatakitchen = data kitchen for current pair.
        :returns:
        model: Any = trained model to be used for inference in dry/live/backtesting
        """
        train_df = data_dictionary["train_features"]
        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)

Additional context (optional)

Answer 1 · 2023-01-05T13:37:08.000Z

This is not a bug of Optuna/optuna-examples. I don't know about SB3, but optuna_callback.last_mean_reward in the code should be a float value as error message says. So possibly, optuna_callback.last_mean_reward.item() instead of optuna_callback.last_mean_reward.