SB3_contrib Maskable PPO, Learning Rate schedule error
CodingWookie opened this issue · 1 comments
CodingWookie commented
Expected behavior
Optimizing the initial learning rate for PPO to linearly decrease the learning rate for each trial.
Environment-
- Optuna version: optuna 3.0.5
- Python version: Python 3.8.10
- OS: WSL2- Ubuntu 20.04
- (Optional) Other libraries and their versions:
sb3-contrib 1.6.2
stable-baselines3 1.6.2
Error messages, stack traces, or logs
2023-01-04 12:51:53 [W 2023-01-04 17:51:46,602] Trial 0 failed because of the following error: ValueError('value should be one of int, float, str, bool, or torch.Tensor')
2023-01-04 12:51:53 Traceback (most recent call last):
2023-01-04 12:51:53 File "/home/ftuser/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
2023-01-04 12:51:53 value_or_values = func(trial)
2023-01-04 12:51:53 File "/user_data/models/ReforceXOptuna.py", line 353, in <lambda>
2023-01-04 12:51:53 lambda trial: self._hyperopt_objective(
2023-01-04 12:51:53 File "/user_data/models/ReforceXOptuna.py", line 289, in _hyperopt_objective
2023-01-04 12:51:53 model.learn(
2023-01-04 12:51:53 File "/home/ftuser/.local/lib/python3.10/site-packages/sb3_contrib/ppo_mask/ppo_mask.py", line 613, in learn
2023-01-04 12:51:53 self.logger.dump(step=self.num_timesteps)
2023-01-04 12:51:53 File "/home/ftuser/.local/lib/python3.10/site-packages/stable_baselines3/common/logger.py", line 528, in dump
2023-01-04 12:51:53 _format.write(self.name_to_value, self.name_to_excluded, step)
2023-01-04 12:51:53 File "/home/ftuser/.local/lib/python3.10/site-packages/stable_baselines3/common/logger.py", line 429, in write
2023-01-04 12:51:53 experiment, session_start_info, session_end_info = hparams(value.hparam_dict, metric_dict=value.metric_dict)
2023-01-04 12:51:53 File "/home/ftuser/.local/lib/python3.10/site-packages/torch/utils/tensorboard/summary.py", line 231, in hparams
2023-01-04 12:51:53 raise ValueError(
2023-01-04 12:51:53 ValueError: value should be one of int, float, str, bool, or torch.Tensor
Steps to reproduce
- I am using RL_zoo3 example https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/rl_zoo3/hyperparams_opt.py
Reproducible examples (optional)
This is just a snippet of a large algo.
def linear_schedule(initial_value: Union[float, str]) -> Callable[[float], float]:
"""
Linear learning rate schedule.
:param initial_value: (float or str)
:return: (function)
"""
# Force conversion to float
initial_value_ = float(initial_value)
def func(progress_remaining: float) -> float:
"""
Progress will decrease from 1 (beginning) to 0
:param progress_remaining: (float)
:return: (float)
"""
return progress_remaining * initial_value_
return func
def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
"""
Sampler for PPO hyperparams.
:param trial:
:return:
"""
batch_size = trial.suggest_categorical("batch_size", [ 512, 1024, 2048,4096])
n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
gamma = trial.suggest_categorical("gamma", [0.6,0.7,0.8, 0.85, 0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
lr_schedule = trial.suggest_float("learning_rate", 0.00000003, 0.009)
learning_rate = linear_schedule(lr_schedule)# 0.003 to 5e-6
ent_coef = trial.suggest_float("ent_coef", 0.00000001, 0.01)
clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4, 0.5])
n_epochs = trial.suggest_categorical("n_epochs", [ 5, 10, 15, 20, 25, 30,50,100,150])
gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.85, 0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
vf_coef = trial.suggest_float("vf_coef", 0, 1)
if batch_size > n_steps:
batch_size = n_steps
return {
########PPO PARAMS######
"n_steps": n_steps,
"batch_size": batch_size,
"gamma": gamma,
"learning_rate": learning_rate,
"ent_coef": ent_coef,
"clip_range": clip_range,
"n_epochs": n_epochs,
"gae_lambda": gae_lambda,
"max_grad_norm": max_grad_norm,
"vf_coef": vf_coef,
# "sde_sample_freq": sde_sample_freq,
#"policy_kwargs": net_arch
# # log_std_init=log_std_init,
# net_arch=net_arch,
# activation_fn=activation_fn,
# ortho_init=ortho_init,
# ),
}
class ReforceXOptuna(ReforceX):
def _hyperopt_objective(self, trial: optuna.Trial, train_df, total_timesteps: int) -> float:
params = sample_ppo_params(trial)
policy_kwargs = dict(activation_fn=th.nn.SiLU,net_arch=self.net_arch)
#print(params)
# Create the RL model
model = self.MODELCLASS(
self.policy_type,
self.train_env,
policy_kwargs=policy_kwargs,
tensorboard_log=Path(self.custom_tensorboard_path),
**params
)
nan_encountered = False
optuna_callback = TrialEvalCallback(
self.eval_env, trial, eval_freq=len(train_df), deterministic=True,
)
try:
model.learn(
total_timesteps=int(total_timesteps),
callback=[optuna_callback, self.tensorboard_callback]
)
"""
results=optuna.study.trials_dataframe()
sortedresults=results.sort_values(by='value',ascending=True)
print(sortedresults.tail(30))
"""
except AssertionError as e:
# Sometimes, random hyperparams can generate NaN
logger.warning(f"Optuna encountered NaN n\{e}")
nan_encountered = True
finally:
# Free memory
model.env.close()
self.eval_env.close()
# Tell the optimizer that the trial failed
if nan_encountered:
return float("nan")
if optuna_callback.is_pruned:
raise optuna.exceptions.TrialPruned()
return optuna_callback.last_mean_reward
def fit(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen, **kwargs):
"""
User customizable fit method
:params:
data_dictionary: dict = common data dictionary containing all train/test
features/labels/weights.
dk: FreqaiDatakitchen = data kitchen for current pair.
:returns:
model: Any = trained model to be used for inference in dry/live/backtesting
"""
train_df = data_dictionary["train_features"]
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
Additional context (optional)
nzw0301 commented
This is not a bug of Optuna/optuna-examples. I don't know about SB3, but optuna_callback.last_mean_reward
in the code should be a float value as error message says. So possibly, optuna_callback.last_mean_reward.item()
instead of optuna_callback.last_mean_reward
.