2020-01-27 13:32:31 +00:00
|
|
|
|
import os
|
2020-07-16 14:12:16 +00:00
|
|
|
|
import warnings
|
|
|
|
|
|
from abc import ABC, abstractmethod
|
2020-10-07 08:51:49 +00:00
|
|
|
|
from typing import Any, Callable, Dict, List, Optional, Union
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
|
|
|
|
|
import gym
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
2020-09-29 17:41:14 +00:00
|
|
|
|
from stable_baselines3.common import base_class, logger # pytype: disable=pyi-error
|
2020-07-16 14:12:16 +00:00
|
|
|
|
from stable_baselines3.common.evaluation import evaluate_policy
|
|
|
|
|
|
from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, sync_envs_normalization
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BaseCallback(ABC):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Base class for callback.
|
|
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param verbose:
|
2020-01-27 13:32:31 +00:00
|
|
|
|
"""
|
2020-07-16 14:12:16 +00:00
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
def __init__(self, verbose: int = 0):
|
|
|
|
|
|
super(BaseCallback, self).__init__()
|
2020-03-12 11:34:25 +00:00
|
|
|
|
# The RL model
|
2020-09-29 17:41:14 +00:00
|
|
|
|
self.model = None # type: Optional[base_class.BaseAlgorithm]
|
2020-03-12 11:34:25 +00:00
|
|
|
|
# An alias for self.model.get_env(), the environment used for training
|
2020-01-27 13:32:31 +00:00
|
|
|
|
self.training_env = None # type: Union[gym.Env, VecEnv, None]
|
2020-03-12 11:34:25 +00:00
|
|
|
|
# Number of time the callback was called
|
2020-01-27 13:32:31 +00:00
|
|
|
|
self.n_calls = 0 # type: int
|
2020-03-12 11:34:25 +00:00
|
|
|
|
# n_envs * n times env.step() was called
|
2020-01-27 13:32:31 +00:00
|
|
|
|
self.num_timesteps = 0 # type: int
|
|
|
|
|
|
self.verbose = verbose
|
2020-08-23 12:34:01 +00:00
|
|
|
|
self.locals: Dict[str, Any] = {}
|
|
|
|
|
|
self.globals: Dict[str, Any] = {}
|
2020-06-01 09:55:44 +00:00
|
|
|
|
self.logger = None
|
2020-01-27 13:32:31 +00:00
|
|
|
|
# Sometimes, for event callback, it is useful
|
|
|
|
|
|
# to have access to the parent object
|
|
|
|
|
|
self.parent = None # type: Optional[BaseCallback]
|
|
|
|
|
|
|
2020-01-27 14:53:27 +00:00
|
|
|
|
# Type hint as string to avoid circular import
|
2020-09-29 17:41:14 +00:00
|
|
|
|
def init_callback(self, model: "base_class.BaseAlgorithm") -> None:
|
2020-01-27 13:32:31 +00:00
|
|
|
|
"""
|
|
|
|
|
|
Initialize the callback by saving references to the
|
|
|
|
|
|
RL model and the training environment for convenience.
|
|
|
|
|
|
"""
|
|
|
|
|
|
self.model = model
|
|
|
|
|
|
self.training_env = model.get_env()
|
2020-06-01 09:55:44 +00:00
|
|
|
|
self.logger = logger
|
2020-01-27 13:32:31 +00:00
|
|
|
|
self._init_callback()
|
|
|
|
|
|
|
|
|
|
|
|
def _init_callback(self) -> None:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
def on_training_start(self, locals_: Dict[str, Any], globals_: Dict[str, Any]) -> None:
|
|
|
|
|
|
# Those are reference and will be updated automatically
|
|
|
|
|
|
self.locals = locals_
|
|
|
|
|
|
self.globals = globals_
|
|
|
|
|
|
self._on_training_start()
|
|
|
|
|
|
|
|
|
|
|
|
def _on_training_start(self) -> None:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
def on_rollout_start(self) -> None:
|
|
|
|
|
|
self._on_rollout_start()
|
|
|
|
|
|
|
|
|
|
|
|
def _on_rollout_start(self) -> None:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
|
def _on_step(self) -> bool:
|
|
|
|
|
|
"""
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:return: If the callback returns False, training is aborted early.
|
2020-01-27 13:32:31 +00:00
|
|
|
|
"""
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
2020-03-12 11:34:25 +00:00
|
|
|
|
def on_step(self) -> bool:
|
2020-01-27 13:32:31 +00:00
|
|
|
|
"""
|
2020-03-12 11:34:25 +00:00
|
|
|
|
This method will be called by the model after each call to ``env.step()``.
|
|
|
|
|
|
|
|
|
|
|
|
For child callback (of an ``EventCallback``), this will be called
|
|
|
|
|
|
when the event is triggered.
|
|
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:return: If the callback returns False, training is aborted early.
|
2020-01-27 13:32:31 +00:00
|
|
|
|
"""
|
|
|
|
|
|
self.n_calls += 1
|
|
|
|
|
|
# timesteps start at zero
|
2020-08-28 09:36:33 +00:00
|
|
|
|
self.num_timesteps = self.model.num_timesteps
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
|
|
|
|
|
return self._on_step()
|
|
|
|
|
|
|
|
|
|
|
|
def on_training_end(self) -> None:
|
|
|
|
|
|
self._on_training_end()
|
|
|
|
|
|
|
|
|
|
|
|
def _on_training_end(self) -> None:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
def on_rollout_end(self) -> None:
|
|
|
|
|
|
self._on_rollout_end()
|
|
|
|
|
|
|
|
|
|
|
|
def _on_rollout_end(self) -> None:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
2020-08-23 12:34:01 +00:00
|
|
|
|
def update_locals(self, locals_: Dict[str, Any]) -> None:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Update the references to the local variables.
|
|
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param locals_: the local variables during rollout collection
|
2020-08-23 12:34:01 +00:00
|
|
|
|
"""
|
|
|
|
|
|
self.locals.update(locals_)
|
|
|
|
|
|
self.update_child_locals(locals_)
|
|
|
|
|
|
|
|
|
|
|
|
def update_child_locals(self, locals_: Dict[str, Any]) -> None:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Update the references to the local variables on sub callbacks.
|
|
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param locals_: the local variables during rollout collection
|
2020-08-23 12:34:01 +00:00
|
|
|
|
"""
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
|
|
|
|
|
class EventCallback(BaseCallback):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Base class for triggering callback on event.
|
|
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param callback: Callback that will be called
|
2020-01-27 13:32:31 +00:00
|
|
|
|
when an event is triggered.
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param verbose:
|
2020-01-27 13:32:31 +00:00
|
|
|
|
"""
|
2020-07-16 14:12:16 +00:00
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
def __init__(self, callback: Optional[BaseCallback] = None, verbose: int = 0):
|
|
|
|
|
|
super(EventCallback, self).__init__(verbose=verbose)
|
|
|
|
|
|
self.callback = callback
|
|
|
|
|
|
# Give access to the parent
|
|
|
|
|
|
if callback is not None:
|
|
|
|
|
|
self.callback.parent = self
|
|
|
|
|
|
|
2020-09-29 17:41:14 +00:00
|
|
|
|
def init_callback(self, model: "base_class.BaseAlgorithm") -> None:
|
2020-01-27 13:32:31 +00:00
|
|
|
|
super(EventCallback, self).init_callback(model)
|
2020-01-27 14:53:27 +00:00
|
|
|
|
if self.callback is not None:
|
|
|
|
|
|
self.callback.init_callback(self.model)
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
|
|
|
|
|
def _on_training_start(self) -> None:
|
2020-01-27 14:53:27 +00:00
|
|
|
|
if self.callback is not None:
|
|
|
|
|
|
self.callback.on_training_start(self.locals, self.globals)
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
|
|
|
|
|
def _on_event(self) -> bool:
|
|
|
|
|
|
if self.callback is not None:
|
2020-03-12 12:24:11 +00:00
|
|
|
|
return self.callback.on_step()
|
2020-01-27 13:32:31 +00:00
|
|
|
|
return True
|
|
|
|
|
|
|
2020-01-27 14:53:27 +00:00
|
|
|
|
def _on_step(self) -> bool:
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
2020-08-23 12:34:01 +00:00
|
|
|
|
def update_child_locals(self, locals_: Dict[str, Any]) -> None:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Update the references to the local variables.
|
|
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param locals_: the local variables during rollout collection
|
2020-08-23 12:34:01 +00:00
|
|
|
|
"""
|
|
|
|
|
|
if self.callback is not None:
|
|
|
|
|
|
self.callback.update_locals(locals_)
|
|
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
|
|
|
|
|
class CallbackList(BaseCallback):
|
2020-03-12 11:34:25 +00:00
|
|
|
|
"""
|
|
|
|
|
|
Class for chaining callbacks.
|
|
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param callbacks: A list of callbacks that will be called
|
2020-03-12 11:34:25 +00:00
|
|
|
|
sequentially.
|
|
|
|
|
|
"""
|
2020-07-16 14:12:16 +00:00
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
def __init__(self, callbacks: List[BaseCallback]):
|
|
|
|
|
|
super(CallbackList, self).__init__()
|
|
|
|
|
|
assert isinstance(callbacks, list)
|
|
|
|
|
|
self.callbacks = callbacks
|
|
|
|
|
|
|
|
|
|
|
|
def _init_callback(self) -> None:
|
|
|
|
|
|
for callback in self.callbacks:
|
|
|
|
|
|
callback.init_callback(self.model)
|
|
|
|
|
|
|
|
|
|
|
|
def _on_training_start(self) -> None:
|
|
|
|
|
|
for callback in self.callbacks:
|
|
|
|
|
|
callback.on_training_start(self.locals, self.globals)
|
|
|
|
|
|
|
2020-03-12 11:34:25 +00:00
|
|
|
|
def _on_rollout_start(self) -> None:
|
|
|
|
|
|
for callback in self.callbacks:
|
|
|
|
|
|
callback.on_rollout_start()
|
|
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
def _on_step(self) -> bool:
|
|
|
|
|
|
continue_training = True
|
|
|
|
|
|
for callback in self.callbacks:
|
|
|
|
|
|
# Return False (stop training) if at least one callback returns False
|
2020-03-12 11:34:25 +00:00
|
|
|
|
continue_training = callback.on_step() and continue_training
|
2020-01-27 13:32:31 +00:00
|
|
|
|
return continue_training
|
|
|
|
|
|
|
2020-03-12 11:34:25 +00:00
|
|
|
|
def _on_rollout_end(self) -> None:
|
|
|
|
|
|
for callback in self.callbacks:
|
|
|
|
|
|
callback.on_rollout_end()
|
|
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
def _on_training_end(self) -> None:
|
|
|
|
|
|
for callback in self.callbacks:
|
|
|
|
|
|
callback.on_training_end()
|
|
|
|
|
|
|
2020-08-23 12:34:01 +00:00
|
|
|
|
def update_child_locals(self, locals_: Dict[str, Any]) -> None:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Update the references to the local variables.
|
|
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param locals_: the local variables during rollout collection
|
2020-08-23 12:34:01 +00:00
|
|
|
|
"""
|
|
|
|
|
|
for callback in self.callbacks:
|
|
|
|
|
|
callback.update_locals(locals_)
|
|
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
|
|
|
|
|
class CheckpointCallback(BaseCallback):
|
|
|
|
|
|
"""
|
2020-03-12 11:34:25 +00:00
|
|
|
|
Callback for saving a model every ``save_freq`` steps
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param save_freq:
|
|
|
|
|
|
:param save_path: Path to the folder where the model will be saved.
|
|
|
|
|
|
:param name_prefix: Common prefix to the saved models
|
2020-10-07 08:51:49 +00:00
|
|
|
|
:param verbose:
|
2020-01-27 13:32:31 +00:00
|
|
|
|
"""
|
2020-07-16 14:12:16 +00:00
|
|
|
|
|
2020-10-07 08:51:49 +00:00
|
|
|
|
def __init__(self, save_freq: int, save_path: str, name_prefix: str = "rl_model", verbose: int = 0):
|
2020-01-27 13:32:31 +00:00
|
|
|
|
super(CheckpointCallback, self).__init__(verbose)
|
|
|
|
|
|
self.save_freq = save_freq
|
|
|
|
|
|
self.save_path = save_path
|
|
|
|
|
|
self.name_prefix = name_prefix
|
|
|
|
|
|
|
|
|
|
|
|
def _init_callback(self) -> None:
|
|
|
|
|
|
# Create folder if needed
|
|
|
|
|
|
if self.save_path is not None:
|
|
|
|
|
|
os.makedirs(self.save_path, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
def _on_step(self) -> bool:
|
|
|
|
|
|
if self.n_calls % self.save_freq == 0:
|
2020-07-16 14:12:16 +00:00
|
|
|
|
path = os.path.join(self.save_path, f"{self.name_prefix}_{self.num_timesteps}_steps")
|
2020-01-27 13:32:31 +00:00
|
|
|
|
self.model.save(path)
|
|
|
|
|
|
if self.verbose > 1:
|
|
|
|
|
|
print(f"Saving model checkpoint to {path}")
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ConvertCallback(BaseCallback):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Convert functional callback (old-style) to object.
|
|
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param callback:
|
|
|
|
|
|
:param verbose:
|
2020-01-27 13:32:31 +00:00
|
|
|
|
"""
|
2020-07-16 14:12:16 +00:00
|
|
|
|
|
2020-11-15 16:50:28 +00:00
|
|
|
|
def __init__(self, callback: Callable[[Dict[str, Any], Dict[str, Any]], bool], verbose: int = 0):
|
2020-01-27 13:32:31 +00:00
|
|
|
|
super(ConvertCallback, self).__init__(verbose)
|
|
|
|
|
|
self.callback = callback
|
|
|
|
|
|
|
|
|
|
|
|
def _on_step(self) -> bool:
|
|
|
|
|
|
if self.callback is not None:
|
|
|
|
|
|
return self.callback(self.locals, self.globals)
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class EvalCallback(EventCallback):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Callback for evaluating an agent.
|
|
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param eval_env: The environment used for initialization
|
|
|
|
|
|
:param callback_on_new_best: Callback to trigger
|
2020-03-12 11:34:25 +00:00
|
|
|
|
when there is a new best model according to the ``mean_reward``
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param n_eval_episodes: The number of episodes to test the agent
|
|
|
|
|
|
:param eval_freq: Evaluate the agent every eval_freq call of the callback.
|
|
|
|
|
|
:param log_path: Path to a folder where the evaluations (``evaluations.npz``)
|
2020-01-27 13:32:31 +00:00
|
|
|
|
will be saved. It will be updated at each evaluation.
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param best_model_save_path: Path to a folder where the best model
|
2020-01-27 13:32:31 +00:00
|
|
|
|
according to performance on the eval env will be saved.
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param deterministic: Whether the evaluation should
|
2020-01-27 13:32:31 +00:00
|
|
|
|
use a stochastic or deterministic actions.
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param render: Whether to render or not the environment during evaluation
|
|
|
|
|
|
:param verbose:
|
2020-11-16 10:52:28 +00:00
|
|
|
|
:param warn: Passed to ``evaluate_policy`` (warns if ``eval_env`` has not been
|
|
|
|
|
|
wrapped with a Monitor wrapper)
|
2020-01-27 13:32:31 +00:00
|
|
|
|
"""
|
2020-07-16 14:12:16 +00:00
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
eval_env: Union[gym.Env, VecEnv],
|
|
|
|
|
|
callback_on_new_best: Optional[BaseCallback] = None,
|
|
|
|
|
|
n_eval_episodes: int = 5,
|
|
|
|
|
|
eval_freq: int = 10000,
|
|
|
|
|
|
log_path: str = None,
|
|
|
|
|
|
best_model_save_path: str = None,
|
|
|
|
|
|
deterministic: bool = True,
|
|
|
|
|
|
render: bool = False,
|
|
|
|
|
|
verbose: int = 1,
|
2020-11-16 10:52:28 +00:00
|
|
|
|
warn: bool = True,
|
2020-07-16 14:12:16 +00:00
|
|
|
|
):
|
2020-01-27 13:32:31 +00:00
|
|
|
|
super(EvalCallback, self).__init__(callback_on_new_best, verbose=verbose)
|
|
|
|
|
|
self.n_eval_episodes = n_eval_episodes
|
|
|
|
|
|
self.eval_freq = eval_freq
|
|
|
|
|
|
self.best_mean_reward = -np.inf
|
2020-02-11 12:22:44 +00:00
|
|
|
|
self.last_mean_reward = -np.inf
|
2020-01-27 13:32:31 +00:00
|
|
|
|
self.deterministic = deterministic
|
2020-01-27 14:53:27 +00:00
|
|
|
|
self.render = render
|
2020-11-16 10:52:28 +00:00
|
|
|
|
self.warn = warn
|
2020-01-27 14:53:27 +00:00
|
|
|
|
|
2020-03-12 11:34:25 +00:00
|
|
|
|
# Convert to VecEnv for consistency
|
|
|
|
|
|
if not isinstance(eval_env, VecEnv):
|
|
|
|
|
|
eval_env = DummyVecEnv([lambda: eval_env])
|
|
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
if isinstance(eval_env, VecEnv):
|
|
|
|
|
|
assert eval_env.num_envs == 1, "You must pass only one environment for evaluation"
|
|
|
|
|
|
|
|
|
|
|
|
self.eval_env = eval_env
|
|
|
|
|
|
self.best_model_save_path = best_model_save_path
|
2020-03-12 11:34:25 +00:00
|
|
|
|
# Logs will be written in ``evaluations.npz``
|
2020-01-31 12:48:25 +00:00
|
|
|
|
if log_path is not None:
|
2020-07-16 14:12:16 +00:00
|
|
|
|
log_path = os.path.join(log_path, "evaluations")
|
2020-01-31 12:48:25 +00:00
|
|
|
|
self.log_path = log_path
|
2020-01-27 13:32:31 +00:00
|
|
|
|
self.evaluations_results = []
|
|
|
|
|
|
self.evaluations_timesteps = []
|
2020-01-27 14:53:27 +00:00
|
|
|
|
self.evaluations_length = []
|
2020-12-08 14:49:07 +00:00
|
|
|
|
# For computing success rate
|
|
|
|
|
|
self._is_success_buffer = []
|
|
|
|
|
|
self.evaluations_successes = []
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
2020-10-07 08:51:49 +00:00
|
|
|
|
def _init_callback(self) -> None:
|
2020-03-12 11:34:25 +00:00
|
|
|
|
# Does not work in some corner cases, where the wrapper is not the same
|
2020-05-15 11:54:06 +00:00
|
|
|
|
if not isinstance(self.training_env, type(self.eval_env)):
|
2020-07-16 14:12:16 +00:00
|
|
|
|
warnings.warn("Training and eval env are not of the same type" f"{self.training_env} != {self.eval_env}")
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
|
|
|
|
|
# Create folders if needed
|
|
|
|
|
|
if self.best_model_save_path is not None:
|
|
|
|
|
|
os.makedirs(self.best_model_save_path, exist_ok=True)
|
|
|
|
|
|
if self.log_path is not None:
|
|
|
|
|
|
os.makedirs(os.path.dirname(self.log_path), exist_ok=True)
|
|
|
|
|
|
|
2020-12-08 14:49:07 +00:00
|
|
|
|
def _log_success_callback(self, locals_: Dict[str, Any], globals_: Dict[str, Any]) -> None:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Callback passed to the ``evaluate_policy`` function
|
|
|
|
|
|
in order to log the success rate (when applicable),
|
|
|
|
|
|
for instance when using HER.
|
|
|
|
|
|
|
|
|
|
|
|
:param locals_:
|
|
|
|
|
|
:param globals_:
|
|
|
|
|
|
"""
|
|
|
|
|
|
info = locals_["info"]
|
|
|
|
|
|
# VecEnv: unpack
|
|
|
|
|
|
if not isinstance(info, dict):
|
|
|
|
|
|
info = info[0]
|
|
|
|
|
|
|
|
|
|
|
|
if locals_["done"]:
|
|
|
|
|
|
maybe_is_success = info.get("is_success")
|
|
|
|
|
|
if maybe_is_success is not None:
|
|
|
|
|
|
self._is_success_buffer.append(maybe_is_success)
|
|
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
def _on_step(self) -> bool:
|
|
|
|
|
|
|
2020-01-27 14:53:27 +00:00
|
|
|
|
if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
|
2020-01-27 13:32:31 +00:00
|
|
|
|
# Sync training and eval env if there is VecNormalize
|
|
|
|
|
|
sync_envs_normalization(self.training_env, self.eval_env)
|
|
|
|
|
|
|
2020-12-08 14:49:07 +00:00
|
|
|
|
# Reset success rate buffer
|
|
|
|
|
|
self._is_success_buffer = []
|
|
|
|
|
|
|
2020-07-16 14:12:16 +00:00
|
|
|
|
episode_rewards, episode_lengths = evaluate_policy(
|
|
|
|
|
|
self.model,
|
|
|
|
|
|
self.eval_env,
|
|
|
|
|
|
n_eval_episodes=self.n_eval_episodes,
|
|
|
|
|
|
render=self.render,
|
|
|
|
|
|
deterministic=self.deterministic,
|
|
|
|
|
|
return_episode_rewards=True,
|
2020-11-16 10:52:28 +00:00
|
|
|
|
warn=self.warn,
|
2020-12-08 14:49:07 +00:00
|
|
|
|
callback=self._log_success_callback,
|
2020-07-16 14:12:16 +00:00
|
|
|
|
)
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
|
|
|
|
|
if self.log_path is not None:
|
|
|
|
|
|
self.evaluations_timesteps.append(self.num_timesteps)
|
|
|
|
|
|
self.evaluations_results.append(episode_rewards)
|
2020-01-27 14:53:27 +00:00
|
|
|
|
self.evaluations_length.append(episode_lengths)
|
2020-12-08 14:49:07 +00:00
|
|
|
|
|
|
|
|
|
|
kwargs = {}
|
|
|
|
|
|
# Save success log if present
|
|
|
|
|
|
if len(self._is_success_buffer) > 0:
|
|
|
|
|
|
self.evaluations_successes.append(self._is_success_buffer)
|
|
|
|
|
|
kwargs = dict(successes=self.evaluations_successes)
|
|
|
|
|
|
|
2020-07-16 14:12:16 +00:00
|
|
|
|
np.savez(
|
|
|
|
|
|
self.log_path,
|
|
|
|
|
|
timesteps=self.evaluations_timesteps,
|
|
|
|
|
|
results=self.evaluations_results,
|
|
|
|
|
|
ep_lengths=self.evaluations_length,
|
2020-12-08 14:49:07 +00:00
|
|
|
|
**kwargs,
|
2020-07-16 14:12:16 +00:00
|
|
|
|
)
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
|
|
|
|
|
mean_reward, std_reward = np.mean(episode_rewards), np.std(episode_rewards)
|
2020-01-27 14:53:27 +00:00
|
|
|
|
mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(episode_lengths)
|
2020-02-11 12:22:44 +00:00
|
|
|
|
self.last_mean_reward = mean_reward
|
2020-01-27 14:53:27 +00:00
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
if self.verbose > 0:
|
2020-07-16 14:12:16 +00:00
|
|
|
|
print(f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}")
|
2020-01-27 14:53:27 +00:00
|
|
|
|
print(f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}")
|
2020-06-01 09:55:44 +00:00
|
|
|
|
# Add to current Logger
|
2020-07-16 14:12:16 +00:00
|
|
|
|
self.logger.record("eval/mean_reward", float(mean_reward))
|
|
|
|
|
|
self.logger.record("eval/mean_ep_length", mean_ep_length)
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
2020-12-08 14:49:07 +00:00
|
|
|
|
if len(self._is_success_buffer) > 0:
|
|
|
|
|
|
success_rate = np.mean(self._is_success_buffer)
|
|
|
|
|
|
if self.verbose > 0:
|
|
|
|
|
|
print(f"Success rate: {100 * success_rate:.2f}%")
|
|
|
|
|
|
self.logger.record("eval/success_rate", success_rate)
|
|
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
if mean_reward > self.best_mean_reward:
|
|
|
|
|
|
if self.verbose > 0:
|
|
|
|
|
|
print("New best mean reward!")
|
|
|
|
|
|
if self.best_model_save_path is not None:
|
2020-07-16 14:12:16 +00:00
|
|
|
|
self.model.save(os.path.join(self.best_model_save_path, "best_model"))
|
2020-01-27 13:32:31 +00:00
|
|
|
|
self.best_mean_reward = mean_reward
|
|
|
|
|
|
# Trigger callback if needed
|
|
|
|
|
|
if self.callback is not None:
|
|
|
|
|
|
return self._on_event()
|
|
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
2020-08-23 12:34:01 +00:00
|
|
|
|
def update_child_locals(self, locals_: Dict[str, Any]) -> None:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Update the references to the local variables.
|
|
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param locals_: the local variables during rollout collection
|
2020-08-23 12:34:01 +00:00
|
|
|
|
"""
|
|
|
|
|
|
if self.callback:
|
|
|
|
|
|
self.callback.update_locals(locals_)
|
|
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
|
|
|
|
|
class StopTrainingOnRewardThreshold(BaseCallback):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Stop the training once a threshold in episodic reward
|
|
|
|
|
|
has been reached (i.e. when the model is good enough).
|
|
|
|
|
|
|
2020-03-12 11:34:25 +00:00
|
|
|
|
It must be used with the ``EvalCallback``.
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param reward_threshold: Minimum expected reward per episode
|
2020-01-27 13:32:31 +00:00
|
|
|
|
to stop training.
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param verbose:
|
2020-01-27 13:32:31 +00:00
|
|
|
|
"""
|
2020-07-16 14:12:16 +00:00
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
def __init__(self, reward_threshold: float, verbose: int = 0):
|
|
|
|
|
|
super(StopTrainingOnRewardThreshold, self).__init__(verbose=verbose)
|
|
|
|
|
|
self.reward_threshold = reward_threshold
|
|
|
|
|
|
|
|
|
|
|
|
def _on_step(self) -> bool:
|
2020-07-16 14:12:16 +00:00
|
|
|
|
assert self.parent is not None, "``StopTrainingOnMinimumReward`` callback must be used " "with an ``EvalCallback``"
|
2021-02-01 10:24:44 +00:00
|
|
|
|
# Convert np.bool_ to bool, otherwise callback() is False won't work
|
2020-01-27 13:32:31 +00:00
|
|
|
|
continue_training = bool(self.parent.best_mean_reward < self.reward_threshold)
|
|
|
|
|
|
if self.verbose > 0 and not continue_training:
|
2020-07-16 14:12:16 +00:00
|
|
|
|
print(
|
|
|
|
|
|
f"Stopping training because the mean reward {self.parent.best_mean_reward:.2f} "
|
|
|
|
|
|
f" is above the threshold {self.reward_threshold}"
|
|
|
|
|
|
)
|
2020-01-27 13:32:31 +00:00
|
|
|
|
return continue_training
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class EveryNTimesteps(EventCallback):
|
|
|
|
|
|
"""
|
2020-03-12 11:34:25 +00:00
|
|
|
|
Trigger a callback every ``n_steps`` timesteps
|
2020-01-27 13:32:31 +00:00
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param n_steps: Number of timesteps between two trigger.
|
|
|
|
|
|
:param callback: Callback that will be called
|
2020-01-27 13:32:31 +00:00
|
|
|
|
when the event is triggered.
|
|
|
|
|
|
"""
|
2020-07-16 14:12:16 +00:00
|
|
|
|
|
2020-01-27 13:32:31 +00:00
|
|
|
|
def __init__(self, n_steps: int, callback: BaseCallback):
|
|
|
|
|
|
super(EveryNTimesteps, self).__init__(callback)
|
|
|
|
|
|
self.n_steps = n_steps
|
|
|
|
|
|
self.last_time_trigger = 0
|
|
|
|
|
|
|
|
|
|
|
|
def _on_step(self) -> bool:
|
|
|
|
|
|
if (self.num_timesteps - self.last_time_trigger) >= self.n_steps:
|
|
|
|
|
|
self.last_time_trigger = self.num_timesteps
|
|
|
|
|
|
return self._on_event()
|
|
|
|
|
|
return True
|
2020-08-28 09:36:33 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class StopTrainingOnMaxEpisodes(BaseCallback):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Stop the training once a maximum number of episodes are played.
|
|
|
|
|
|
|
|
|
|
|
|
For multiple environments presumes that, the desired behavior is that the agent trains on each env for ``max_episodes``
|
|
|
|
|
|
and in total for ``max_episodes * n_envs`` episodes.
|
|
|
|
|
|
|
2020-10-02 17:05:55 +00:00
|
|
|
|
:param max_episodes: Maximum number of episodes to stop training.
|
|
|
|
|
|
:param verbose: Select whether to print information about when training ended by reaching ``max_episodes``
|
2020-08-28 09:36:33 +00:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, max_episodes: int, verbose: int = 0):
|
|
|
|
|
|
super(StopTrainingOnMaxEpisodes, self).__init__(verbose=verbose)
|
|
|
|
|
|
self.max_episodes = max_episodes
|
|
|
|
|
|
self._total_max_episodes = max_episodes
|
|
|
|
|
|
self.n_episodes = 0
|
|
|
|
|
|
|
2020-10-07 08:51:49 +00:00
|
|
|
|
def _init_callback(self) -> None:
|
2020-08-28 09:36:33 +00:00
|
|
|
|
# At start set total max according to number of envirnments
|
|
|
|
|
|
self._total_max_episodes = self.max_episodes * self.training_env.num_envs
|
|
|
|
|
|
|
|
|
|
|
|
def _on_step(self) -> bool:
|
|
|
|
|
|
# Checking for both 'done' and 'dones' keywords because:
|
|
|
|
|
|
# Some models use keyword 'done' (e.g.,: SAC, TD3, DQN, DDPG)
|
|
|
|
|
|
# While some models use keyword 'dones' (e.g.,: A2C, PPO)
|
|
|
|
|
|
done_array = np.array(self.locals.get("done") if self.locals.get("done") is not None else self.locals.get("dones"))
|
|
|
|
|
|
self.n_episodes += np.sum(done_array).item()
|
|
|
|
|
|
|
|
|
|
|
|
continue_training = self.n_episodes < self._total_max_episodes
|
|
|
|
|
|
|
|
|
|
|
|
if self.verbose > 0 and not continue_training:
|
|
|
|
|
|
mean_episodes_per_env = self.n_episodes / self.training_env.num_envs
|
|
|
|
|
|
mean_ep_str = (
|
2020-08-29 18:04:19 +00:00
|
|
|
|
f"with an average of {mean_episodes_per_env:.2f} episodes per env" if self.training_env.num_envs > 1 else ""
|
2020-08-28 09:36:33 +00:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
print(
|
|
|
|
|
|
f"Stopping training with a total of {self.num_timesteps} steps because the "
|
|
|
|
|
|
f"{self.locals.get('tb_log_name')} model reached max_episodes={self.max_episodes}, "
|
|
|
|
|
|
f"by playing for {self.n_episodes} episodes "
|
|
|
|
|
|
f"{mean_ep_str}"
|
|
|
|
|
|
)
|
|
|
|
|
|
return continue_training
|