mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-05-22 22:10:16 +00:00
* Split torch module code into torch_layers file * Updated reference to CNN * Change 'CxWxH' to 'CxHxW', as per common notion * Fix missing import in policies.py * Move PPOPolicy to OnlineActorCriticPolicy * Create OnPolicyRLModel from PPO, and make A2C and PPO inherit * Update A2C optimizer comment * Clean weight init scales for clarity * Fix A2C log_interval default parameter * Rename 'progress' to 'progress_remaining * Rename 'Models' to 'Algorithms' * Rename 'OnlineActorCriticPolicy' to 'ActorCriticPolicy' * Move static functions out from BaseAlgorithm * Move on/off_policy base algorithms to their own files * Add files for A2C/PPO * Fix docs * Fix pytype * Update documentation on OnPolicyAlgorithm * Add proper doctstring for on_policy rollout gathering * Add bit clarification on the mlppolicy/cnnpolicy naming * Move static function is_vectorized_policies to utils.py * Checking docstrings, pep8 fixes * Update changelog * Clean changelog * Remove policy warnings for sac/td3 * Add monitor_wrapper for OnPolicyAlgorithm. Clean tb logging variables. Add parameter keywords to OffPolicyAlgorithm super init Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org>
56 lines
2.5 KiB
Python
56 lines
2.5 KiB
Python
# Copied from stable_baselines
|
|
import numpy as np
|
|
|
|
from stable_baselines3.common.vec_env import VecEnv
|
|
|
|
|
|
def evaluate_policy(model, env, n_eval_episodes=10, deterministic=True,
|
|
render=False, callback=None, reward_threshold=None,
|
|
return_episode_rewards=False):
|
|
"""
|
|
Runs policy for ``n_eval_episodes`` episodes and returns average reward.
|
|
This is made to work only with one env.
|
|
|
|
:param model: (BaseAlgorithm) The RL agent you want to evaluate.
|
|
:param env: (gym.Env or VecEnv) The gym environment. In the case of a ``VecEnv``
|
|
this must contain only one environment.
|
|
:param n_eval_episodes: (int) Number of episode to evaluate the agent
|
|
:param deterministic: (bool) Whether to use deterministic or stochastic actions
|
|
:param render: (bool) Whether to render the environment or not
|
|
:param callback: (callable) callback function to do additional checks,
|
|
called after each step.
|
|
:param reward_threshold: (float) Minimum expected reward per episode,
|
|
this will raise an error if the performance is not met
|
|
:param return_episode_rewards: (bool) If True, a list of reward per episode
|
|
will be returned instead of the mean.
|
|
:return: (float, float) Mean reward per episode, std of reward per episode
|
|
returns ([float], [int]) when ``return_episode_rewards`` is True
|
|
"""
|
|
if isinstance(env, VecEnv):
|
|
assert env.num_envs == 1, "You must pass only one environment when using this function"
|
|
|
|
episode_rewards, episode_lengths = [], []
|
|
for _ in range(n_eval_episodes):
|
|
obs = env.reset()
|
|
done, state = False, None
|
|
episode_reward = 0.0
|
|
episode_length = 0
|
|
while not done:
|
|
action, state = model.predict(obs, state=state, deterministic=deterministic)
|
|
obs, reward, done, _info = env.step(action)
|
|
episode_reward += reward
|
|
if callback is not None:
|
|
callback(locals(), globals())
|
|
episode_length += 1
|
|
if render:
|
|
env.render()
|
|
episode_rewards.append(episode_reward)
|
|
episode_lengths.append(episode_length)
|
|
mean_reward = np.mean(episode_rewards)
|
|
std_reward = np.std(episode_rewards)
|
|
if reward_threshold is not None:
|
|
assert mean_reward > reward_threshold, ('Mean reward below threshold: '
|
|
f'{mean_reward:.2f} < {reward_threshold:.2f}')
|
|
if return_episode_rewards:
|
|
return episode_rewards, episode_lengths
|
|
return mean_reward, std_reward
|