mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-05-14 20:58:03 +00:00
* Fix failing set_env test * Fix test failiing due to deprectation of env.seed * Adjust mean reward threshold in failing test * Fix her test failing due to rng * Change seed and revert reward threshold to 90 * Pin gym version * Make VecEnv compatible with gym seeding change * Revert change to VecEnv reset signature * Change subprocenv seed cmd to call reset instead * Fix type check * Add backward compat * Add `compat_gym_seed` helper * Add goal env checks in env_checker * Add docs on HER requirements for envs * Capture user warning in test with inverted box space * Update ale-py version * Fix randint * Allow noop_max to be zero * Update changelog * Update docker image * Update doc conda env and dockerfile * Custom envs should not have any warnings * Fix test for numpy >= 1.21 * Add check for vectorized compute reward * Bump to gym 0.24 * Fix gym default step docstring * Test downgrading gym * Revert "Test downgrading gym" This reverts commit 0072b77156c006ada8a1d6e26ce347ed85a83eeb. * Fix protobuf error * Fix in dependencies * Fix protobuf dep * Use newest version of cartpole * Update gym * Fix warning * Loosen required scipy version * Scipy no longer needed * Try gym 0.25 * Silence warnings from gym * Filter warnings during tests * Update doc * Update requirements * Add gym 26 compat in vec env * Fixes in envs and tests for gym 0.26+ * Enforce gym 0.26 api * format * Fix formatting * Fix dependencies * Fix syntax * Cleanup doc and warnings * Faster tests * Higher budget for HER perf test (revert prev change) * Fixes and update doc * Fix doc build * Fix breaking change * Fixes for rendering * Rename variables in monitor * update render method for gym 0.26 API backwards compatible (mode argument is allowed) while using the gym 0.26 API (render mode is determined at environment creation) * update tests and docs to new gym render API * undo removal of render modes metatadata check * set rgb_array as default render mode for gym.make * undo changes & raise warning if not 'rgb_array' * Fix type check * Remove recursion and fix type checking * Remove hacks for protobuf and gym 0.24 * Fix type annotations * reuse existing render_mode attribute * return tiled images for 'human' render mode * Allow to use opencv for human render, fix typos * Add warning when using non-zero start with Discrete (fixes #1197) * Fix type checking * Bug fixes and handle more cases * Throw proper warnings * Update test * Fix new metadata name * Ignore numpy warnings * Fixes in vec recorder * Global ignore * Filter local warning too * Monkey patch not needed for gym 26 * Add doc of VecEnv vs Gym API * Add render test * Fix return type * Update VecEnv vs Gym API doc * Fix for custom render mode * Fix return type * Fix type checking * check test env test_buffer * skip render check * check env test_dict_env * test_env test_gae * check envs in remaining tests * Update tests * Add warning for Discrete action space with non-zero (#1295) * Fix atari annotation * ignore get_action_meanings [attr-defined] * Fix mypy issues * Add patch for gym/gymnasium transition * Switch to gymnasium * Rely on signature instead of version * More patches * Type ignore because of https://github.com/Farama-Foundation/Gymnasium/pull/39 * Fix doc build * Fix pytype errors * Fix atari requirement * Update env checker due to change in dtype for Discrete * Fix type hint * Convert spaces for saved models * Ignore pytype * Remove gitlab CI * Disable pytype for convert space * Fix undefined info * Fix undefined info * Upgrade shimmy * Fix wrappers type annotation (need PR from Gymnasium) * Fix gymnasium dependency * Fix dependency declaration * Cap pygame version for python 3.7 * Point to master branch (v0.28.0) * Fix: use main not master branch * Rename done to terminated * Fix pygame dependency for python 3.7 * Rename gym to gymnasium * Update Gymnasium * Fix test * Fix tests * Forks don't have access to private variables * Fix linter warnings * Update read the doc env * Fix env checker for GoalEnv * Fix import * Update env checker (more info) and fix dtype * Use micromamab for Docker * Update dependencies * Clarify VecEnv doc * Fix Gymnasium version * Copy file only after mamba install * [ci skip] Update docker doc * Polish code * Reformat * Remove deprecated features * Ignore warning * Update doc * Update examples and changelog * Fix type annotation bundle (SAC, TD3, A2C, PPO, base class) (#1436) * Fix SAC type hints, improve DQN ones * Fix A2C and TD3 type hints * Fix PPO type hints * Fix on-policy type hints * Fix base class type annotation, do not use defaults * Update version * Disable mypy for python 3.7 * Rename Gym26StepReturn * Update continuous critic type annotation * Fix pytype complain --------- Co-authored-by: Carlos Luis <carlos.luisgonc@gmail.com> Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com> Co-authored-by: Thomas Lips <37955681+tlpss@users.noreply.github.com> Co-authored-by: tlips <thomas.lips@ugent.be> Co-authored-by: tlpss <thomas17.lips@gmail.com> Co-authored-by: Quentin GALLOUÉDEC <gallouedec.quentin@gmail.com>
232 lines
9.7 KiB
Python
232 lines
9.7 KiB
Python
from copy import deepcopy
|
|
from typing import Tuple
|
|
|
|
import gymnasium as gym
|
|
import numpy as np
|
|
import pytest
|
|
import torch as th
|
|
|
|
from stable_baselines3 import A2C, PPO
|
|
from stable_baselines3.common.distributions import (
|
|
BernoulliDistribution,
|
|
CategoricalDistribution,
|
|
DiagGaussianDistribution,
|
|
MultiCategoricalDistribution,
|
|
SquashedDiagGaussianDistribution,
|
|
StateDependentNoiseDistribution,
|
|
TanhBijector,
|
|
kl_divergence,
|
|
)
|
|
from stable_baselines3.common.utils import set_random_seed
|
|
|
|
N_ACTIONS = 2
|
|
N_FEATURES = 3
|
|
N_SAMPLES = int(5e6)
|
|
|
|
|
|
def test_bijector():
|
|
"""
|
|
Test TanhBijector
|
|
"""
|
|
actions = th.ones(5) * 2.0
|
|
bijector = TanhBijector()
|
|
|
|
squashed_actions = bijector.forward(actions)
|
|
# Check that the boundaries are not violated
|
|
assert th.max(th.abs(squashed_actions)) <= 1.0
|
|
# Check the inverse method
|
|
assert th.isclose(TanhBijector.inverse(squashed_actions), actions).all()
|
|
|
|
|
|
@pytest.mark.parametrize("model_class", [A2C, PPO])
|
|
def test_squashed_gaussian(model_class):
|
|
"""
|
|
Test run with squashed Gaussian (notably entropy computation)
|
|
"""
|
|
model = model_class("MlpPolicy", "Pendulum-v1", use_sde=True, n_steps=64, policy_kwargs=dict(squash_output=True))
|
|
model.learn(500)
|
|
|
|
gaussian_mean = th.rand(N_SAMPLES, N_ACTIONS)
|
|
dist = SquashedDiagGaussianDistribution(N_ACTIONS)
|
|
_, log_std = dist.proba_distribution_net(N_FEATURES)
|
|
dist = dist.proba_distribution(gaussian_mean, log_std)
|
|
actions = dist.get_actions()
|
|
assert th.max(th.abs(actions)) <= 1.0
|
|
|
|
|
|
@pytest.fixture()
|
|
def dummy_model_distribution_obs_and_actions() -> Tuple[A2C, np.ndarray, np.ndarray]:
|
|
"""
|
|
Fixture creating a Pendulum-v1 gym env, an A2C model and sampling 10 random observations and actions from the env
|
|
:return: A2C model, random observations, random actions
|
|
"""
|
|
env = gym.make("Pendulum-v1")
|
|
model = A2C("MlpPolicy", env, seed=23)
|
|
random_obs = np.array([env.observation_space.sample() for _ in range(10)])
|
|
random_actions = np.array([env.action_space.sample() for _ in range(10)])
|
|
return model, random_obs, random_actions
|
|
|
|
|
|
def test_get_distribution(dummy_model_distribution_obs_and_actions):
|
|
model, random_obs, random_actions = dummy_model_distribution_obs_and_actions
|
|
# Check that evaluate actions return the same thing as get_distribution
|
|
with th.no_grad():
|
|
observations, _ = model.policy.obs_to_tensor(random_obs)
|
|
actions = th.tensor(random_actions, device=observations.device).float()
|
|
_, log_prob_1, entropy_1 = model.policy.evaluate_actions(observations, actions)
|
|
distribution = model.policy.get_distribution(observations)
|
|
log_prob_2 = distribution.log_prob(actions)
|
|
entropy_2 = distribution.entropy()
|
|
assert entropy_1 is not None
|
|
assert entropy_2 is not None
|
|
assert th.allclose(log_prob_1, log_prob_2)
|
|
assert th.allclose(entropy_1, entropy_2)
|
|
|
|
|
|
def test_predict_values(dummy_model_distribution_obs_and_actions):
|
|
model, random_obs, random_actions = dummy_model_distribution_obs_and_actions
|
|
# Check that evaluate_actions return the same thing as predict_values
|
|
with th.no_grad():
|
|
observations, _ = model.policy.obs_to_tensor(random_obs)
|
|
actions = th.tensor(random_actions, device=observations.device).float()
|
|
values_1, _, _ = model.policy.evaluate_actions(observations, actions)
|
|
values_2 = model.policy.predict_values(observations)
|
|
assert th.allclose(values_1, values_2)
|
|
|
|
|
|
def test_sde_distribution():
|
|
n_actions = 1
|
|
deterministic_actions = th.ones(N_SAMPLES, n_actions) * 0.1
|
|
state = th.ones(N_SAMPLES, N_FEATURES) * 0.3
|
|
dist = StateDependentNoiseDistribution(n_actions, full_std=True, squash_output=False)
|
|
|
|
set_random_seed(1)
|
|
_, log_std = dist.proba_distribution_net(N_FEATURES)
|
|
dist.sample_weights(log_std, batch_size=N_SAMPLES)
|
|
|
|
dist = dist.proba_distribution(deterministic_actions, log_std, state)
|
|
actions = dist.get_actions()
|
|
|
|
assert th.allclose(actions.mean(), dist.distribution.mean.mean(), rtol=2e-3)
|
|
assert th.allclose(actions.std(), dist.distribution.scale.mean(), rtol=2e-3)
|
|
|
|
|
|
# TODO: analytical form for squashed Gaussian?
|
|
@pytest.mark.parametrize(
|
|
"dist",
|
|
[
|
|
DiagGaussianDistribution(N_ACTIONS),
|
|
StateDependentNoiseDistribution(N_ACTIONS, squash_output=False),
|
|
],
|
|
)
|
|
def test_entropy(dist):
|
|
# The entropy can be approximated by averaging the negative log likelihood
|
|
# mean negative log likelihood == differential entropy
|
|
set_random_seed(1)
|
|
deterministic_actions = th.rand(1, N_ACTIONS).repeat(N_SAMPLES, 1)
|
|
_, log_std = dist.proba_distribution_net(N_FEATURES, log_std_init=th.log(th.tensor(0.2)))
|
|
|
|
if isinstance(dist, DiagGaussianDistribution):
|
|
dist = dist.proba_distribution(deterministic_actions, log_std)
|
|
else:
|
|
state = th.rand(1, N_FEATURES).repeat(N_SAMPLES, 1)
|
|
dist.sample_weights(log_std, batch_size=N_SAMPLES)
|
|
dist = dist.proba_distribution(deterministic_actions, log_std, state)
|
|
|
|
actions = dist.get_actions()
|
|
entropy = dist.entropy()
|
|
log_prob = dist.log_prob(actions)
|
|
assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=5e-3)
|
|
|
|
|
|
categorical_params = [
|
|
(CategoricalDistribution(N_ACTIONS), N_ACTIONS),
|
|
(MultiCategoricalDistribution([2, 3]), sum([2, 3])),
|
|
(BernoulliDistribution(N_ACTIONS), N_ACTIONS),
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("dist, CAT_ACTIONS", categorical_params)
|
|
def test_categorical(dist, CAT_ACTIONS):
|
|
# The entropy can be approximated by averaging the negative log likelihood
|
|
# mean negative log likelihood == entropy
|
|
set_random_seed(1)
|
|
action_logits = th.rand(N_SAMPLES, CAT_ACTIONS)
|
|
dist = dist.proba_distribution(action_logits)
|
|
actions = dist.get_actions()
|
|
entropy = dist.entropy()
|
|
log_prob = dist.log_prob(actions)
|
|
assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=5e-3)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"dist_type",
|
|
[
|
|
BernoulliDistribution(N_ACTIONS).proba_distribution(th.rand(N_ACTIONS)),
|
|
CategoricalDistribution(N_ACTIONS).proba_distribution(th.rand(N_ACTIONS)),
|
|
DiagGaussianDistribution(N_ACTIONS).proba_distribution(th.rand(N_ACTIONS), th.rand(N_ACTIONS)),
|
|
MultiCategoricalDistribution([N_ACTIONS, N_ACTIONS]).proba_distribution(th.rand(1, sum([N_ACTIONS, N_ACTIONS]))),
|
|
SquashedDiagGaussianDistribution(N_ACTIONS).proba_distribution(th.rand(N_ACTIONS), th.rand(N_ACTIONS)),
|
|
StateDependentNoiseDistribution(N_ACTIONS).proba_distribution(
|
|
th.rand(N_ACTIONS), th.rand([N_ACTIONS, N_ACTIONS]), th.rand([N_ACTIONS, N_ACTIONS])
|
|
),
|
|
],
|
|
)
|
|
def test_kl_divergence(dist_type):
|
|
set_random_seed(8)
|
|
# Test 1: same distribution should have KL Div = 0
|
|
dist1 = dist_type
|
|
dist2 = dist_type
|
|
# PyTorch implementation of kl_divergence doesn't sum across dimensions
|
|
assert th.allclose(kl_divergence(dist1, dist2).sum(), th.tensor(0.0))
|
|
|
|
# Test 2: KL Div = E(Unbiased approx KL Div)
|
|
if isinstance(dist_type, CategoricalDistribution):
|
|
dist1 = dist_type.proba_distribution(th.rand(N_ACTIONS).repeat(N_SAMPLES, 1))
|
|
# deepcopy needed to assign new memory to new distribution instance
|
|
dist2 = deepcopy(dist_type).proba_distribution(th.rand(N_ACTIONS).repeat(N_SAMPLES, 1))
|
|
elif isinstance(dist_type, DiagGaussianDistribution) or isinstance(dist_type, SquashedDiagGaussianDistribution):
|
|
mean_actions1 = th.rand(1).repeat(N_SAMPLES, 1)
|
|
log_std1 = th.rand(1).repeat(N_SAMPLES, 1)
|
|
mean_actions2 = th.rand(1).repeat(N_SAMPLES, 1)
|
|
log_std2 = th.rand(1).repeat(N_SAMPLES, 1)
|
|
dist1 = dist_type.proba_distribution(mean_actions1, log_std1)
|
|
dist2 = deepcopy(dist_type).proba_distribution(mean_actions2, log_std2)
|
|
elif isinstance(dist_type, BernoulliDistribution):
|
|
dist1 = dist_type.proba_distribution(th.rand(1).repeat(N_SAMPLES, 1))
|
|
dist2 = deepcopy(dist_type).proba_distribution(th.rand(1).repeat(N_SAMPLES, 1))
|
|
elif isinstance(dist_type, MultiCategoricalDistribution):
|
|
dist1 = dist_type.proba_distribution(th.rand(1, sum([N_ACTIONS, N_ACTIONS])).repeat(N_SAMPLES, 1))
|
|
dist2 = deepcopy(dist_type).proba_distribution(th.rand(1, sum([N_ACTIONS, N_ACTIONS])).repeat(N_SAMPLES, 1))
|
|
elif isinstance(dist_type, StateDependentNoiseDistribution):
|
|
dist1 = StateDependentNoiseDistribution(1)
|
|
dist2 = deepcopy(dist1)
|
|
state = th.rand(1, N_FEATURES).repeat(N_SAMPLES, 1)
|
|
mean_actions1 = th.rand(1).repeat(N_SAMPLES, 1)
|
|
mean_actions2 = th.rand(1).repeat(N_SAMPLES, 1)
|
|
_, log_std = dist1.proba_distribution_net(N_FEATURES, log_std_init=th.log(th.tensor(0.2)))
|
|
dist1.sample_weights(log_std, batch_size=N_SAMPLES)
|
|
dist2.sample_weights(log_std, batch_size=N_SAMPLES)
|
|
dist1 = dist1.proba_distribution(mean_actions1, log_std, state)
|
|
dist2 = dist2.proba_distribution(mean_actions2, log_std, state)
|
|
|
|
full_kl_div = kl_divergence(dist1, dist2).mean(dim=0)
|
|
actions = dist1.get_actions()
|
|
approx_kl_div = (dist1.log_prob(actions) - dist2.log_prob(actions)).mean(dim=0)
|
|
|
|
assert th.allclose(full_kl_div, approx_kl_div, rtol=5e-2)
|
|
|
|
# Test 3 Sanity test with easy Bernoulli distribution
|
|
if isinstance(dist_type, BernoulliDistribution):
|
|
dist1 = BernoulliDistribution(1).proba_distribution(th.tensor([0.3]))
|
|
dist2 = BernoulliDistribution(1).proba_distribution(th.tensor([0.65]))
|
|
|
|
full_kl_div = kl_divergence(dist1, dist2)
|
|
|
|
actions = th.tensor([0.0, 1.0])
|
|
ad_hoc_kl = th.sum(
|
|
th.exp(dist1.distribution.log_prob(actions))
|
|
* (dist1.distribution.log_prob(actions) - dist2.distribution.log_prob(actions))
|
|
)
|
|
|
|
assert th.allclose(full_kl_div, ad_hoc_kl)
|