mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-05-14 20:58:03 +00:00
* Fix failing set_env test * Fix test failiing due to deprectation of env.seed * Adjust mean reward threshold in failing test * Fix her test failing due to rng * Change seed and revert reward threshold to 90 * Pin gym version * Make VecEnv compatible with gym seeding change * Revert change to VecEnv reset signature * Change subprocenv seed cmd to call reset instead * Fix type check * Add backward compat * Add `compat_gym_seed` helper * Add goal env checks in env_checker * Add docs on HER requirements for envs * Capture user warning in test with inverted box space * Update ale-py version * Fix randint * Allow noop_max to be zero * Update changelog * Update docker image * Update doc conda env and dockerfile * Custom envs should not have any warnings * Fix test for numpy >= 1.21 * Add check for vectorized compute reward * Bump to gym 0.24 * Fix gym default step docstring * Test downgrading gym * Revert "Test downgrading gym" This reverts commit 0072b77156c006ada8a1d6e26ce347ed85a83eeb. * Fix protobuf error * Fix in dependencies * Fix protobuf dep * Use newest version of cartpole * Update gym * Fix warning * Loosen required scipy version * Scipy no longer needed * Try gym 0.25 * Silence warnings from gym * Filter warnings during tests * Update doc * Update requirements * Add gym 26 compat in vec env * Fixes in envs and tests for gym 0.26+ * Enforce gym 0.26 api * format * Fix formatting * Fix dependencies * Fix syntax * Cleanup doc and warnings * Faster tests * Higher budget for HER perf test (revert prev change) * Fixes and update doc * Fix doc build * Fix breaking change * Fixes for rendering * Rename variables in monitor * update render method for gym 0.26 API backwards compatible (mode argument is allowed) while using the gym 0.26 API (render mode is determined at environment creation) * update tests and docs to new gym render API * undo removal of render modes metatadata check * set rgb_array as default render mode for gym.make * undo changes & raise warning if not 'rgb_array' * Fix type check * Remove recursion and fix type checking * Remove hacks for protobuf and gym 0.24 * Fix type annotations * reuse existing render_mode attribute * return tiled images for 'human' render mode * Allow to use opencv for human render, fix typos * Add warning when using non-zero start with Discrete (fixes #1197) * Fix type checking * Bug fixes and handle more cases * Throw proper warnings * Update test * Fix new metadata name * Ignore numpy warnings * Fixes in vec recorder * Global ignore * Filter local warning too * Monkey patch not needed for gym 26 * Add doc of VecEnv vs Gym API * Add render test * Fix return type * Update VecEnv vs Gym API doc * Fix for custom render mode * Fix return type * Fix type checking * check test env test_buffer * skip render check * check env test_dict_env * test_env test_gae * check envs in remaining tests * Update tests * Add warning for Discrete action space with non-zero (#1295) * Fix atari annotation * ignore get_action_meanings [attr-defined] * Fix mypy issues * Add patch for gym/gymnasium transition * Switch to gymnasium * Rely on signature instead of version * More patches * Type ignore because of https://github.com/Farama-Foundation/Gymnasium/pull/39 * Fix doc build * Fix pytype errors * Fix atari requirement * Update env checker due to change in dtype for Discrete * Fix type hint * Convert spaces for saved models * Ignore pytype * Remove gitlab CI * Disable pytype for convert space * Fix undefined info * Fix undefined info * Upgrade shimmy * Fix wrappers type annotation (need PR from Gymnasium) * Fix gymnasium dependency * Fix dependency declaration * Cap pygame version for python 3.7 * Point to master branch (v0.28.0) * Fix: use main not master branch * Rename done to terminated * Fix pygame dependency for python 3.7 * Rename gym to gymnasium * Update Gymnasium * Fix test * Fix tests * Forks don't have access to private variables * Fix linter warnings * Update read the doc env * Fix env checker for GoalEnv * Fix import * Update env checker (more info) and fix dtype * Use micromamab for Docker * Update dependencies * Clarify VecEnv doc * Fix Gymnasium version * Copy file only after mamba install * [ci skip] Update docker doc * Polish code * Reformat * Remove deprecated features * Ignore warning * Update doc * Update examples and changelog * Fix type annotation bundle (SAC, TD3, A2C, PPO, base class) (#1436) * Fix SAC type hints, improve DQN ones * Fix A2C and TD3 type hints * Fix PPO type hints * Fix on-policy type hints * Fix base class type annotation, do not use defaults * Update version * Disable mypy for python 3.7 * Rename Gym26StepReturn * Update continuous critic type annotation * Fix pytype complain --------- Co-authored-by: Carlos Luis <carlos.luisgonc@gmail.com> Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com> Co-authored-by: Thomas Lips <37955681+tlpss@users.noreply.github.com> Co-authored-by: tlips <thomas.lips@ugent.be> Co-authored-by: tlpss <thomas17.lips@gmail.com> Co-authored-by: Quentin GALLOUÉDEC <gallouedec.quentin@gmail.com>
236 lines
7.2 KiB
Python
236 lines
7.2 KiB
Python
import gymnasium as gym
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from stable_baselines3 import A2C, DDPG, DQN, PPO, SAC, TD3
|
|
from stable_baselines3.common.env_util import make_vec_env
|
|
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
|
|
|
|
normal_action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1))
|
|
|
|
|
|
@pytest.mark.parametrize("model_class", [TD3, DDPG])
|
|
@pytest.mark.parametrize(
|
|
"action_noise",
|
|
[normal_action_noise, OrnsteinUhlenbeckActionNoise(np.zeros(1), 0.1 * np.ones(1))],
|
|
)
|
|
def test_deterministic_pg(model_class, action_noise):
|
|
"""
|
|
Test for DDPG and variants (TD3).
|
|
"""
|
|
model = model_class(
|
|
"MlpPolicy",
|
|
"Pendulum-v1",
|
|
policy_kwargs=dict(net_arch=[64, 64]),
|
|
learning_starts=100,
|
|
verbose=1,
|
|
buffer_size=250,
|
|
action_noise=action_noise,
|
|
)
|
|
model.learn(total_timesteps=200)
|
|
|
|
|
|
@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v1"])
|
|
def test_a2c(env_id):
|
|
model = A2C("MlpPolicy", env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1)
|
|
model.learn(total_timesteps=64)
|
|
|
|
|
|
@pytest.mark.parametrize("model_class", [A2C, PPO])
|
|
@pytest.mark.parametrize("normalize_advantage", [False, True])
|
|
def test_advantage_normalization(model_class, normalize_advantage):
|
|
model = model_class("MlpPolicy", "CartPole-v1", n_steps=64, normalize_advantage=normalize_advantage)
|
|
model.learn(64)
|
|
|
|
|
|
@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v1"])
|
|
@pytest.mark.parametrize("clip_range_vf", [None, 0.2, -0.2])
|
|
def test_ppo(env_id, clip_range_vf):
|
|
if clip_range_vf is not None and clip_range_vf < 0:
|
|
# Should throw an error
|
|
with pytest.raises(AssertionError):
|
|
model = PPO(
|
|
"MlpPolicy",
|
|
env_id,
|
|
seed=0,
|
|
policy_kwargs=dict(net_arch=[16]),
|
|
verbose=1,
|
|
clip_range_vf=clip_range_vf,
|
|
)
|
|
else:
|
|
model = PPO(
|
|
"MlpPolicy",
|
|
env_id,
|
|
n_steps=512,
|
|
seed=0,
|
|
policy_kwargs=dict(net_arch=[16]),
|
|
verbose=1,
|
|
clip_range_vf=clip_range_vf,
|
|
n_epochs=2,
|
|
)
|
|
model.learn(total_timesteps=1000)
|
|
|
|
|
|
@pytest.mark.parametrize("ent_coef", ["auto", 0.01, "auto_0.01"])
|
|
def test_sac(ent_coef):
|
|
model = SAC(
|
|
"MlpPolicy",
|
|
"Pendulum-v1",
|
|
policy_kwargs=dict(net_arch=[64, 64]),
|
|
learning_starts=100,
|
|
verbose=1,
|
|
buffer_size=250,
|
|
ent_coef=ent_coef,
|
|
action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)),
|
|
)
|
|
model.learn(total_timesteps=200)
|
|
|
|
|
|
@pytest.mark.parametrize("n_critics", [1, 3])
|
|
def test_n_critics(n_critics):
|
|
# Test SAC with different number of critics, for TD3, n_critics=1 corresponds to DDPG
|
|
model = SAC(
|
|
"MlpPolicy",
|
|
"Pendulum-v1",
|
|
policy_kwargs=dict(net_arch=[64, 64], n_critics=n_critics),
|
|
learning_starts=100,
|
|
buffer_size=10000,
|
|
verbose=1,
|
|
)
|
|
model.learn(total_timesteps=200)
|
|
|
|
|
|
def test_dqn():
|
|
model = DQN(
|
|
"MlpPolicy",
|
|
"CartPole-v1",
|
|
policy_kwargs=dict(net_arch=[64, 64]),
|
|
learning_starts=100,
|
|
buffer_size=500,
|
|
learning_rate=3e-4,
|
|
verbose=1,
|
|
)
|
|
model.learn(total_timesteps=200)
|
|
|
|
|
|
@pytest.mark.parametrize("train_freq", [4, (4, "step"), (1, "episode")])
|
|
def test_train_freq(tmp_path, train_freq):
|
|
model = SAC(
|
|
"MlpPolicy",
|
|
"Pendulum-v1",
|
|
policy_kwargs=dict(net_arch=[64, 64], n_critics=1),
|
|
learning_starts=100,
|
|
buffer_size=10000,
|
|
verbose=1,
|
|
train_freq=train_freq,
|
|
)
|
|
model.learn(total_timesteps=150)
|
|
model.save(tmp_path / "test_save.zip")
|
|
env = model.get_env()
|
|
model = SAC.load(tmp_path / "test_save.zip", env=env)
|
|
model.learn(total_timesteps=150)
|
|
model = SAC.load(tmp_path / "test_save.zip", train_freq=train_freq, env=env)
|
|
model.learn(total_timesteps=150)
|
|
|
|
|
|
@pytest.mark.parametrize("train_freq", ["4", ("1", "episode"), "non_sense", (1, "close")])
|
|
def test_train_freq_fail(train_freq):
|
|
with pytest.raises(ValueError):
|
|
model = SAC(
|
|
"MlpPolicy",
|
|
"Pendulum-v1",
|
|
policy_kwargs=dict(net_arch=[64, 64], n_critics=1),
|
|
learning_starts=100,
|
|
buffer_size=10000,
|
|
verbose=1,
|
|
train_freq=train_freq,
|
|
)
|
|
model.learn(total_timesteps=250)
|
|
|
|
|
|
@pytest.mark.parametrize("model_class", [SAC, TD3, DDPG, DQN])
|
|
def test_offpolicy_multi_env(model_class):
|
|
kwargs = {}
|
|
if model_class in [SAC, TD3, DDPG]:
|
|
env_id = "Pendulum-v1"
|
|
policy_kwargs = dict(net_arch=[64], n_critics=1)
|
|
# Check auto-conversion to VectorizedActionNoise
|
|
kwargs = dict(action_noise=NormalActionNoise(np.zeros(1), 0.1 * np.ones(1)))
|
|
if model_class == SAC:
|
|
kwargs["use_sde"] = True
|
|
kwargs["sde_sample_freq"] = 4
|
|
else:
|
|
env_id = "CartPole-v1"
|
|
policy_kwargs = dict(net_arch=[64])
|
|
|
|
def make_env():
|
|
env = gym.make(env_id)
|
|
# to check that the code handling timeouts runs
|
|
env = gym.wrappers.TimeLimit(env, 50)
|
|
return env
|
|
|
|
env = make_vec_env(make_env, n_envs=2)
|
|
model = model_class(
|
|
"MlpPolicy",
|
|
env,
|
|
policy_kwargs=policy_kwargs,
|
|
learning_starts=100,
|
|
buffer_size=10000,
|
|
verbose=0,
|
|
train_freq=5,
|
|
**kwargs,
|
|
)
|
|
model.learn(total_timesteps=150)
|
|
|
|
# Check that gradient_steps=-1 works as expected:
|
|
# perform as many gradient_steps as transitions collected
|
|
train_freq = 3
|
|
model = model_class(
|
|
"MlpPolicy",
|
|
env,
|
|
policy_kwargs=policy_kwargs,
|
|
learning_starts=0,
|
|
buffer_size=10000,
|
|
verbose=0,
|
|
train_freq=train_freq,
|
|
gradient_steps=-1,
|
|
**kwargs,
|
|
)
|
|
model.learn(total_timesteps=train_freq)
|
|
assert model.logger.name_to_value["train/n_updates"] == train_freq * env.num_envs
|
|
|
|
|
|
def test_warn_dqn_multi_env():
|
|
with pytest.warns(UserWarning, match="The number of environments used is greater"):
|
|
DQN(
|
|
"MlpPolicy",
|
|
make_vec_env("CartPole-v1", n_envs=2),
|
|
buffer_size=100,
|
|
target_update_interval=1,
|
|
)
|
|
|
|
|
|
def test_ppo_warnings():
|
|
"""Test that PPO warns and errors correctly on
|
|
problematic rollout buffer sizes"""
|
|
|
|
# Only 1 step: advantage normalization will return NaN
|
|
with pytest.raises(AssertionError):
|
|
PPO("MlpPolicy", "Pendulum-v1", n_steps=1)
|
|
|
|
# batch_size of 1 is allowed when normalize_advantage=False
|
|
model = PPO("MlpPolicy", "Pendulum-v1", n_steps=1, batch_size=1, normalize_advantage=False)
|
|
model.learn(4)
|
|
|
|
# Truncated mini-batch
|
|
# Batch size 1 yields NaN with normalized advantage because
|
|
# torch.std(some_length_1_tensor) == NaN
|
|
# advantage normalization is automatically deactivated
|
|
# in that case
|
|
with pytest.warns(UserWarning, match="there will be a truncated mini-batch of size 1"):
|
|
model = PPO("MlpPolicy", "Pendulum-v1", n_steps=64, batch_size=63, verbose=1)
|
|
model.learn(64)
|
|
|
|
loss = model.logger.name_to_value["train/loss"]
|
|
assert loss > 0
|
|
assert not np.isnan(loss) # check not nan (since nan does not equal nan)
|