stable-baselines3/tests/test_vec_normalize.py

183 lines
6 KiB
Python
Raw Normal View History

2019-09-20 13:18:25 +00:00
import gym
import numpy as np
import pytest
2019-09-20 13:18:25 +00:00
2020-05-05 13:02:35 +00:00
from stable_baselines3 import SAC, TD3
from stable_baselines3.common.running_mean_std import RunningMeanStd
from stable_baselines3.common.vec_env import (
DummyVecEnv,
VecFrameStack,
VecNormalize,
sync_envs_normalization,
unwrap_vec_normalize,
)
2019-09-20 13:18:25 +00:00
ENV_ID = "Pendulum-v0"
2019-09-20 13:18:25 +00:00
2020-03-12 10:12:10 +00:00
def make_env():
return gym.make(ENV_ID)
2020-03-12 10:12:10 +00:00
def check_rms_equal(rmsa, rmsb):
assert np.all(rmsa.mean == rmsb.mean)
assert np.all(rmsa.var == rmsb.var)
assert np.all(rmsa.count == rmsb.count)
def check_vec_norm_equal(norma, normb):
assert norma.observation_space == normb.observation_space
assert norma.action_space == normb.action_space
assert norma.num_envs == normb.num_envs
check_rms_equal(norma.obs_rms, normb.obs_rms)
check_rms_equal(norma.ret_rms, normb.ret_rms)
assert norma.clip_obs == normb.clip_obs
assert norma.clip_reward == normb.clip_reward
assert norma.norm_obs == normb.norm_obs
assert norma.norm_reward == normb.norm_reward
assert np.all(norma.ret == normb.ret)
assert norma.gamma == normb.gamma
assert norma.epsilon == normb.epsilon
assert norma.training == normb.training
2020-03-12 10:12:10 +00:00
def _make_warmstart_cartpole():
"""Warm-start VecNormalize by stepping through CartPole"""
venv = DummyVecEnv([lambda: gym.make("CartPole-v1")])
venv = VecNormalize(venv)
venv.reset()
venv.get_original_obs()
for _ in range(100):
actions = [venv.action_space.sample()]
venv.step(actions)
return venv
2019-09-20 13:18:25 +00:00
def test_runningmeanstd():
"""Test RunningMeanStd object"""
for (x_1, x_2, x_3) in [
(np.random.randn(3), np.random.randn(4), np.random.randn(5)),
(np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)),
]:
2019-09-20 13:18:25 +00:00
rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:])
x_cat = np.concatenate([x_1, x_2, x_3], axis=0)
moments_1 = [x_cat.mean(axis=0), x_cat.var(axis=0)]
rms.update(x_1)
rms.update(x_2)
rms.update(x_3)
moments_2 = [rms.mean, rms.var]
assert np.allclose(moments_1, moments_2)
Implement DQN (#28) * Created DQN template according to the paper. Next steps: - Create Policy - Complete Training - Debug * Changed Base Class * refactor save, to be consistence with overriding the excluded_save_params function. Do not try to exclude the parameters twice. * Added simple DQN policy * Finished learn and train function - missing correct loss computation * changed collect_rollouts to work with discrete space * moved discrete space collect_rollouts to dqn * basic dqn working * deleted SDE related code * added gradient clipping and moved greedy policy to policy * changed policy to implement target network and added soft update(in fact standart tau is 1 so hard update) * fixed policy setup * rebase target_update_intervall on _n_updates * adapted all tests all tests passing * Move to stable-baseline3 * Fixes for DQN * Fix tests + add CNNPolicy * Allow any optimizer for DQN * added some util functions to create a arbitrary linear schedule, fixed pickle problem with old exploration schedule * more documentation * changed buffer dtype * refactor and document * Added Sphinx Documentation Updated changelog.rst * removed custom collect_rollouts as it is no longer necessary * Implemented suggestions to clean code and documentation. * extracted some functions on tests to reduce duplicated code * added support for exploration_fraction * Fixed exploration_fraction * Added documentation * Fixed get_linear_fn -> proper progress scaling * Merged master * Added nature reference * Changed default parameters to https://www.nature.com/articles/nature14236/tables/1 * Fixed n_updates to be incremented correctly * Correct train_freq * Doc update * added special parameter for DQN in tests * different fix for test_discrete * Update docs/modules/dqn.rst Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * Update docs/modules/dqn.rst Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * Update docs/modules/dqn.rst Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * Added RMSProp in optimizer_kwargs, as described in nature paper * Exploration fraction is inverse of 50.000.000 (total frames) / 1.000.000 (frames with linear schedule) according to nature paper * Changelog update for buffer dtype * standard exlude parameters should be always excluded to assure proper saving only if intentionally included by ``include`` parameter * slightly more iterations on test_discrete to pass the test * added param use_rms_prop instead of mutable default argument * forgot alpha * using huber loss, adam and learning rate 1e-4 * account for train_freq in update_target_network * Added memory check for both buffers * Doc updated for buffer allocation * Added psutil Requirement * Adapted test_identity.py * Fixes with new SB3 version * Fix for tensorboard name * Convert assert to warning and fix tests * Refactor off-policy algorithms * Fixes * test: remove next_obs in replay buffer * Update changelog * Fix tests and use tmp_path where possible * Fix sampling bug in buffer * Do not store next obs on episode termination * Fix replay buffer sampling * Update comment * moved epsilon from policy to model * Update predict method * Update atari wrappers to match SB2 * Minor edit in the buffers * Update changelog * Merge branch 'master' into dqn * Update DQN to new structure * Fix tests and remove hardcoded path * Fix for DQN * Disable memory efficient replay buffer by default * Fix docstring * Add tests for memory efficient buffer * Update changelog * Split collect rollout * Move target update outside `train()` for DQN * Update changelog * Update linear schedule doc * Cleanup DQN code * Minor edit * Update version and docker images Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org>
2020-06-29 09:16:54 +00:00
def test_vec_env(tmp_path):
2019-09-20 13:18:25 +00:00
"""Test VecNormalize Object"""
clip_obs = 0.5
clip_reward = 5.0
2019-09-20 13:18:25 +00:00
orig_venv = DummyVecEnv([make_env])
norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward)
_, done = norm_venv.reset(), [False]
2019-09-20 13:18:25 +00:00
while not done[0]:
actions = [norm_venv.action_space.sample()]
obs, rew, done, _ = norm_venv.step(actions)
assert np.max(np.abs(obs)) <= clip_obs
assert np.max(np.abs(rew)) <= clip_reward
Implement DQN (#28) * Created DQN template according to the paper. Next steps: - Create Policy - Complete Training - Debug * Changed Base Class * refactor save, to be consistence with overriding the excluded_save_params function. Do not try to exclude the parameters twice. * Added simple DQN policy * Finished learn and train function - missing correct loss computation * changed collect_rollouts to work with discrete space * moved discrete space collect_rollouts to dqn * basic dqn working * deleted SDE related code * added gradient clipping and moved greedy policy to policy * changed policy to implement target network and added soft update(in fact standart tau is 1 so hard update) * fixed policy setup * rebase target_update_intervall on _n_updates * adapted all tests all tests passing * Move to stable-baseline3 * Fixes for DQN * Fix tests + add CNNPolicy * Allow any optimizer for DQN * added some util functions to create a arbitrary linear schedule, fixed pickle problem with old exploration schedule * more documentation * changed buffer dtype * refactor and document * Added Sphinx Documentation Updated changelog.rst * removed custom collect_rollouts as it is no longer necessary * Implemented suggestions to clean code and documentation. * extracted some functions on tests to reduce duplicated code * added support for exploration_fraction * Fixed exploration_fraction * Added documentation * Fixed get_linear_fn -> proper progress scaling * Merged master * Added nature reference * Changed default parameters to https://www.nature.com/articles/nature14236/tables/1 * Fixed n_updates to be incremented correctly * Correct train_freq * Doc update * added special parameter for DQN in tests * different fix for test_discrete * Update docs/modules/dqn.rst Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * Update docs/modules/dqn.rst Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * Update docs/modules/dqn.rst Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * Added RMSProp in optimizer_kwargs, as described in nature paper * Exploration fraction is inverse of 50.000.000 (total frames) / 1.000.000 (frames with linear schedule) according to nature paper * Changelog update for buffer dtype * standard exlude parameters should be always excluded to assure proper saving only if intentionally included by ``include`` parameter * slightly more iterations on test_discrete to pass the test * added param use_rms_prop instead of mutable default argument * forgot alpha * using huber loss, adam and learning rate 1e-4 * account for train_freq in update_target_network * Added memory check for both buffers * Doc updated for buffer allocation * Added psutil Requirement * Adapted test_identity.py * Fixes with new SB3 version * Fix for tensorboard name * Convert assert to warning and fix tests * Refactor off-policy algorithms * Fixes * test: remove next_obs in replay buffer * Update changelog * Fix tests and use tmp_path where possible * Fix sampling bug in buffer * Do not store next obs on episode termination * Fix replay buffer sampling * Update comment * moved epsilon from policy to model * Update predict method * Update atari wrappers to match SB2 * Minor edit in the buffers * Update changelog * Merge branch 'master' into dqn * Update DQN to new structure * Fix tests and remove hardcoded path * Fix for DQN * Disable memory efficient replay buffer by default * Fix docstring * Add tests for memory efficient buffer * Update changelog * Split collect rollout * Move target update outside `train()` for DQN * Update changelog * Update linear schedule doc * Cleanup DQN code * Minor edit * Update version and docker images Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org>
2020-06-29 09:16:54 +00:00
path = tmp_path / "vec_normalize"
norm_venv.save(path)
deserialized = VecNormalize.load(path, venv=orig_venv)
check_vec_norm_equal(norm_venv, deserialized)
def test_get_original():
venv = _make_warmstart_cartpole()
for _ in range(3):
actions = [venv.action_space.sample()]
obs, rewards, _, _ = venv.step(actions)
obs = obs[0]
orig_obs = venv.get_original_obs()[0]
rewards = rewards[0]
orig_rewards = venv.get_original_reward()[0]
assert np.all(orig_rewards == 1)
assert orig_obs.shape == obs.shape
assert orig_rewards.dtype == rewards.dtype
assert not np.array_equal(orig_obs, obs)
assert not np.array_equal(orig_rewards, rewards)
np.testing.assert_allclose(venv.normalize_obs(orig_obs), obs)
np.testing.assert_allclose(venv.normalize_reward(orig_rewards), rewards)
def test_normalize_external():
venv = _make_warmstart_cartpole()
rewards = np.array([1, 1])
norm_rewards = venv.normalize_reward(rewards)
assert norm_rewards.shape == rewards.shape
# Episode return is almost always >= 1 in CartPole. So reward should shrink.
assert np.all(norm_rewards < 1)
2019-11-14 13:35:00 +00:00
2020-03-23 13:48:38 +00:00
@pytest.mark.parametrize("model_class", [SAC, TD3])
2019-11-14 13:35:00 +00:00
def test_offpolicy_normalization(model_class):
env = DummyVecEnv([make_env])
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0, clip_reward=10.0)
2019-11-14 13:35:00 +00:00
eval_env = DummyVecEnv([make_env])
eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=False, clip_obs=10.0, clip_reward=10.0)
2019-11-14 13:35:00 +00:00
model = model_class("MlpPolicy", env, verbose=1, policy_kwargs=dict(net_arch=[64]))
model.learn(total_timesteps=1000, eval_env=eval_env, eval_freq=500)
2020-02-12 10:34:29 +00:00
# Check getter
assert isinstance(model.get_vec_normalize_env(), VecNormalize)
def test_sync_vec_normalize():
env = DummyVecEnv([make_env])
2020-03-12 14:34:35 +00:00
assert unwrap_vec_normalize(env) is None
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0)
2020-03-12 14:34:35 +00:00
assert isinstance(unwrap_vec_normalize(env), VecNormalize)
env = VecFrameStack(env, 1)
2020-03-12 14:34:35 +00:00
assert isinstance(unwrap_vec_normalize(env), VecNormalize)
eval_env = DummyVecEnv([make_env])
eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0)
eval_env = VecFrameStack(eval_env, 1)
env.seed(0)
env.action_space.seed(0)
env.reset()
# Initialize running mean
latest_reward = None
for _ in range(100):
_, latest_reward, _, _ = env.step([env.action_space.sample()])
# Check that unnormalized reward is same as original reward
original_latest_reward = env.get_original_reward()
assert np.allclose(original_latest_reward, env.unnormalize_reward(latest_reward))
obs = env.reset()
2020-03-12 14:34:35 +00:00
dummy_rewards = np.random.rand(10)
original_obs = env.get_original_obs()
# Check that unnormalization works
assert np.allclose(original_obs, env.unnormalize_obs(obs))
# Normalization must be different (between different environments)
assert not np.allclose(obs, eval_env.normalize_obs(original_obs))
# Test syncing of parameters
sync_envs_normalization(env, eval_env)
# Now they must be synced
assert np.allclose(obs, eval_env.normalize_obs(original_obs))
2020-03-12 14:34:35 +00:00
assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))