stable-baselines3/tests/test_sde.py
Megan Klaiber dd6e361204
Implement HER (#120)
* Added working her version, Online sampling is missing.

* Updated test_her.

* Added first version of online her sampling. Still problems with tensor dimensions.

* Reformat

* Fixed tests

* Added some comments.

* Updated changelog.

* Add missing init file

* Fixed some small bugs.

* Reduced arguments for HER, small changes.

* Added getattr. Fixed bug for online sampling.

* Updated save/load funtions. Small changes.

* Added her to init.

* Updated save method.

* Updated her ratio.

* Move obs_wrapper

* Added DQN test.

* Fix potential bug

* Offline and online her share same sample_goal function.

* Changed lists into arrays.

* Updated her test.

* Fix online sampling

* Fixed action bug. Updated time limit for episodes.

* Updated convert_dict method to take keys as arguments.

* Renamed obs dict wrapper.

* Seed bit flipping env

* Remove get_episode_dict

* Add fast online sampling version

* Added documentation.

* Vectorized reward computation

* Vectorized goal sampling

* Update time limit for episodes in online her sampling.

* Fix max episode length inference

* Bug fix for Fetch envs

* Fix for HER + gSDE

* Reformat (new black version)

* Added info dict to compute new reward. Check her_replay_buffer again.

* Fix info buffer

* Updated done flag.

* Fixes for gSDE

* Offline her version uses now HerReplayBuffer as episode storage.

* Fix num_timesteps computation

* Fix get torch params

* Vectorized version for offline sampling.

* Modified offline her sampling to use sample method of her_replay_buffer

* Updated HER tests.

* Updated documentation

* Cleanup docstrings

* Updated to review comments

* Fix pytype

* Update according to review comments.

* Removed random goal strategy. Updated sample transitions.

* Updated migration. Removed time signal removal.

* Update doc

* Fix potential load issue

* Add VecNormalize support for dict obs

* Updated saving/loading replay buffer for HER.

* Fix test memory usage

* Fixed save/load replay buffer.

* Fixed save/load replay buffer

* Fixed transition index after loading replay buffer in online sampling

* Better error handling

* Add tests for get_time_limit

* More tests for VecNormalize with dict obs

* Update doc

* Improve HER description

* Add test for sde support

* Add comments

* Add comments

* Remove check that was always valid

* Fix for terminal observation

* Updated buffer size in offline version and reset of HER buffer

* Reformat

* Update doc

* Remove np.empty + add doc

* Fix loading

* Updated loading replay buffer

* Separate online and offline sampling + bug fixes

* Update tensorboard log name

* Version bump

* Bug fix for special case

Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de>
Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org>
2020-10-22 11:56:43 +02:00

78 lines
2.8 KiB
Python

import pytest
import torch as th
from torch.distributions import Normal
from stable_baselines3 import A2C, PPO, SAC
def test_state_dependent_exploration_grad():
"""
Check that the gradient correspond to the expected one
"""
n_states = 2
state_dim = 3
action_dim = 10
sigma_hat = th.ones(state_dim, action_dim, requires_grad=True)
# Reduce the number of parameters
# sigma_ = th.ones(state_dim, action_dim) * sigma_
# weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma))
th.manual_seed(2)
weights_dist = Normal(th.zeros_like(sigma_hat), sigma_hat)
weights = weights_dist.rsample()
state = th.rand(n_states, state_dim)
mu = th.ones(action_dim)
noise = th.mm(state, weights)
action = mu + noise
variance = th.mm(state ** 2, sigma_hat ** 2)
action_dist = Normal(mu, th.sqrt(variance))
# Sum over the action dimension because we assume they are independent
loss = action_dist.log_prob(action.detach()).sum(dim=-1).mean()
loss.backward()
# From Rueckstiess paper: check that the computed gradient
# correspond to the analytical form
grad = th.zeros_like(sigma_hat)
for j in range(action_dim):
# sigma_hat is the std of the gaussian distribution of the noise matrix weights
# sigma_j = sum_j(state_i **2 * sigma_hat_ij ** 2)
# sigma_j is the standard deviation of the policy gaussian distribution
sigma_j = th.sqrt(variance[:, j])
for i in range(state_dim):
# Derivative of the log probability of the jth component of the action
# w.r.t. the standard deviation sigma_j
d_log_policy_j = (noise[:, j] ** 2 - sigma_j ** 2) / sigma_j ** 3
# Derivative of sigma_j w.r.t. sigma_hat_ij
d_log_sigma_j = (state[:, i] ** 2 * sigma_hat[i, j]) / sigma_j
# Chain rule, average over the minibatch
grad[i, j] = (d_log_policy_j * d_log_sigma_j).mean()
# sigma.grad should be equal to grad
assert sigma_hat.grad.allclose(grad)
def test_sde_check():
with pytest.raises(ValueError):
PPO("MlpPolicy", "CartPole-v1", use_sde=True)
@pytest.mark.parametrize("model_class", [SAC, A2C, PPO])
@pytest.mark.parametrize("sde_net_arch", [None, [32, 16], []])
@pytest.mark.parametrize("use_expln", [False, True])
def test_state_dependent_offpolicy_noise(model_class, sde_net_arch, use_expln):
model = model_class(
"MlpPolicy",
"Pendulum-v0",
use_sde=True,
seed=None,
create_eval_env=True,
verbose=1,
policy_kwargs=dict(log_std_init=-2, sde_net_arch=sde_net_arch, use_expln=use_expln, net_arch=[64]),
)
model.learn(total_timesteps=int(300), eval_freq=250)
model.policy.reset_noise()
if model_class == SAC:
model.policy.actor.get_std()