stable-baselines3/tests/test_sde.py

import gymnasium as gym
import numpy as np
import pytest
import torch as th
from torch.distributions import Normal

from stable_baselines3 import A2C, PPO, SAC


def test_state_dependent_exploration_grad():
    """
    Check that the gradient correspond to the expected one
    """
    n_states = 2
    state_dim = 3
    action_dim = 10
    sigma_hat = th.ones(state_dim, action_dim, requires_grad=True)
    # Reduce the number of parameters
    # sigma_ = th.ones(state_dim, action_dim) * sigma_
    # weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma))
    th.manual_seed(2)
    weights_dist = Normal(th.zeros_like(sigma_hat), sigma_hat)
    weights = weights_dist.rsample()

    state = th.rand(n_states, state_dim)
    mu = th.ones(action_dim)
    noise = th.mm(state, weights)

    action = mu + noise

    variance = th.mm(state**2, sigma_hat**2)
    action_dist = Normal(mu, th.sqrt(variance))

    # Sum over the action dimension because we assume they are independent
    loss = action_dist.log_prob(action.detach()).sum(dim=-1).mean()
    loss.backward()

    # From Rueckstiess paper: check that the computed gradient
    # correspond to the analytical form
    grad = th.zeros_like(sigma_hat)
    for j in range(action_dim):
        # sigma_hat is the std of the gaussian distribution of the noise matrix weights
        # sigma_j = sum_j(state_i **2 * sigma_hat_ij ** 2)
        # sigma_j is the standard deviation of the policy gaussian distribution
        sigma_j = th.sqrt(variance[:, j])
        for i in range(state_dim):
            # Derivative of the log probability of the jth component of the action
            # w.r.t. the standard deviation sigma_j
            d_log_policy_j = (noise[:, j] ** 2 - sigma_j**2) / sigma_j**3
            # Derivative of sigma_j w.r.t. sigma_hat_ij
            d_log_sigma_j = (state[:, i] ** 2 * sigma_hat[i, j]) / sigma_j
            # Chain rule, average over the minibatch
            grad[i, j] = (d_log_policy_j * d_log_sigma_j).mean()

    # sigma.grad should be equal to grad
    assert sigma_hat.grad.allclose(grad)


def test_sde_check():
    with pytest.raises(ValueError):
        PPO("MlpPolicy", "CartPole-v1", use_sde=True)


def test_only_sde_squashed():
    with pytest.raises(AssertionError, match="use_sde=True"):
        PPO("MlpPolicy", "Pendulum-v1", use_sde=False, policy_kwargs=dict(squash_output=True))


@pytest.mark.parametrize("model_class", [SAC, A2C, PPO])
@pytest.mark.parametrize("use_expln", [False, True])
@pytest.mark.parametrize("squash_output", [False, True])
def test_state_dependent_noise(model_class, use_expln, squash_output):
    kwargs = {"learning_starts": 0} if model_class == SAC else {"n_steps": 64}

    policy_kwargs = dict(log_std_init=-2, use_expln=use_expln, net_arch=[64])

    if model_class in [A2C, PPO]:
        policy_kwargs["squash_output"] = squash_output
    elif not squash_output:
        pytest.skip("SAC can only use squashed output")

    env = StoreActionEnvWrapper(gym.make("Pendulum-v1"))
    model = model_class(
        "MlpPolicy",
        env,
        use_sde=True,
        seed=1,
        verbose=1,
        policy_kwargs=policy_kwargs,
        **kwargs,
    )
    model.learn(total_timesteps=255)
    buffer = model.replay_buffer if model_class == SAC else model.rollout_buffer
    # Check that only scaled actions are stored
    assert (buffer.actions <= model.action_space.high).all()
    assert (buffer.actions >= model.action_space.low).all()
    if squash_output:
        # Pendulum action range is [-2, 2]
        # we check that the action are correctly unscaled
        if buffer.actions.max() > 0.5:
            assert np.max(env.actions) > 1.0
        if buffer.actions.max() < -0.5:
            assert np.min(env.actions) < -1.0
    model.policy.reset_noise()
    if model_class == SAC:
        model.policy.actor.get_std()


class StoreActionEnvWrapper(gym.Wrapper):
    """
    Keep track of which actions were sent to the env.
    """

    def __init__(self, env):
        super().__init__(env)
        # defines list for tracking actions
        self.actions = []

    def step(self, action):
        # appends list for tracking actions
        self.actions.append(action)
        return super().step(action)
Fix squash output unscaling when using gSDE (#1652) * prevents squash_output if not use_sde, see #1592 * update changelog * add unscaling of actions taken during training * add test regarding squashing and unquashing * avoids try-except block * format Gymnasium code with black * makes mypy pass * makes pytype pass * sort imports * makes error message in assert statement clearer Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * improves code commenting * replaces full env with wrapper * Cleanup code * Reformat --------- Co-authored-by: PatrickHelm <patrick.helm@gmx.net> Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de> 2023-09-01 15:58:15 +00:00			`import gymnasium as gym`
			`import numpy as np`
Add first draft of SDE 2019-10-28 17:24:13 +00:00			`import pytest`
			`import torch as th`
			`from torch.distributions import Normal`

Auto-formatting with black and isort (#97) * Add auto formatting with black and isort * Reformat code * Ignore typing errors * Add note about line length * Add minimum version for isort * Add commit-checks * Update docker image * Fixed lost import (during last merge) * Fix opencv dependency 2020-07-16 14:12:16 +00:00			`from stable_baselines3 import A2C, PPO, SAC`
Add first draft of SDE 2019-10-28 17:24:13 +00:00

Bug fix + add test for sde net arch 2019-12-02 13:14:48 +00:00			`def test_state_dependent_exploration_grad():`
Start cleanup + update docstrings 2019-11-18 13:09:31 +00:00			`"""`
			`Check that the gradient correspond to the expected one`
			`"""`
Add sde test + fix random seed 2019-10-31 13:14:30 +00:00			`n_states = 2`
Add first draft of SDE 2019-10-28 17:24:13 +00:00			`state_dim = 3`
Fix grad computation for sde test 2019-11-26 10:57:48 +00:00			`action_dim = 10`
			`sigma_hat = th.ones(state_dim, action_dim, requires_grad=True)`
Start cleanup + update docstrings 2019-11-18 13:09:31 +00:00			`# Reduce the number of parameters`
			`# sigma_ = th.ones(state_dim, action_dim) * sigma_`
Add first draft of SDE 2019-10-28 17:24:13 +00:00			`# weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma))`
			`th.manual_seed(2)`
Fix grad computation for sde test 2019-11-26 10:57:48 +00:00			`weights_dist = Normal(th.zeros_like(sigma_hat), sigma_hat)`
Add first draft of SDE 2019-10-28 17:24:13 +00:00			`weights = weights_dist.rsample()`
Fix grad computation for sde test 2019-11-26 10:57:48 +00:00
Add sde test + fix random seed 2019-10-31 13:14:30 +00:00			`state = th.rand(n_states, state_dim)`
Add first draft of SDE 2019-10-28 17:24:13 +00:00			`mu = th.ones(action_dim)`
			`noise = th.mm(state, weights)`
Add sde test + fix random seed 2019-10-31 13:14:30 +00:00
Fix grad computation for sde test 2019-11-26 10:57:48 +00:00			`action = mu + noise`

Autoformat code with black (new version complains about new things) (#757) * Blacken code * Fix GitLab CI: switch to Docker container with new black version 2022-02-04 00:56:06 +00:00			`variance = th.mm(state2, sigma_hat2)`
Add first draft of SDE 2019-10-28 17:24:13 +00:00			`action_dist = Normal(mu, th.sqrt(variance))`

Fix grad computation for sde test 2019-11-26 10:57:48 +00:00			`# Sum over the action dimension because we assume they are independent`
Clean up code + bug fixes 2020-01-20 10:17:55 +00:00			`loss = action_dist.log_prob(action.detach()).sum(dim=-1).mean()`
Add first draft of SDE 2019-10-28 17:24:13 +00:00			`loss.backward()`

Fix grad computation for sde test 2019-11-26 10:57:48 +00:00			`# From Rueckstiess paper: check that the computed gradient`
			`# correspond to the analytical form`
			`grad = th.zeros_like(sigma_hat)`
Add first draft of SDE 2019-10-28 17:24:13 +00:00			`for j in range(action_dim):`
Fix grad computation for sde test 2019-11-26 10:57:48 +00:00			`# sigma_hat is the std of the gaussian distribution of the noise matrix weights`
			`# sigma_j = sum_j(state_i *2 sigma_hat_ij ** 2)`
			`# sigma_j is the standard deviation of the policy gaussian distribution`
			`sigma_j = th.sqrt(variance[:, j])`
Add first draft of SDE 2019-10-28 17:24:13 +00:00			`for i in range(state_dim):`
Fix grad computation for sde test 2019-11-26 10:57:48 +00:00			`# Derivative of the log probability of the jth component of the action`
			`# w.r.t. the standard deviation sigma_j`
Autoformat code with black (new version complains about new things) (#757) * Blacken code * Fix GitLab CI: switch to Docker container with new black version 2022-02-04 00:56:06 +00:00			`d_log_policy_j = (noise[:, j] 2 - sigma_j2) / sigma_j**3`
Fix grad computation for sde test 2019-11-26 10:57:48 +00:00			`# Derivative of sigma_j w.r.t. sigma_hat_ij`
			`d_log_sigma_j = (state[:, i] ** 2 * sigma_hat[i, j]) / sigma_j`
			`# Chain rule, average over the minibatch`
			`grad[i, j] = (d_log_policy_j * d_log_sigma_j).mean()`
Add first draft of SDE 2019-10-28 17:24:13 +00:00
			`# sigma.grad should be equal to grad`
Fix grad computation for sde test 2019-11-26 10:57:48 +00:00			`assert sigma_hat.grad.allclose(grad)`
Add first draft of SDE 2019-10-28 17:24:13 +00:00

Implement HER (#120) * Added working her version, Online sampling is missing. * Updated test_her. * Added first version of online her sampling. Still problems with tensor dimensions. * Reformat * Fixed tests * Added some comments. * Updated changelog. * Add missing init file * Fixed some small bugs. * Reduced arguments for HER, small changes. * Added getattr. Fixed bug for online sampling. * Updated save/load funtions. Small changes. * Added her to init. * Updated save method. * Updated her ratio. * Move obs_wrapper * Added DQN test. * Fix potential bug * Offline and online her share same sample_goal function. * Changed lists into arrays. * Updated her test. * Fix online sampling * Fixed action bug. Updated time limit for episodes. * Updated convert_dict method to take keys as arguments. * Renamed obs dict wrapper. * Seed bit flipping env * Remove get_episode_dict * Add fast online sampling version * Added documentation. * Vectorized reward computation * Vectorized goal sampling * Update time limit for episodes in online her sampling. * Fix max episode length inference * Bug fix for Fetch envs * Fix for HER + gSDE * Reformat (new black version) * Added info dict to compute new reward. Check her_replay_buffer again. * Fix info buffer * Updated done flag. * Fixes for gSDE * Offline her version uses now HerReplayBuffer as episode storage. * Fix num_timesteps computation * Fix get torch params * Vectorized version for offline sampling. * Modified offline her sampling to use sample method of her_replay_buffer * Updated HER tests. * Updated documentation * Cleanup docstrings * Updated to review comments * Fix pytype * Update according to review comments. * Removed random goal strategy. Updated sample transitions. * Updated migration. Removed time signal removal. * Update doc * Fix potential load issue * Add VecNormalize support for dict obs * Updated saving/loading replay buffer for HER. * Fix test memory usage * Fixed save/load replay buffer. * Fixed save/load replay buffer * Fixed transition index after loading replay buffer in online sampling * Better error handling * Add tests for get_time_limit * More tests for VecNormalize with dict obs * Update doc * Improve HER description * Add test for sde support * Add comments * Add comments * Remove check that was always valid * Fix for terminal observation * Updated buffer size in offline version and reset of HER buffer * Reformat * Update doc * Remove np.empty + add doc * Fix loading * Updated loading replay buffer * Separate online and offline sampling + bug fixes * Update tensorboard log name * Version bump * Bug fix for special case Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de> Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> 2020-10-22 09:56:43 +00:00			`def test_sde_check():`
			`with pytest.raises(ValueError):`
			`PPO("MlpPolicy", "CartPole-v1", use_sde=True)`


Fix squash output unscaling when using gSDE (#1652) * prevents squash_output if not use_sde, see #1592 * update changelog * add unscaling of actions taken during training * add test regarding squashing and unquashing * avoids try-except block * format Gymnasium code with black * makes mypy pass * makes pytype pass * sort imports * makes error message in assert statement clearer Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * improves code commenting * replaces full env with wrapper * Cleanup code * Reformat --------- Co-authored-by: PatrickHelm <patrick.helm@gmx.net> Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de> 2023-09-01 15:58:15 +00:00			`def test_only_sde_squashed():`
			`with pytest.raises(AssertionError, match="use_sde=True"):`
			`PPO("MlpPolicy", "Pendulum-v1", use_sde=False, policy_kwargs=dict(squash_output=True))`


Remove SDE support for TD3 2020-05-08 13:00:34 +00:00			`@pytest.mark.parametrize("model_class", [SAC, A2C, PPO])`
Add test for `expln` 2020-03-11 15:35:13 +00:00			`@pytest.mark.parametrize("use_expln", [False, True])`
Fix squash output unscaling when using gSDE (#1652) * prevents squash_output if not use_sde, see #1592 * update changelog * add unscaling of actions taken during training * add test regarding squashing and unquashing * avoids try-except block * format Gymnasium code with black * makes mypy pass * makes pytype pass * sort imports * makes error message in assert statement clearer Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * improves code commenting * replaces full env with wrapper * Cleanup code * Reformat --------- Co-authored-by: PatrickHelm <patrick.helm@gmx.net> Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de> 2023-09-01 15:58:15 +00:00			`@pytest.mark.parametrize("squash_output", [False, True])`
			`def test_state_dependent_noise(model_class, use_expln, squash_output):`
Remove `sde_net_arch` + Simplify policy (#584) * Remove `sde_net_arch` + Simplify policy * Add warning at load time 2021-09-28 19:32:54 +00:00			`kwargs = {"learning_starts": 0} if model_class == SAC else {"n_steps": 64}`
Fix squash output unscaling when using gSDE (#1652) * prevents squash_output if not use_sde, see #1592 * update changelog * add unscaling of actions taken during training * add test regarding squashing and unquashing * avoids try-except block * format Gymnasium code with black * makes mypy pass * makes pytype pass * sort imports * makes error message in assert statement clearer Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * improves code commenting * replaces full env with wrapper * Cleanup code * Reformat --------- Co-authored-by: PatrickHelm <patrick.helm@gmx.net> Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de> 2023-09-01 15:58:15 +00:00
			`policy_kwargs = dict(log_std_init=-2, use_expln=use_expln, net_arch=[64])`

			`if model_class in [A2C, PPO]:`
			`policy_kwargs["squash_output"] = squash_output`
			`elif not squash_output:`
			`pytest.skip("SAC can only use squashed output")`

			`env = StoreActionEnvWrapper(gym.make("Pendulum-v1"))`
Remove deprecated features and attributes (#1104) * Remove deprecated eval env * Remove deprecated ret attribute * Remove sde net arch * Remove unused code * Update test comment Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com> 2022-10-11 08:55:16 +00:00			`model = model_class(`
			`"MlpPolicy",`
Fix squash output unscaling when using gSDE (#1652) * prevents squash_output if not use_sde, see #1592 * update changelog * add unscaling of actions taken during training * add test regarding squashing and unquashing * avoids try-except block * format Gymnasium code with black * makes mypy pass * makes pytype pass * sort imports * makes error message in assert statement clearer Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * improves code commenting * replaces full env with wrapper * Cleanup code * Reformat --------- Co-authored-by: PatrickHelm <patrick.helm@gmx.net> Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de> 2023-09-01 15:58:15 +00:00			`env,`
Remove deprecated features and attributes (#1104) * Remove deprecated eval env * Remove deprecated ret attribute * Remove sde net arch * Remove unused code * Update test comment Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com> 2022-10-11 08:55:16 +00:00			`use_sde=True,`
Fix squash output unscaling when using gSDE (#1652) * prevents squash_output if not use_sde, see #1592 * update changelog * add unscaling of actions taken during training * add test regarding squashing and unquashing * avoids try-except block * format Gymnasium code with black * makes mypy pass * makes pytype pass * sort imports * makes error message in assert statement clearer Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * improves code commenting * replaces full env with wrapper * Cleanup code * Reformat --------- Co-authored-by: PatrickHelm <patrick.helm@gmx.net> Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de> 2023-09-01 15:58:15 +00:00			`seed=1,`
Remove deprecated features and attributes (#1104) * Remove deprecated eval env * Remove deprecated ret attribute * Remove sde net arch * Remove unused code * Update test comment Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com> 2022-10-11 08:55:16 +00:00			`verbose=1,`
Fix squash output unscaling when using gSDE (#1652) * prevents squash_output if not use_sde, see #1592 * update changelog * add unscaling of actions taken during training * add test regarding squashing and unquashing * avoids try-except block * format Gymnasium code with black * makes mypy pass * makes pytype pass * sort imports * makes error message in assert statement clearer Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * improves code commenting * replaces full env with wrapper * Cleanup code * Reformat --------- Co-authored-by: PatrickHelm <patrick.helm@gmx.net> Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de> 2023-09-01 15:58:15 +00:00			`policy_kwargs=policy_kwargs,`
Remove deprecated features and attributes (#1104) * Remove deprecated eval env * Remove deprecated ret attribute * Remove sde net arch * Remove unused code * Update test comment Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com> 2022-10-11 08:55:16 +00:00			`**kwargs,`
			`)`
			`model.learn(total_timesteps=255)`
Fix squash output unscaling when using gSDE (#1652) * prevents squash_output if not use_sde, see #1592 * update changelog * add unscaling of actions taken during training * add test regarding squashing and unquashing * avoids try-except block * format Gymnasium code with black * makes mypy pass * makes pytype pass * sort imports * makes error message in assert statement clearer Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * improves code commenting * replaces full env with wrapper * Cleanup code * Reformat --------- Co-authored-by: PatrickHelm <patrick.helm@gmx.net> Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de> 2023-09-01 15:58:15 +00:00			`buffer = model.replay_buffer if model_class == SAC else model.rollout_buffer`
			`# Check that only scaled actions are stored`
			`assert (buffer.actions <= model.action_space.high).all()`
			`assert (buffer.actions >= model.action_space.low).all()`
			`if squash_output:`
			`# Pendulum action range is [-2, 2]`
			`# we check that the action are correctly unscaled`
			`if buffer.actions.max() > 0.5:`
			`assert np.max(env.actions) > 1.0`
			`if buffer.actions.max() < -0.5:`
			`assert np.min(env.actions) < -1.0`
Add custom arch for off-policy actor/critic networks (#182) * Add custom arch for off-policy actor/critic networks * Fix type hints * Address comments * Make sure number of updated parameters match in polyak * Add zip_strict for strict-length zipping * Fix building docs * Add test for zip strict * Faster tests Co-authored-by: Anssi "Miffyli" Kanervisto <kaneran21@hotmail.com> 2020-10-13 10:01:33 +00:00			`model.policy.reset_noise()`
			`if model_class == SAC:`
			`model.policy.actor.get_std()`
Fix squash output unscaling when using gSDE (#1652) * prevents squash_output if not use_sde, see #1592 * update changelog * add unscaling of actions taken during training * add test regarding squashing and unquashing * avoids try-except block * format Gymnasium code with black * makes mypy pass * makes pytype pass * sort imports * makes error message in assert statement clearer Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> * improves code commenting * replaces full env with wrapper * Cleanup code * Reformat --------- Co-authored-by: PatrickHelm <patrick.helm@gmx.net> Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org> Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de> 2023-09-01 15:58:15 +00:00

			`class StoreActionEnvWrapper(gym.Wrapper):`
			`"""`
			`Keep track of which actions were sent to the env.`
			`"""`

			`def __init__(self, env):`
			`super().__init__(env)`
			`# defines list for tracking actions`
			`self.actions = []`

			`def step(self, action):`
			`# appends list for tracking actions`
			`self.actions.append(action)`
			`return super().step(action)`