Add sde test + fix random seed

2026-07-03 03:59:13 +00:00 · 2019-10-31 14:14:30 +01:00 · 2019-10-31 14:14:30 +01:00 · 72a6f18e43
commit 72a6f18e43
parent 925afe784c
3 changed files with 22 additions and 12 deletions
--- a/tests/test_sde.py
+++ b/tests/test_sde.py
@ -7,24 +7,22 @@ from torchy_baselines import A2C


 def test_state_dependent_exploration():
+    n_states = 2
    state_dim = 3
    # TODO: fix for action_dim > 1
    action_dim = 1
    sigma = th.ones(state_dim, action_dim, requires_grad=True)

-    # log_sigma = th.ones(2, 1, requires_grad=True)
-
    # weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma))
    th.manual_seed(2)
    weights_dist = Normal(th.zeros_like(sigma), sigma)

    weights = weights_dist.rsample()
-    state = th.rand(1, state_dim)
-    # state = (th.ones(state_dim,) * 2).view(1, -1)
+    state = th.rand(n_states, state_dim)
    mu = th.ones(action_dim)
    # print(weights.shape, state.shape)
    noise = th.mm(state, weights)
-    # variance = th.mm(state ** 2, th.exp(log_sigma) ** 2)
+
    variance = th.mm(state ** 2, sigma ** 2)
    action_dist = Normal(mu, th.sqrt(variance))

@ -35,7 +33,8 @@ def test_state_dependent_exploration():
    grad = th.zeros_like(sigma)
    for j in range(action_dim):
        for i in range(state_dim):
-            grad[i, j] = ((noise[:, j] ** 2 - variance[:, j]) / (variance[:, j] ** 2)) * (state[:, i] ** 2 * sigma[i, j])
+            a = ((noise[:, j] ** 2 - variance[:, j]) / (variance[:, j] ** 2)) * (state[:, i] ** 2 * sigma[i, j])
+            grad[i, j] = a.mean()

    # sigma.grad should be equal to grad
    assert sigma.grad.allclose(grad)
@ -43,6 +42,16 @@ def test_state_dependent_exploration():

@pytest.mark.parametrize("model_class", [A2C])
 def test_state_dependent_noise(model_class):
-    model = model_class('MlpPolicy', 'Pendulum-v0', n_steps=200,
-                        use_sde=True, ent_coef=0.0, verbose=1, create_eval_env=True)
-    model.learn(total_timesteps=int(1e6), log_interval=10, eval_freq=10000)
+    import gym
+    from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize
+    from torchy_baselines.common.monitor import Monitor
+
+    # env_id = 'Pendulum-v0'
+    env_id = 'MountainCarContinuous-v0'
+    # env_id = 'LunarLanderContinuous-v2'
+    env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), norm_reward=True)
+    eval_env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), training=False, norm_reward=False)
+    model = model_class('MlpPolicy', env, n_steps=200, max_grad_norm=1, use_rms_prop=False,
+                        use_sde=True, ent_coef=0.00, verbose=1, create_eval_env=True, learning_rate=3e-4,
+                        policy_kwargs=dict(log_std_init=0.0, ortho_init=False, net_arch=[256, dict(pi=[256], vf=[256])]), seed=None)
+    model.learn(total_timesteps=int(20000), log_interval=5, eval_freq=10000, eval_env=eval_env)
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@ -282,7 +282,9 @@ class BaseRLModel(object):
        """
        raise NotImplementedError()

-    def set_random_seed(self, seed=0):
+    def set_random_seed(self, seed=None):
+        if seed is None:
+            return
        set_random_seed(seed, using_cuda=self.device == th.device('cuda'))
        self.action_space.seed(seed)
        if self.env is not None:
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@ -199,9 +199,8 @@ class StateDependentNoiseDistribution(Distribution):
        self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std))
        self.exploration_mat = self.weights_dist.rsample()

-    def proba_distribution_net(self, latent_dim, log_std_init=-1):
+    def proba_distribution_net(self, latent_dim, log_std_init=0.0):
        mean_actions = nn.Linear(latent_dim, self.action_dim)
-        # TODO: log_std_init depending on the number of layers?
        log_std = nn.Parameter(th.ones(latent_dim, self.action_dim) * log_std_init)
        self.sample_weights(log_std)
        return mean_actions, log_std