diff --git a/tests/test_sde.py b/tests/test_sde.py index 3c5db43..f397558 100644 --- a/tests/test_sde.py +++ b/tests/test_sde.py @@ -7,24 +7,22 @@ from torchy_baselines import A2C def test_state_dependent_exploration(): + n_states = 2 state_dim = 3 # TODO: fix for action_dim > 1 action_dim = 1 sigma = th.ones(state_dim, action_dim, requires_grad=True) - # log_sigma = th.ones(2, 1, requires_grad=True) - # weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma)) th.manual_seed(2) weights_dist = Normal(th.zeros_like(sigma), sigma) weights = weights_dist.rsample() - state = th.rand(1, state_dim) - # state = (th.ones(state_dim,) * 2).view(1, -1) + state = th.rand(n_states, state_dim) mu = th.ones(action_dim) # print(weights.shape, state.shape) noise = th.mm(state, weights) - # variance = th.mm(state ** 2, th.exp(log_sigma) ** 2) + variance = th.mm(state ** 2, sigma ** 2) action_dist = Normal(mu, th.sqrt(variance)) @@ -35,7 +33,8 @@ def test_state_dependent_exploration(): grad = th.zeros_like(sigma) for j in range(action_dim): for i in range(state_dim): - grad[i, j] = ((noise[:, j] ** 2 - variance[:, j]) / (variance[:, j] ** 2)) * (state[:, i] ** 2 * sigma[i, j]) + a = ((noise[:, j] ** 2 - variance[:, j]) / (variance[:, j] ** 2)) * (state[:, i] ** 2 * sigma[i, j]) + grad[i, j] = a.mean() # sigma.grad should be equal to grad assert sigma.grad.allclose(grad) @@ -43,6 +42,16 @@ def test_state_dependent_exploration(): @pytest.mark.parametrize("model_class", [A2C]) def test_state_dependent_noise(model_class): - model = model_class('MlpPolicy', 'Pendulum-v0', n_steps=200, - use_sde=True, ent_coef=0.0, verbose=1, create_eval_env=True) - model.learn(total_timesteps=int(1e6), log_interval=10, eval_freq=10000) + import gym + from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize + from torchy_baselines.common.monitor import Monitor + + # env_id = 'Pendulum-v0' + env_id = 'MountainCarContinuous-v0' + # env_id = 'LunarLanderContinuous-v2' + env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), norm_reward=True) + eval_env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), training=False, norm_reward=False) + model = model_class('MlpPolicy', env, n_steps=200, max_grad_norm=1, use_rms_prop=False, + use_sde=True, ent_coef=0.00, verbose=1, create_eval_env=True, learning_rate=3e-4, + policy_kwargs=dict(log_std_init=0.0, ortho_init=False, net_arch=[256, dict(pi=[256], vf=[256])]), seed=None) + model.learn(total_timesteps=int(20000), log_interval=5, eval_freq=10000, eval_env=eval_env) diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index 6adc45c..a6b9a41 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -282,7 +282,9 @@ class BaseRLModel(object): """ raise NotImplementedError() - def set_random_seed(self, seed=0): + def set_random_seed(self, seed=None): + if seed is None: + return set_random_seed(seed, using_cuda=self.device == th.device('cuda')) self.action_space.seed(seed) if self.env is not None: diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py index 72bf7b2..90105ee 100644 --- a/torchy_baselines/common/distributions.py +++ b/torchy_baselines/common/distributions.py @@ -199,9 +199,8 @@ class StateDependentNoiseDistribution(Distribution): self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std)) self.exploration_mat = self.weights_dist.rsample() - def proba_distribution_net(self, latent_dim, log_std_init=-1): + def proba_distribution_net(self, latent_dim, log_std_init=0.0): mean_actions = nn.Linear(latent_dim, self.action_dim) - # TODO: log_std_init depending on the number of layers? log_std = nn.Parameter(th.ones(latent_dim, self.action_dim) * log_std_init) self.sample_weights(log_std) return mean_actions, log_std