Bug fix for off-policy normalization

Now working properly
2026-07-16 18:31:44 +00:00 · 2019-11-15 11:00:31 +01:00 · 2019-11-15 11:00:31 +01:00 · cdb62a93fe
commit cdb62a93fe
parent f719544386
3 changed files with 18 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -24,7 +24,6 @@ TODO:
 - SDE: reduce the number of parameters (only n_features instead of n_features x n_actions) for A2C
 (done for TD3)
 - SDE: learn the feature extractor?
- DEBUG normalization with replay buffer (apparently pb with observation normalization)

 Later:
 - get_parameters / set_parameters
--- a/tests/test_vec_normalize.py
+++ b/tests/test_vec_normalize.py
@ -46,10 +46,10 @@ def test_vec_env():
@pytest.mark.parametrize("model_class", [SAC, TD3, CEMRL])
 def test_offpolicy_normalization(model_class):
    env = DummyVecEnv([lambda: gym.make(ENV_ID)])
-    env = VecNormalize(env, norm_obs=False, norm_reward=False, clip_obs=10., clip_reward=10.)
+    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)

    eval_env = DummyVecEnv([lambda: gym.make(ENV_ID)])
    eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False, clip_obs=10., clip_reward=10.)

    model = model_class('MlpPolicy', env, verbose=1)
-    model.learn(total_timesteps=10000, eval_env=eval_env, eval_freq=1000)
+    model.learn(total_timesteps=1000, eval_env=eval_env, eval_freq=500)
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@ -333,6 +333,11 @@ class BaseRLModel(object):
        assert isinstance(env, VecEnv)
        assert env.num_envs == 1

+        # Retrieve unnormalized observation for saving into the buffer
+        if self._vec_normalize_env is not None:
+            obs_ = self._vec_normalize_env.get_original_obs()
+
+
        self.rollout_data = None
        if hasattr(self, 'use_sde') and self.use_sde:
            self.actor.reset_noise()
@ -377,12 +382,13 @@ class BaseRLModel(object):
                if replay_buffer is not None:
                    # Store only the unnormalized version
                    if self._vec_normalize_env is not None:
-                        # TODO: save it instead of unnormalizing
-                        obs = self._vec_normalize_env.unnormalize_obs(obs)
-                        new_obs = self._vec_normalize_env.get_original_obs()
-                        reward = self._vec_normalize_env.get_original_reward()
+                        new_obs_ = self._vec_normalize_env.get_original_obs()
+                        reward_ = self._vec_normalize_env.get_original_reward()
+                    else:
+                        # Avoid changing the original ones
+                        obs_, new_obs_, reward_ = obs, new_obs, reward

-                    replay_buffer.add(obs, new_obs, action, reward, done_bool)
+                    replay_buffer.add(obs_, new_obs_, action, reward_, done_bool)

                if self.rollout_data is not None:
                    # Assume only one env
@ -392,6 +398,11 @@ class BaseRLModel(object):
                    self.rollout_data['dones'].append(np.array(done_bool[0]).copy())

                obs = new_obs
+                # Save the true unnormalized observation
+                # otherwise obs_ = self._vec_normalize_env.unnormalize_obs(obs)
+                # is a good approximation
+                if self._vec_normalize_env is not None:
+                    obs_ = new_obs_

                num_timesteps += 1
                episode_timesteps += 1