mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-05-29 23:07:07 +00:00
Hotfix PPO + gSDE (#53)
* Fix variable being passed with gradients * Update changelog * Bump version * Fixes #54
This commit is contained in:
parent
b833207142
commit
494ebfd20a
4 changed files with 18 additions and 13 deletions
|
|
@ -4,22 +4,24 @@ Changelog
|
|||
==========
|
||||
|
||||
|
||||
Pre-Release 0.7.0a1 (WIP)
|
||||
Pre-Release 0.7.0 (2020-06-10)
|
||||
------------------------------
|
||||
|
||||
**Hotfix for PPO/A2C + gSDE, internal refactoring and bug fixes**
|
||||
|
||||
Breaking Changes:
|
||||
^^^^^^^^^^^^^^^^^
|
||||
- ``render()`` method of ``VecEnvs`` now only accept one argument: ``mode``
|
||||
- Created new file common/torch_layers.py, similar to SB refactoring
|
||||
|
||||
|
||||
- Contains all PyTorch network layer definitions and feature extractors: ``MlpExtractor``, ``create_mlp``, ``NatureCNN``
|
||||
|
||||
- Renamed ``BaseRLModel`` to ``BaseAlgorithm`` (along with offpolicy and onpolicy variants)
|
||||
- Moved on-policy and off-policy base algorithms to ``common/on_policy_algorithm.py`` and ``common/off_policy_algorithm.py``, respectively.
|
||||
- Moved ``PPOPolicy`` to ``ActorCriticPolicy`` in common/policies.py
|
||||
- Moved ``PPOPolicy`` to ``ActorCriticPolicy`` in common/policies.py
|
||||
- Moved ``PPO`` (algorithm class) into ``OnPolicyAlgorithm`` (``common/on_policy_algorithm.py``), to be shared with A2C
|
||||
- Moved following functions from ``BaseAlgorithm``:
|
||||
|
||||
- Moved following functions from ``BaseAlgorithm``:
|
||||
|
||||
- ``_load_from_file`` to ``load_from_zip_file`` (save_util.py)
|
||||
- ``_save_to_file_zip`` to ``save_to_zip_file`` (save_util.py)
|
||||
- ``safe_mean`` to ``safe_mean`` (utils.py)
|
||||
|
|
@ -28,7 +30,7 @@ Breaking Changes:
|
|||
- Moved static function ``_is_vectorized_observation`` from common/policies.py to common/utils.py under name ``is_vectorized_observation``.
|
||||
- Removed ``{save,load}_running_average`` functions of ``VecNormalize`` in favor of ``load/save``.
|
||||
- Removed ``use_gae`` parameter from ``RolloutBuffer.compute_returns_and_advantage``.
|
||||
|
||||
|
||||
New Features:
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
|
|
@ -38,6 +40,7 @@ Bug Fixes:
|
|||
- Fixed ``seed()`` method for ``SubprocVecEnv``
|
||||
- Fixed loading on GPU for testing when using gSDE and ``deterministic=False``
|
||||
- Fixed ``register_policy`` to allow re-registering same policy for same sub-class (i.e. assign same value to same key).
|
||||
- Fixed a bug where the gradient was passed when using ``gSDE`` with ``PPO``/``A2C``, this does not affect ``SAC``
|
||||
|
||||
Deprecations:
|
||||
^^^^^^^^^^^^^
|
||||
|
|
@ -67,7 +70,7 @@ Breaking Changes:
|
|||
^^^^^^^^^^^^^^^^^
|
||||
- Remove State-Dependent Exploration (SDE) support for ``TD3``
|
||||
- Methods were renamed in the logger:
|
||||
|
||||
|
||||
- ``logkv`` -> ``record``, ``writekvs`` -> ``write``, ``writeseq`` -> ``write_sequence``,
|
||||
- ``logkvs`` -> ``record_dict``, ``dumpkvs`` -> ``dump``,
|
||||
- ``getkvs`` -> ``get_log_dict``, ``logkv_mean`` -> ``record_mean``,
|
||||
|
|
|
|||
|
|
@ -525,7 +525,7 @@ class StateDependentNoiseDistribution(Distribution):
|
|||
"""
|
||||
# Stop gradient if we don't want to influence the features
|
||||
self._latent_sde = latent_sde if self.learn_features else latent_sde.detach()
|
||||
variance = th.mm(latent_sde ** 2, self.get_std(log_std) ** 2)
|
||||
variance = th.mm(self._latent_sde ** 2, self.get_std(log_std) ** 2)
|
||||
self.distribution = Normal(mean_actions, th.sqrt(variance + self.epsilon))
|
||||
return self
|
||||
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
0.7.0a1
|
||||
0.7.0
|
||||
|
|
|
|||
|
|
@ -1,5 +1,3 @@
|
|||
import os
|
||||
|
||||
import gym
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
|
@ -138,7 +136,7 @@ def test_sync_vec_normalize():
|
|||
|
||||
assert unwrap_vec_normalize(env) is None
|
||||
|
||||
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
|
||||
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100., clip_reward=100.)
|
||||
|
||||
assert isinstance(unwrap_vec_normalize(env), VecNormalize)
|
||||
|
||||
|
|
@ -147,9 +145,13 @@ def test_sync_vec_normalize():
|
|||
assert isinstance(unwrap_vec_normalize(env), VecNormalize)
|
||||
|
||||
eval_env = DummyVecEnv([make_env])
|
||||
eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
|
||||
eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True,
|
||||
clip_obs=100., clip_reward=100.)
|
||||
eval_env = VecFrameStack(eval_env, 1)
|
||||
|
||||
env.seed(0)
|
||||
env.action_space.seed(0)
|
||||
|
||||
env.reset()
|
||||
# Initialize running mean
|
||||
latest_reward = None
|
||||
|
|
|
|||
Loading…
Reference in a new issue