mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-05-27 22:55:17 +00:00
Fixes HER mixed ordering of desired_goal and achieved_goal (#1570)
* change ordering of achieved_goal and desired_goal to match expected compute_reward order * Update changelog.rst * Update version * Update version.txt * Update changelog.rst --------- Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
This commit is contained in:
parent
4eed3e9769
commit
f667f086ea
4 changed files with 8 additions and 7 deletions
|
|
@ -3,7 +3,7 @@
|
|||
Changelog
|
||||
==========
|
||||
|
||||
Release 2.0.0a13 (WIP)
|
||||
Release 2.0.0a14 (WIP)
|
||||
--------------------------
|
||||
|
||||
**Gymnasium support**
|
||||
|
|
@ -42,6 +42,7 @@ Bug Fixes:
|
|||
- Fixed env checker to properly reset the env before calling ``step()`` when checking
|
||||
for ``Inf`` and ``NaN`` (@lutogniew)
|
||||
- Fixed HER ``truncate_last_trajectory()`` (@lbergmann1)
|
||||
- Fixed HER desired and achieved goal order in reward computation (@JonathanKuelz)
|
||||
|
||||
Deprecations:
|
||||
^^^^^^^^^^^^^
|
||||
|
|
@ -1347,7 +1348,7 @@ And all the contributors:
|
|||
@eleurent @ac-93 @cove9988 @theDebugger811 @hsuehch @Demetrio92 @thomasgubler @IperGiove @ScheiklP
|
||||
@simoninithomas @armandpl @manuel-delverme @Gautam-J @gianlucadecola @buoyancy99 @caburu @xy9485
|
||||
@Gregwar @ycheng517 @quantitative-technologies @bcollazo @git-thor @TibiGG @cool-RR @MWeltevrede
|
||||
@carlosluis @arjun-kg @tlpss
|
||||
@carlosluis @arjun-kg @tlpss @JonathanKuelz
|
||||
@Melanol @qgallouedec @francescoluciano @jlp-ue @burakdmb @timothe-chaumont @honglu2875
|
||||
@anand-bala @hughperkins @sidney-tio @AlexPasqua @dominicgkerr @Akhilez @Rocamonde @tobirohrer @ZikangXiong
|
||||
@DavyMorgan @luizapozzobon @Bonifatius94 @theSquaredError @harveybellini @DavyMorgan @FieteO @jonasreiher @npit @WeberSamuel @troiganto
|
||||
|
|
|
|||
|
|
@ -128,10 +128,10 @@ def _check_goal_env_obs(obs: dict, observation_space: spaces.Dict, method_name:
|
|||
"""
|
||||
Check that an environment implementing the `compute_rewards()` method
|
||||
(previously known as GoalEnv in gym) contains at least three elements,
|
||||
namely `observation`, `desired_goal`, and `achieved_goal`.
|
||||
namely `observation`, `achieved_goal`, and `desired_goal`.
|
||||
"""
|
||||
assert len(observation_space.spaces) >= 3, (
|
||||
"A goal conditioned env must contain at least 3 observation keys: `observation`, `desired_goal`, and `achieved_goal`. "
|
||||
"A goal conditioned env must contain at least 3 observation keys: `observation`, `achieved_goal`, and `desired_goal`. "
|
||||
f"The current observation contains {len(observation_space.spaces)} keys: {list(observation_space.spaces.keys())}"
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -312,8 +312,6 @@ class HerReplayBuffer(DictReplayBuffer):
|
|||
# Compute new reward
|
||||
rewards = self.env.env_method(
|
||||
"compute_reward",
|
||||
# here we use the new desired goal
|
||||
obs["desired_goal"],
|
||||
# the new state depends on the previous state and action
|
||||
# s_{t+1} = f(s_t, a_t)
|
||||
# so the next achieved_goal depends also on the previous state and action
|
||||
|
|
@ -321,6 +319,8 @@ class HerReplayBuffer(DictReplayBuffer):
|
|||
# r_t = reward(s_t, a_t) = reward(next_achieved_goal, desired_goal)
|
||||
# therefore we have to use next_obs["achieved_goal"] and not obs["achieved_goal"]
|
||||
next_obs["achieved_goal"],
|
||||
# here we use the new desired goal
|
||||
obs["desired_goal"],
|
||||
infos,
|
||||
# we use the method of the first environment assuming that all environments are identical.
|
||||
indices=[0],
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
2.0.0a13
|
||||
2.0.0a14
|
||||
|
|
|
|||
Loading…
Reference in a new issue