Fixes HER mixed ordering of desired_goal and achieved_goal (#1570)

* change ordering of achieved_goal and desired_goal to match expected compute_reward order

* Update changelog.rst

* Update version

* Update version.txt

* Update changelog.rst

---------

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
This commit is contained in:
Jonathan 2023-06-21 16:27:06 +02:00 committed by GitHub
parent 4eed3e9769
commit f667f086ea
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 8 additions and 7 deletions

View file

@ -3,7 +3,7 @@
Changelog
==========
Release 2.0.0a13 (WIP)
Release 2.0.0a14 (WIP)
--------------------------
**Gymnasium support**
@ -42,6 +42,7 @@ Bug Fixes:
- Fixed env checker to properly reset the env before calling ``step()`` when checking
for ``Inf`` and ``NaN`` (@lutogniew)
- Fixed HER ``truncate_last_trajectory()`` (@lbergmann1)
- Fixed HER desired and achieved goal order in reward computation (@JonathanKuelz)
Deprecations:
^^^^^^^^^^^^^
@ -1347,7 +1348,7 @@ And all the contributors:
@eleurent @ac-93 @cove9988 @theDebugger811 @hsuehch @Demetrio92 @thomasgubler @IperGiove @ScheiklP
@simoninithomas @armandpl @manuel-delverme @Gautam-J @gianlucadecola @buoyancy99 @caburu @xy9485
@Gregwar @ycheng517 @quantitative-technologies @bcollazo @git-thor @TibiGG @cool-RR @MWeltevrede
@carlosluis @arjun-kg @tlpss
@carlosluis @arjun-kg @tlpss @JonathanKuelz
@Melanol @qgallouedec @francescoluciano @jlp-ue @burakdmb @timothe-chaumont @honglu2875
@anand-bala @hughperkins @sidney-tio @AlexPasqua @dominicgkerr @Akhilez @Rocamonde @tobirohrer @ZikangXiong
@DavyMorgan @luizapozzobon @Bonifatius94 @theSquaredError @harveybellini @DavyMorgan @FieteO @jonasreiher @npit @WeberSamuel @troiganto

View file

@ -128,10 +128,10 @@ def _check_goal_env_obs(obs: dict, observation_space: spaces.Dict, method_name:
"""
Check that an environment implementing the `compute_rewards()` method
(previously known as GoalEnv in gym) contains at least three elements,
namely `observation`, `desired_goal`, and `achieved_goal`.
namely `observation`, `achieved_goal`, and `desired_goal`.
"""
assert len(observation_space.spaces) >= 3, (
"A goal conditioned env must contain at least 3 observation keys: `observation`, `desired_goal`, and `achieved_goal`. "
"A goal conditioned env must contain at least 3 observation keys: `observation`, `achieved_goal`, and `desired_goal`. "
f"The current observation contains {len(observation_space.spaces)} keys: {list(observation_space.spaces.keys())}"
)

View file

@ -312,8 +312,6 @@ class HerReplayBuffer(DictReplayBuffer):
# Compute new reward
rewards = self.env.env_method(
"compute_reward",
# here we use the new desired goal
obs["desired_goal"],
# the new state depends on the previous state and action
# s_{t+1} = f(s_t, a_t)
# so the next achieved_goal depends also on the previous state and action
@ -321,6 +319,8 @@ class HerReplayBuffer(DictReplayBuffer):
# r_t = reward(s_t, a_t) = reward(next_achieved_goal, desired_goal)
# therefore we have to use next_obs["achieved_goal"] and not obs["achieved_goal"]
next_obs["achieved_goal"],
# here we use the new desired goal
obs["desired_goal"],
infos,
# we use the method of the first environment assuming that all environments are identical.
indices=[0],

View file

@ -1 +1 @@
2.0.0a13
2.0.0a14