Fix q-target in SAC (#77)

* Fix q-target in SAC

* [ci skip] Update version
This commit is contained in:
Antonin RAFFIN 2020-06-29 17:58:55 +02:00 committed by GitHub
parent 96b771f24e
commit 08e7519381
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 5 additions and 5 deletions

View file

@ -3,7 +3,7 @@
Changelog
==========
Pre-Release 0.8.0a1 (WIP)
Pre-Release 0.8.0a2 (WIP)
------------------------------
Breaking Changes:
@ -21,6 +21,7 @@ New Features:
Bug Fixes:
^^^^^^^^^^
- Fixed a bug in the ``close()`` method of ``SubprocVecEnv``, causing wrappers further down in the wrapper stack to not be closed. (@NeoExtended)
- Fix target for updating q values in SAC: the entropy term was not conditioned by terminals states
Deprecations:
^^^^^^^^^^^^^

View file

@ -202,10 +202,9 @@ class SAC(OffPolicyAlgorithm):
next_actions, next_log_prob = self.actor.action_log_prob(replay_data.next_observations)
# Compute the target Q value
target_q1, target_q2 = self.critic_target(replay_data.next_observations, next_actions)
target_q = th.min(target_q1, target_q2)
target_q = replay_data.rewards + (1 - replay_data.dones) * self.gamma * target_q
target_q = th.min(target_q1, target_q2) - ent_coef * next_log_prob.reshape(-1, 1)
# td error + entropy term
q_backup = target_q - ent_coef * next_log_prob.reshape(-1, 1)
q_backup = replay_data.rewards + (1 - replay_data.dones) * self.gamma * target_q
# Get current Q estimates
# using action from the replay buffer

View file

@ -1 +1 @@
0.8.0a1
0.8.0a2