mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-06-09 00:31:14 +00:00
Fix q-target in SAC (#77)
* Fix q-target in SAC * [ci skip] Update version
This commit is contained in:
parent
96b771f24e
commit
08e7519381
3 changed files with 5 additions and 5 deletions
|
|
@ -3,7 +3,7 @@
|
|||
Changelog
|
||||
==========
|
||||
|
||||
Pre-Release 0.8.0a1 (WIP)
|
||||
Pre-Release 0.8.0a2 (WIP)
|
||||
------------------------------
|
||||
|
||||
Breaking Changes:
|
||||
|
|
@ -21,6 +21,7 @@ New Features:
|
|||
Bug Fixes:
|
||||
^^^^^^^^^^
|
||||
- Fixed a bug in the ``close()`` method of ``SubprocVecEnv``, causing wrappers further down in the wrapper stack to not be closed. (@NeoExtended)
|
||||
- Fix target for updating q values in SAC: the entropy term was not conditioned by terminals states
|
||||
|
||||
Deprecations:
|
||||
^^^^^^^^^^^^^
|
||||
|
|
|
|||
|
|
@ -202,10 +202,9 @@ class SAC(OffPolicyAlgorithm):
|
|||
next_actions, next_log_prob = self.actor.action_log_prob(replay_data.next_observations)
|
||||
# Compute the target Q value
|
||||
target_q1, target_q2 = self.critic_target(replay_data.next_observations, next_actions)
|
||||
target_q = th.min(target_q1, target_q2)
|
||||
target_q = replay_data.rewards + (1 - replay_data.dones) * self.gamma * target_q
|
||||
target_q = th.min(target_q1, target_q2) - ent_coef * next_log_prob.reshape(-1, 1)
|
||||
# td error + entropy term
|
||||
q_backup = target_q - ent_coef * next_log_prob.reshape(-1, 1)
|
||||
q_backup = replay_data.rewards + (1 - replay_data.dones) * self.gamma * target_q
|
||||
|
||||
# Get current Q estimates
|
||||
# using action from the replay buffer
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
0.8.0a1
|
||||
0.8.0a2
|
||||
|
|
|
|||
Loading…
Reference in a new issue