mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-06-02 23:40:09 +00:00
First sign of life
This commit is contained in:
parent
ad089f5b19
commit
fe8b415cbf
4 changed files with 25 additions and 67 deletions
|
|
@ -11,7 +11,6 @@ def test_pendulum():
|
|||
model.load("test_save")
|
||||
os.remove("test_save.pth")
|
||||
|
||||
|
||||
def test_cemrl():
|
||||
model = CEMRL('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), pop_size=2, n_grad=1,
|
||||
start_timesteps=100, verbose=1)
|
||||
|
|
|
|||
|
|
@ -1,11 +1,8 @@
|
|||
import numpy as np
|
||||
import torch as th
|
||||
|
||||
from torchy_baselines.common.utils import discount_cumsum
|
||||
|
||||
class BaseBuffer(object):
|
||||
"""docstring for BaseBuffer."""
|
||||
|
||||
def __init__(self, buffer_size, state_dim, action_dim, device='cpu'):
|
||||
super(BaseBuffer, self).__init__()
|
||||
self.buffer_size = buffer_size
|
||||
|
|
@ -88,25 +85,23 @@ class RolloutBuffer(BaseBuffer):
|
|||
self.values = th.zeros(self.buffer_size, 1)
|
||||
self.log_probs = th.zeros(self.buffer_size, 1)
|
||||
self.advantages = th.zeros(self.buffer_size, 1)
|
||||
self.path_start_idx = 0
|
||||
|
||||
def finish_path(self, last_value=0):
|
||||
def compute_returns_and_advantage(self, last_value, done=False):
|
||||
"""
|
||||
From https://github.com/openai/spinningup/blob/master/spinup/algos/ppo/ppo.py
|
||||
From PPO2
|
||||
"""
|
||||
# No use of dones?
|
||||
path_slice = slice(self.path_start_idx, self.pos)
|
||||
rewards = np.append(self.rewards[path_slice].detach().cpu().numpy(), last_value)
|
||||
values = np.append(self.values[path_slice].detach().cpu().numpy(), last_value)
|
||||
|
||||
# the next two lines implement GAE-Lambda advantage calculation
|
||||
deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]
|
||||
|
||||
self.advantages[path_slice, 0] = th.FloatTensor(discount_cumsum(deltas, self.gamma * self.lambda_).copy())
|
||||
# the next line computes rewards-to-go, to be targets for the value function
|
||||
self.returns[path_slice, 0] = th.FloatTensor(discount_cumsum(rewards, self.gamma)[:-1].copy())
|
||||
|
||||
self.path_start_idx = self.pos
|
||||
last_gae_lam = 0
|
||||
for step in reversed(range(self.buffer_size)):
|
||||
if step == self.buffer_size - 1:
|
||||
next_non_terminal = 1.0 - float(done)
|
||||
next_value = last_value
|
||||
else:
|
||||
next_non_terminal = 1.0 - self.dones[step + 1]
|
||||
next_value = self.values[step + 1]
|
||||
delta = self.rewards[step] + self.gamma * next_value * next_non_terminal - self.values[step]
|
||||
last_gae_lam = delta + self.gamma * self.lambda_ * next_non_terminal * last_gae_lam
|
||||
self.advantages[step] = last_gae_lam
|
||||
self.returns = self.advantages + self.values
|
||||
|
||||
def add(self, state, action, reward, done, value, log_prob):
|
||||
self.values[self.pos] = th.FloatTensor([value])
|
||||
|
|
@ -119,14 +114,9 @@ class RolloutBuffer(BaseBuffer):
|
|||
if self.pos == self.buffer_size:
|
||||
self.full = True
|
||||
|
||||
def reset(self):
|
||||
self.path_start_idx = 0
|
||||
super(RolloutBuffer, self).reset()
|
||||
|
||||
def get(self, batch_size):
|
||||
assert self.full
|
||||
indices = th.randperm(self.buffer_size)
|
||||
minibatch_indices = []
|
||||
start_idx = 0
|
||||
while start_idx < self.buffer_size:
|
||||
yield self._get_samples(indices[start_idx:start_idx + batch_size])
|
||||
|
|
@ -135,6 +125,6 @@ class RolloutBuffer(BaseBuffer):
|
|||
def _get_samples(self, batch_inds):
|
||||
return (self.states[batch_inds].to(self.device),
|
||||
self.actions[batch_inds].to(self.device),
|
||||
self.log_probs[batch_inds].to(self.device),
|
||||
self.advantages[batch_inds].to(self.device),
|
||||
self.returns[batch_inds].to(self.device))
|
||||
self.log_probs[batch_inds].flatten().to(self.device),
|
||||
self.advantages[batch_inds].flatten().to(self.device),
|
||||
self.returns[batch_inds].flatten().to(self.device))
|
||||
|
|
|
|||
|
|
@ -21,38 +21,6 @@ def set_random_seed(seed, using_cuda=False):
|
|||
th.cuda.manual_seed(seed)
|
||||
|
||||
|
||||
# From stable_baselines.common.math_util
|
||||
# def discount(vector, gamma):
|
||||
# """
|
||||
# computes discounted sums along 0th dimension of vector x.
|
||||
# y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
|
||||
# where k = len(x) - t - 1
|
||||
#
|
||||
# :param vector: (np.ndarray) the input vector
|
||||
# :param gamma: (float) the discount value
|
||||
# :return: (np.ndarray) the output vector
|
||||
# """
|
||||
# assert vector.ndim >= 1
|
||||
# return scipy.signal.lfilter([1], [1, -gamma], vector[::-1], axis=0)[::-1]
|
||||
|
||||
|
||||
def discount_cumsum(x, discount):
|
||||
"""
|
||||
magic from rllab for computing discounted cumulative sums of vectors.
|
||||
|
||||
input:
|
||||
vector x,
|
||||
[x0,
|
||||
x1,
|
||||
x2]
|
||||
|
||||
output:
|
||||
[x0 + discount * x1 + discount^2 * x2,
|
||||
x1 + discount * x2,
|
||||
x2]
|
||||
"""
|
||||
return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
|
||||
|
||||
# From stable baselines
|
||||
def explained_variance(y_pred, y_true):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -88,19 +88,19 @@ class PPO(BaseRLModel):
|
|||
# No grad ok?
|
||||
with th.no_grad():
|
||||
action, value, log_prob = self.policy.forward(obs)
|
||||
action = action[0].cpu().numpy()
|
||||
action = action.flatten().cpu().numpy()
|
||||
|
||||
# Rescale and perform action
|
||||
obs, reward, done, _ = env.step(np.clip(action, -self.max_action, self.max_action))
|
||||
new_obs, reward, done, _ = env.step(np.clip(action, -self.max_action, self.max_action))
|
||||
|
||||
n_steps += 1
|
||||
rollout_buffer.add(obs, action, reward, float(done), value, log_prob)
|
||||
|
||||
obs = new_obs
|
||||
if done:
|
||||
value = 0.0
|
||||
obs = None
|
||||
|
||||
rollout_buffer.finish_path(last_value=value)
|
||||
rollout_buffer.compute_returns_and_advantage(value, done=done)
|
||||
|
||||
return obs
|
||||
|
||||
|
|
@ -113,7 +113,6 @@ class PPO(BaseRLModel):
|
|||
# Unpack
|
||||
state, action, old_log_prob, advantage, return_batch = replay_data
|
||||
|
||||
# _, values, log_prob = self.policy.forward(state)
|
||||
values, log_prob, entropy = self.policy.get_policy_stats(state, action)
|
||||
|
||||
# Normalize advantage
|
||||
|
|
@ -121,11 +120,12 @@ class PPO(BaseRLModel):
|
|||
advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
|
||||
|
||||
ratio = th.exp(log_prob - old_log_prob)
|
||||
|
||||
policy_loss_1 = advantage * ratio
|
||||
policy_loss_2 = advantage * th.clamp(ratio, 1 - self.clip_range, 1 + self.clip_range)
|
||||
policy_loss = -th.min(policy_loss_1, policy_loss_2).mean()
|
||||
# value_loss = th.mean((return_batch - value)**2)
|
||||
value_loss = F.mse_loss(return_batch, values)
|
||||
value_loss = F.mse_loss(return_batch, values.flatten())
|
||||
entropy_loss = th.mean(entropy)
|
||||
loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss
|
||||
# loss = policy_loss
|
||||
|
|
@ -138,7 +138,8 @@ class PPO(BaseRLModel):
|
|||
# TODO: clip grad norm?
|
||||
# nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
|
||||
self.policy.optimizer.step()
|
||||
print(explained_variance(return_batch.numpy()[:, 0], values[:, 0].detach().cpu().numpy()))
|
||||
# print(value_loss.item())
|
||||
# print(explained_variance(return_batch.numpy(), values.flatten().detach().cpu().numpy()))
|
||||
|
||||
def learn(self, total_timesteps, callback=None, log_interval=100,
|
||||
eval_freq=-1, n_eval_episodes=5, tb_log_name="PPO", reset_num_timesteps=True):
|
||||
|
|
|
|||
Loading…
Reference in a new issue