First sign of life

This commit is contained in:
Antonin RAFFIN 2019-09-19 16:21:28 +02:00
parent ad089f5b19
commit fe8b415cbf
4 changed files with 25 additions and 67 deletions

View file

@ -11,7 +11,6 @@ def test_pendulum():
model.load("test_save")
os.remove("test_save.pth")
def test_cemrl():
model = CEMRL('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), pop_size=2, n_grad=1,
start_timesteps=100, verbose=1)

View file

@ -1,11 +1,8 @@
import numpy as np
import torch as th
from torchy_baselines.common.utils import discount_cumsum
class BaseBuffer(object):
"""docstring for BaseBuffer."""
def __init__(self, buffer_size, state_dim, action_dim, device='cpu'):
super(BaseBuffer, self).__init__()
self.buffer_size = buffer_size
@ -88,25 +85,23 @@ class RolloutBuffer(BaseBuffer):
self.values = th.zeros(self.buffer_size, 1)
self.log_probs = th.zeros(self.buffer_size, 1)
self.advantages = th.zeros(self.buffer_size, 1)
self.path_start_idx = 0
def finish_path(self, last_value=0):
def compute_returns_and_advantage(self, last_value, done=False):
"""
From https://github.com/openai/spinningup/blob/master/spinup/algos/ppo/ppo.py
From PPO2
"""
# No use of dones?
path_slice = slice(self.path_start_idx, self.pos)
rewards = np.append(self.rewards[path_slice].detach().cpu().numpy(), last_value)
values = np.append(self.values[path_slice].detach().cpu().numpy(), last_value)
# the next two lines implement GAE-Lambda advantage calculation
deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]
self.advantages[path_slice, 0] = th.FloatTensor(discount_cumsum(deltas, self.gamma * self.lambda_).copy())
# the next line computes rewards-to-go, to be targets for the value function
self.returns[path_slice, 0] = th.FloatTensor(discount_cumsum(rewards, self.gamma)[:-1].copy())
self.path_start_idx = self.pos
last_gae_lam = 0
for step in reversed(range(self.buffer_size)):
if step == self.buffer_size - 1:
next_non_terminal = 1.0 - float(done)
next_value = last_value
else:
next_non_terminal = 1.0 - self.dones[step + 1]
next_value = self.values[step + 1]
delta = self.rewards[step] + self.gamma * next_value * next_non_terminal - self.values[step]
last_gae_lam = delta + self.gamma * self.lambda_ * next_non_terminal * last_gae_lam
self.advantages[step] = last_gae_lam
self.returns = self.advantages + self.values
def add(self, state, action, reward, done, value, log_prob):
self.values[self.pos] = th.FloatTensor([value])
@ -119,14 +114,9 @@ class RolloutBuffer(BaseBuffer):
if self.pos == self.buffer_size:
self.full = True
def reset(self):
self.path_start_idx = 0
super(RolloutBuffer, self).reset()
def get(self, batch_size):
assert self.full
indices = th.randperm(self.buffer_size)
minibatch_indices = []
start_idx = 0
while start_idx < self.buffer_size:
yield self._get_samples(indices[start_idx:start_idx + batch_size])
@ -135,6 +125,6 @@ class RolloutBuffer(BaseBuffer):
def _get_samples(self, batch_inds):
return (self.states[batch_inds].to(self.device),
self.actions[batch_inds].to(self.device),
self.log_probs[batch_inds].to(self.device),
self.advantages[batch_inds].to(self.device),
self.returns[batch_inds].to(self.device))
self.log_probs[batch_inds].flatten().to(self.device),
self.advantages[batch_inds].flatten().to(self.device),
self.returns[batch_inds].flatten().to(self.device))

View file

@ -21,38 +21,6 @@ def set_random_seed(seed, using_cuda=False):
th.cuda.manual_seed(seed)
# From stable_baselines.common.math_util
# def discount(vector, gamma):
# """
# computes discounted sums along 0th dimension of vector x.
# y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
# where k = len(x) - t - 1
#
# :param vector: (np.ndarray) the input vector
# :param gamma: (float) the discount value
# :return: (np.ndarray) the output vector
# """
# assert vector.ndim >= 1
# return scipy.signal.lfilter([1], [1, -gamma], vector[::-1], axis=0)[::-1]
def discount_cumsum(x, discount):
"""
magic from rllab for computing discounted cumulative sums of vectors.
input:
vector x,
[x0,
x1,
x2]
output:
[x0 + discount * x1 + discount^2 * x2,
x1 + discount * x2,
x2]
"""
return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
# From stable baselines
def explained_variance(y_pred, y_true):
"""

View file

@ -88,19 +88,19 @@ class PPO(BaseRLModel):
# No grad ok?
with th.no_grad():
action, value, log_prob = self.policy.forward(obs)
action = action[0].cpu().numpy()
action = action.flatten().cpu().numpy()
# Rescale and perform action
obs, reward, done, _ = env.step(np.clip(action, -self.max_action, self.max_action))
new_obs, reward, done, _ = env.step(np.clip(action, -self.max_action, self.max_action))
n_steps += 1
rollout_buffer.add(obs, action, reward, float(done), value, log_prob)
obs = new_obs
if done:
value = 0.0
obs = None
rollout_buffer.finish_path(last_value=value)
rollout_buffer.compute_returns_and_advantage(value, done=done)
return obs
@ -113,7 +113,6 @@ class PPO(BaseRLModel):
# Unpack
state, action, old_log_prob, advantage, return_batch = replay_data
# _, values, log_prob = self.policy.forward(state)
values, log_prob, entropy = self.policy.get_policy_stats(state, action)
# Normalize advantage
@ -121,11 +120,12 @@ class PPO(BaseRLModel):
advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
ratio = th.exp(log_prob - old_log_prob)
policy_loss_1 = advantage * ratio
policy_loss_2 = advantage * th.clamp(ratio, 1 - self.clip_range, 1 + self.clip_range)
policy_loss = -th.min(policy_loss_1, policy_loss_2).mean()
# value_loss = th.mean((return_batch - value)**2)
value_loss = F.mse_loss(return_batch, values)
value_loss = F.mse_loss(return_batch, values.flatten())
entropy_loss = th.mean(entropy)
loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss
# loss = policy_loss
@ -138,7 +138,8 @@ class PPO(BaseRLModel):
# TODO: clip grad norm?
# nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
self.policy.optimizer.step()
print(explained_variance(return_batch.numpy()[:, 0], values[:, 0].detach().cpu().numpy()))
# print(value_loss.item())
# print(explained_variance(return_batch.numpy(), values.flatten().detach().cpu().numpy()))
def learn(self, total_timesteps, callback=None, log_interval=100,
eval_freq=-1, n_eval_episodes=5, tb_log_name="PPO", reset_num_timesteps=True):