First sign of life

2026-07-19 19:00:58 +00:00 · 2019-09-19 16:21:28 +02:00 · 2019-09-19 16:21:28 +02:00 · fe8b415cbf
commit fe8b415cbf
parent ad089f5b19
4 changed files with 25 additions and 67 deletions
--- a/tests/test_run.py
+++ b/tests/test_run.py
@ -11,7 +11,6 @@ def test_pendulum():
    model.load("test_save")
    os.remove("test_save.pth")

-
 def test_cemrl():
    model = CEMRL('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), pop_size=2, n_grad=1,
                 start_timesteps=100, verbose=1)
--- a/torchy_baselines/common/buffers.py
+++ b/torchy_baselines/common/buffers.py
@ -1,11 +1,8 @@
 import numpy as np
 import torch as th

-from torchy_baselines.common.utils import  discount_cumsum

 class BaseBuffer(object):
-    """docstring for BaseBuffer."""
-
    def __init__(self, buffer_size, state_dim, action_dim, device='cpu'):
        super(BaseBuffer, self).__init__()
        self.buffer_size = buffer_size
@ -88,25 +85,23 @@ class RolloutBuffer(BaseBuffer):
        self.values = th.zeros(self.buffer_size, 1)
        self.log_probs = th.zeros(self.buffer_size, 1)
        self.advantages = th.zeros(self.buffer_size, 1)
-        self.path_start_idx = 0

-    def finish_path(self, last_value=0):
+    def compute_returns_and_advantage(self, last_value, done=False):
        """
-        From https://github.com/openai/spinningup/blob/master/spinup/algos/ppo/ppo.py
+        From PPO2
        """
-        # No use of dones?
-        path_slice = slice(self.path_start_idx, self.pos)
-        rewards = np.append(self.rewards[path_slice].detach().cpu().numpy(), last_value)
-        values = np.append(self.values[path_slice].detach().cpu().numpy(), last_value)
-
-        # the next two lines implement GAE-Lambda advantage calculation
-        deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]
-
-        self.advantages[path_slice, 0] = th.FloatTensor(discount_cumsum(deltas, self.gamma * self.lambda_).copy())
-        # the next line computes rewards-to-go, to be targets for the value function
-        self.returns[path_slice, 0] = th.FloatTensor(discount_cumsum(rewards, self.gamma)[:-1].copy())
-
-        self.path_start_idx = self.pos
+        last_gae_lam = 0
+        for step in reversed(range(self.buffer_size)):
+            if step == self.buffer_size - 1:
+                next_non_terminal = 1.0 - float(done)
+                next_value = last_value
+            else:
+                next_non_terminal = 1.0 - self.dones[step + 1]
+                next_value = self.values[step + 1]
+            delta = self.rewards[step] + self.gamma * next_value * next_non_terminal - self.values[step]
+            last_gae_lam = delta + self.gamma * self.lambda_ * next_non_terminal * last_gae_lam
+            self.advantages[step] = last_gae_lam
+        self.returns = self.advantages + self.values

    def add(self, state, action, reward, done, value, log_prob):
        self.values[self.pos] = th.FloatTensor([value])
@ -119,14 +114,9 @@ class RolloutBuffer(BaseBuffer):
        if self.pos == self.buffer_size:
            self.full = True

-    def reset(self):
-        self.path_start_idx = 0
-        super(RolloutBuffer, self).reset()
-
    def get(self, batch_size):
        assert self.full
        indices = th.randperm(self.buffer_size)
-        minibatch_indices = []
        start_idx = 0
        while start_idx < self.buffer_size:
            yield self._get_samples(indices[start_idx:start_idx + batch_size])
@ -135,6 +125,6 @@ class RolloutBuffer(BaseBuffer):
    def _get_samples(self, batch_inds):
        return (self.states[batch_inds].to(self.device),
                self.actions[batch_inds].to(self.device),
-                self.log_probs[batch_inds].to(self.device),
-                self.advantages[batch_inds].to(self.device),
-                self.returns[batch_inds].to(self.device))
+                self.log_probs[batch_inds].flatten().to(self.device),
+                self.advantages[batch_inds].flatten().to(self.device),
+                self.returns[batch_inds].flatten().to(self.device))
--- a/torchy_baselines/common/utils.py
+++ b/torchy_baselines/common/utils.py
@ -21,38 +21,6 @@ def set_random_seed(seed, using_cuda=False):
        th.cuda.manual_seed(seed)


-# From stable_baselines.common.math_util
-# def discount(vector, gamma):
-#     """
-#     computes discounted sums along 0th dimension of vector x.
-#         y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
-#                 where k = len(x) - t - 1
-#
-#     :param vector: (np.ndarray) the input vector
-#     :param gamma: (float) the discount value
-#     :return: (np.ndarray) the output vector
-#     """
-#     assert vector.ndim >= 1
-#     return scipy.signal.lfilter([1], [1, -gamma], vector[::-1], axis=0)[::-1]
-
-
-def discount_cumsum(x, discount):
-    """
-    magic from rllab for computing discounted cumulative sums of vectors.
-
-    input:
-        vector x,
-        [x0,
-         x1,
-         x2]
-
-    output:
-        [x0 + discount * x1 + discount^2 * x2,
-         x1 + discount * x2,
-         x2]
-    """
-    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
-
 # From stable baselines
 def explained_variance(y_pred, y_true):
    """
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@ -88,19 +88,19 @@ class PPO(BaseRLModel):
            # No grad ok?
            with th.no_grad():
                action, value, log_prob = self.policy.forward(obs)
-            action = action[0].cpu().numpy()
+            action = action.flatten().cpu().numpy()

            # Rescale and perform action
-            obs, reward, done, _ = env.step(np.clip(action, -self.max_action, self.max_action))
+            new_obs, reward, done, _ = env.step(np.clip(action, -self.max_action, self.max_action))

            n_steps += 1
            rollout_buffer.add(obs, action, reward, float(done), value, log_prob)

+            obs = new_obs
            if done:
-                value = 0.0
                obs = None

-            rollout_buffer.finish_path(last_value=value)
+        rollout_buffer.compute_returns_and_advantage(value, done=done)

        return obs

@ -113,7 +113,6 @@ class PPO(BaseRLModel):
                # Unpack
                state, action, old_log_prob, advantage, return_batch = replay_data

-                # _, values, log_prob = self.policy.forward(state)
                values, log_prob, entropy = self.policy.get_policy_stats(state, action)

                # Normalize advantage
@ -121,11 +120,12 @@ class PPO(BaseRLModel):
                advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)

                ratio = th.exp(log_prob - old_log_prob)
+
                policy_loss_1 = advantage * ratio
                policy_loss_2 = advantage * th.clamp(ratio, 1 - self.clip_range, 1 + self.clip_range)
                policy_loss = -th.min(policy_loss_1, policy_loss_2).mean()
                # value_loss = th.mean((return_batch - value)**2)
-                value_loss = F.mse_loss(return_batch, values)
+                value_loss = F.mse_loss(return_batch, values.flatten())
                entropy_loss = th.mean(entropy)
                loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss
                # loss = policy_loss
@ -138,7 +138,8 @@ class PPO(BaseRLModel):
                # TODO: clip grad norm?
                # nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
                self.policy.optimizer.step()
-        print(explained_variance(return_batch.numpy()[:, 0], values[:, 0].detach().cpu().numpy()))
+        # print(value_loss.item())
+        # print(explained_variance(return_batch.numpy(), values.flatten().detach().cpu().numpy()))

    def learn(self, total_timesteps, callback=None, log_interval=100,
              eval_freq=-1, n_eval_episodes=5, tb_log_name="PPO", reset_num_timesteps=True):