From 255ff10bfffdccaf7193772d46e0c2df6cf36a9b Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 20 Sep 2019 15:19:04 +0200 Subject: [PATCH] PPO VecEnv compat --- setup.py | 5 +- torchy_baselines/common/base_class.py | 33 +++++++-- torchy_baselines/common/buffers.py | 99 ++++++++++++++++++--------- torchy_baselines/common/evaluation.py | 10 +-- torchy_baselines/ppo/ppo.py | 47 ++++++------- 5 files changed, 124 insertions(+), 70 deletions(-) diff --git a/setup.py b/setup.py index 4f40cde..994d949 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,8 @@ setup(name='torchy_baselines', install_requires=[ 'gym[classic_control]>=0.10.9', 'numpy', - 'torch>=1.2.0' + 'torch>=1.2.0', + 'cloudpickle' ], extras_require={ 'tests': [ @@ -33,7 +34,7 @@ setup(name='torchy_baselines', license="MIT", long_description="", long_description_content_type='text/markdown', - version="0.0.2", + version="0.0.3", ) # python setup.py sdist diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index 14e050f..11073e9 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -7,6 +7,7 @@ import numpy as np from torchy_baselines.common.policies import get_policy_from_name from torchy_baselines.common.utils import set_random_seed +from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv class BaseRLModel(object): @@ -24,7 +25,8 @@ class BaseRLModel(object): """ __metaclass__ = ABCMeta - def __init__(self, policy, env, policy_base, policy_kwargs=None, verbose=0, device='auto'): + def __init__(self, policy, env, policy_base, policy_kwargs=None, + verbose=0, device='auto', support_multi_env=False): if isinstance(policy, str) and policy_base is not None: self.policy = get_policy_from_name(policy_base, policy) else: @@ -48,13 +50,32 @@ class BaseRLModel(object): self.replay_buffer = None if env is not None: - if env is not None: - if isinstance(env, str): - env = gym.make(env) - self.env = env - self.n_envs = 1 + if isinstance(env, str): + if self.verbose >= 1: + print("Creating environment from the given name, wrapped in a DummyVecEnv.") + env = DummyVecEnv([lambda: gym.make(env)]) + self.observation_space = env.observation_space self.action_space = env.action_space + if not isinstance(env, VecEnv): + if self.verbose >= 1: + print("Wrapping the env in a DummyVecEnv.") + env = DummyVecEnv([lambda: env]) + self.n_envs = env.num_envs + self.env = env + + if not support_multi_env and self.n_envs > 1: + raise ValueError("Error: the model does not support multiple envs requires a single vectorized" + " environment.") + + # if env is not None: + # if env is not None: + # if isinstance(env, str): + # env = gym.make(env) + # self.env = env + # self.n_envs = 1 + # self.observation_space = env.observation_space + # self.action_space = env.action_space def get_env(self): """ diff --git a/torchy_baselines/common/buffers.py b/torchy_baselines/common/buffers.py index ef13c94..8c4ccf6 100644 --- a/torchy_baselines/common/buffers.py +++ b/torchy_baselines/common/buffers.py @@ -3,7 +3,7 @@ import torch as th class BaseBuffer(object): - def __init__(self, buffer_size, state_dim, action_dim, device='cpu'): + def __init__(self, buffer_size, state_dim, action_dim, device='cpu', n_envs=1): super(BaseBuffer, self).__init__() self.buffer_size = buffer_size self.state_dim = state_dim @@ -11,6 +11,22 @@ class BaseBuffer(object): self.pos = 0 self.full = False self.device = device + self.n_envs = n_envs + + @staticmethod + def swap_and_flatten(tensor): + """ + Swap and then flatten axes 0 (buffer_size) and 1 (n_envs) + to convert shape from [n_steps, n_envs, ...] (when ... is the shape of the features) + to [n_steps * n_envs, ...] (which maintain the order) + + :param tensor: (th.Tensor) + :return: (th.Tensor) + """ + shape = tensor.shape + if len(shape) < 3: + shape = shape + (1,) + return tensor.transpose(0, 1).reshape(shape[0] * shape[1], *shape[2:]) def size(self): if self.full: @@ -40,21 +56,22 @@ class ReplayBuffer(BaseBuffer): """ Taken from https://github.com/apourchot/CEM-RL """ - def __init__(self, buffer_size, state_dim, action_dim, device='cpu'): - super(ReplayBuffer, self).__init__(buffer_size, state_dim, action_dim, device) - self.states = th.zeros(self.buffer_size, self.state_dim) - self.actions = th.zeros(self.buffer_size, self.action_dim) - self.next_states = th.zeros(self.buffer_size, self.state_dim) - self.rewards = th.zeros(self.buffer_size, 1) - self.dones = th.zeros(self.buffer_size, 1) + def __init__(self, buffer_size, state_dim, action_dim, device='cpu', n_envs=1): + super(ReplayBuffer, self).__init__(buffer_size, state_dim, action_dim, device, n_envs=n_envs) + + assert n_envs == 1 + self.states = th.zeros(self.buffer_size, self.n_envs, self.state_dim) + self.actions = th.zeros(self.buffer_size, self.n_envs, self.action_dim) + self.next_states = th.zeros(self.buffer_size, self.n_envs, self.state_dim) + self.rewards = th.zeros(self.buffer_size, self.n_envs) + self.dones = th.zeros(self.buffer_size, self.n_envs) def add(self, state, next_state, action, reward, done): - self.states[self.pos] = th.FloatTensor(state) self.next_states[self.pos] = th.FloatTensor(next_state) self.actions[self.pos] = th.FloatTensor(action) - self.rewards[self.pos] = th.FloatTensor([reward]) - self.dones[self.pos] = th.FloatTensor([done]) + self.rewards[self.pos] = th.FloatTensor(reward) + self.dones[self.pos] = th.FloatTensor(done) self.pos += 1 if self.pos == self.buffer_size: @@ -62,39 +79,46 @@ class ReplayBuffer(BaseBuffer): self.pos = 0 def _get_samples(self, batch_inds): - return (self.states[batch_inds].to(self.device), - self.actions[batch_inds].to(self.device), - self.next_states[batch_inds].to(self.device), + return (self.states[batch_inds, 0, :].to(self.device), + self.actions[batch_inds, 0, :].to(self.device), + self.next_states[batch_inds, 0, :].to(self.device), self.dones[batch_inds].to(self.device), self.rewards[batch_inds].to(self.device)) class RolloutBuffer(BaseBuffer): def __init__(self, buffer_size, state_dim, action_dim, device='cpu', - lambda_=1, gamma=0.99): - super(RolloutBuffer, self).__init__(buffer_size, state_dim, action_dim, device) + lambda_=1, gamma=0.99, n_envs=1): + super(RolloutBuffer, self).__init__(buffer_size, state_dim, action_dim, device, n_envs=n_envs) self.lambda_ = lambda_ self.gamma = gamma - # TODO: add n_envs - self.states = th.zeros(self.buffer_size, self.state_dim) - self.actions = th.zeros(self.buffer_size, self.action_dim) - self.rewards = th.zeros(self.buffer_size, 1) - self.returns = th.zeros(self.buffer_size, 1) - self.dones = th.zeros(self.buffer_size, 1) - self.values = th.zeros(self.buffer_size, 1) - self.log_probs = th.zeros(self.buffer_size, 1) - self.advantages = th.zeros(self.buffer_size, 1) + self.states, self.actions, self.rewards, self.advantages = None, None, None, None + self.returns, self.dones, self.values, self.log_probs = None, None, None, None + self.generator_ready = False + self.reset() - def compute_returns_and_advantage(self, last_value, done=False): + def reset(self): + self.states = th.zeros(self.buffer_size, self.n_envs, self.state_dim) + self.actions = th.zeros(self.buffer_size, self.n_envs, self.action_dim) + self.rewards = th.zeros(self.buffer_size, self.n_envs) + self.returns = th.zeros(self.buffer_size, self.n_envs) + self.dones = th.zeros(self.buffer_size, self.n_envs) + self.values = th.zeros(self.buffer_size, self.n_envs) + self.log_probs = th.zeros(self.buffer_size, self.n_envs) + self.advantages = th.zeros(self.buffer_size, self.n_envs) + self.generator_ready = False + super(RolloutBuffer, self).reset() + + def compute_returns_and_advantage(self, last_value, dones=False): """ From PPO2 """ last_gae_lam = 0 for step in reversed(range(self.buffer_size)): if step == self.buffer_size - 1: - next_non_terminal = 1.0 - float(done) - next_value = last_value + next_non_terminal = th.FloatTensor(1.0 - dones) + next_value = last_value.flatten() else: next_non_terminal = 1.0 - self.dones[step + 1] next_value = self.values[step + 1] @@ -104,21 +128,28 @@ class RolloutBuffer(BaseBuffer): self.returns = self.advantages + self.values def add(self, state, action, reward, done, value, log_prob): - self.values[self.pos] = th.FloatTensor([value]) - self.log_probs[self.pos] = th.FloatTensor([log_prob]) + self.values[self.pos] = th.FloatTensor(value.flatten()) + self.log_probs[self.pos] = th.FloatTensor(log_prob) self.states[self.pos] = th.FloatTensor(state) self.actions[self.pos] = th.FloatTensor(action) - self.rewards[self.pos] = th.FloatTensor([reward]) - self.dones[self.pos] = th.FloatTensor([done]) + self.rewards[self.pos] = th.FloatTensor(reward) + self.dones[self.pos] = th.FloatTensor(done) self.pos += 1 if self.pos == self.buffer_size: self.full = True def get(self, batch_size): assert self.full - indices = th.randperm(self.buffer_size) + indices = th.randperm(self.buffer_size * self.n_envs) + # Prepare the data + if not self.generator_ready: + for tensor in ['states', 'actions', 'values', + 'log_probs', 'advantages', 'returns']: + self.__dict__[tensor] = self.swap_and_flatten(self.__dict__[tensor]) + self.generator_ready = True + start_idx = 0 - while start_idx < self.buffer_size: + while start_idx < self.buffer_size * self.n_envs: yield self._get_samples(indices[start_idx:start_idx + batch_size]) start_idx += batch_size diff --git a/torchy_baselines/common/evaluation.py b/torchy_baselines/common/evaluation.py index fc0f38a..3d76d9f 100644 --- a/torchy_baselines/common/evaluation.py +++ b/torchy_baselines/common/evaluation.py @@ -5,18 +5,18 @@ def evaluate_policy(model, env, n_eval_episodes=10, deterministic=True, render=F """ Runs policy for n episodes and returns average reward """ - mean_reward, n_steps = 0.0, 0 + episode_rewards, n_steps = [], 0 for _ in range(n_eval_episodes): obs = env.reset() done = False + episode_reward = 0.0 while not done: action = model.predict(obs, deterministic=deterministic) obs, reward, done, _ = env.step(action) - mean_reward += reward + episode_reward += reward n_steps += 1 if render: env.render() + episode_rewards.append(episode_reward) - mean_reward /= n_eval_episodes - - return mean_reward, n_steps + return np.mean(episode_rewards), n_steps diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 8470b89..463bf6b 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -1,5 +1,6 @@ import time +import gym import torch as th import torch.nn.functional as F import numpy as np @@ -9,6 +10,7 @@ from torchy_baselines.common.evaluation import evaluate_policy from torchy_baselines.ppo.policies import PPOPolicy from torchy_baselines.common.buffers import RolloutBuffer from torchy_baselines.common.utils import explained_variance +from torchy_baselines.common.vec_env import VecEnv, DummyVecEnv class PPO(BaseRLModel): @@ -28,9 +30,9 @@ class PPO(BaseRLModel): target_kl=None, clip_range_vf=None, _init_setup_model=True): - super(PPO, self).__init__(policy, env, PPOPolicy, policy_kwargs, verbose, device) + super(PPO, self).__init__(policy, env, PPOPolicy, policy_kwargs, + verbose, device, support_multi_env=True) - self.max_action = np.abs(self.action_space.high) self.learning_rate = learning_rate self._seed = seed self.batch_size = batch_size @@ -54,7 +56,7 @@ class PPO(BaseRLModel): self.seed(self._seed) self.rollout_buffer = RolloutBuffer(self.n_steps, state_dim, action_dim, self.device, - gamma=self.gamma, lambda_=self.lambda_) + gamma=self.gamma, lambda_=self.lambda_, n_envs=self.n_envs) self.policy = self.policy(self.observation_space, self.action_space, self.learning_rate, device=self.device, **self.policy_kwargs) @@ -63,7 +65,7 @@ class PPO(BaseRLModel): observation = np.array(observation) with th.no_grad(): observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device) - return self.policy.actor_forward(observation, deterministic=False).flatten() + return self.policy.actor_forward(observation, deterministic=False) def predict(self, observation, state=None, mask=None, deterministic=True): """ @@ -75,37 +77,32 @@ class PPO(BaseRLModel): :param deterministic: (bool) Whether or not to return deterministic actions. :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies) """ - return np.clip(self.select_action(observation), -self.max_action, self.max_action) + return np.clip(self.select_action(observation), self.action_space.low, self.action_space.high) def collect_rollouts(self, env, rollout_buffer, n_rollout_steps=256, callback=None, obs=None): n_steps = 0 - done = obs is None rollout_buffer.reset() while n_steps < n_rollout_steps: - # Reset environment - if done: - obs = env.reset() - - # No grad ok? with th.no_grad(): - action, value, log_prob = self.policy.forward(obs) - action = action.flatten().cpu().numpy() + actions, values, log_probs = self.policy.forward(obs) + actions = actions.cpu().numpy() # Rescale and perform action - # TODO: clip only when using Box action space - new_obs, reward, done, _ = env.step(np.clip(action, -self.max_action, self.max_action)) + clipped_actions = actions + # Clip the actions to avoid out of bound error + if isinstance(self.action_space, gym.spaces.Box): + clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) + new_obs, rewards, dones, _ = env.step(clipped_actions) n_steps += 1 - rollout_buffer.add(obs, action, reward, float(done), value, log_prob) + rollout_buffer.add(obs, actions, rewards, dones, values, log_probs) obs = new_obs - if done: - obs = None - rollout_buffer.compute_returns_and_advantage(value, done=done) + rollout_buffer.compute_returns_and_advantage(values, dones=dones) return obs @@ -121,7 +118,6 @@ class PPO(BaseRLModel): values, log_prob, entropy = self.policy.get_policy_stats(state, action) values = values.flatten() - # Normalize advantage advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8) @@ -173,7 +169,12 @@ class PPO(BaseRLModel): episode_num = 0 evaluations = [] start_time = time.time() - obs = None + obs = self.env.reset() + + if eval_env is not None and not isinstance(eval_env, VecEnv): + eval_env = DummyVecEnv([lambda: eval_env]) + + assert eval_env.num_envs == 1 while self.num_timesteps < total_timesteps: @@ -185,8 +186,8 @@ class PPO(BaseRLModel): obs = self.collect_rollouts(self.env, self.rollout_buffer, n_rollout_steps=self.n_steps, obs=obs) episode_num += 1 - self.num_timesteps += self.n_steps - timesteps_since_eval += self.n_steps + self.num_timesteps += self.n_steps * self.n_envs + timesteps_since_eval += self.n_steps * self.n_envs self.train(self.n_optim, batch_size=self.batch_size)