PPO VecEnv compat

This commit is contained in:
Antonin Raffin 2019-09-20 15:19:04 +02:00
parent 56053bc692
commit 255ff10bff
5 changed files with 124 additions and 70 deletions

View file

@ -9,7 +9,8 @@ setup(name='torchy_baselines',
install_requires=[
'gym[classic_control]>=0.10.9',
'numpy',
'torch>=1.2.0'
'torch>=1.2.0',
'cloudpickle'
],
extras_require={
'tests': [
@ -33,7 +34,7 @@ setup(name='torchy_baselines',
license="MIT",
long_description="",
long_description_content_type='text/markdown',
version="0.0.2",
version="0.0.3",
)
# python setup.py sdist

View file

@ -7,6 +7,7 @@ import numpy as np
from torchy_baselines.common.policies import get_policy_from_name
from torchy_baselines.common.utils import set_random_seed
from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv
class BaseRLModel(object):
@ -24,7 +25,8 @@ class BaseRLModel(object):
"""
__metaclass__ = ABCMeta
def __init__(self, policy, env, policy_base, policy_kwargs=None, verbose=0, device='auto'):
def __init__(self, policy, env, policy_base, policy_kwargs=None,
verbose=0, device='auto', support_multi_env=False):
if isinstance(policy, str) and policy_base is not None:
self.policy = get_policy_from_name(policy_base, policy)
else:
@ -48,13 +50,32 @@ class BaseRLModel(object):
self.replay_buffer = None
if env is not None:
if env is not None:
if isinstance(env, str):
env = gym.make(env)
self.env = env
self.n_envs = 1
if isinstance(env, str):
if self.verbose >= 1:
print("Creating environment from the given name, wrapped in a DummyVecEnv.")
env = DummyVecEnv([lambda: gym.make(env)])
self.observation_space = env.observation_space
self.action_space = env.action_space
if not isinstance(env, VecEnv):
if self.verbose >= 1:
print("Wrapping the env in a DummyVecEnv.")
env = DummyVecEnv([lambda: env])
self.n_envs = env.num_envs
self.env = env
if not support_multi_env and self.n_envs > 1:
raise ValueError("Error: the model does not support multiple envs requires a single vectorized"
" environment.")
# if env is not None:
# if env is not None:
# if isinstance(env, str):
# env = gym.make(env)
# self.env = env
# self.n_envs = 1
# self.observation_space = env.observation_space
# self.action_space = env.action_space
def get_env(self):
"""

View file

@ -3,7 +3,7 @@ import torch as th
class BaseBuffer(object):
def __init__(self, buffer_size, state_dim, action_dim, device='cpu'):
def __init__(self, buffer_size, state_dim, action_dim, device='cpu', n_envs=1):
super(BaseBuffer, self).__init__()
self.buffer_size = buffer_size
self.state_dim = state_dim
@ -11,6 +11,22 @@ class BaseBuffer(object):
self.pos = 0
self.full = False
self.device = device
self.n_envs = n_envs
@staticmethod
def swap_and_flatten(tensor):
"""
Swap and then flatten axes 0 (buffer_size) and 1 (n_envs)
to convert shape from [n_steps, n_envs, ...] (when ... is the shape of the features)
to [n_steps * n_envs, ...] (which maintain the order)
:param tensor: (th.Tensor)
:return: (th.Tensor)
"""
shape = tensor.shape
if len(shape) < 3:
shape = shape + (1,)
return tensor.transpose(0, 1).reshape(shape[0] * shape[1], *shape[2:])
def size(self):
if self.full:
@ -40,21 +56,22 @@ class ReplayBuffer(BaseBuffer):
"""
Taken from https://github.com/apourchot/CEM-RL
"""
def __init__(self, buffer_size, state_dim, action_dim, device='cpu'):
super(ReplayBuffer, self).__init__(buffer_size, state_dim, action_dim, device)
self.states = th.zeros(self.buffer_size, self.state_dim)
self.actions = th.zeros(self.buffer_size, self.action_dim)
self.next_states = th.zeros(self.buffer_size, self.state_dim)
self.rewards = th.zeros(self.buffer_size, 1)
self.dones = th.zeros(self.buffer_size, 1)
def __init__(self, buffer_size, state_dim, action_dim, device='cpu', n_envs=1):
super(ReplayBuffer, self).__init__(buffer_size, state_dim, action_dim, device, n_envs=n_envs)
assert n_envs == 1
self.states = th.zeros(self.buffer_size, self.n_envs, self.state_dim)
self.actions = th.zeros(self.buffer_size, self.n_envs, self.action_dim)
self.next_states = th.zeros(self.buffer_size, self.n_envs, self.state_dim)
self.rewards = th.zeros(self.buffer_size, self.n_envs)
self.dones = th.zeros(self.buffer_size, self.n_envs)
def add(self, state, next_state, action, reward, done):
self.states[self.pos] = th.FloatTensor(state)
self.next_states[self.pos] = th.FloatTensor(next_state)
self.actions[self.pos] = th.FloatTensor(action)
self.rewards[self.pos] = th.FloatTensor([reward])
self.dones[self.pos] = th.FloatTensor([done])
self.rewards[self.pos] = th.FloatTensor(reward)
self.dones[self.pos] = th.FloatTensor(done)
self.pos += 1
if self.pos == self.buffer_size:
@ -62,39 +79,46 @@ class ReplayBuffer(BaseBuffer):
self.pos = 0
def _get_samples(self, batch_inds):
return (self.states[batch_inds].to(self.device),
self.actions[batch_inds].to(self.device),
self.next_states[batch_inds].to(self.device),
return (self.states[batch_inds, 0, :].to(self.device),
self.actions[batch_inds, 0, :].to(self.device),
self.next_states[batch_inds, 0, :].to(self.device),
self.dones[batch_inds].to(self.device),
self.rewards[batch_inds].to(self.device))
class RolloutBuffer(BaseBuffer):
def __init__(self, buffer_size, state_dim, action_dim, device='cpu',
lambda_=1, gamma=0.99):
super(RolloutBuffer, self).__init__(buffer_size, state_dim, action_dim, device)
lambda_=1, gamma=0.99, n_envs=1):
super(RolloutBuffer, self).__init__(buffer_size, state_dim, action_dim, device, n_envs=n_envs)
self.lambda_ = lambda_
self.gamma = gamma
# TODO: add n_envs
self.states = th.zeros(self.buffer_size, self.state_dim)
self.actions = th.zeros(self.buffer_size, self.action_dim)
self.rewards = th.zeros(self.buffer_size, 1)
self.returns = th.zeros(self.buffer_size, 1)
self.dones = th.zeros(self.buffer_size, 1)
self.values = th.zeros(self.buffer_size, 1)
self.log_probs = th.zeros(self.buffer_size, 1)
self.advantages = th.zeros(self.buffer_size, 1)
self.states, self.actions, self.rewards, self.advantages = None, None, None, None
self.returns, self.dones, self.values, self.log_probs = None, None, None, None
self.generator_ready = False
self.reset()
def compute_returns_and_advantage(self, last_value, done=False):
def reset(self):
self.states = th.zeros(self.buffer_size, self.n_envs, self.state_dim)
self.actions = th.zeros(self.buffer_size, self.n_envs, self.action_dim)
self.rewards = th.zeros(self.buffer_size, self.n_envs)
self.returns = th.zeros(self.buffer_size, self.n_envs)
self.dones = th.zeros(self.buffer_size, self.n_envs)
self.values = th.zeros(self.buffer_size, self.n_envs)
self.log_probs = th.zeros(self.buffer_size, self.n_envs)
self.advantages = th.zeros(self.buffer_size, self.n_envs)
self.generator_ready = False
super(RolloutBuffer, self).reset()
def compute_returns_and_advantage(self, last_value, dones=False):
"""
From PPO2
"""
last_gae_lam = 0
for step in reversed(range(self.buffer_size)):
if step == self.buffer_size - 1:
next_non_terminal = 1.0 - float(done)
next_value = last_value
next_non_terminal = th.FloatTensor(1.0 - dones)
next_value = last_value.flatten()
else:
next_non_terminal = 1.0 - self.dones[step + 1]
next_value = self.values[step + 1]
@ -104,21 +128,28 @@ class RolloutBuffer(BaseBuffer):
self.returns = self.advantages + self.values
def add(self, state, action, reward, done, value, log_prob):
self.values[self.pos] = th.FloatTensor([value])
self.log_probs[self.pos] = th.FloatTensor([log_prob])
self.values[self.pos] = th.FloatTensor(value.flatten())
self.log_probs[self.pos] = th.FloatTensor(log_prob)
self.states[self.pos] = th.FloatTensor(state)
self.actions[self.pos] = th.FloatTensor(action)
self.rewards[self.pos] = th.FloatTensor([reward])
self.dones[self.pos] = th.FloatTensor([done])
self.rewards[self.pos] = th.FloatTensor(reward)
self.dones[self.pos] = th.FloatTensor(done)
self.pos += 1
if self.pos == self.buffer_size:
self.full = True
def get(self, batch_size):
assert self.full
indices = th.randperm(self.buffer_size)
indices = th.randperm(self.buffer_size * self.n_envs)
# Prepare the data
if not self.generator_ready:
for tensor in ['states', 'actions', 'values',
'log_probs', 'advantages', 'returns']:
self.__dict__[tensor] = self.swap_and_flatten(self.__dict__[tensor])
self.generator_ready = True
start_idx = 0
while start_idx < self.buffer_size:
while start_idx < self.buffer_size * self.n_envs:
yield self._get_samples(indices[start_idx:start_idx + batch_size])
start_idx += batch_size

View file

@ -5,18 +5,18 @@ def evaluate_policy(model, env, n_eval_episodes=10, deterministic=True, render=F
"""
Runs policy for n episodes and returns average reward
"""
mean_reward, n_steps = 0.0, 0
episode_rewards, n_steps = [], 0
for _ in range(n_eval_episodes):
obs = env.reset()
done = False
episode_reward = 0.0
while not done:
action = model.predict(obs, deterministic=deterministic)
obs, reward, done, _ = env.step(action)
mean_reward += reward
episode_reward += reward
n_steps += 1
if render:
env.render()
episode_rewards.append(episode_reward)
mean_reward /= n_eval_episodes
return mean_reward, n_steps
return np.mean(episode_rewards), n_steps

View file

@ -1,5 +1,6 @@
import time
import gym
import torch as th
import torch.nn.functional as F
import numpy as np
@ -9,6 +10,7 @@ from torchy_baselines.common.evaluation import evaluate_policy
from torchy_baselines.ppo.policies import PPOPolicy
from torchy_baselines.common.buffers import RolloutBuffer
from torchy_baselines.common.utils import explained_variance
from torchy_baselines.common.vec_env import VecEnv, DummyVecEnv
class PPO(BaseRLModel):
@ -28,9 +30,9 @@ class PPO(BaseRLModel):
target_kl=None, clip_range_vf=None,
_init_setup_model=True):
super(PPO, self).__init__(policy, env, PPOPolicy, policy_kwargs, verbose, device)
super(PPO, self).__init__(policy, env, PPOPolicy, policy_kwargs,
verbose, device, support_multi_env=True)
self.max_action = np.abs(self.action_space.high)
self.learning_rate = learning_rate
self._seed = seed
self.batch_size = batch_size
@ -54,7 +56,7 @@ class PPO(BaseRLModel):
self.seed(self._seed)
self.rollout_buffer = RolloutBuffer(self.n_steps, state_dim, action_dim, self.device,
gamma=self.gamma, lambda_=self.lambda_)
gamma=self.gamma, lambda_=self.lambda_, n_envs=self.n_envs)
self.policy = self.policy(self.observation_space, self.action_space,
self.learning_rate, device=self.device, **self.policy_kwargs)
@ -63,7 +65,7 @@ class PPO(BaseRLModel):
observation = np.array(observation)
with th.no_grad():
observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device)
return self.policy.actor_forward(observation, deterministic=False).flatten()
return self.policy.actor_forward(observation, deterministic=False)
def predict(self, observation, state=None, mask=None, deterministic=True):
"""
@ -75,37 +77,32 @@ class PPO(BaseRLModel):
:param deterministic: (bool) Whether or not to return deterministic actions.
:return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies)
"""
return np.clip(self.select_action(observation), -self.max_action, self.max_action)
return np.clip(self.select_action(observation), self.action_space.low, self.action_space.high)
def collect_rollouts(self, env, rollout_buffer, n_rollout_steps=256, callback=None,
obs=None):
n_steps = 0
done = obs is None
rollout_buffer.reset()
while n_steps < n_rollout_steps:
# Reset environment
if done:
obs = env.reset()
# No grad ok?
with th.no_grad():
action, value, log_prob = self.policy.forward(obs)
action = action.flatten().cpu().numpy()
actions, values, log_probs = self.policy.forward(obs)
actions = actions.cpu().numpy()
# Rescale and perform action
# TODO: clip only when using Box action space
new_obs, reward, done, _ = env.step(np.clip(action, -self.max_action, self.max_action))
clipped_actions = actions
# Clip the actions to avoid out of bound error
if isinstance(self.action_space, gym.spaces.Box):
clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
new_obs, rewards, dones, _ = env.step(clipped_actions)
n_steps += 1
rollout_buffer.add(obs, action, reward, float(done), value, log_prob)
rollout_buffer.add(obs, actions, rewards, dones, values, log_probs)
obs = new_obs
if done:
obs = None
rollout_buffer.compute_returns_and_advantage(value, done=done)
rollout_buffer.compute_returns_and_advantage(values, dones=dones)
return obs
@ -121,7 +118,6 @@ class PPO(BaseRLModel):
values, log_prob, entropy = self.policy.get_policy_stats(state, action)
values = values.flatten()
# Normalize advantage
advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
@ -173,7 +169,12 @@ class PPO(BaseRLModel):
episode_num = 0
evaluations = []
start_time = time.time()
obs = None
obs = self.env.reset()
if eval_env is not None and not isinstance(eval_env, VecEnv):
eval_env = DummyVecEnv([lambda: eval_env])
assert eval_env.num_envs == 1
while self.num_timesteps < total_timesteps:
@ -185,8 +186,8 @@ class PPO(BaseRLModel):
obs = self.collect_rollouts(self.env, self.rollout_buffer, n_rollout_steps=self.n_steps,
obs=obs)
episode_num += 1
self.num_timesteps += self.n_steps
timesteps_since_eval += self.n_steps
self.num_timesteps += self.n_steps * self.n_envs
timesteps_since_eval += self.n_steps * self.n_envs
self.train(self.n_optim, batch_size=self.batch_size)