mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-07-02 03:55:39 +00:00
PPO VecEnv compat
This commit is contained in:
parent
56053bc692
commit
255ff10bff
5 changed files with 124 additions and 70 deletions
5
setup.py
5
setup.py
|
|
@ -9,7 +9,8 @@ setup(name='torchy_baselines',
|
|||
install_requires=[
|
||||
'gym[classic_control]>=0.10.9',
|
||||
'numpy',
|
||||
'torch>=1.2.0'
|
||||
'torch>=1.2.0',
|
||||
'cloudpickle'
|
||||
],
|
||||
extras_require={
|
||||
'tests': [
|
||||
|
|
@ -33,7 +34,7 @@ setup(name='torchy_baselines',
|
|||
license="MIT",
|
||||
long_description="",
|
||||
long_description_content_type='text/markdown',
|
||||
version="0.0.2",
|
||||
version="0.0.3",
|
||||
)
|
||||
|
||||
# python setup.py sdist
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import numpy as np
|
|||
|
||||
from torchy_baselines.common.policies import get_policy_from_name
|
||||
from torchy_baselines.common.utils import set_random_seed
|
||||
from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv
|
||||
|
||||
|
||||
class BaseRLModel(object):
|
||||
|
|
@ -24,7 +25,8 @@ class BaseRLModel(object):
|
|||
"""
|
||||
__metaclass__ = ABCMeta
|
||||
|
||||
def __init__(self, policy, env, policy_base, policy_kwargs=None, verbose=0, device='auto'):
|
||||
def __init__(self, policy, env, policy_base, policy_kwargs=None,
|
||||
verbose=0, device='auto', support_multi_env=False):
|
||||
if isinstance(policy, str) and policy_base is not None:
|
||||
self.policy = get_policy_from_name(policy_base, policy)
|
||||
else:
|
||||
|
|
@ -48,13 +50,32 @@ class BaseRLModel(object):
|
|||
self.replay_buffer = None
|
||||
|
||||
if env is not None:
|
||||
if env is not None:
|
||||
if isinstance(env, str):
|
||||
env = gym.make(env)
|
||||
self.env = env
|
||||
self.n_envs = 1
|
||||
if isinstance(env, str):
|
||||
if self.verbose >= 1:
|
||||
print("Creating environment from the given name, wrapped in a DummyVecEnv.")
|
||||
env = DummyVecEnv([lambda: gym.make(env)])
|
||||
|
||||
self.observation_space = env.observation_space
|
||||
self.action_space = env.action_space
|
||||
if not isinstance(env, VecEnv):
|
||||
if self.verbose >= 1:
|
||||
print("Wrapping the env in a DummyVecEnv.")
|
||||
env = DummyVecEnv([lambda: env])
|
||||
self.n_envs = env.num_envs
|
||||
self.env = env
|
||||
|
||||
if not support_multi_env and self.n_envs > 1:
|
||||
raise ValueError("Error: the model does not support multiple envs requires a single vectorized"
|
||||
" environment.")
|
||||
|
||||
# if env is not None:
|
||||
# if env is not None:
|
||||
# if isinstance(env, str):
|
||||
# env = gym.make(env)
|
||||
# self.env = env
|
||||
# self.n_envs = 1
|
||||
# self.observation_space = env.observation_space
|
||||
# self.action_space = env.action_space
|
||||
|
||||
def get_env(self):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import torch as th
|
|||
|
||||
|
||||
class BaseBuffer(object):
|
||||
def __init__(self, buffer_size, state_dim, action_dim, device='cpu'):
|
||||
def __init__(self, buffer_size, state_dim, action_dim, device='cpu', n_envs=1):
|
||||
super(BaseBuffer, self).__init__()
|
||||
self.buffer_size = buffer_size
|
||||
self.state_dim = state_dim
|
||||
|
|
@ -11,6 +11,22 @@ class BaseBuffer(object):
|
|||
self.pos = 0
|
||||
self.full = False
|
||||
self.device = device
|
||||
self.n_envs = n_envs
|
||||
|
||||
@staticmethod
|
||||
def swap_and_flatten(tensor):
|
||||
"""
|
||||
Swap and then flatten axes 0 (buffer_size) and 1 (n_envs)
|
||||
to convert shape from [n_steps, n_envs, ...] (when ... is the shape of the features)
|
||||
to [n_steps * n_envs, ...] (which maintain the order)
|
||||
|
||||
:param tensor: (th.Tensor)
|
||||
:return: (th.Tensor)
|
||||
"""
|
||||
shape = tensor.shape
|
||||
if len(shape) < 3:
|
||||
shape = shape + (1,)
|
||||
return tensor.transpose(0, 1).reshape(shape[0] * shape[1], *shape[2:])
|
||||
|
||||
def size(self):
|
||||
if self.full:
|
||||
|
|
@ -40,21 +56,22 @@ class ReplayBuffer(BaseBuffer):
|
|||
"""
|
||||
Taken from https://github.com/apourchot/CEM-RL
|
||||
"""
|
||||
def __init__(self, buffer_size, state_dim, action_dim, device='cpu'):
|
||||
super(ReplayBuffer, self).__init__(buffer_size, state_dim, action_dim, device)
|
||||
self.states = th.zeros(self.buffer_size, self.state_dim)
|
||||
self.actions = th.zeros(self.buffer_size, self.action_dim)
|
||||
self.next_states = th.zeros(self.buffer_size, self.state_dim)
|
||||
self.rewards = th.zeros(self.buffer_size, 1)
|
||||
self.dones = th.zeros(self.buffer_size, 1)
|
||||
def __init__(self, buffer_size, state_dim, action_dim, device='cpu', n_envs=1):
|
||||
super(ReplayBuffer, self).__init__(buffer_size, state_dim, action_dim, device, n_envs=n_envs)
|
||||
|
||||
assert n_envs == 1
|
||||
self.states = th.zeros(self.buffer_size, self.n_envs, self.state_dim)
|
||||
self.actions = th.zeros(self.buffer_size, self.n_envs, self.action_dim)
|
||||
self.next_states = th.zeros(self.buffer_size, self.n_envs, self.state_dim)
|
||||
self.rewards = th.zeros(self.buffer_size, self.n_envs)
|
||||
self.dones = th.zeros(self.buffer_size, self.n_envs)
|
||||
|
||||
def add(self, state, next_state, action, reward, done):
|
||||
|
||||
self.states[self.pos] = th.FloatTensor(state)
|
||||
self.next_states[self.pos] = th.FloatTensor(next_state)
|
||||
self.actions[self.pos] = th.FloatTensor(action)
|
||||
self.rewards[self.pos] = th.FloatTensor([reward])
|
||||
self.dones[self.pos] = th.FloatTensor([done])
|
||||
self.rewards[self.pos] = th.FloatTensor(reward)
|
||||
self.dones[self.pos] = th.FloatTensor(done)
|
||||
|
||||
self.pos += 1
|
||||
if self.pos == self.buffer_size:
|
||||
|
|
@ -62,39 +79,46 @@ class ReplayBuffer(BaseBuffer):
|
|||
self.pos = 0
|
||||
|
||||
def _get_samples(self, batch_inds):
|
||||
return (self.states[batch_inds].to(self.device),
|
||||
self.actions[batch_inds].to(self.device),
|
||||
self.next_states[batch_inds].to(self.device),
|
||||
return (self.states[batch_inds, 0, :].to(self.device),
|
||||
self.actions[batch_inds, 0, :].to(self.device),
|
||||
self.next_states[batch_inds, 0, :].to(self.device),
|
||||
self.dones[batch_inds].to(self.device),
|
||||
self.rewards[batch_inds].to(self.device))
|
||||
|
||||
|
||||
class RolloutBuffer(BaseBuffer):
|
||||
def __init__(self, buffer_size, state_dim, action_dim, device='cpu',
|
||||
lambda_=1, gamma=0.99):
|
||||
super(RolloutBuffer, self).__init__(buffer_size, state_dim, action_dim, device)
|
||||
lambda_=1, gamma=0.99, n_envs=1):
|
||||
super(RolloutBuffer, self).__init__(buffer_size, state_dim, action_dim, device, n_envs=n_envs)
|
||||
|
||||
self.lambda_ = lambda_
|
||||
self.gamma = gamma
|
||||
# TODO: add n_envs
|
||||
self.states = th.zeros(self.buffer_size, self.state_dim)
|
||||
self.actions = th.zeros(self.buffer_size, self.action_dim)
|
||||
self.rewards = th.zeros(self.buffer_size, 1)
|
||||
self.returns = th.zeros(self.buffer_size, 1)
|
||||
self.dones = th.zeros(self.buffer_size, 1)
|
||||
self.values = th.zeros(self.buffer_size, 1)
|
||||
self.log_probs = th.zeros(self.buffer_size, 1)
|
||||
self.advantages = th.zeros(self.buffer_size, 1)
|
||||
self.states, self.actions, self.rewards, self.advantages = None, None, None, None
|
||||
self.returns, self.dones, self.values, self.log_probs = None, None, None, None
|
||||
self.generator_ready = False
|
||||
self.reset()
|
||||
|
||||
def compute_returns_and_advantage(self, last_value, done=False):
|
||||
def reset(self):
|
||||
self.states = th.zeros(self.buffer_size, self.n_envs, self.state_dim)
|
||||
self.actions = th.zeros(self.buffer_size, self.n_envs, self.action_dim)
|
||||
self.rewards = th.zeros(self.buffer_size, self.n_envs)
|
||||
self.returns = th.zeros(self.buffer_size, self.n_envs)
|
||||
self.dones = th.zeros(self.buffer_size, self.n_envs)
|
||||
self.values = th.zeros(self.buffer_size, self.n_envs)
|
||||
self.log_probs = th.zeros(self.buffer_size, self.n_envs)
|
||||
self.advantages = th.zeros(self.buffer_size, self.n_envs)
|
||||
self.generator_ready = False
|
||||
super(RolloutBuffer, self).reset()
|
||||
|
||||
def compute_returns_and_advantage(self, last_value, dones=False):
|
||||
"""
|
||||
From PPO2
|
||||
"""
|
||||
last_gae_lam = 0
|
||||
for step in reversed(range(self.buffer_size)):
|
||||
if step == self.buffer_size - 1:
|
||||
next_non_terminal = 1.0 - float(done)
|
||||
next_value = last_value
|
||||
next_non_terminal = th.FloatTensor(1.0 - dones)
|
||||
next_value = last_value.flatten()
|
||||
else:
|
||||
next_non_terminal = 1.0 - self.dones[step + 1]
|
||||
next_value = self.values[step + 1]
|
||||
|
|
@ -104,21 +128,28 @@ class RolloutBuffer(BaseBuffer):
|
|||
self.returns = self.advantages + self.values
|
||||
|
||||
def add(self, state, action, reward, done, value, log_prob):
|
||||
self.values[self.pos] = th.FloatTensor([value])
|
||||
self.log_probs[self.pos] = th.FloatTensor([log_prob])
|
||||
self.values[self.pos] = th.FloatTensor(value.flatten())
|
||||
self.log_probs[self.pos] = th.FloatTensor(log_prob)
|
||||
self.states[self.pos] = th.FloatTensor(state)
|
||||
self.actions[self.pos] = th.FloatTensor(action)
|
||||
self.rewards[self.pos] = th.FloatTensor([reward])
|
||||
self.dones[self.pos] = th.FloatTensor([done])
|
||||
self.rewards[self.pos] = th.FloatTensor(reward)
|
||||
self.dones[self.pos] = th.FloatTensor(done)
|
||||
self.pos += 1
|
||||
if self.pos == self.buffer_size:
|
||||
self.full = True
|
||||
|
||||
def get(self, batch_size):
|
||||
assert self.full
|
||||
indices = th.randperm(self.buffer_size)
|
||||
indices = th.randperm(self.buffer_size * self.n_envs)
|
||||
# Prepare the data
|
||||
if not self.generator_ready:
|
||||
for tensor in ['states', 'actions', 'values',
|
||||
'log_probs', 'advantages', 'returns']:
|
||||
self.__dict__[tensor] = self.swap_and_flatten(self.__dict__[tensor])
|
||||
self.generator_ready = True
|
||||
|
||||
start_idx = 0
|
||||
while start_idx < self.buffer_size:
|
||||
while start_idx < self.buffer_size * self.n_envs:
|
||||
yield self._get_samples(indices[start_idx:start_idx + batch_size])
|
||||
start_idx += batch_size
|
||||
|
||||
|
|
|
|||
|
|
@ -5,18 +5,18 @@ def evaluate_policy(model, env, n_eval_episodes=10, deterministic=True, render=F
|
|||
"""
|
||||
Runs policy for n episodes and returns average reward
|
||||
"""
|
||||
mean_reward, n_steps = 0.0, 0
|
||||
episode_rewards, n_steps = [], 0
|
||||
for _ in range(n_eval_episodes):
|
||||
obs = env.reset()
|
||||
done = False
|
||||
episode_reward = 0.0
|
||||
while not done:
|
||||
action = model.predict(obs, deterministic=deterministic)
|
||||
obs, reward, done, _ = env.step(action)
|
||||
mean_reward += reward
|
||||
episode_reward += reward
|
||||
n_steps += 1
|
||||
if render:
|
||||
env.render()
|
||||
episode_rewards.append(episode_reward)
|
||||
|
||||
mean_reward /= n_eval_episodes
|
||||
|
||||
return mean_reward, n_steps
|
||||
return np.mean(episode_rewards), n_steps
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import time
|
||||
|
||||
import gym
|
||||
import torch as th
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
|
|
@ -9,6 +10,7 @@ from torchy_baselines.common.evaluation import evaluate_policy
|
|||
from torchy_baselines.ppo.policies import PPOPolicy
|
||||
from torchy_baselines.common.buffers import RolloutBuffer
|
||||
from torchy_baselines.common.utils import explained_variance
|
||||
from torchy_baselines.common.vec_env import VecEnv, DummyVecEnv
|
||||
|
||||
|
||||
class PPO(BaseRLModel):
|
||||
|
|
@ -28,9 +30,9 @@ class PPO(BaseRLModel):
|
|||
target_kl=None, clip_range_vf=None,
|
||||
_init_setup_model=True):
|
||||
|
||||
super(PPO, self).__init__(policy, env, PPOPolicy, policy_kwargs, verbose, device)
|
||||
super(PPO, self).__init__(policy, env, PPOPolicy, policy_kwargs,
|
||||
verbose, device, support_multi_env=True)
|
||||
|
||||
self.max_action = np.abs(self.action_space.high)
|
||||
self.learning_rate = learning_rate
|
||||
self._seed = seed
|
||||
self.batch_size = batch_size
|
||||
|
|
@ -54,7 +56,7 @@ class PPO(BaseRLModel):
|
|||
self.seed(self._seed)
|
||||
|
||||
self.rollout_buffer = RolloutBuffer(self.n_steps, state_dim, action_dim, self.device,
|
||||
gamma=self.gamma, lambda_=self.lambda_)
|
||||
gamma=self.gamma, lambda_=self.lambda_, n_envs=self.n_envs)
|
||||
self.policy = self.policy(self.observation_space, self.action_space,
|
||||
self.learning_rate, device=self.device, **self.policy_kwargs)
|
||||
|
||||
|
|
@ -63,7 +65,7 @@ class PPO(BaseRLModel):
|
|||
observation = np.array(observation)
|
||||
with th.no_grad():
|
||||
observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device)
|
||||
return self.policy.actor_forward(observation, deterministic=False).flatten()
|
||||
return self.policy.actor_forward(observation, deterministic=False)
|
||||
|
||||
def predict(self, observation, state=None, mask=None, deterministic=True):
|
||||
"""
|
||||
|
|
@ -75,37 +77,32 @@ class PPO(BaseRLModel):
|
|||
:param deterministic: (bool) Whether or not to return deterministic actions.
|
||||
:return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies)
|
||||
"""
|
||||
return np.clip(self.select_action(observation), -self.max_action, self.max_action)
|
||||
return np.clip(self.select_action(observation), self.action_space.low, self.action_space.high)
|
||||
|
||||
def collect_rollouts(self, env, rollout_buffer, n_rollout_steps=256, callback=None,
|
||||
obs=None):
|
||||
|
||||
n_steps = 0
|
||||
done = obs is None
|
||||
rollout_buffer.reset()
|
||||
|
||||
while n_steps < n_rollout_steps:
|
||||
# Reset environment
|
||||
if done:
|
||||
obs = env.reset()
|
||||
|
||||
# No grad ok?
|
||||
with th.no_grad():
|
||||
action, value, log_prob = self.policy.forward(obs)
|
||||
action = action.flatten().cpu().numpy()
|
||||
actions, values, log_probs = self.policy.forward(obs)
|
||||
actions = actions.cpu().numpy()
|
||||
|
||||
# Rescale and perform action
|
||||
# TODO: clip only when using Box action space
|
||||
new_obs, reward, done, _ = env.step(np.clip(action, -self.max_action, self.max_action))
|
||||
clipped_actions = actions
|
||||
# Clip the actions to avoid out of bound error
|
||||
if isinstance(self.action_space, gym.spaces.Box):
|
||||
clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
|
||||
new_obs, rewards, dones, _ = env.step(clipped_actions)
|
||||
|
||||
n_steps += 1
|
||||
rollout_buffer.add(obs, action, reward, float(done), value, log_prob)
|
||||
rollout_buffer.add(obs, actions, rewards, dones, values, log_probs)
|
||||
|
||||
obs = new_obs
|
||||
if done:
|
||||
obs = None
|
||||
|
||||
rollout_buffer.compute_returns_and_advantage(value, done=done)
|
||||
rollout_buffer.compute_returns_and_advantage(values, dones=dones)
|
||||
|
||||
return obs
|
||||
|
||||
|
|
@ -121,7 +118,6 @@ class PPO(BaseRLModel):
|
|||
|
||||
values, log_prob, entropy = self.policy.get_policy_stats(state, action)
|
||||
values = values.flatten()
|
||||
|
||||
# Normalize advantage
|
||||
advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
|
||||
|
||||
|
|
@ -173,7 +169,12 @@ class PPO(BaseRLModel):
|
|||
episode_num = 0
|
||||
evaluations = []
|
||||
start_time = time.time()
|
||||
obs = None
|
||||
obs = self.env.reset()
|
||||
|
||||
if eval_env is not None and not isinstance(eval_env, VecEnv):
|
||||
eval_env = DummyVecEnv([lambda: eval_env])
|
||||
|
||||
assert eval_env.num_envs == 1
|
||||
|
||||
while self.num_timesteps < total_timesteps:
|
||||
|
||||
|
|
@ -185,8 +186,8 @@ class PPO(BaseRLModel):
|
|||
obs = self.collect_rollouts(self.env, self.rollout_buffer, n_rollout_steps=self.n_steps,
|
||||
obs=obs)
|
||||
episode_num += 1
|
||||
self.num_timesteps += self.n_steps
|
||||
timesteps_since_eval += self.n_steps
|
||||
self.num_timesteps += self.n_steps * self.n_envs
|
||||
timesteps_since_eval += self.n_steps * self.n_envs
|
||||
|
||||
self.train(self.n_optim, batch_size=self.batch_size)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue