mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-05-23 22:20:18 +00:00
Start PPO
This commit is contained in:
parent
2a660e9a41
commit
54dd7ea60d
8 changed files with 403 additions and 50 deletions
|
|
@ -6,6 +6,7 @@ import torch as th
|
|||
import numpy as np
|
||||
|
||||
from torchy_baselines.common.policies import get_policy_from_name
|
||||
from torchy_baselines.common.utils import set_random_seed
|
||||
|
||||
|
||||
class BaseRLModel(object):
|
||||
|
|
@ -182,8 +183,13 @@ class BaseRLModel(object):
|
|||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def seed(self, seed=0):
|
||||
set_random_seed(seed, using_cuda=self.device == th.device('cuda'))
|
||||
if self.env is not None:
|
||||
self.env.seed(seed)
|
||||
|
||||
def collect_rollouts(self, env, n_episodes=1, action_noise_std=0.0,
|
||||
deterministic=False, callback=None,
|
||||
deterministic=False, callback=None, remove_timelimits=True,
|
||||
start_timesteps=0, num_timesteps=0, replay_buffer=None):
|
||||
|
||||
episode_rewards = []
|
||||
|
|
@ -209,7 +215,7 @@ class BaseRLModel(object):
|
|||
# Rescale and perform action
|
||||
new_obs, reward, done, _ = env.step(self.max_action * action)
|
||||
|
||||
if hasattr(self.env, '_max_episode_steps'):
|
||||
if hasattr(self.env, '_max_episode_steps') and remove_timelimits:
|
||||
done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
|
||||
else:
|
||||
done_bool = float(done)
|
||||
|
|
|
|||
54
torchy_baselines/common/distributions.py
Normal file
54
torchy_baselines/common/distributions.py
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
import torch as th
|
||||
from torch.distributions import Normal
|
||||
|
||||
class Distribution(object):
|
||||
def __init__(self):
|
||||
super(Distribution, self).__init__()
|
||||
|
||||
def log_prob(self, x):
|
||||
"""
|
||||
returns the log likelihood
|
||||
|
||||
:param x: (str) the labels of each index
|
||||
:return: ([float]) The log likelihood of the distribution
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def kl_div(self, other):
|
||||
"""
|
||||
Calculates the Kullback-Leibler divergence from the given probabilty distribution
|
||||
|
||||
:param other: ([float]) the distibution to compare with
|
||||
:return: (float) the KL divergence of the two distributions
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def entropy(self):
|
||||
"""
|
||||
Returns shannon's entropy of the probability
|
||||
|
||||
:return: (float) the entropy
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def sample(self):
|
||||
"""
|
||||
returns a sample from the probabilty distribution
|
||||
|
||||
:return: (Tensorflow Tensor) the stochastic action
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class DiagGaussianDistribution(object):
|
||||
"""docstring for DiagGaussianDistribution."""
|
||||
|
||||
def __init__(self):
|
||||
super(DiagGaussianDistribution, self).__init__()
|
||||
self.distribution = None
|
||||
|
||||
def proba_distribution_from_latent(self, latent, init_scale=1.0, init_bias=0.0):
|
||||
self.distribution = Normal()
|
||||
|
||||
def sample(self):
|
||||
return self.distribution.rsample()
|
||||
|
|
@ -48,7 +48,46 @@ class BasePolicy(nn.Module):
|
|||
|
||||
:return: (np.ndarray)
|
||||
"""
|
||||
return th.nn.utils.parameters_to_vector(self.parameters())
|
||||
return th.nn.utils.parameters_to_vector(self.parameters()).detach().cpu().numpy()
|
||||
|
||||
|
||||
def create_mlp(input_dim, output_dim, net_arch,
|
||||
activation_fn=nn.ReLU, squash_out=False):
|
||||
modules = [nn.Linear(input_dim, net_arch[0]), activation_fn()]
|
||||
|
||||
for idx in range(len(net_arch) - 1):
|
||||
modules.append(nn.Linear(net_arch[idx], net_arch[idx + 1]))
|
||||
modules.append(activation_fn())
|
||||
|
||||
if output_dim > 0:
|
||||
modules.append(nn.Linear(net_arch[-1], output_dim))
|
||||
if squash_out:
|
||||
modules.append(nn.Tanh())
|
||||
return modules
|
||||
|
||||
|
||||
class BaseNetwork(nn.Module):
|
||||
"""docstring for BaseNetwork."""
|
||||
|
||||
def __init__(self):
|
||||
super(BaseNetwork, self).__init__()
|
||||
|
||||
def load_from_vector(self, vector):
|
||||
"""
|
||||
Load parameters from a 1D vector.
|
||||
|
||||
:param vector: (np.ndarray)
|
||||
"""
|
||||
device = next(self.parameters()).device
|
||||
th.nn.utils.vector_to_parameters(th.FloatTensor(vector).to(device), self.parameters())
|
||||
|
||||
def parameters_to_vector(self):
|
||||
"""
|
||||
Convert the parameters to a 1D vector.
|
||||
|
||||
:return: (np.ndarray)
|
||||
"""
|
||||
return th.nn.utils.parameters_to_vector(self.parameters()).detach().cpu().numpy()
|
||||
|
||||
|
||||
_policy_registry = dict()
|
||||
|
|
|
|||
0
torchy_baselines/ppo/__init__.py
Normal file
0
torchy_baselines/ppo/__init__.py
Normal file
107
torchy_baselines/ppo/policies.py
Normal file
107
torchy_baselines/ppo/policies.py
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
import torch as th
|
||||
import torch.nn as nn
|
||||
from torch.distributions import Normal
|
||||
|
||||
from torchy_baselines.common.policies import BasePolicy, register_policy, create_mlp, BaseNetwork
|
||||
|
||||
|
||||
class Actor(BaseNetwork):
|
||||
def __init__(self, state_dim, action_dim, net_arch=None, activation_fn=nn.ReLU):
|
||||
super(Actor, self).__init__()
|
||||
|
||||
if net_arch is None:
|
||||
net_arch = [64, 64]
|
||||
|
||||
# TODO: orthogonal initialization?
|
||||
actor_net = create_mlp(state_dim, action_dim, net_arch, activation_fn, squash_out=True)
|
||||
self.actor_net = nn.Sequential(*actor_net)
|
||||
|
||||
def forward(self, x):
|
||||
return self.actor_net(x)
|
||||
|
||||
|
||||
class Critic(BaseNetwork):
|
||||
def __init__(self, state_dim, action_dim,
|
||||
net_arch=None, activation_fn=nn.ReLU):
|
||||
super(Critic, self).__init__()
|
||||
|
||||
if net_arch is None:
|
||||
net_arch = [400, 300]
|
||||
|
||||
# TODO: solve pytorch parameter registration
|
||||
# for _ in range(n_critics):
|
||||
# q_net = create_mlp(state_dim + action_dim, 1, net_arch, activation_fn)
|
||||
# self.q_net = nn.Sequential(*q_net)
|
||||
# self.q_networks.append(self.q_net)
|
||||
|
||||
q1_net = create_mlp(state_dim + action_dim, 1, net_arch, activation_fn)
|
||||
self.q1_net = nn.Sequential(*q1_net)
|
||||
|
||||
q2_net = create_mlp(state_dim + action_dim, 1, net_arch, activation_fn)
|
||||
self.q2_net = nn.Sequential(*q2_net)
|
||||
|
||||
self.q_networks = [self.q1_net, self.q2_net]
|
||||
|
||||
def forward(self, obs, action):
|
||||
qvalue_input = th.cat([obs, action], dim=1)
|
||||
return [q_net(qvalue_input) for q_net in self.q_networks]
|
||||
|
||||
def q1_forward(self, obs, action):
|
||||
return self.q_networks[0](th.cat([obs, action], dim=1))
|
||||
|
||||
|
||||
class PPOPolicy(BasePolicy):
|
||||
def __init__(self, observation_space, action_space,
|
||||
learning_rate=1e-3, net_arch=None, device='cpu',
|
||||
activation_fn=nn.Tanh):
|
||||
super(PPOPolicy, self).__init__(observation_space, action_space, device)
|
||||
self.state_dim = self.observation_space.shape[0]
|
||||
self.action_dim = self.action_space.shape[0]
|
||||
if net_arch is None:
|
||||
net_arch = [64, 64]
|
||||
self.net_arch = net_arch
|
||||
self.activation_fn = activation_fn
|
||||
self.net_args = {
|
||||
'input_dim': self.state_dim,
|
||||
'output_dim': -1,
|
||||
'net_arch': self.net_arch,
|
||||
'activation_fn': self.activation_fn
|
||||
}
|
||||
self.shared_net = None
|
||||
self._build(learning_rate)
|
||||
|
||||
def _build(self, learning_rate):
|
||||
shared_net = create_mlp(self.state_dim, output_dim=-1, self.net_arch, self.activation_fn)
|
||||
self.shared_net = nn.Sequential(*shared_net).to(self.device)
|
||||
self.actor_net = nn.Linear(self.net_arch[-1], self.action_dim)
|
||||
self.value_net = nn.Linear(self.net_arch[-1], 1)
|
||||
self.log_std = nn.Parameter(th.zeros(self.action_dim, 1))
|
||||
self.optimizer = th.optim.Adam(self.parameters(), lr=learning_rate)
|
||||
|
||||
def forward(self, state):
|
||||
latent = self.shared_net(state)
|
||||
# TODO: initialize pi_mean weights properly
|
||||
mean_actions = self.actor_net(latent)
|
||||
action_distribution = Normal(mean_actions, self.log_std)
|
||||
# Sample from the gaussian
|
||||
action = action_distribution.rsample()
|
||||
log_prob = action_distribution.log_prob()
|
||||
# entropy = action_distribution.entropy()
|
||||
value = self.value_net(latent)
|
||||
return action, value, log_prob
|
||||
|
||||
def actor_forward(self):
|
||||
latent = self.shared_net(state)
|
||||
# TODO: initialize pi_mean weights properly
|
||||
mean_actions = self.actor_net(latent)
|
||||
action_distribution = Normal(mean_actions, self.log_std)
|
||||
# Sample from the gaussian
|
||||
action = action_distribution.rsample()
|
||||
return action
|
||||
|
||||
def value_forward(self):
|
||||
pass
|
||||
|
||||
MlpPolicy = PPOPolicy
|
||||
|
||||
register_policy("MlpPolicy", MlpPolicy)
|
||||
190
torchy_baselines/ppo/ppo.py
Normal file
190
torchy_baselines/ppo/ppo.py
Normal file
|
|
@ -0,0 +1,190 @@
|
|||
import time
|
||||
|
||||
import torch as th
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
|
||||
from torchy_baselines.common.base_class import BaseRLModel
|
||||
from torchy_baselines.common.utils import set_random_seed
|
||||
from torchy_baselines.common.evaluation import evaluate_policy
|
||||
from torchy_baselines.ppo.policies import ActorCriticPolicy
|
||||
from torchy_baselines.common.replay_buffer import ReplayBuffer
|
||||
|
||||
|
||||
class PPO(BaseRLModel):
|
||||
"""
|
||||
Implementation of Proximal Policy Optimization (PPO) (clip version)
|
||||
Paper: https://arxiv.org/abs/1707.06347
|
||||
Code: https://github.com/openai/spinningup/
|
||||
"""
|
||||
|
||||
def __init__(self, policy, env, policy_kwargs=None, verbose=0,
|
||||
learning_rate=1e-3, seed=0, device='auto',
|
||||
n_optim=5, batch_size=100, n_steps=256,
|
||||
gamma=0.99, lambda_=0.95,
|
||||
_init_setup_model=True):
|
||||
|
||||
super(PPO, self).__init__(policy, env, ActorCriticPolicy, policy_kwargs, verbose, device)
|
||||
|
||||
self.max_action = np.abs(self.action_space.high)
|
||||
self.learning_rate = learning_rate
|
||||
self._seed = seed
|
||||
self.batch_size = batch_size
|
||||
self.n_optim = n_optim
|
||||
self.n_steps = n_steps
|
||||
self.gamma = gamma
|
||||
self.lambda_ = lambda_
|
||||
self.buffer_rollouts = None
|
||||
|
||||
if _init_setup_model:
|
||||
self._setup_model()
|
||||
|
||||
def _setup_model(self):
|
||||
state_dim, action_dim = self.observation_space.shape[0], self.action_space.shape[0]
|
||||
self.seed(self._seed)
|
||||
|
||||
self.policy = self.policy(self.observation_space, self.action_space,
|
||||
self.learning_rate, device=self.device, **self.policy_kwargs)
|
||||
|
||||
|
||||
def select_action(self, observation):
|
||||
# Normally not needed
|
||||
observation = np.array(observation)
|
||||
with th.no_grad():
|
||||
observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device)
|
||||
return self.policy.actor_forward(observation).cpu().data.numpy().flatten()
|
||||
|
||||
def predict(self, observation, state=None, mask=None, deterministic=True):
|
||||
"""
|
||||
Get the model's action from an observation
|
||||
|
||||
:param observation: (np.ndarray) the input observation
|
||||
:param state: (np.ndarray) The last states (can be None, used in recurrent policies)
|
||||
:param mask: (np.ndarray) The last masks (can be None, used in recurrent policies)
|
||||
:param deterministic: (bool) Whether or not to return deterministic actions.
|
||||
:return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies)
|
||||
"""
|
||||
return np.clip(self.select_action(observation), -self.max_action, self.max_action)
|
||||
|
||||
|
||||
def train_actor(self, n_iterations=1, batch_size=100, tau_actor=0.005, tau_critic=0.005, replay_data=None):
|
||||
|
||||
for it in range(n_iterations):
|
||||
# Sample replay buffer
|
||||
if replay_data is None:
|
||||
state, action, next_state, done, reward = self.replay_buffer.sample(batch_size)
|
||||
else:
|
||||
state, action, next_state, done, reward = replay_data
|
||||
|
||||
# Compute actor loss
|
||||
actor_loss = -self.critic.q1_forward(state, self.actor(state)).mean()
|
||||
|
||||
# Optimize the actor
|
||||
self.actor.optimizer.zero_grad()
|
||||
actor_loss.backward()
|
||||
self.actor.optimizer.step()
|
||||
|
||||
# Update the frozen target models
|
||||
if tau_critic > 0:
|
||||
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
|
||||
target_param.data.copy_(tau_critic * param.data + (1 - tau_critic) * target_param.data)
|
||||
|
||||
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
|
||||
target_param.data.copy_(tau_actor * param.data + (1 - tau_actor) * target_param.data)
|
||||
|
||||
def train(self, n_iterations, batch_size=100, discount=0.99,
|
||||
tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
|
||||
|
||||
for it in range(n_iterations):
|
||||
|
||||
# Sample replay buffer
|
||||
replay_data = self.replay_buffer.sample(batch_size)
|
||||
self.train_critic(replay_data=replay_data)
|
||||
|
||||
# Delayed policy updates
|
||||
if it % policy_freq == 0:
|
||||
self.train_actor(replay_data=replay_data)
|
||||
|
||||
def learn(self, total_timesteps, callback=None, log_interval=100,
|
||||
eval_freq=-1, n_eval_episodes=5, tb_log_name="TD3", reset_num_timesteps=True):
|
||||
|
||||
timesteps_since_eval = 0
|
||||
episode_num = 0
|
||||
evaluations = []
|
||||
start_time = time.time()
|
||||
|
||||
while self.num_timesteps < total_timesteps:
|
||||
|
||||
if callback is not None:
|
||||
# Only stop training if return value is False, not when it is None.
|
||||
if callback(locals(), globals()) is False:
|
||||
break
|
||||
|
||||
episode_reward, episode_timesteps = self.collect_rollouts(self.env, n_episodes=1,
|
||||
action_noise_std=self.action_noise_std,
|
||||
deterministic=False, callback=None,
|
||||
start_timesteps=self.start_timesteps,
|
||||
num_timesteps=self.num_timesteps,
|
||||
replay_buffer=self.buffer_rollouts)
|
||||
episode_num += 1
|
||||
self.num_timesteps += episode_timesteps
|
||||
timesteps_since_eval += episode_timesteps
|
||||
|
||||
if self.num_timesteps > 0:
|
||||
if self.verbose > 1:
|
||||
print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format(
|
||||
self.num_timesteps, episode_num, episode_timesteps, episode_reward))
|
||||
self.train(episode_timesteps, batch_size=self.batch_size, policy_freq=self.policy_freq)
|
||||
|
||||
# Evaluate episode
|
||||
if 0 < eval_freq <= timesteps_since_eval:
|
||||
timesteps_since_eval %= eval_freq
|
||||
mean_reward, _ = evaluate_policy(self, self.env, n_eval_episodes)
|
||||
evaluations.append(mean_reward)
|
||||
if self.verbose > 0:
|
||||
print("Eval num_timesteps={}, mean_reward={:.2f}".format(self.num_timesteps, evaluations[-1]))
|
||||
print("FPS: {:.2f}".format(self.num_timesteps / (time.time() - start_time)))
|
||||
|
||||
return self
|
||||
|
||||
def save(self, path):
|
||||
if not path.endswith('.pth'):
|
||||
path += '.pth'
|
||||
th.save(self.policy.state_dict(), path)
|
||||
|
||||
def load(self, path, env=None, **_kwargs):
|
||||
if not path.endswith('.pth'):
|
||||
path += '.pth'
|
||||
if env is not None:
|
||||
pass
|
||||
self.policy.load_state_dict(th.load(path))
|
||||
|
||||
|
||||
class PPOBuffer(ReplayBuffer):
|
||||
"""docstring for PPOBuffer."""
|
||||
|
||||
def __init__(self, buffer_size, state_dim, action_dim, device='cpu',
|
||||
lambda=0.95):
|
||||
super(PPOBuffer, self).__init__(buffer_size, state_dim, action_dim, device)
|
||||
|
||||
self.returns = th.zeros(self.buffer_size, 1)
|
||||
self.values = th.zeros(self.buffer_size, 1)
|
||||
self.log_probs = th.zeros(self.buffer_size, 1)
|
||||
self.advantages = th.zeros(self.buffer_size, 1)
|
||||
|
||||
def compute_gae(self):
|
||||
"""
|
||||
From https://github.com/openai/spinningup/blob/master/spinup/algos/ppo/ppo.py
|
||||
"""
|
||||
path_slice = slice(self.path_start_idx, self.pos)
|
||||
rews = np.append(self.rewards[path_slice], last_val)
|
||||
vals = np.append(self.val_buf[path_slice], last_val)
|
||||
|
||||
# the next two lines implement GAE-Lambda advantage calculation
|
||||
deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
|
||||
self.advantages[path_slice] = core.discount_cumsum(deltas, self.gamma * self.lam)
|
||||
|
||||
# the next line computes rewards-to-go, to be targets for the value function
|
||||
self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1]
|
||||
|
||||
self.path_start_idx = self.pos
|
||||
|
|
@ -1,45 +1,7 @@
|
|||
import torch as th
|
||||
import torch.nn as nn
|
||||
|
||||
from torchy_baselines.common.policies import BasePolicy, register_policy
|
||||
|
||||
|
||||
def create_mlp(input_dim, output_dim, net_arch,
|
||||
activation_fn=nn.ReLU, squash_out=False):
|
||||
modules = [nn.Linear(input_dim, net_arch[0]), activation_fn()]
|
||||
|
||||
for idx in range(len(net_arch) - 1):
|
||||
modules.append(nn.Linear(net_arch[idx], net_arch[idx + 1]))
|
||||
modules.append(activation_fn())
|
||||
|
||||
modules.append(nn.Linear(net_arch[-1], output_dim))
|
||||
if squash_out:
|
||||
modules.append(nn.Tanh())
|
||||
return modules
|
||||
|
||||
|
||||
class BaseNetwork(nn.Module):
|
||||
"""docstring for BaseNetwork."""
|
||||
|
||||
def __init__(self):
|
||||
super(BaseNetwork, self).__init__()
|
||||
|
||||
def load_from_vector(self, vector):
|
||||
"""
|
||||
Load parameters from a 1D vector.
|
||||
|
||||
:param vector: (np.ndarray)
|
||||
"""
|
||||
device = next(self.parameters()).device
|
||||
th.nn.utils.vector_to_parameters(th.FloatTensor(vector).to(device), self.parameters())
|
||||
|
||||
def parameters_to_vector(self):
|
||||
"""
|
||||
Convert the parameters to a 1D vector.
|
||||
|
||||
:return: (np.ndarray)
|
||||
"""
|
||||
return th.nn.utils.parameters_to_vector(self.parameters()).detach().cpu().numpy()
|
||||
from torchy_baselines.common.policies import BasePolicy, register_policy, create_mlp, BaseNetwork
|
||||
|
||||
|
||||
class Actor(BaseNetwork):
|
||||
|
|
|
|||
|
|
@ -6,7 +6,6 @@ import numpy as np
|
|||
|
||||
from torchy_baselines.common.base_class import BaseRLModel
|
||||
from torchy_baselines.common.replay_buffer import ReplayBuffer
|
||||
from torchy_baselines.common.utils import set_random_seed
|
||||
from torchy_baselines.common.evaluation import evaluate_policy
|
||||
from torchy_baselines.td3.policies import TD3Policy
|
||||
|
||||
|
|
@ -31,20 +30,16 @@ class TD3(BaseRLModel):
|
|||
self.learning_rate = learning_rate
|
||||
self.buffer_size = buffer_size
|
||||
self.start_timesteps = start_timesteps
|
||||
self.seed = seed
|
||||
self._seed = seed
|
||||
self.policy_freq = policy_freq
|
||||
self.batch_size = batch_size
|
||||
|
||||
if _init_setup_model:
|
||||
self._setup_model()
|
||||
|
||||
def _setup_model(self, seed=None):
|
||||
def _setup_model(self):
|
||||
state_dim, action_dim = self.observation_space.shape[0], self.action_space.shape[0]
|
||||
set_random_seed(self.seed, using_cuda=self.device == th.device('cuda'))
|
||||
|
||||
if self.env is not None:
|
||||
self.env.seed(self.seed)
|
||||
|
||||
self.seed(self._seed)
|
||||
self.replay_buffer = ReplayBuffer(self.buffer_size, state_dim, action_dim, self.device)
|
||||
self.policy = self.policy(self.observation_space, self.action_space,
|
||||
self.learning_rate, device=self.device, **self.policy_kwargs)
|
||||
|
|
|
|||
Loading…
Reference in a new issue