diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index f51e9dc..17e829e 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -6,6 +6,7 @@ import torch as th import numpy as np from torchy_baselines.common.policies import get_policy_from_name +from torchy_baselines.common.utils import set_random_seed class BaseRLModel(object): @@ -182,8 +183,13 @@ class BaseRLModel(object): """ raise NotImplementedError() + def seed(self, seed=0): + set_random_seed(seed, using_cuda=self.device == th.device('cuda')) + if self.env is not None: + self.env.seed(seed) + def collect_rollouts(self, env, n_episodes=1, action_noise_std=0.0, - deterministic=False, callback=None, + deterministic=False, callback=None, remove_timelimits=True, start_timesteps=0, num_timesteps=0, replay_buffer=None): episode_rewards = [] @@ -209,7 +215,7 @@ class BaseRLModel(object): # Rescale and perform action new_obs, reward, done, _ = env.step(self.max_action * action) - if hasattr(self.env, '_max_episode_steps'): + if hasattr(self.env, '_max_episode_steps') and remove_timelimits: done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) else: done_bool = float(done) diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py new file mode 100644 index 0000000..e0a73a8 --- /dev/null +++ b/torchy_baselines/common/distributions.py @@ -0,0 +1,54 @@ +import torch as th +from torch.distributions import Normal + +class Distribution(object): + def __init__(self): + super(Distribution, self).__init__() + + def log_prob(self, x): + """ + returns the log likelihood + + :param x: (str) the labels of each index + :return: ([float]) The log likelihood of the distribution + """ + raise NotImplementedError + + def kl_div(self, other): + """ + Calculates the Kullback-Leibler divergence from the given probabilty distribution + + :param other: ([float]) the distibution to compare with + :return: (float) the KL divergence of the two distributions + """ + raise NotImplementedError + + def entropy(self): + """ + Returns shannon's entropy of the probability + + :return: (float) the entropy + """ + raise NotImplementedError + + def sample(self): + """ + returns a sample from the probabilty distribution + + :return: (Tensorflow Tensor) the stochastic action + """ + raise NotImplementedError + + +class DiagGaussianDistribution(object): + """docstring for DiagGaussianDistribution.""" + + def __init__(self): + super(DiagGaussianDistribution, self).__init__() + self.distribution = None + + def proba_distribution_from_latent(self, latent, init_scale=1.0, init_bias=0.0): + self.distribution = Normal() + + def sample(self): + return self.distribution.rsample() diff --git a/torchy_baselines/common/policies.py b/torchy_baselines/common/policies.py index 212dda8..33f143b 100644 --- a/torchy_baselines/common/policies.py +++ b/torchy_baselines/common/policies.py @@ -48,7 +48,46 @@ class BasePolicy(nn.Module): :return: (np.ndarray) """ - return th.nn.utils.parameters_to_vector(self.parameters()) + return th.nn.utils.parameters_to_vector(self.parameters()).detach().cpu().numpy() + + +def create_mlp(input_dim, output_dim, net_arch, + activation_fn=nn.ReLU, squash_out=False): + modules = [nn.Linear(input_dim, net_arch[0]), activation_fn()] + + for idx in range(len(net_arch) - 1): + modules.append(nn.Linear(net_arch[idx], net_arch[idx + 1])) + modules.append(activation_fn()) + + if output_dim > 0: + modules.append(nn.Linear(net_arch[-1], output_dim)) + if squash_out: + modules.append(nn.Tanh()) + return modules + + +class BaseNetwork(nn.Module): + """docstring for BaseNetwork.""" + + def __init__(self): + super(BaseNetwork, self).__init__() + + def load_from_vector(self, vector): + """ + Load parameters from a 1D vector. + + :param vector: (np.ndarray) + """ + device = next(self.parameters()).device + th.nn.utils.vector_to_parameters(th.FloatTensor(vector).to(device), self.parameters()) + + def parameters_to_vector(self): + """ + Convert the parameters to a 1D vector. + + :return: (np.ndarray) + """ + return th.nn.utils.parameters_to_vector(self.parameters()).detach().cpu().numpy() _policy_registry = dict() diff --git a/torchy_baselines/ppo/__init__.py b/torchy_baselines/ppo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py new file mode 100644 index 0000000..53548c4 --- /dev/null +++ b/torchy_baselines/ppo/policies.py @@ -0,0 +1,107 @@ +import torch as th +import torch.nn as nn +from torch.distributions import Normal + +from torchy_baselines.common.policies import BasePolicy, register_policy, create_mlp, BaseNetwork + + +class Actor(BaseNetwork): + def __init__(self, state_dim, action_dim, net_arch=None, activation_fn=nn.ReLU): + super(Actor, self).__init__() + + if net_arch is None: + net_arch = [64, 64] + + # TODO: orthogonal initialization? + actor_net = create_mlp(state_dim, action_dim, net_arch, activation_fn, squash_out=True) + self.actor_net = nn.Sequential(*actor_net) + + def forward(self, x): + return self.actor_net(x) + + +class Critic(BaseNetwork): + def __init__(self, state_dim, action_dim, + net_arch=None, activation_fn=nn.ReLU): + super(Critic, self).__init__() + + if net_arch is None: + net_arch = [400, 300] + + # TODO: solve pytorch parameter registration + # for _ in range(n_critics): + # q_net = create_mlp(state_dim + action_dim, 1, net_arch, activation_fn) + # self.q_net = nn.Sequential(*q_net) + # self.q_networks.append(self.q_net) + + q1_net = create_mlp(state_dim + action_dim, 1, net_arch, activation_fn) + self.q1_net = nn.Sequential(*q1_net) + + q2_net = create_mlp(state_dim + action_dim, 1, net_arch, activation_fn) + self.q2_net = nn.Sequential(*q2_net) + + self.q_networks = [self.q1_net, self.q2_net] + + def forward(self, obs, action): + qvalue_input = th.cat([obs, action], dim=1) + return [q_net(qvalue_input) for q_net in self.q_networks] + + def q1_forward(self, obs, action): + return self.q_networks[0](th.cat([obs, action], dim=1)) + + +class PPOPolicy(BasePolicy): + def __init__(self, observation_space, action_space, + learning_rate=1e-3, net_arch=None, device='cpu', + activation_fn=nn.Tanh): + super(PPOPolicy, self).__init__(observation_space, action_space, device) + self.state_dim = self.observation_space.shape[0] + self.action_dim = self.action_space.shape[0] + if net_arch is None: + net_arch = [64, 64] + self.net_arch = net_arch + self.activation_fn = activation_fn + self.net_args = { + 'input_dim': self.state_dim, + 'output_dim': -1, + 'net_arch': self.net_arch, + 'activation_fn': self.activation_fn + } + self.shared_net = None + self._build(learning_rate) + + def _build(self, learning_rate): + shared_net = create_mlp(self.state_dim, output_dim=-1, self.net_arch, self.activation_fn) + self.shared_net = nn.Sequential(*shared_net).to(self.device) + self.actor_net = nn.Linear(self.net_arch[-1], self.action_dim) + self.value_net = nn.Linear(self.net_arch[-1], 1) + self.log_std = nn.Parameter(th.zeros(self.action_dim, 1)) + self.optimizer = th.optim.Adam(self.parameters(), lr=learning_rate) + + def forward(self, state): + latent = self.shared_net(state) + # TODO: initialize pi_mean weights properly + mean_actions = self.actor_net(latent) + action_distribution = Normal(mean_actions, self.log_std) + # Sample from the gaussian + action = action_distribution.rsample() + log_prob = action_distribution.log_prob() + # entropy = action_distribution.entropy() + value = self.value_net(latent) + return action, value, log_prob + + def actor_forward(self): + latent = self.shared_net(state) + # TODO: initialize pi_mean weights properly + mean_actions = self.actor_net(latent) + action_distribution = Normal(mean_actions, self.log_std) + # Sample from the gaussian + action = action_distribution.rsample() + return action + + def value_forward(self): + pass + +MlpPolicy = PPOPolicy + +register_policy("MlpPolicy", MlpPolicy) diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py new file mode 100644 index 0000000..db12a03 --- /dev/null +++ b/torchy_baselines/ppo/ppo.py @@ -0,0 +1,190 @@ +import time + +import torch as th +import torch.nn.functional as F +import numpy as np + +from torchy_baselines.common.base_class import BaseRLModel +from torchy_baselines.common.utils import set_random_seed +from torchy_baselines.common.evaluation import evaluate_policy +from torchy_baselines.ppo.policies import ActorCriticPolicy +from torchy_baselines.common.replay_buffer import ReplayBuffer + + +class PPO(BaseRLModel): + """ + Implementation of Proximal Policy Optimization (PPO) (clip version) + Paper: https://arxiv.org/abs/1707.06347 + Code: https://github.com/openai/spinningup/ + """ + + def __init__(self, policy, env, policy_kwargs=None, verbose=0, + learning_rate=1e-3, seed=0, device='auto', + n_optim=5, batch_size=100, n_steps=256, + gamma=0.99, lambda_=0.95, + _init_setup_model=True): + + super(PPO, self).__init__(policy, env, ActorCriticPolicy, policy_kwargs, verbose, device) + + self.max_action = np.abs(self.action_space.high) + self.learning_rate = learning_rate + self._seed = seed + self.batch_size = batch_size + self.n_optim = n_optim + self.n_steps = n_steps + self.gamma = gamma + self.lambda_ = lambda_ + self.buffer_rollouts = None + + if _init_setup_model: + self._setup_model() + + def _setup_model(self): + state_dim, action_dim = self.observation_space.shape[0], self.action_space.shape[0] + self.seed(self._seed) + + self.policy = self.policy(self.observation_space, self.action_space, + self.learning_rate, device=self.device, **self.policy_kwargs) + + + def select_action(self, observation): + # Normally not needed + observation = np.array(observation) + with th.no_grad(): + observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device) + return self.policy.actor_forward(observation).cpu().data.numpy().flatten() + + def predict(self, observation, state=None, mask=None, deterministic=True): + """ + Get the model's action from an observation + + :param observation: (np.ndarray) the input observation + :param state: (np.ndarray) The last states (can be None, used in recurrent policies) + :param mask: (np.ndarray) The last masks (can be None, used in recurrent policies) + :param deterministic: (bool) Whether or not to return deterministic actions. + :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies) + """ + return np.clip(self.select_action(observation), -self.max_action, self.max_action) + + + def train_actor(self, n_iterations=1, batch_size=100, tau_actor=0.005, tau_critic=0.005, replay_data=None): + + for it in range(n_iterations): + # Sample replay buffer + if replay_data is None: + state, action, next_state, done, reward = self.replay_buffer.sample(batch_size) + else: + state, action, next_state, done, reward = replay_data + + # Compute actor loss + actor_loss = -self.critic.q1_forward(state, self.actor(state)).mean() + + # Optimize the actor + self.actor.optimizer.zero_grad() + actor_loss.backward() + self.actor.optimizer.step() + + # Update the frozen target models + if tau_critic > 0: + for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): + target_param.data.copy_(tau_critic * param.data + (1 - tau_critic) * target_param.data) + + for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): + target_param.data.copy_(tau_actor * param.data + (1 - tau_actor) * target_param.data) + + def train(self, n_iterations, batch_size=100, discount=0.99, + tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2): + + for it in range(n_iterations): + + # Sample replay buffer + replay_data = self.replay_buffer.sample(batch_size) + self.train_critic(replay_data=replay_data) + + # Delayed policy updates + if it % policy_freq == 0: + self.train_actor(replay_data=replay_data) + + def learn(self, total_timesteps, callback=None, log_interval=100, + eval_freq=-1, n_eval_episodes=5, tb_log_name="TD3", reset_num_timesteps=True): + + timesteps_since_eval = 0 + episode_num = 0 + evaluations = [] + start_time = time.time() + + while self.num_timesteps < total_timesteps: + + if callback is not None: + # Only stop training if return value is False, not when it is None. + if callback(locals(), globals()) is False: + break + + episode_reward, episode_timesteps = self.collect_rollouts(self.env, n_episodes=1, + action_noise_std=self.action_noise_std, + deterministic=False, callback=None, + start_timesteps=self.start_timesteps, + num_timesteps=self.num_timesteps, + replay_buffer=self.buffer_rollouts) + episode_num += 1 + self.num_timesteps += episode_timesteps + timesteps_since_eval += episode_timesteps + + if self.num_timesteps > 0: + if self.verbose > 1: + print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format( + self.num_timesteps, episode_num, episode_timesteps, episode_reward)) + self.train(episode_timesteps, batch_size=self.batch_size, policy_freq=self.policy_freq) + + # Evaluate episode + if 0 < eval_freq <= timesteps_since_eval: + timesteps_since_eval %= eval_freq + mean_reward, _ = evaluate_policy(self, self.env, n_eval_episodes) + evaluations.append(mean_reward) + if self.verbose > 0: + print("Eval num_timesteps={}, mean_reward={:.2f}".format(self.num_timesteps, evaluations[-1])) + print("FPS: {:.2f}".format(self.num_timesteps / (time.time() - start_time))) + + return self + + def save(self, path): + if not path.endswith('.pth'): + path += '.pth' + th.save(self.policy.state_dict(), path) + + def load(self, path, env=None, **_kwargs): + if not path.endswith('.pth'): + path += '.pth' + if env is not None: + pass + self.policy.load_state_dict(th.load(path)) + + +class PPOBuffer(ReplayBuffer): + """docstring for PPOBuffer.""" + + def __init__(self, buffer_size, state_dim, action_dim, device='cpu', + lambda=0.95): + super(PPOBuffer, self).__init__(buffer_size, state_dim, action_dim, device) + + self.returns = th.zeros(self.buffer_size, 1) + self.values = th.zeros(self.buffer_size, 1) + self.log_probs = th.zeros(self.buffer_size, 1) + self.advantages = th.zeros(self.buffer_size, 1) + + def compute_gae(self): + """ + From https://github.com/openai/spinningup/blob/master/spinup/algos/ppo/ppo.py + """ + path_slice = slice(self.path_start_idx, self.pos) + rews = np.append(self.rewards[path_slice], last_val) + vals = np.append(self.val_buf[path_slice], last_val) + + # the next two lines implement GAE-Lambda advantage calculation + deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1] + self.advantages[path_slice] = core.discount_cumsum(deltas, self.gamma * self.lam) + + # the next line computes rewards-to-go, to be targets for the value function + self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1] + + self.path_start_idx = self.pos diff --git a/torchy_baselines/td3/policies.py b/torchy_baselines/td3/policies.py index a886af7..23c3945 100644 --- a/torchy_baselines/td3/policies.py +++ b/torchy_baselines/td3/policies.py @@ -1,45 +1,7 @@ import torch as th import torch.nn as nn -from torchy_baselines.common.policies import BasePolicy, register_policy - - -def create_mlp(input_dim, output_dim, net_arch, - activation_fn=nn.ReLU, squash_out=False): - modules = [nn.Linear(input_dim, net_arch[0]), activation_fn()] - - for idx in range(len(net_arch) - 1): - modules.append(nn.Linear(net_arch[idx], net_arch[idx + 1])) - modules.append(activation_fn()) - - modules.append(nn.Linear(net_arch[-1], output_dim)) - if squash_out: - modules.append(nn.Tanh()) - return modules - - -class BaseNetwork(nn.Module): - """docstring for BaseNetwork.""" - - def __init__(self): - super(BaseNetwork, self).__init__() - - def load_from_vector(self, vector): - """ - Load parameters from a 1D vector. - - :param vector: (np.ndarray) - """ - device = next(self.parameters()).device - th.nn.utils.vector_to_parameters(th.FloatTensor(vector).to(device), self.parameters()) - - def parameters_to_vector(self): - """ - Convert the parameters to a 1D vector. - - :return: (np.ndarray) - """ - return th.nn.utils.parameters_to_vector(self.parameters()).detach().cpu().numpy() +from torchy_baselines.common.policies import BasePolicy, register_policy, create_mlp, BaseNetwork class Actor(BaseNetwork): diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index 50bb83f..8ef6b8e 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -6,7 +6,6 @@ import numpy as np from torchy_baselines.common.base_class import BaseRLModel from torchy_baselines.common.replay_buffer import ReplayBuffer -from torchy_baselines.common.utils import set_random_seed from torchy_baselines.common.evaluation import evaluate_policy from torchy_baselines.td3.policies import TD3Policy @@ -31,20 +30,16 @@ class TD3(BaseRLModel): self.learning_rate = learning_rate self.buffer_size = buffer_size self.start_timesteps = start_timesteps - self.seed = seed + self._seed = seed self.policy_freq = policy_freq self.batch_size = batch_size if _init_setup_model: self._setup_model() - def _setup_model(self, seed=None): + def _setup_model(self): state_dim, action_dim = self.observation_space.shape[0], self.action_space.shape[0] - set_random_seed(self.seed, using_cuda=self.device == th.device('cuda')) - - if self.env is not None: - self.env.seed(self.seed) - + self.seed(self._seed) self.replay_buffer = ReplayBuffer(self.buffer_size, state_dim, action_dim, self.device) self.policy = self.policy(self.observation_space, self.action_space, self.learning_rate, device=self.device, **self.policy_kwargs)