diff --git a/torchy_baselines/cem_rl/cem.py b/torchy_baselines/cem_rl/cem.py index 57513c5..bea75fc 100644 --- a/torchy_baselines/cem_rl/cem.py +++ b/torchy_baselines/cem_rl/cem.py @@ -1,24 +1,27 @@ import numpy as np - # TODO: add more from https://github.com/hardmaru/estool/blob/master/es.py # or https://github.com/facebookresearch/nevergrad class CEM(object): - - """ - Cross-entropy method with diagonal covariance (separable CEM) """ + Cross-entropy method with diagonal covariance (separable CEM). - def __init__(self, num_params, - mu_init=None, - sigma_init=1e-3, - pop_size=256, - damp=1e-3, - damp_limit=1e-5, - parents=None, - elitism=False, - antithetic=False): + :param num_params: (int) + :param mu_init: (np.ndarray) Initial mean of the population distribution + Taken to be zero if None is passed. + :param sigma_init: (float) Initial standard deviation of the population distribution + :param pop_size: (int) Number of individuals in the population + :param damp: (float) Damping for preventing from early convergence. + :param damp_limit: (float) Final value of damping + :param parents: (int) + :param elitism: (bool) + :param antithetic: (bool) Use a finite difference like method for sampling + (mu + epsilon, mu - epsilon) + """ + def __init__(self, num_params, mu_init=None, sigma_init=1e-3, + pop_size=256, damp=1e-3, damp_limit=1e-5, + parents=None, elitism=False, antithetic=False): super(CEM, self).__init__() # misc self.num_params = num_params @@ -31,6 +34,7 @@ class CEM(object): self.sigma = sigma_init self.damp = damp self.damp_limit = damp_limit + # Exponential moving average decay for damping self.tau = 0.95 self.cov = self.sigma * np.ones(self.num_params) @@ -56,6 +60,9 @@ class CEM(object): def ask(self, pop_size): """ Returns a list of candidates parameters + + :param pop_size: (int) + :return: ([np.ndarray]) """ if self.antithetic and not pop_size % 2: epsilon_half = np.random.randn(pop_size // 2, self.num_params) @@ -64,16 +71,20 @@ class CEM(object): else: epsilon = np.random.randn(pop_size, self.num_params) - inds = self.mu + epsilon * np.sqrt(self.cov) + individuals = self.mu + epsilon * np.sqrt(self.cov) if self.elitism: - inds[-1] = self.elite + individuals[-1] = self.elite - return inds + return individuals def tell(self, solutions, scores): """ Updates the distribution + + :param solutions: ([np.ndarray]) + :param scores: ([float]) episode reward. """ + # Convert rewards (we want to maximize) to cost (we want to minimize) scores = np.array(scores) scores *= -1 idx_sorted = np.argsort(scores) @@ -92,7 +103,9 @@ class CEM(object): def get_distrib_params(self): """ - Returns the parameters of the distrubtion: - the mean and sigma + Returns the parameters of the distribution: + the mean and standard deviation. + + :return: (np.ndarray, np.ndarray) """ return np.copy(self.mu), np.copy(self.cov) diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py index ff7100d..024ac25 100644 --- a/torchy_baselines/cem_rl/cem_rl.py +++ b/torchy_baselines/cem_rl/cem_rl.py @@ -10,17 +10,48 @@ from torchy_baselines.common.vec_env import sync_envs_normalization class CEMRL(TD3): """ - Implementation of CEM-RL + Implementation of CEM-RL, in fact CEM combined with TD3. Paper: https://arxiv.org/abs/1810.01222 Code: https://github.com/apourchot/CEM-RL - """ + :param policy: (TD3Policy or str) The policy model to use (MlpPolicy, CnnPolicy, ...) + :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) + :param sigma_init: (float) Initial standard deviation of the population distribution + :param pop_size: (int) Number of individuals in the population + :param damp: (float) Damping for preventing from early convergence. + :param damp_limit: (float) Final value of damping + :param elitism: (bool) + :param n_grad: (int) Number of individuals that will receive a gradient update. + Half of the population size in the paper. + :param buffer_size: (int) size of the replay buffer + :param learning_rate: (float or callable) learning rate for adam optimizer, + the same learning rate will be used for all networks (Q-Values and Actor networks) + it can be a function of the current progress (from 1 to 0) + :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps + per training steps. The Q values will be updated policy_delay more often (update every training step). + :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts + :param gamma: (float) the discount factor + :param batch_size: (int) Minibatch size for each gradient update + :param tau: (float) the soft update coefficient ("polyak update" of the target networks, between 0 and 1) + :param action_noise: (ActionNoise) the action noise type. Cf common.noise for the different action noise type. + :param target_policy_noise: (float) Standard deviation of gaussian noise added to target policy + (smoothing noise) + :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise. + :param create_eval_env: (bool) Whether to create a second environment that will be + used for evaluating the agent periodically. (Only available when passing string for the environment) + :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation + :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param seed: (int) Seed for the pseudo random generators + :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run. + Setting it to auto, the code will be run on the GPU if possible. + :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance + """ def __init__(self, policy, env, sigma_init=1e-3, pop_size=10, damp=1e-3, damp_limit=1e-5, elitism=False, n_grad=5, - policy_delay=2, batch_size=100, - buffer_size=int(1e6), learning_rate=1e-3, - action_noise=None, learning_starts=100, tau=0.005, + buffer_size=int(1e6), learning_rate=1e-3, policy_delay=2, + learning_starts=100, gamma=0.99, batch_size=100, tau=0.005, + action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5, n_episodes_rollout=1, update_style='original', tensorboard_log=None, create_eval_env=False, policy_kwargs=None, verbose=0, seed=0, device='auto', @@ -28,13 +59,16 @@ class CEMRL(TD3): super(CEMRL, self).__init__(policy, env, buffer_size=buffer_size, learning_rate=learning_rate, seed=seed, device=device, - action_noise=action_noise, learning_starts=learning_starts, - n_episodes_rollout=n_episodes_rollout, tau=tau, + action_noise=action_noise, target_policy_noise=target_policy_noise, + target_noise_clip=target_noise_clip, learning_starts=learning_starts, + n_episodes_rollout=n_episodes_rollout, tau=tau, gamma=gamma, policy_kwargs=policy_kwargs, verbose=verbose, policy_delay=policy_delay, batch_size=batch_size, - create_eval_env=create_eval_env, + create_eval_env=create_eval_env, tensorboard_log=tensorboard_log, _init_setup_model=False) + # Evolution strategy method that follows cma-es interface (ask-tell) + # for now, only CEM is implemented self.es = None self.sigma_init = sigma_init self.pop_size = pop_size @@ -79,7 +113,8 @@ class CEMRL(TD3): # set params self.actor.load_from_vector(self.es_params[i]) self.actor_target.load_from_vector(self.es_params[i]) - self.actor.optimizer = th.optim.Adam(self.actor.parameters(), lr=self.learning_rate(self._current_progress)) + self.actor.optimizer = th.optim.Adam(self.actor.parameters(), + lr=self.learning_rate(self._current_progress)) # In the paper: 2 * actor_steps // self.n_grad # In the original implementation: actor_steps // self.n_grad diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 771b5d1..4257c1e 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -5,6 +5,7 @@ import gym from gym import spaces import torch as th import torch.nn.functional as F + # Check if tensorboard is available for pytorch try: from torch.utils.tensorboard import SummaryWriter @@ -192,7 +193,6 @@ class PPO(BaseRLModel): clip_range_vf = self.clip_range_vf(self._current_progress) logger.logkv("clip_range_vf", clip_range_vf) - for gradient_step in range(gradient_steps): approx_kl_divs = [] # Sample replay buffer @@ -226,7 +226,6 @@ class PPO(BaseRLModel): # Value loss using the TD(gae_lambda) target value_loss = F.mse_loss(return_batch, values_pred) - # Entropy loss favor exploration entropy_loss = -th.mean(entropy) @@ -241,7 +240,8 @@ class PPO(BaseRLModel): approx_kl_divs.append(th.mean(old_log_prob - log_prob).detach().cpu().numpy()) if self.target_kl is not None and np.mean(approx_kl_divs) > 1.5 * self.target_kl: - print("Early stopping at step {} due to reaching max kl: {:.2f}".format(gradient_step, np.mean(approx_kl_divs))) + print("Early stopping at step {} due to reaching max kl: {:.2f}".format(gradient_step, + np.mean(approx_kl_divs))) break explained_var = explained_variance(self.rollout_buffer.returns.flatten().cpu().numpy(),