From c56865e10d2b232d523e7a06a2bfcd8e4ab870f5 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Fri, 22 Nov 2019 19:02:00 +0100
Subject: [PATCH] Cleanup CEM, rename variables + add comments

---
 torchy_baselines/cem_rl/cem.py    | 43 +++++++++++++++++++------------
 torchy_baselines/cem_rl/cem_rl.py | 14 +++++-----
 2 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/torchy_baselines/cem_rl/cem.py b/torchy_baselines/cem_rl/cem.py
index bf6b6fa..ee4f484 100644
--- a/torchy_baselines/cem_rl/cem.py
+++ b/torchy_baselines/cem_rl/cem.py
@@ -8,36 +8,39 @@ class CEM(object):
     """
     Cross-entropy method with diagonal covariance (separable CEM).
 
-    :param num_params: (int)
+    :param num_params: (int) Number of parameters per individual (dimension of the problem)
     :param mu_init: (np.ndarray) Initial mean of the population distribution
         Taken to be zero if None is passed.
     :param sigma_init: (float) Initial standard deviation of the population distribution
     :param pop_size: (int) Number of individuals in the population
-    :param damp: (float) Damping for preventing from early convergence.
-    :param damp_limit: (float) Final value of damping
-    :param parents: (int)
-    :param elitism: (bool)
+    :param damping_init: (float) Initial value of damping for preventing from early convergence.
+    :param damping_final: (float) Final value of damping
+    :param parents: (int) Number of parents used to compute the new distribution
+        of individuals.
+    :param elitism: (bool) Keep the best known individual in the population
     :param antithetic: (bool) Use a finite difference like method for sampling
         (mu + epsilon, mu - epsilon)
     """
-
     def __init__(self, num_params, mu_init=None, sigma_init=1e-3,
-                 pop_size=256, damp=1e-3, damp_limit=1e-5,
+                 pop_size=256, damping_init=1e-3, damping_final=1e-5,
                  parents=None, elitism=False, antithetic=False):
         super(CEM, self).__init__()
-        # misc
+
         self.num_params = num_params
 
-        # distribution parameters
+        # Distribution parameters
         if mu_init is None:
             self.mu = np.zeros(self.num_params)
         else:
             self.mu = np.array(mu_init)
+
         self.sigma = sigma_init
-        self.damp = damp
-        self.damp_limit = damp_limit
+        # Damping parameters
+        self.damping = damping_init
+        self.damping_final = damping_final
         # Exponential moving average decay for damping
         self.tau = 0.95
+        # Covariance matrix, here only the diagonal
         self.cov = self.sigma * np.ones(self.num_params)
 
         # elite stuff
@@ -45,16 +48,20 @@ class CEM(object):
         self.elite = np.sqrt(self.sigma) * np.random.rand(self.num_params)
         self.elite_score = None
 
-        # sampling stuff
+        # sampling parameters
         self.pop_size = pop_size
         self.antithetic = antithetic
 
         if self.antithetic:
             assert (self.pop_size % 2 == 0), "Population size must be even"
+
         if parents is None or parents <= 0:
             self.parents = pop_size // 2
         else:
             self.parents = parents
+
+        # Weighting for computing the new mean of the distributions
+        # from the parents. The better the individual, the higher the weight
         self.weights = np.array([np.log((self.parents + 1) / i)
                                  for i in range(1, self.parents + 1)])
         self.weights /= self.weights.sum()
@@ -69,11 +76,12 @@ class CEM(object):
         if self.antithetic and not pop_size % 2:
             epsilon_half = np.random.randn(pop_size // 2, self.num_params)
             epsilon = np.concatenate([epsilon_half, - epsilon_half])
-
         else:
             epsilon = np.random.randn(pop_size, self.num_params)
 
         individuals = self.mu + epsilon * np.sqrt(self.cov)
+
+        # Keep the best known individual in the population
         if self.elitism:
             individuals[-1] = self.elite
 
@@ -89,19 +97,22 @@ class CEM(object):
         # Convert rewards (we want to maximize) to cost (we want to minimize)
         scores = np.array(scores)
         scores *= -1
+        # Sort the individuals by fitness
         idx_sorted = np.argsort(scores)
 
         old_mu = self.mu
-        self.damp = self.damp * self.tau + (1 - self.tau) * self.damp_limit
+        # Update damping using a moving average
+        self.damping = self.damping * self.tau + (1 - self.tau) * self.damping_final
         # self.mu = self.weights @ solutions[idx_sorted[:self.parents]]
         self.mu = self.weights.dot(solutions[idx_sorted[:self.parents]])
 
+        # CMA-ES style would be to use the new mean here
         z = (solutions[idx_sorted[:self.parents]] - old_mu)
-        self.cov = 1 / self.parents * self.weights.dot(z * z) + self.damp * np.ones(self.num_params)
+        self.cov = 1 / self.parents * self.weights.dot(z * z) + self.damping * np.ones(self.num_params)
 
+        # Retrieve the best individual
         self.elite = solutions[idx_sorted[0]]
         self.elite_score = scores[idx_sorted[0]]
-        # print(self.cov)
 
     def get_distrib_params(self):
         """
diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py
index 0ada89c..9a432fa 100644
--- a/torchy_baselines/cem_rl/cem_rl.py
+++ b/torchy_baselines/cem_rl/cem_rl.py
@@ -19,9 +19,9 @@ class CEMRL(TD3):
     :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
     :param sigma_init: (float) Initial standard deviation of the population distribution
     :param pop_size: (int) Number of individuals in the population
-    :param damp: (float) Damping for preventing from early convergence.
-    :param damp_limit: (float) Final value of damping
-    :param elitism: (bool)
+    :param damping_init: (float)  Initial value of damping for preventing from early convergence.
+    :param damping_final: (float) Final value of damping
+    :param elitism: (bool) Keep the best known individual in the population
     :param n_grad: (int) Number of individuals that will receive a gradient update.
         Half of the population size in the paper.
     :param buffer_size: (int) size of the replay buffer
@@ -48,7 +48,7 @@ class CEMRL(TD3):
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     """
     def __init__(self, policy, env, sigma_init=1e-3, pop_size=10,
-                 damp=1e-3, damp_limit=1e-5, elitism=False, n_grad=5,
+                 damping_init=1e-3, damping_final=1e-5, elitism=False, n_grad=5,
                  buffer_size=int(1e6), learning_rate=1e-3, policy_delay=2,
                  learning_starts=100, gamma=0.99, batch_size=100, tau=0.005,
                  action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5,
@@ -72,8 +72,8 @@ class CEMRL(TD3):
         self.es = None
         self.sigma_init = sigma_init
         self.pop_size = pop_size
-        self.damp = damp
-        self.damp_limit = damp_limit
+        self.damping_init = damping_init
+        self.damping_final = damping_final
         self.elitism = elitism
         self.n_grad = n_grad
         self.es_params = None
@@ -87,7 +87,7 @@ class CEMRL(TD3):
         super(CEMRL, self)._setup_model()
         params_vector = self.actor.parameters_to_vector()
         self.es = CEM(len(params_vector), mu_init=params_vector,
-                      sigma_init=self.sigma_init, damp=self.damp, damp_limit=self.damp_limit,
+                      sigma_init=self.sigma_init, damping_init=self.damping_init, damping_final=self.damping_final,
                       pop_size=self.pop_size, antithetic=not self.pop_size % 2, parents=self.pop_size // 2,
                       elitism=self.elitism)