From 9e250b6818ec7c41680119571662141600f19414 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Mon, 20 Jan 2020 16:19:35 +0100
Subject: [PATCH] Build doc

---
 README.md                                |  16 ++++
 docs/conf.py                             |  23 +++--
 docs/guide/quickstart.rst                |   3 +-
 docs/guide/vec_envs.rst                  |   2 +-
 docs/index.rst                           |   6 +-
 docs/misc/changelog.rst                  |  33 ++++++-
 docs/modules/a2c.rst                     |  73 ++++++++++++++
 docs/modules/cem_rl.rst                  |  96 +++++++++++++++++++
 docs/modules/td3.rst                     |   2 +-
 docs/spelling_wordlist.txt               | 115 +++++++++++++++++++++++
 setup.py                                 |   6 +-
 torchy_baselines/__init__.py             |   2 +-
 torchy_baselines/cem_rl/cem_rl.py        |  10 +-
 torchy_baselines/common/base_class.py    |  16 ++--
 torchy_baselines/common/distributions.py |   4 +-
 torchy_baselines/common/noise.py         |   2 +-
 torchy_baselines/sac/policies.py         |   2 +-
 torchy_baselines/sac/sac.py              |   2 +
 torchy_baselines/td3/td3.py              |   4 +-
 19 files changed, 379 insertions(+), 38 deletions(-)
 create mode 100644 docs/modules/a2c.rst
 create mode 100644 docs/modules/cem_rl.rst
 create mode 100644 docs/spelling_wordlist.txt

diff --git a/README.md b/README.md
index d8f5fa6..018eb35 100644
--- a/README.md
+++ b/README.md
@@ -20,3 +20,19 @@ PyTorch version of [Stable Baselines](https://github.com/hill-a/stable-baselines
 ## Roadmap
 
 - cf github Roadmap
+
+
+## Citing the Project
+
+To cite this repository in publications:
+
+```
+@misc{torchy-baselines,
+  author = {Raffin, Antonin and Dormann, Noah and Hill, Ashley and Ernestus, Maximilian and Gleave, Adam and Kanervisto, Anssi},
+  title = {Torchy Baselines},
+  year = {2019},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/araffin/torchy-baselines}},
+}
+```
diff --git a/docs/conf.py b/docs/conf.py
index 4e4a2de..06c977b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -16,6 +16,14 @@ import os
 import sys
 from unittest.mock import MagicMock
 
+# We CANNOT enable 'sphinxcontrib.spelling' because ReadTheDocs.org does not support
+# PyEnchant.
+try:
+    import sphinxcontrib.spelling
+    enable_spell_check = True
+except ImportError:
+    enable_spell_check = False
+
 # source code directory, relative to this file, for sphinx-autobuild
 sys.path.insert(0, os.path.abspath('..'))
 
@@ -31,16 +39,8 @@ class Mock(MagicMock):
 # Mock modules that requires C modules
 # Note: because of that we cannot test examples using CI
 # 'torch', 'torch.nn', 'torch.nn.functional',
-MOCK_MODULES = ['joblib', 'scipy', 'scipy.signal',
-                'pandas', 'mpi4py', 'mujoco-py', 'cv2',
-                'tensorflow', 'torch', 'torch.nn', 'torch.nn.functional',
-                'torch.distributions',
-                'tensorflow.contrib', 'tensorflow.contrib.layers',
-                'tensorflow.python', 'tensorflow.python.client', 'tensorflow.python.ops',
-                'tqdm', 'cloudpickle', 'matplotlib', 'matplotlib.pyplot',
-                'seaborn', 'gym', 'gym.spaces', 'gym.core',
-                'tensorflow.core', 'tensorflow.core.util', 'tensorflow.python.util',
-                'gym.wrappers', 'gym.wrappers.monitoring', 'zmq']
+# DO not mock modules for now, we will need to do that for read the docs later
+MOCK_MODULES = []
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 
 
@@ -76,6 +76,9 @@ extensions = [
     'sphinx.ext.viewcode',
 ]
 
+if enable_spell_check:
+    extensions.append('sphinxcontrib.spelling')
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 
diff --git a/docs/guide/quickstart.rst b/docs/guide/quickstart.rst
index 94e6601..58fca6d 100644
--- a/docs/guide/quickstart.rst
+++ b/docs/guide/quickstart.rst
@@ -16,8 +16,7 @@ Here is a quick example of how to train and run SAC on a Pendulum environment:
   from torchy_baselines.common.vec_env import DummyVecEnv
   from torchy_baselines import SAC
 
-  # The algorithms require a vectorized environment to run
-  env = DummyVecEnv([lambda: gym.make('Pendulum-v0')])
+  env = gym.make('Pendulum-v0')
 
   model = SAC(MlpPolicy, env, verbose=1)
   model.learn(total_timesteps=10000)
diff --git a/docs/guide/vec_envs.rst b/docs/guide/vec_envs.rst
index 0a989ea..e0e930c 100644
--- a/docs/guide/vec_envs.rst
+++ b/docs/guide/vec_envs.rst
@@ -27,7 +27,7 @@ SubprocVecEnv ✔️       ✔️           ✔️        ✔️         ✔️
 
 	When using vectorized environments, the environments are automatically reset at the end of each episode.
 	Thus, the observation returned for the i-th environment when ``done[i]`` is true will in fact be the first observation of the next episode, not the last observation of the episode that has just terminated.
-	You can access the "real" final observation of the terminated episode—that is, the one that accompanied the ``done`` event provided by the underlying environment—using the ``terminal_observation`` keys in the info dicts returned by the vecenv.
+	You can access the "real" final observation of the terminated episode—that is, the one that accompanied the ``done`` event provided by the underlying environment—using the ``terminal_observation`` keys in the info dicts returned by the `VecEnv`.
 
 .. warning::
 
diff --git a/docs/index.rst b/docs/index.rst
index 77dab25..0f17563 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -28,6 +28,8 @@ RL Baselines zoo also offers a simple interface to train, evaluate agents and do
   :caption: RL Algorithms
 
   modules/base
+  modules/a2c
+  modules/cem_rl
   modules/ppo
   modules/sac
   modules/td3
@@ -47,12 +49,12 @@ To cite this project in publications:
 .. code-block:: bibtex
 
     @misc{torchy-baselines,
-      author = {Raffin, Antonin and Hill, Ashley and Ernestus, Maximilian and Gleave, Adam and Kanervisto, Anssi},
+      author = {Raffin, Antonin and Dormann, Noah and Hill, Ashley and Ernestus, Maximilian and Gleave, Adam and Kanervisto, Anssi},
       title = {Torchy Baselines},
       year = {2019},
       publisher = {GitHub},
       journal = {GitHub repository},
-      howpublished = {\url{https://github.com/hill-a/stable-baselines}},
+      howpublished = {\url{https://github.com/araffin/torchy-baselines}},
     }
 
 Indices and tables
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 4af5310..39e30db 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -3,17 +3,14 @@
 Changelog
 ==========
 
-
-Pre-Release 0.0.3a0 (WIP)
--------------------------
-**Initial Release**
+Pre-Release 0.2.0a0 (WIP)
+------------------------------
 
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
 
 New Features:
 ^^^^^^^^^^^^^
-- Initial release of CEM-RL, PPO, SAC and TD3
 
 Bug Fixes:
 ^^^^^^^^^^
@@ -22,6 +19,32 @@ Deprecations:
 ^^^^^^^^^^^^^
 
 
+Others:
+^^^^^^^
+
+Documentation:
+^^^^^^^^^^^^^^
+- fix documentation build
+
+
+Pre-Release 0.1.0 (2020-01-20)
+------------------------------
+**First Release: base algorithms and state-dependent exploration**
+
+Breaking Changes:
+^^^^^^^^^^^^^^^^^
+
+New Features:
+^^^^^^^^^^^^^
+- Initial release of A2C, CEM-RL, PPO, SAC and TD3, working only with `Box` input space
+- State-Dependent Exploration (SDE) for A2C, PPO, SAC and TD3
+
+Bug Fixes:
+^^^^^^^^^^
+
+Deprecations:
+^^^^^^^^^^^^^
+
 Others:
 ^^^^^^^
 
diff --git a/docs/modules/a2c.rst b/docs/modules/a2c.rst
new file mode 100644
index 0000000..d8f5cf2
--- /dev/null
+++ b/docs/modules/a2c.rst
@@ -0,0 +1,73 @@
+.. _a2c:
+
+.. automodule:: torchy_baselines.a2c
+
+
+A2C
+====
+
+A synchronous, deterministic variant of `Asynchronous Advantage Actor Critic (A3C) <https://arxiv.org/abs/1602.01783>`_.
+It uses multiple workers to avoid the use of a replay buffer.
+
+
+Notes
+-----
+
+-  Original paper:  https://arxiv.org/abs/1602.01783
+-  OpenAI blog post: https://openai.com/blog/baselines-acktr-a2c/
+
+
+Can I use?
+----------
+
+-  Recurrent policies: ✔️
+-  Multi processing: ✔️
+-  Gym spaces:
+
+
+============= ====== ===========
+Space         Action Observation
+============= ====== ===========
+Discrete      ❌      ❌
+Box           ✔️      ✔️
+MultiDiscrete ❌     ❌
+MultiBinary   ❌      ❌
+============= ====== ===========
+
+
+Example
+-------
+
+Train a A2C agent on `CartPole-v1` using 4 processes.
+
+.. code-block:: python
+
+  import gym
+
+  from torchy_baselines.common.policies import MlpPolicy
+  from torchy_baselines.common import make_vec_env
+  from torchy_baselines import A2C
+
+  # Parallel environments
+  env = make_vec_env('CartPole-v1', n_envs=4)
+
+  model = A2C(MlpPolicy, env, verbose=1)
+  model.learn(total_timesteps=25000)
+  model.save("a2c_cartpole")
+
+  del model # remove to demonstrate saving and loading
+
+  model = A2C.load("a2c_cartpole")
+
+  obs = env.reset()
+  while True:
+      action, _states = model.predict(obs)
+      obs, rewards, dones, info = env.step(action)
+      env.render()
+
+Parameters
+----------
+
+.. autoclass:: A2C
+  :members:
+  :inherited-members:
diff --git a/docs/modules/cem_rl.rst b/docs/modules/cem_rl.rst
new file mode 100644
index 0000000..bc243b9
--- /dev/null
+++ b/docs/modules/cem_rl.rst
@@ -0,0 +1,96 @@
+.. _cem_rl:
+
+.. automodule:: torchy_baselines.cem_rl
+
+
+CEM RL
+======
+
+Combining cross-entropy method (CEM) and Twin Delayed Deep Deterministic policy gradient (TD3).
+
+
+.. rubric:: Available Policies
+
+.. autosummary::
+    :nosignatures:
+
+    MlpPolicy
+
+
+Notes
+-----
+
+- Original paper: https://arxiv.org/abs/1810.01222 and https://openreview.net/forum?id=BkeU5j0ctQ
+- Original Implementation: https://github.com/apourchot/CEM-RL
+
+
+.. note::
+
+	CEM RL is currently implemented for TD3
+
+
+.. note::
+
+    The default policies for CEM RL differ a bit from others MlpPolicy: it uses ReLU instead of tanh activation,
+    to match the original paper
+
+
+Can I use?
+----------
+
+-  Recurrent policies: ❌
+-  Multi processing: ❌
+-  Gym spaces:
+
+
+============= ====== ===========
+Space         Action Observation
+============= ====== ===========
+Discrete      ❌      ❌
+Box           ✔️       ✔️
+MultiDiscrete ❌      ❌
+MultiBinary   ❌      ❌
+============= ====== ===========
+
+
+Example
+-------
+
+.. code-block:: python
+
+  import numpy as np
+
+  from torchy_baselines import CEMRL
+  from torchy_baselines.td3.policies import MlpPolicy
+
+  # n_grad = 0 corresponds to CEM (in fact CMA-ES without history)
+  model = CEMRL(MlpPolicy, 'Pendulum-v0', pop_size=10, n_grad=5, verbose=1)
+  model.learn(total_timesteps=50000, log_interval=10)
+  model.save("td3_pendulum")
+  env = model.get_env()
+
+  del model # remove to demonstrate saving and loading
+
+  model = CEMRL.load("td3_pendulum")
+
+  obs = env.reset()
+  while True:
+      action, _states = model.predict(obs)
+      obs, rewards, dones, info = env.step(action)
+      env.render()
+
+Parameters
+----------
+
+.. autoclass:: CEMRL
+  :members:
+  :inherited-members:
+
+.. _cemrl_policies:
+
+CEM RL Policies
+---------------
+
+.. autoclass:: MlpPolicy
+  :members:
+  :inherited-members:
diff --git a/docs/modules/td3.rst b/docs/modules/td3.rst
index b476350..9fd6806 100644
--- a/docs/modules/td3.rst
+++ b/docs/modules/td3.rst
@@ -66,7 +66,7 @@ Example
 
   from torchy_baselines import TD3
   from torchy_baselines.td3.policies import MlpPolicy
-  from torchy_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
+  from torchy_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
 
   # The noise objects for TD3
   n_actions = env.action_space.shape[-1]
diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt
new file mode 100644
index 0000000..696ae55
--- /dev/null
+++ b/docs/spelling_wordlist.txt
@@ -0,0 +1,115 @@
+py
+env
+atari
+argparse
+Argparse
+TensorFlow
+feedforward
+envs
+VecEnv
+pretrain
+petrained
+tf
+np
+mujoco
+cpu
+ndarray
+ndarrays
+timestep
+timesteps
+stepsize
+dataset
+adam
+fn
+normalisation
+Kullback
+Leibler
+boolean
+deserialized
+pretrained
+minibatch
+subprocesses
+ArgumentParser
+Tensorflow
+Gaussian
+approximator
+minibatches
+hyperparameters
+hyperparameter
+vectorized
+rl
+colab
+dataloader
+npz
+datasets
+vf
+logits
+num
+Utils
+backpropagate
+prepend
+NaN
+preprocessing
+Cloudpickle
+async
+multiprocess
+tensorflow
+mlp
+cnn
+neglogp
+tanh
+coef
+repo
+Huber
+params
+ppo
+arxiv
+Arxiv
+func
+DQN
+Uhlenbeck
+Ornstein
+multithread
+cancelled
+Tensorboard
+parallelize
+customising
+serializable
+Multiprocessed
+cartpole
+toolset
+lstm
+rescale
+ffmpeg
+avconv
+unnormalized
+Github
+pre
+preprocess
+backend
+attr
+preprocess
+Antonin
+Raffin
+araffin
+Homebrew
+Numpy
+Theano
+rollout
+kfac
+Piecewise
+csv
+nvidia
+visdom
+tensorboard
+preprocessed
+namespace
+sklearn
+GoalEnv
+Torchy
+pytorch
+dicts
+optimizers
+Deprecations
+forkserver
+cuda
diff --git a/setup.py b/setup.py
index 316c136..393174f 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,9 @@ setup(name='torchy_baselines',
         'docs': [
             'sphinx',
             'sphinx-autobuild',
-            'sphinx-rtd-theme'
+            'sphinx-rtd-theme',
+            # For spelling
+            'sphinxcontrib.spelling'
         ],
         'extra': [
             # For render
@@ -40,7 +42,7 @@ setup(name='torchy_baselines',
       license="MIT",
       long_description="",
       long_description_content_type='text/markdown',
-      version="0.1.0",
+      version="0.2.0a0",
       )
 
 # python setup.py sdist
diff --git a/torchy_baselines/__init__.py b/torchy_baselines/__init__.py
index 46d7956..90d98dc 100644
--- a/torchy_baselines/__init__.py
+++ b/torchy_baselines/__init__.py
@@ -4,4 +4,4 @@ from torchy_baselines.ppo import PPO
 from torchy_baselines.sac import SAC
 from torchy_baselines.td3 import TD3
 
-__version__ = "0.1.0"
+__version__ = "0.2.0a0"
diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py
index e3a322b..1b84018 100644
--- a/torchy_baselines/cem_rl/cem_rl.py
+++ b/torchy_baselines/cem_rl/cem_rl.py
@@ -35,9 +35,17 @@ class CEMRL(TD3):
     :param batch_size: (int) Minibatch size for each gradient update
     :param tau: (float) the soft update coefficient ("polyak update" of the target networks, between 0 and 1)
     :param action_noise: (ActionNoise) the action noise type. Cf common.noise for the different action noise type.
-    :param target_policy_noise: (float) Standard deviation of gaussian noise added to target policy
+    :param target_policy_noise: (float) Standard deviation of Gaussian noise added to target policy
         (smoothing noise)
     :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise.
+    :param n_episodes_rollout: (int) Update the model every `n_episodes_rollout` episodes.
+        Note that this cannot be used at the same time as `train_freq`
+    :param update_style: (str) Update style for the individual that will use the gradient:
+        - original: original implementation (actor_steps // n_grad steps for the critic
+        and actor_steps gradient steps per individual)
+        - original_td3: same as before but the target networks are only update afterward
+        - td3_like: use policy delay and `actor_steps` steps for both the critic and the individual
+        - other: `2 * (actor_steps // self.n_grad)` for the critic and the individual
     :param create_eval_env: (bool) Whether to create a second environment that will be
         used for evaluating the agent periodically. (Only available when passing string for the environment)
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index 7a29847..bdbef53 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -183,7 +183,7 @@ class BaseRLModel(object):
     def safe_mean(arr):
         """
         Compute the mean of an array if there is at least one element.
-        For empty array, return nan. It is used for logging only.
+        For empty array, return NaN. It is used for logging only.
 
         :param arr: (np.ndarray)
         :return: (float)
@@ -192,7 +192,7 @@ class BaseRLModel(object):
 
     def get_env(self):
         """
-        returns the current environment (can be None if not defined)
+        Returns the current environment (can be None if not defined).
 
         :return: (gym.Env) The current environment
         """
@@ -201,10 +201,10 @@ class BaseRLModel(object):
     @staticmethod
     def check_env(env, observation_space, action_space):
         """
-        Checks the validity of the environment and returns if it is coherent
+        Checks the validity of the environment and returns if it is consistent.
         Checked parameters:
-         - observation_space
-         - action_space
+        - observation_space
+        - action_space
         :return: (bool) True if environment seems to be coherent
         """
         if observation_space != env.observation_space:
@@ -219,8 +219,8 @@ class BaseRLModel(object):
         Checks the validity of the environment, and if it is coherent, set it as the current environment.
         Furthermore wrap any non vectorized env into a vectorized
         checked parameters:
-         - observation_space
-         - action_space
+        - observation_space
+        - action_space
 
         :param env: (gym.Env) The environment for learning a policy
         """
@@ -312,7 +312,7 @@ class BaseRLModel(object):
         Load the model from a zip-file
 
         :param load_path: (str) the location of the saved data
-        :param env: (Gym Envrionment) the new environment to run the loaded model on
+        :param env: (Gym Environment) the new environment to run the loaded model on
             (can be None if you only need prediction from a trained model) has priority over any saved environment
         :param kwargs: extra arguments to change the model when loading
         """
diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py
index 9d6189c..e4ca9bc 100644
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@@ -162,7 +162,7 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution):
             # It will be clipped to avoid NaN when inversing tanh
             gaussian_action = TanhBijector.inverse(action)
 
-        # Log likelihood for a gaussian distribution
+        # Log likelihood for a Gaussian distribution
         log_prob = super(SquashedDiagGaussianDistribution, self).log_prob(gaussian_action)
         # Squash correction (from original SAC implementation)
         # this comes from the fact that tanh is bijective and differentiable
@@ -289,7 +289,7 @@ class StateDependentNoiseDistribution(Distribution):
     def sample_weights(self, log_std, batch_size=1):
         """
         Sample weights for the noise exploration matrix,
-        using a centered gaussian distribution.
+        using a centered Gaussian distribution.
 
         :param log_std: (th.Tensor)
         :param batch_size: (int)
diff --git a/torchy_baselines/common/noise.py b/torchy_baselines/common/noise.py
index 4afef6b..09e07fb 100644
--- a/torchy_baselines/common/noise.py
+++ b/torchy_baselines/common/noise.py
@@ -17,7 +17,7 @@ class ActionNoise(object):
 
 class NormalActionNoise(ActionNoise):
     """
-    A gaussian action noise
+    A Gaussian action noise
 
     :param mean: (float) the mean value of the noise
     :param sigma: (float) the scale of the noise (std here)
diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py
index 12b23cc..f40f298 100644
--- a/torchy_baselines/sac/policies.py
+++ b/torchy_baselines/sac/policies.py
@@ -74,7 +74,7 @@ class Actor(BaseNetwork):
             self.mu, self.log_std = self.action_dist.proba_distribution_net(latent_dim=net_arch[-1],
                                                                             latent_sde_dim=latent_sde_dim,
                                                                             log_std_init=log_std_init)
-            # Avoid saturation by limiting the mean of the gaussian to be in [-1, 1]
+            # Avoid saturation by limiting the mean of the Gaussian to be in [-1, 1]
             # self.mu = nn.Sequential(self.mu, nn.Tanh())
             self.mu = nn.Sequential(self.mu, nn.Hardtanh(min_val=-2.0, max_val=2.0))
             # Small positive slope to have non-zero gradient
diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py
index f5a18d2..70ca621 100644
--- a/torchy_baselines/sac/sac.py
+++ b/torchy_baselines/sac/sac.py
@@ -37,6 +37,8 @@ class SAC(BaseRLModel):
     :param target_update_interval: (int) update the target network every `target_network_update_freq` steps.
     :param train_freq: (int) Update the model every `train_freq` steps.
     :param gradient_steps: (int) How many gradient update after each step
+    :param n_episodes_rollout: (int) Update the model every `n_episodes_rollout` episodes.
+        Note that this cannot be used at the same time as `train_freq`
     :param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto')
     :param action_noise: (ActionNoise) the action noise type (None by default), this can help
         for hard exploration problem. Cf common.noise for the different action noise type.
diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py
index d9ce51b..8a19d8f 100644
--- a/torchy_baselines/td3/td3.py
+++ b/torchy_baselines/td3/td3.py
@@ -29,9 +29,11 @@ class TD3(BaseRLModel):
     :param batch_size: (int) Minibatch size for each gradient update
     :param train_freq: (int) Update the model every `train_freq` steps.
     :param gradient_steps: (int) How many gradient update after each step
+    :param n_episodes_rollout: (int) Update the model every `n_episodes_rollout` episodes.
+        Note that this cannot be used at the same time as `train_freq`
     :param tau: (float) the soft update coefficient ("polyak update" of the target networks, between 0 and 1)
     :param action_noise: (ActionNoise) the action noise type. Cf common.noise for the different action noise type.
-    :param target_policy_noise: (float) Standard deviation of gaussian noise added to target policy
+    :param target_policy_noise: (float) Standard deviation of Gaussian noise added to target policy
         (smoothing noise)
     :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise.
     :param use_sde: (bool) Whether to use State Dependent Exploration (SDE)