diff --git a/tests/test_run.py b/tests/test_run.py index 084176a..f409ab5 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -1,11 +1,17 @@ import os +import numpy as np + from torchy_baselines import TD3, CEMRL, PPO, SAC +from torchy_baselines.common.noise import NormalActionNoise + + +action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1)) def test_td3(): model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), - learning_starts=100, verbose=1, create_eval_env=True) + learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise) model.learn(total_timesteps=1000, eval_freq=500) model.save("test_save") model.load("test_save") @@ -14,7 +20,7 @@ def test_td3(): def test_cemrl(): model = CEMRL('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), pop_size=2, n_grad=1, - learning_starts=100, verbose=1, create_eval_env=True) + learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise) model.learn(total_timesteps=1000, eval_freq=500) model.save("test_save") model.load("test_save") @@ -30,5 +36,6 @@ def test_ppo(): def test_sac(): model = SAC('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), - learning_starts=100, verbose=1, create_eval_env=True, ent_coef='auto') + learning_starts=100, verbose=1, create_eval_env=True, ent_coef='auto', + action_noise=NormalActionNoise(np.zeros(1), np.zeros(1))) model.learn(total_timesteps=1000, eval_freq=500) diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py index 93453eb..e17cc40 100644 --- a/torchy_baselines/cem_rl/cem_rl.py +++ b/torchy_baselines/cem_rl/cem_rl.py @@ -19,14 +19,14 @@ class CEMRL(TD3): sigma_init=1e-3, pop_size=10, damp=1e-3, damp_limit=1e-5, elitism=False, n_grad=5, policy_delay=2, batch_size=100, buffer_size=int(1e6), learning_rate=1e-3, seed=0, device='auto', - action_noise_std=0.0, learning_starts=100, tau=0.005, + action_noise=None, learning_starts=100, tau=0.005, n_episodes_rollout=1, update_style='original', create_eval_env=False, _init_setup_model=True): super(CEMRL, self).__init__(policy, env, buffer_size=buffer_size, learning_rate=learning_rate, seed=seed, device=device, - action_noise_std=action_noise_std, learning_starts=learning_starts, + action_noise=action_noise, learning_starts=learning_starts, n_episodes_rollout=n_episodes_rollout, tau=tau, policy_kwargs=policy_kwargs, verbose=verbose, policy_delay=policy_delay, batch_size=batch_size, @@ -136,7 +136,7 @@ class CEMRL(TD3): self.actor.load_from_vector(params) rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout, - n_steps=-1, action_noise_std=self.action_noise_std, + n_steps=-1, action_noise=self.action_noise, deterministic=False, callback=None, learning_starts=self.learning_starts, num_timesteps=self.num_timesteps, diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index c9c0ded..a4b846b 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -80,6 +80,22 @@ class BaseRLModel(object): assert eval_env.num_envs == 1 return eval_env + def scale_action(self, action): + """ + Rescale the action from [low, high] to [-1, 1] + (no need for symmetric action space) + """ + low, high = self.action_space.low, self.action_space.high + return 2.0 * ((action - low) / (high - low)) - 1.0 + + def unscale_action(self, scaled_action): + """ + Rescale the action from [-1, 1] to [low, high] + (no need for symmetric action space) + """ + low, high = self.action_space.low, self.action_space.high + return low + (0.5 * (scaled_action + 1.0) * (high - low)) + def get_env(self): """ returns the current environment (can be None if not defined) @@ -215,7 +231,7 @@ class BaseRLModel(object): if self.eval_env is not None: self.eval_env.seed(seed) - def collect_rollouts(self, env, n_episodes=1, n_steps=-1, action_noise_std=0.0, + def collect_rollouts(self, env, n_episodes=1, n_steps=-1, action_noise=None, deterministic=False, callback=None, learning_starts=0, num_timesteps=0, replay_buffer=None, obs=None): @@ -237,16 +253,15 @@ class BaseRLModel(object): if num_timesteps < learning_starts: action = [self.action_space.sample()] else: - action = self.predict(obs, deterministic=deterministic) / self.max_action + action = self.scale_action(self.predict(obs, deterministic=deterministic)) - if action_noise_std > 0: + # Add noise to the action (improve exploration) + if action_noise is not None: # NOTE: in the original implementation of TD3, the noise was applied to the unscaled action - action_noise = np.random.normal(0, action_noise_std, size=self.action_space.shape[0]) - action = (action + action_noise).clip(-1, 1) + action = np.clip(action + action_noise(), -1, 1) # Rescale and perform action - # TODO: better rescale - new_obs, reward, done, _ = env.step(self.max_action * action) + new_obs, reward, done, _ = env.step(self.unscale_action(action)) done_bool = [float(done[0])] episode_reward += reward @@ -267,6 +282,8 @@ class BaseRLModel(object): total_episodes += 1 episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) + if action_noise is not None: + action_noise.reset() mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0 diff --git a/torchy_baselines/common/noise.py b/torchy_baselines/common/noise.py new file mode 100644 index 0000000..4afef6b --- /dev/null +++ b/torchy_baselines/common/noise.py @@ -0,0 +1,71 @@ +""" +Taken from stable-baselines +""" +import numpy as np + + +class ActionNoise(object): + """ + The action noise base class + """ + def reset(self): + """ + call end of episode reset for the noise + """ + pass + + +class NormalActionNoise(ActionNoise): + """ + A gaussian action noise + + :param mean: (float) the mean value of the noise + :param sigma: (float) the scale of the noise (std here) + """ + def __init__(self, mean, sigma): + self._mu = mean + self._sigma = sigma + + def __call__(self): + return np.random.normal(self._mu, self._sigma) + + def __repr__(self): + return 'NormalActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma) + + +class OrnsteinUhlenbeckActionNoise(ActionNoise): + """ + A Ornstein Uhlenbeck action noise, this is designed to aproximate brownian motion with friction. + + Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab + + :param mean: (float) the mean of the noise + :param sigma: (float) the scale of the noise + :param theta: (float) the rate of mean reversion + :param dt: (float) the timestep for the noise + :param initial_noise: ([float]) the initial value for the noise output, (if None: 0) + """ + + def __init__(self, mean, sigma, theta=.15, dt=1e-2, initial_noise=None): + self._theta = theta + self._mu = mean + self._sigma = sigma + self._dt = dt + self.initial_noise = initial_noise + self.noise_prev = None + self.reset() + + def __call__(self): + noise = self.noise_prev + self._theta * (self._mu - self.noise_prev) * self._dt + \ + self._sigma * np.sqrt(self._dt) * np.random.normal(size=self._mu.shape) + self.noise_prev = noise + return noise + + def reset(self): + """ + reset the Ornstein Uhlenbeck noise, to the initial position + """ + self.noise_prev = self.initial_noise if self.initial_noise is not None else np.zeros_like(self._mu) + + def __repr__(self): + return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma) diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py index 9d77cc2..108d0be 100644 --- a/torchy_baselines/sac/sac.py +++ b/torchy_baselines/sac/sac.py @@ -41,7 +41,7 @@ class SAC(BaseRLModel): :param gradient_steps: (int) How many gradient update after each step :param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto') :param action_noise: (ActionNoise) the action noise type (None by default), this can help - for hard exploration problem. Cf DDPG for the different action noise type. + for hard exploration problem. Cf common.noise for the different action noise type. :param gamma: (float) the discount factor :param create_eval_env: (bool) Whether to create a second environment that will be used for evaluating the agent periodically. (Only available when passing string for the environment) @@ -57,15 +57,13 @@ class SAC(BaseRLModel): tau=0.005, ent_coef='auto', target_update_interval=1, train_freq=1, gradient_steps=1, n_episodes_rollout=-1, target_entropy='auto', action_noise=None, - gamma=0.99, action_noise_std=0.0, create_eval_env=False, + gamma=0.99, create_eval_env=False, policy_kwargs=None, verbose=0, seed=0, device='auto', _init_setup_model=True): super(SAC, self).__init__(policy, env, SACPolicy, policy_kwargs, verbose, device, create_eval_env=create_eval_env) - self.max_action = np.abs(self.action_space.high) - self.action_noise_std = action_noise_std self.learning_rate = learning_rate self.seed = seed self.target_entropy = target_entropy @@ -84,7 +82,7 @@ class SAC(BaseRLModel): self.train_freq = train_freq self.gradient_steps = gradient_steps self.n_episodes_rollout = n_episodes_rollout - # self.action_noise = action_noise + self.action_noise = action_noise self.gamma = gamma if _init_setup_model: @@ -151,7 +149,7 @@ class SAC(BaseRLModel): :param deterministic: (bool) Whether or not to return deterministic actions. :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies) """ - return self.max_action * self.select_action(observation) + return self.unscale_action(self.select_action(observation)) def train(self, gradient_steps, batch_size=64): for gradient_step in range(gradient_steps): @@ -238,7 +236,7 @@ class SAC(BaseRLModel): break rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout, - n_steps=self.train_freq, action_noise_std=self.action_noise_std, + n_steps=self.train_freq, action_noise=self.action_noise, deterministic=False, callback=None, learning_starts=self.learning_starts, num_timesteps=self.num_timesteps, diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index 7f34970..36a221f 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -33,7 +33,7 @@ class TD3(BaseRLModel): :param train_freq: (int) Update the model every `train_freq` steps. :param gradient_steps: (int) How many gradient update after each step :param tau: (float) the soft update coefficient ("polyak update" of the target networks, between 0 and 1) - :param action_noise: (ActionNoise) the action noise type. Cf DDPG for the different action noise type. + :param action_noise: (ActionNoise) the action noise type. Cf common.noise for the different action noise type. :param target_policy_noise: (float) Standard deviation of gaussian noise added to target policy (smoothing noise) :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise. @@ -47,8 +47,7 @@ class TD3(BaseRLModel): :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ def __init__(self, policy, env, buffer_size=int(1e6), learning_rate=1e-3, - action_noise_std=0.1, policy_delay=2, learning_starts=100, - gamma=0.99, batch_size=100, + policy_delay=2, learning_starts=100, gamma=0.99, batch_size=100, train_freq=-1, gradient_steps=-1, n_episodes_rollout=1, tau=0.005, action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5, create_eval_env=False, policy_kwargs=None, verbose=0, @@ -57,8 +56,6 @@ class TD3(BaseRLModel): super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device, create_eval_env=create_eval_env) - self.max_action = np.abs(self.action_space.high) - self.action_noise_std = action_noise_std self.buffer_size = buffer_size self.seed = seed @@ -72,7 +69,7 @@ class TD3(BaseRLModel): self.batch_size = batch_size self.tau = tau self.gamma = gamma - # self.action_noise = action_noise + self.action_noise = action_noise self.policy_delay = policy_delay self.target_noise_clip = target_noise_clip self.target_policy_noise = target_policy_noise @@ -112,10 +109,7 @@ class TD3(BaseRLModel): :param deterministic: (bool) Whether or not to return deterministic actions. :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies) """ - # Rescale the action (no need for symmetric action space) - # return self.action_space.low +\ - # (0.5 * (self.select_action(observation) + 1.0) * (self.action_space.high - self.action_space.low)) - return self.max_action * self.select_action(observation) + return self.unscale_action(self.select_action(observation)) def train_critic(self, gradient_steps=1, batch_size=100, replay_data=None, tau=0.0): @@ -200,6 +194,8 @@ class TD3(BaseRLModel): start_time = time.time() eval_env = self._get_eval_env(eval_env) obs = self.env.reset() + if self.action_noise is not None: + self.action_noise.reset() while self.num_timesteps < total_timesteps: @@ -209,7 +205,7 @@ class TD3(BaseRLModel): break rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout, - n_steps=self.train_freq, action_noise_std=self.action_noise_std, + n_steps=self.train_freq, action_noise=self.action_noise, deterministic=False, callback=None, learning_starts=self.learning_starts, num_timesteps=self.num_timesteps,