Rescale actions and add action noise

This commit is contained in:
Antonin Raffin 2019-10-07 16:26:03 +02:00
parent 12f854e1aa
commit 37ab9d10f1
6 changed files with 120 additions and 31 deletions

View file

@ -1,11 +1,17 @@
import os
import numpy as np
from torchy_baselines import TD3, CEMRL, PPO, SAC
from torchy_baselines.common.noise import NormalActionNoise
action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1))
def test_td3():
model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]),
learning_starts=100, verbose=1, create_eval_env=True)
learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise)
model.learn(total_timesteps=1000, eval_freq=500)
model.save("test_save")
model.load("test_save")
@ -14,7 +20,7 @@ def test_td3():
def test_cemrl():
model = CEMRL('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), pop_size=2, n_grad=1,
learning_starts=100, verbose=1, create_eval_env=True)
learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise)
model.learn(total_timesteps=1000, eval_freq=500)
model.save("test_save")
model.load("test_save")
@ -30,5 +36,6 @@ def test_ppo():
def test_sac():
model = SAC('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]),
learning_starts=100, verbose=1, create_eval_env=True, ent_coef='auto')
learning_starts=100, verbose=1, create_eval_env=True, ent_coef='auto',
action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)))
model.learn(total_timesteps=1000, eval_freq=500)

View file

@ -19,14 +19,14 @@ class CEMRL(TD3):
sigma_init=1e-3, pop_size=10, damp=1e-3, damp_limit=1e-5,
elitism=False, n_grad=5, policy_delay=2, batch_size=100,
buffer_size=int(1e6), learning_rate=1e-3, seed=0, device='auto',
action_noise_std=0.0, learning_starts=100, tau=0.005,
action_noise=None, learning_starts=100, tau=0.005,
n_episodes_rollout=1, update_style='original',
create_eval_env=False,
_init_setup_model=True):
super(CEMRL, self).__init__(policy, env,
buffer_size=buffer_size, learning_rate=learning_rate, seed=seed, device=device,
action_noise_std=action_noise_std, learning_starts=learning_starts,
action_noise=action_noise, learning_starts=learning_starts,
n_episodes_rollout=n_episodes_rollout, tau=tau,
policy_kwargs=policy_kwargs, verbose=verbose,
policy_delay=policy_delay, batch_size=batch_size,
@ -136,7 +136,7 @@ class CEMRL(TD3):
self.actor.load_from_vector(params)
rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout,
n_steps=-1, action_noise_std=self.action_noise_std,
n_steps=-1, action_noise=self.action_noise,
deterministic=False, callback=None,
learning_starts=self.learning_starts,
num_timesteps=self.num_timesteps,

View file

@ -80,6 +80,22 @@ class BaseRLModel(object):
assert eval_env.num_envs == 1
return eval_env
def scale_action(self, action):
"""
Rescale the action from [low, high] to [-1, 1]
(no need for symmetric action space)
"""
low, high = self.action_space.low, self.action_space.high
return 2.0 * ((action - low) / (high - low)) - 1.0
def unscale_action(self, scaled_action):
"""
Rescale the action from [-1, 1] to [low, high]
(no need for symmetric action space)
"""
low, high = self.action_space.low, self.action_space.high
return low + (0.5 * (scaled_action + 1.0) * (high - low))
def get_env(self):
"""
returns the current environment (can be None if not defined)
@ -215,7 +231,7 @@ class BaseRLModel(object):
if self.eval_env is not None:
self.eval_env.seed(seed)
def collect_rollouts(self, env, n_episodes=1, n_steps=-1, action_noise_std=0.0,
def collect_rollouts(self, env, n_episodes=1, n_steps=-1, action_noise=None,
deterministic=False, callback=None,
learning_starts=0, num_timesteps=0,
replay_buffer=None, obs=None):
@ -237,16 +253,15 @@ class BaseRLModel(object):
if num_timesteps < learning_starts:
action = [self.action_space.sample()]
else:
action = self.predict(obs, deterministic=deterministic) / self.max_action
action = self.scale_action(self.predict(obs, deterministic=deterministic))
if action_noise_std > 0:
# Add noise to the action (improve exploration)
if action_noise is not None:
# NOTE: in the original implementation of TD3, the noise was applied to the unscaled action
action_noise = np.random.normal(0, action_noise_std, size=self.action_space.shape[0])
action = (action + action_noise).clip(-1, 1)
action = np.clip(action + action_noise(), -1, 1)
# Rescale and perform action
# TODO: better rescale
new_obs, reward, done, _ = env.step(self.max_action * action)
new_obs, reward, done, _ = env.step(self.unscale_action(action))
done_bool = [float(done[0])]
episode_reward += reward
@ -267,6 +282,8 @@ class BaseRLModel(object):
total_episodes += 1
episode_rewards.append(episode_reward)
total_timesteps.append(episode_timesteps)
if action_noise is not None:
action_noise.reset()
mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0

View file

@ -0,0 +1,71 @@
"""
Taken from stable-baselines
"""
import numpy as np
class ActionNoise(object):
"""
The action noise base class
"""
def reset(self):
"""
call end of episode reset for the noise
"""
pass
class NormalActionNoise(ActionNoise):
"""
A gaussian action noise
:param mean: (float) the mean value of the noise
:param sigma: (float) the scale of the noise (std here)
"""
def __init__(self, mean, sigma):
self._mu = mean
self._sigma = sigma
def __call__(self):
return np.random.normal(self._mu, self._sigma)
def __repr__(self):
return 'NormalActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma)
class OrnsteinUhlenbeckActionNoise(ActionNoise):
"""
A Ornstein Uhlenbeck action noise, this is designed to aproximate brownian motion with friction.
Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
:param mean: (float) the mean of the noise
:param sigma: (float) the scale of the noise
:param theta: (float) the rate of mean reversion
:param dt: (float) the timestep for the noise
:param initial_noise: ([float]) the initial value for the noise output, (if None: 0)
"""
def __init__(self, mean, sigma, theta=.15, dt=1e-2, initial_noise=None):
self._theta = theta
self._mu = mean
self._sigma = sigma
self._dt = dt
self.initial_noise = initial_noise
self.noise_prev = None
self.reset()
def __call__(self):
noise = self.noise_prev + self._theta * (self._mu - self.noise_prev) * self._dt + \
self._sigma * np.sqrt(self._dt) * np.random.normal(size=self._mu.shape)
self.noise_prev = noise
return noise
def reset(self):
"""
reset the Ornstein Uhlenbeck noise, to the initial position
"""
self.noise_prev = self.initial_noise if self.initial_noise is not None else np.zeros_like(self._mu)
def __repr__(self):
return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma)

View file

@ -41,7 +41,7 @@ class SAC(BaseRLModel):
:param gradient_steps: (int) How many gradient update after each step
:param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto')
:param action_noise: (ActionNoise) the action noise type (None by default), this can help
for hard exploration problem. Cf DDPG for the different action noise type.
for hard exploration problem. Cf common.noise for the different action noise type.
:param gamma: (float) the discount factor
:param create_eval_env: (bool) Whether to create a second environment that will be
used for evaluating the agent periodically. (Only available when passing string for the environment)
@ -57,15 +57,13 @@ class SAC(BaseRLModel):
tau=0.005, ent_coef='auto', target_update_interval=1,
train_freq=1, gradient_steps=1, n_episodes_rollout=-1,
target_entropy='auto', action_noise=None,
gamma=0.99, action_noise_std=0.0, create_eval_env=False,
gamma=0.99, create_eval_env=False,
policy_kwargs=None, verbose=0, seed=0, device='auto',
_init_setup_model=True):
super(SAC, self).__init__(policy, env, SACPolicy, policy_kwargs, verbose, device,
create_eval_env=create_eval_env)
self.max_action = np.abs(self.action_space.high)
self.action_noise_std = action_noise_std
self.learning_rate = learning_rate
self.seed = seed
self.target_entropy = target_entropy
@ -84,7 +82,7 @@ class SAC(BaseRLModel):
self.train_freq = train_freq
self.gradient_steps = gradient_steps
self.n_episodes_rollout = n_episodes_rollout
# self.action_noise = action_noise
self.action_noise = action_noise
self.gamma = gamma
if _init_setup_model:
@ -151,7 +149,7 @@ class SAC(BaseRLModel):
:param deterministic: (bool) Whether or not to return deterministic actions.
:return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies)
"""
return self.max_action * self.select_action(observation)
return self.unscale_action(self.select_action(observation))
def train(self, gradient_steps, batch_size=64):
for gradient_step in range(gradient_steps):
@ -238,7 +236,7 @@ class SAC(BaseRLModel):
break
rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout,
n_steps=self.train_freq, action_noise_std=self.action_noise_std,
n_steps=self.train_freq, action_noise=self.action_noise,
deterministic=False, callback=None,
learning_starts=self.learning_starts,
num_timesteps=self.num_timesteps,

View file

@ -33,7 +33,7 @@ class TD3(BaseRLModel):
:param train_freq: (int) Update the model every `train_freq` steps.
:param gradient_steps: (int) How many gradient update after each step
:param tau: (float) the soft update coefficient ("polyak update" of the target networks, between 0 and 1)
:param action_noise: (ActionNoise) the action noise type. Cf DDPG for the different action noise type.
:param action_noise: (ActionNoise) the action noise type. Cf common.noise for the different action noise type.
:param target_policy_noise: (float) Standard deviation of gaussian noise added to target policy
(smoothing noise)
:param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise.
@ -47,8 +47,7 @@ class TD3(BaseRLModel):
:param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
"""
def __init__(self, policy, env, buffer_size=int(1e6), learning_rate=1e-3,
action_noise_std=0.1, policy_delay=2, learning_starts=100,
gamma=0.99, batch_size=100,
policy_delay=2, learning_starts=100, gamma=0.99, batch_size=100,
train_freq=-1, gradient_steps=-1, n_episodes_rollout=1,
tau=0.005, action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5,
create_eval_env=False, policy_kwargs=None, verbose=0,
@ -57,8 +56,6 @@ class TD3(BaseRLModel):
super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device,
create_eval_env=create_eval_env)
self.max_action = np.abs(self.action_space.high)
self.action_noise_std = action_noise_std
self.buffer_size = buffer_size
self.seed = seed
@ -72,7 +69,7 @@ class TD3(BaseRLModel):
self.batch_size = batch_size
self.tau = tau
self.gamma = gamma
# self.action_noise = action_noise
self.action_noise = action_noise
self.policy_delay = policy_delay
self.target_noise_clip = target_noise_clip
self.target_policy_noise = target_policy_noise
@ -112,10 +109,7 @@ class TD3(BaseRLModel):
:param deterministic: (bool) Whether or not to return deterministic actions.
:return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies)
"""
# Rescale the action (no need for symmetric action space)
# return self.action_space.low +\
# (0.5 * (self.select_action(observation) + 1.0) * (self.action_space.high - self.action_space.low))
return self.max_action * self.select_action(observation)
return self.unscale_action(self.select_action(observation))
def train_critic(self, gradient_steps=1, batch_size=100, replay_data=None, tau=0.0):
@ -200,6 +194,8 @@ class TD3(BaseRLModel):
start_time = time.time()
eval_env = self._get_eval_env(eval_env)
obs = self.env.reset()
if self.action_noise is not None:
self.action_noise.reset()
while self.num_timesteps < total_timesteps:
@ -209,7 +205,7 @@ class TD3(BaseRLModel):
break
rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout,
n_steps=self.train_freq, action_noise_std=self.action_noise_std,
n_steps=self.train_freq, action_noise=self.action_noise,
deterministic=False, callback=None,
learning_starts=self.learning_starts,
num_timesteps=self.num_timesteps,