mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-05-18 21:30:19 +00:00
Rescale actions and add action noise
This commit is contained in:
parent
12f854e1aa
commit
37ab9d10f1
6 changed files with 120 additions and 31 deletions
|
|
@ -1,11 +1,17 @@
|
|||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
from torchy_baselines import TD3, CEMRL, PPO, SAC
|
||||
from torchy_baselines.common.noise import NormalActionNoise
|
||||
|
||||
|
||||
action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1))
|
||||
|
||||
|
||||
def test_td3():
|
||||
model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]),
|
||||
learning_starts=100, verbose=1, create_eval_env=True)
|
||||
learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise)
|
||||
model.learn(total_timesteps=1000, eval_freq=500)
|
||||
model.save("test_save")
|
||||
model.load("test_save")
|
||||
|
|
@ -14,7 +20,7 @@ def test_td3():
|
|||
|
||||
def test_cemrl():
|
||||
model = CEMRL('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), pop_size=2, n_grad=1,
|
||||
learning_starts=100, verbose=1, create_eval_env=True)
|
||||
learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise)
|
||||
model.learn(total_timesteps=1000, eval_freq=500)
|
||||
model.save("test_save")
|
||||
model.load("test_save")
|
||||
|
|
@ -30,5 +36,6 @@ def test_ppo():
|
|||
|
||||
def test_sac():
|
||||
model = SAC('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]),
|
||||
learning_starts=100, verbose=1, create_eval_env=True, ent_coef='auto')
|
||||
learning_starts=100, verbose=1, create_eval_env=True, ent_coef='auto',
|
||||
action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)))
|
||||
model.learn(total_timesteps=1000, eval_freq=500)
|
||||
|
|
|
|||
|
|
@ -19,14 +19,14 @@ class CEMRL(TD3):
|
|||
sigma_init=1e-3, pop_size=10, damp=1e-3, damp_limit=1e-5,
|
||||
elitism=False, n_grad=5, policy_delay=2, batch_size=100,
|
||||
buffer_size=int(1e6), learning_rate=1e-3, seed=0, device='auto',
|
||||
action_noise_std=0.0, learning_starts=100, tau=0.005,
|
||||
action_noise=None, learning_starts=100, tau=0.005,
|
||||
n_episodes_rollout=1, update_style='original',
|
||||
create_eval_env=False,
|
||||
_init_setup_model=True):
|
||||
|
||||
super(CEMRL, self).__init__(policy, env,
|
||||
buffer_size=buffer_size, learning_rate=learning_rate, seed=seed, device=device,
|
||||
action_noise_std=action_noise_std, learning_starts=learning_starts,
|
||||
action_noise=action_noise, learning_starts=learning_starts,
|
||||
n_episodes_rollout=n_episodes_rollout, tau=tau,
|
||||
policy_kwargs=policy_kwargs, verbose=verbose,
|
||||
policy_delay=policy_delay, batch_size=batch_size,
|
||||
|
|
@ -136,7 +136,7 @@ class CEMRL(TD3):
|
|||
self.actor.load_from_vector(params)
|
||||
|
||||
rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout,
|
||||
n_steps=-1, action_noise_std=self.action_noise_std,
|
||||
n_steps=-1, action_noise=self.action_noise,
|
||||
deterministic=False, callback=None,
|
||||
learning_starts=self.learning_starts,
|
||||
num_timesteps=self.num_timesteps,
|
||||
|
|
|
|||
|
|
@ -80,6 +80,22 @@ class BaseRLModel(object):
|
|||
assert eval_env.num_envs == 1
|
||||
return eval_env
|
||||
|
||||
def scale_action(self, action):
|
||||
"""
|
||||
Rescale the action from [low, high] to [-1, 1]
|
||||
(no need for symmetric action space)
|
||||
"""
|
||||
low, high = self.action_space.low, self.action_space.high
|
||||
return 2.0 * ((action - low) / (high - low)) - 1.0
|
||||
|
||||
def unscale_action(self, scaled_action):
|
||||
"""
|
||||
Rescale the action from [-1, 1] to [low, high]
|
||||
(no need for symmetric action space)
|
||||
"""
|
||||
low, high = self.action_space.low, self.action_space.high
|
||||
return low + (0.5 * (scaled_action + 1.0) * (high - low))
|
||||
|
||||
def get_env(self):
|
||||
"""
|
||||
returns the current environment (can be None if not defined)
|
||||
|
|
@ -215,7 +231,7 @@ class BaseRLModel(object):
|
|||
if self.eval_env is not None:
|
||||
self.eval_env.seed(seed)
|
||||
|
||||
def collect_rollouts(self, env, n_episodes=1, n_steps=-1, action_noise_std=0.0,
|
||||
def collect_rollouts(self, env, n_episodes=1, n_steps=-1, action_noise=None,
|
||||
deterministic=False, callback=None,
|
||||
learning_starts=0, num_timesteps=0,
|
||||
replay_buffer=None, obs=None):
|
||||
|
|
@ -237,16 +253,15 @@ class BaseRLModel(object):
|
|||
if num_timesteps < learning_starts:
|
||||
action = [self.action_space.sample()]
|
||||
else:
|
||||
action = self.predict(obs, deterministic=deterministic) / self.max_action
|
||||
action = self.scale_action(self.predict(obs, deterministic=deterministic))
|
||||
|
||||
if action_noise_std > 0:
|
||||
# Add noise to the action (improve exploration)
|
||||
if action_noise is not None:
|
||||
# NOTE: in the original implementation of TD3, the noise was applied to the unscaled action
|
||||
action_noise = np.random.normal(0, action_noise_std, size=self.action_space.shape[0])
|
||||
action = (action + action_noise).clip(-1, 1)
|
||||
action = np.clip(action + action_noise(), -1, 1)
|
||||
|
||||
# Rescale and perform action
|
||||
# TODO: better rescale
|
||||
new_obs, reward, done, _ = env.step(self.max_action * action)
|
||||
new_obs, reward, done, _ = env.step(self.unscale_action(action))
|
||||
|
||||
done_bool = [float(done[0])]
|
||||
episode_reward += reward
|
||||
|
|
@ -267,6 +282,8 @@ class BaseRLModel(object):
|
|||
total_episodes += 1
|
||||
episode_rewards.append(episode_reward)
|
||||
total_timesteps.append(episode_timesteps)
|
||||
if action_noise is not None:
|
||||
action_noise.reset()
|
||||
|
||||
mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0
|
||||
|
||||
|
|
|
|||
71
torchy_baselines/common/noise.py
Normal file
71
torchy_baselines/common/noise.py
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
"""
|
||||
Taken from stable-baselines
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
class ActionNoise(object):
|
||||
"""
|
||||
The action noise base class
|
||||
"""
|
||||
def reset(self):
|
||||
"""
|
||||
call end of episode reset for the noise
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class NormalActionNoise(ActionNoise):
|
||||
"""
|
||||
A gaussian action noise
|
||||
|
||||
:param mean: (float) the mean value of the noise
|
||||
:param sigma: (float) the scale of the noise (std here)
|
||||
"""
|
||||
def __init__(self, mean, sigma):
|
||||
self._mu = mean
|
||||
self._sigma = sigma
|
||||
|
||||
def __call__(self):
|
||||
return np.random.normal(self._mu, self._sigma)
|
||||
|
||||
def __repr__(self):
|
||||
return 'NormalActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma)
|
||||
|
||||
|
||||
class OrnsteinUhlenbeckActionNoise(ActionNoise):
|
||||
"""
|
||||
A Ornstein Uhlenbeck action noise, this is designed to aproximate brownian motion with friction.
|
||||
|
||||
Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
|
||||
|
||||
:param mean: (float) the mean of the noise
|
||||
:param sigma: (float) the scale of the noise
|
||||
:param theta: (float) the rate of mean reversion
|
||||
:param dt: (float) the timestep for the noise
|
||||
:param initial_noise: ([float]) the initial value for the noise output, (if None: 0)
|
||||
"""
|
||||
|
||||
def __init__(self, mean, sigma, theta=.15, dt=1e-2, initial_noise=None):
|
||||
self._theta = theta
|
||||
self._mu = mean
|
||||
self._sigma = sigma
|
||||
self._dt = dt
|
||||
self.initial_noise = initial_noise
|
||||
self.noise_prev = None
|
||||
self.reset()
|
||||
|
||||
def __call__(self):
|
||||
noise = self.noise_prev + self._theta * (self._mu - self.noise_prev) * self._dt + \
|
||||
self._sigma * np.sqrt(self._dt) * np.random.normal(size=self._mu.shape)
|
||||
self.noise_prev = noise
|
||||
return noise
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
reset the Ornstein Uhlenbeck noise, to the initial position
|
||||
"""
|
||||
self.noise_prev = self.initial_noise if self.initial_noise is not None else np.zeros_like(self._mu)
|
||||
|
||||
def __repr__(self):
|
||||
return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma)
|
||||
|
|
@ -41,7 +41,7 @@ class SAC(BaseRLModel):
|
|||
:param gradient_steps: (int) How many gradient update after each step
|
||||
:param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto')
|
||||
:param action_noise: (ActionNoise) the action noise type (None by default), this can help
|
||||
for hard exploration problem. Cf DDPG for the different action noise type.
|
||||
for hard exploration problem. Cf common.noise for the different action noise type.
|
||||
:param gamma: (float) the discount factor
|
||||
:param create_eval_env: (bool) Whether to create a second environment that will be
|
||||
used for evaluating the agent periodically. (Only available when passing string for the environment)
|
||||
|
|
@ -57,15 +57,13 @@ class SAC(BaseRLModel):
|
|||
tau=0.005, ent_coef='auto', target_update_interval=1,
|
||||
train_freq=1, gradient_steps=1, n_episodes_rollout=-1,
|
||||
target_entropy='auto', action_noise=None,
|
||||
gamma=0.99, action_noise_std=0.0, create_eval_env=False,
|
||||
gamma=0.99, create_eval_env=False,
|
||||
policy_kwargs=None, verbose=0, seed=0, device='auto',
|
||||
_init_setup_model=True):
|
||||
|
||||
super(SAC, self).__init__(policy, env, SACPolicy, policy_kwargs, verbose, device,
|
||||
create_eval_env=create_eval_env)
|
||||
|
||||
self.max_action = np.abs(self.action_space.high)
|
||||
self.action_noise_std = action_noise_std
|
||||
self.learning_rate = learning_rate
|
||||
self.seed = seed
|
||||
self.target_entropy = target_entropy
|
||||
|
|
@ -84,7 +82,7 @@ class SAC(BaseRLModel):
|
|||
self.train_freq = train_freq
|
||||
self.gradient_steps = gradient_steps
|
||||
self.n_episodes_rollout = n_episodes_rollout
|
||||
# self.action_noise = action_noise
|
||||
self.action_noise = action_noise
|
||||
self.gamma = gamma
|
||||
|
||||
if _init_setup_model:
|
||||
|
|
@ -151,7 +149,7 @@ class SAC(BaseRLModel):
|
|||
:param deterministic: (bool) Whether or not to return deterministic actions.
|
||||
:return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies)
|
||||
"""
|
||||
return self.max_action * self.select_action(observation)
|
||||
return self.unscale_action(self.select_action(observation))
|
||||
|
||||
def train(self, gradient_steps, batch_size=64):
|
||||
for gradient_step in range(gradient_steps):
|
||||
|
|
@ -238,7 +236,7 @@ class SAC(BaseRLModel):
|
|||
break
|
||||
|
||||
rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout,
|
||||
n_steps=self.train_freq, action_noise_std=self.action_noise_std,
|
||||
n_steps=self.train_freq, action_noise=self.action_noise,
|
||||
deterministic=False, callback=None,
|
||||
learning_starts=self.learning_starts,
|
||||
num_timesteps=self.num_timesteps,
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ class TD3(BaseRLModel):
|
|||
:param train_freq: (int) Update the model every `train_freq` steps.
|
||||
:param gradient_steps: (int) How many gradient update after each step
|
||||
:param tau: (float) the soft update coefficient ("polyak update" of the target networks, between 0 and 1)
|
||||
:param action_noise: (ActionNoise) the action noise type. Cf DDPG for the different action noise type.
|
||||
:param action_noise: (ActionNoise) the action noise type. Cf common.noise for the different action noise type.
|
||||
:param target_policy_noise: (float) Standard deviation of gaussian noise added to target policy
|
||||
(smoothing noise)
|
||||
:param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise.
|
||||
|
|
@ -47,8 +47,7 @@ class TD3(BaseRLModel):
|
|||
:param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
|
||||
"""
|
||||
def __init__(self, policy, env, buffer_size=int(1e6), learning_rate=1e-3,
|
||||
action_noise_std=0.1, policy_delay=2, learning_starts=100,
|
||||
gamma=0.99, batch_size=100,
|
||||
policy_delay=2, learning_starts=100, gamma=0.99, batch_size=100,
|
||||
train_freq=-1, gradient_steps=-1, n_episodes_rollout=1,
|
||||
tau=0.005, action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5,
|
||||
create_eval_env=False, policy_kwargs=None, verbose=0,
|
||||
|
|
@ -57,8 +56,6 @@ class TD3(BaseRLModel):
|
|||
super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device,
|
||||
create_eval_env=create_eval_env)
|
||||
|
||||
self.max_action = np.abs(self.action_space.high)
|
||||
self.action_noise_std = action_noise_std
|
||||
self.buffer_size = buffer_size
|
||||
self.seed = seed
|
||||
|
||||
|
|
@ -72,7 +69,7 @@ class TD3(BaseRLModel):
|
|||
self.batch_size = batch_size
|
||||
self.tau = tau
|
||||
self.gamma = gamma
|
||||
# self.action_noise = action_noise
|
||||
self.action_noise = action_noise
|
||||
self.policy_delay = policy_delay
|
||||
self.target_noise_clip = target_noise_clip
|
||||
self.target_policy_noise = target_policy_noise
|
||||
|
|
@ -112,10 +109,7 @@ class TD3(BaseRLModel):
|
|||
:param deterministic: (bool) Whether or not to return deterministic actions.
|
||||
:return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies)
|
||||
"""
|
||||
# Rescale the action (no need for symmetric action space)
|
||||
# return self.action_space.low +\
|
||||
# (0.5 * (self.select_action(observation) + 1.0) * (self.action_space.high - self.action_space.low))
|
||||
return self.max_action * self.select_action(observation)
|
||||
return self.unscale_action(self.select_action(observation))
|
||||
|
||||
def train_critic(self, gradient_steps=1, batch_size=100, replay_data=None, tau=0.0):
|
||||
|
||||
|
|
@ -200,6 +194,8 @@ class TD3(BaseRLModel):
|
|||
start_time = time.time()
|
||||
eval_env = self._get_eval_env(eval_env)
|
||||
obs = self.env.reset()
|
||||
if self.action_noise is not None:
|
||||
self.action_noise.reset()
|
||||
|
||||
while self.num_timesteps < total_timesteps:
|
||||
|
||||
|
|
@ -209,7 +205,7 @@ class TD3(BaseRLModel):
|
|||
break
|
||||
|
||||
rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout,
|
||||
n_steps=self.train_freq, action_noise_std=self.action_noise_std,
|
||||
n_steps=self.train_freq, action_noise=self.action_noise,
|
||||
deterministic=False, callback=None,
|
||||
learning_starts=self.learning_starts,
|
||||
num_timesteps=self.num_timesteps,
|
||||
|
|
|
|||
Loading…
Reference in a new issue