mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-06-03 23:49:57 +00:00
Update collect rollout
This commit is contained in:
parent
6bfbb7198a
commit
322399e8fe
8 changed files with 109 additions and 72 deletions
|
|
@ -8,13 +8,13 @@ PyTorch version of [Stable Baselines](https://github.com/hill-a/stable-baselines
|
|||
|
||||
TODO:
|
||||
- save/load
|
||||
- automatic choice for action distribution
|
||||
- predict
|
||||
- better rescale (min + action * range)
|
||||
- documentation
|
||||
- flexible mlp
|
||||
- logger
|
||||
- better monitor wrapper?
|
||||
- automatic choice for action distribution
|
||||
|
||||
Later:
|
||||
- get_parameters / set_parameters
|
||||
|
|
|
|||
|
|
@ -19,13 +19,15 @@ class CEMRL(TD3):
|
|||
sigma_init=1e-3, pop_size=10, damp=1e-3, damp_limit=1e-5,
|
||||
elitism=False, n_grad=5, policy_delay=2, batch_size=100,
|
||||
buffer_size=int(1e6), learning_rate=1e-3, seed=0, device='auto',
|
||||
action_noise_std=0.0, learning_starts=100, update_style='original',
|
||||
action_noise_std=0.0, learning_starts=100, tau=0.005,
|
||||
n_episodes_rollout=1, update_style='original',
|
||||
create_eval_env=False,
|
||||
_init_setup_model=True):
|
||||
|
||||
super(CEMRL, self).__init__(policy, env,
|
||||
buffer_size=buffer_size, learning_rate=learning_rate, seed=seed, device=device,
|
||||
action_noise_std=action_noise_std, learning_starts=learning_starts,
|
||||
n_episodes_rollout=n_episodes_rollout, tau=tau,
|
||||
policy_kwargs=policy_kwargs, verbose=verbose,
|
||||
policy_delay=policy_delay, batch_size=batch_size,
|
||||
create_eval_env=create_eval_env,
|
||||
|
|
@ -61,6 +63,7 @@ class CEMRL(TD3):
|
|||
evaluations = []
|
||||
start_time = time.time()
|
||||
eval_env = self._get_eval_env(eval_env)
|
||||
obs = self.env.reset()
|
||||
|
||||
while self.num_timesteps < total_timesteps:
|
||||
|
||||
|
|
@ -88,11 +91,11 @@ class CEMRL(TD3):
|
|||
# instead of the train_actor() and no policy delay
|
||||
# Issue with this update style: the bigger the population, the slower the code
|
||||
if self.update_style == 'original':
|
||||
self.train_critic(actor_steps // self.n_grad, tau=0.005)
|
||||
self.train_actor(actor_steps, tau_critic=0.0)
|
||||
self.train_critic(actor_steps // self.n_grad, tau=self.tau)
|
||||
self.train_actor(actor_steps, tau_actor=self.tau, tau_critic=0.0)
|
||||
elif self.update_style == 'original_td3':
|
||||
self.train_critic(actor_steps // self.n_grad, tau=0.0)
|
||||
self.train_actor(actor_steps)
|
||||
self.train_actor(actor_steps, tau_actor=self.tau, tau_critic=self.tau)
|
||||
else:
|
||||
# Closer to td3: with policy delay
|
||||
if self.update_style == 'td3_like':
|
||||
|
|
@ -108,7 +111,7 @@ class CEMRL(TD3):
|
|||
|
||||
# Delayed policy updates
|
||||
if it % self.policy_delay == 0:
|
||||
self.train_actor(replay_data=replay_data)
|
||||
self.train_actor(replay_data=replay_data, tau_actor=self.tau, tau_critic=self.tau)
|
||||
|
||||
# Get the params back in the population
|
||||
self.es_params[i] = self.actor.parameters_to_vector()
|
||||
|
|
@ -132,13 +135,18 @@ class CEMRL(TD3):
|
|||
|
||||
self.actor.load_from_vector(params)
|
||||
|
||||
episode_reward, episode_timesteps = self.collect_rollouts(self.env, n_episodes=1,
|
||||
action_noise_std=self.action_noise_std,
|
||||
deterministic=False, callback=None,
|
||||
learning_starts=self.learning_starts,
|
||||
num_timesteps=self.num_timesteps,
|
||||
replay_buffer=self.replay_buffer)
|
||||
episode_num += 1
|
||||
rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout,
|
||||
n_steps=-1, action_noise_std=self.action_noise_std,
|
||||
deterministic=False, callback=None,
|
||||
learning_starts=self.learning_starts,
|
||||
num_timesteps=self.num_timesteps,
|
||||
replay_buffer=self.replay_buffer,
|
||||
obs=obs)
|
||||
|
||||
# Unpack
|
||||
episode_reward, episode_timesteps, n_episodes, obs = rollout
|
||||
|
||||
episode_num += n_episodes
|
||||
self.num_timesteps += episode_timesteps
|
||||
timesteps_since_eval += episode_timesteps
|
||||
actor_steps += episode_timesteps
|
||||
|
|
|
|||
|
|
@ -215,20 +215,23 @@ class BaseRLModel(object):
|
|||
if self.eval_env is not None:
|
||||
self.eval_env.seed(seed)
|
||||
|
||||
def collect_rollouts(self, env, n_episodes=1, action_noise_std=0.0,
|
||||
def collect_rollouts(self, env, n_episodes=1, n_steps=-1, action_noise_std=0.0,
|
||||
deterministic=False, callback=None,
|
||||
learning_starts=0, num_timesteps=0, replay_buffer=None):
|
||||
learning_starts=0, num_timesteps=0,
|
||||
replay_buffer=None, obs=None):
|
||||
|
||||
episode_rewards = []
|
||||
total_timesteps = []
|
||||
total_steps, total_episodes = 0, 0
|
||||
assert isinstance(env, VecEnv)
|
||||
assert env.num_envs == 1
|
||||
|
||||
for _ in range(n_episodes):
|
||||
while total_steps < n_steps or total_episodes < n_episodes:
|
||||
done = False
|
||||
# Reset environment
|
||||
obs = env.reset()
|
||||
# Reset environment: not needed for VecEnv
|
||||
# obs = env.reset()
|
||||
episode_reward, episode_timesteps = 0.0, 0
|
||||
|
||||
while not done:
|
||||
# Select action randomly or according to policy
|
||||
if num_timesteps < learning_starts:
|
||||
|
|
@ -242,6 +245,7 @@ class BaseRLModel(object):
|
|||
action = (action + action_noise).clip(-1, 1)
|
||||
|
||||
# Rescale and perform action
|
||||
# TODO: better rescale
|
||||
new_obs, reward, done, _ = env.step(self.max_action * action)
|
||||
|
||||
done_bool = [float(done[0])]
|
||||
|
|
@ -255,8 +259,15 @@ class BaseRLModel(object):
|
|||
|
||||
num_timesteps += 1
|
||||
episode_timesteps += 1
|
||||
total_steps += 1
|
||||
if n_steps > 0 and total_steps >= n_steps:
|
||||
break
|
||||
|
||||
episode_rewards.append(episode_reward)
|
||||
total_timesteps.append(episode_timesteps)
|
||||
if done:
|
||||
total_episodes += 1
|
||||
episode_rewards.append(episode_reward)
|
||||
total_timesteps.append(episode_timesteps)
|
||||
|
||||
return np.mean(episode_rewards), np.sum(total_timesteps)
|
||||
mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0
|
||||
|
||||
return mean_reward, total_steps, total_episodes, obs
|
||||
|
|
|
|||
|
|
@ -16,6 +16,12 @@ class BasePolicy(nn.Module):
|
|||
self.action_space = action_space
|
||||
self.device = device
|
||||
|
||||
@staticmethod
|
||||
def init_weights(module, gain=1):
|
||||
if type(module) == nn.Linear:
|
||||
nn.init.orthogonal_(module.weight, gain=gain)
|
||||
module.bias.data.fill_(0.0)
|
||||
|
||||
def forward(self, *_args, **kwargs):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
|
|
|||
|
|
@ -33,12 +33,6 @@ class PPOPolicy(BasePolicy):
|
|||
# self.action_dist = SquashedDiagGaussianDistribution(self.action_dim)
|
||||
self._build(learning_rate)
|
||||
|
||||
@staticmethod
|
||||
def init_weights(module, gain=1):
|
||||
if type(module) == nn.Linear:
|
||||
nn.init.orthogonal_(module.weight, gain=gain)
|
||||
module.bias.data.fill_(0.0)
|
||||
|
||||
def _build(self, learning_rate):
|
||||
# TODO: support shared network
|
||||
# shared_net = create_mlp(self.obs_dim, output_dim=-1, net_arch=self.net_arch, activation_fn=self.activation_fn)
|
||||
|
|
|
|||
|
|
@ -116,10 +116,9 @@ class PPO(BaseRLModel):
|
|||
|
||||
return obs
|
||||
|
||||
def train(self, n_iterations, batch_size=64):
|
||||
def train(self, gradient_steps, batch_size=64):
|
||||
|
||||
# TODO: replace with iterator?
|
||||
for it in range(n_iterations):
|
||||
for gradient_step in range(gradient_steps):
|
||||
approx_kl_divs = []
|
||||
# Sample replay buffer
|
||||
for replay_data in self.rollout_buffer.get(batch_size):
|
||||
|
|
|
|||
|
|
@ -35,9 +35,9 @@ class SAC(BaseRLModel):
|
|||
:param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to
|
||||
inverse of reward scale in the original SAC paper.) Controlling exploration/exploitation trade-off.
|
||||
Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value)
|
||||
:param train_freq: (int) Update the model every `train_freq` steps.
|
||||
:param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
|
||||
:param target_update_interval: (int) update the target network every `target_network_update_freq` steps.
|
||||
:param train_freq: (int) Update the model every `train_freq` steps.
|
||||
:param gradient_steps: (int) How many gradient update after each step
|
||||
:param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto')
|
||||
:param action_noise: (ActionNoise) the action noise type (None by default), this can help
|
||||
|
|
@ -51,9 +51,10 @@ class SAC(BaseRLModel):
|
|||
:param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
|
||||
"""
|
||||
def __init__(self, policy, env, learning_rate=3e-4, buffer_size=int(1e6),
|
||||
learning_starts=100, train_freq=1, batch_size=64,
|
||||
learning_starts=100, batch_size=64,
|
||||
tau=0.005, ent_coef='auto', target_update_interval=1,
|
||||
gradient_steps=1, target_entropy='auto', action_noise=None,
|
||||
train_freq=1, gradient_steps=1, n_episodes_rollout=-1,
|
||||
target_entropy='auto', action_noise=None,
|
||||
gamma=0.99, action_noise_std=0.0, create_eval_env=False,
|
||||
policy_kwargs=None, verbose=0, seed=0, device='auto',
|
||||
_init_setup_model=True):
|
||||
|
|
@ -67,8 +68,7 @@ class SAC(BaseRLModel):
|
|||
self.seed = seed
|
||||
self.target_entropy = target_entropy
|
||||
self.log_ent_coef = None
|
||||
# self.target_update_interval = target_update_interval
|
||||
# self.gradient_steps = gradient_steps
|
||||
self.target_update_interval = target_update_interval
|
||||
self.buffer_size = buffer_size
|
||||
# In the original paper, same learning rate is used for all networks
|
||||
self.learning_rate = learning_rate
|
||||
|
|
@ -79,8 +79,9 @@ class SAC(BaseRLModel):
|
|||
# Inverse of the reward scale
|
||||
self.ent_coef = ent_coef
|
||||
self.target_update_interval = target_update_interval
|
||||
# self.train_freq = train_freq
|
||||
# self.gradient_steps = gradient_steps
|
||||
self.train_freq = train_freq
|
||||
self.gradient_steps = gradient_steps
|
||||
self.n_episodes_rollout = n_episodes_rollout
|
||||
# self.action_noise = action_noise
|
||||
self.gamma = gamma
|
||||
|
||||
|
|
@ -154,9 +155,9 @@ class SAC(BaseRLModel):
|
|||
"""
|
||||
return self.max_action * self.select_action(observation)
|
||||
|
||||
def train(self, n_iterations, batch_size=64):
|
||||
def train(self, gradient_steps, batch_size=64):
|
||||
|
||||
for it in range(n_iterations):
|
||||
for gradient_step in range(gradient_steps):
|
||||
|
||||
# Sample replay buffer
|
||||
replay_data = self.replay_buffer.sample(batch_size)
|
||||
|
|
@ -211,17 +212,19 @@ class SAC(BaseRLModel):
|
|||
self.actor.optimizer.step()
|
||||
|
||||
# Update target networks
|
||||
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
|
||||
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
|
||||
if gradient_step % self.target_update_interval == 0:
|
||||
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
|
||||
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
|
||||
|
||||
def learn(self, total_timesteps, callback=None, log_interval=100,
|
||||
eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="TD3", reset_num_timesteps=True):
|
||||
eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="SAC", reset_num_timesteps=True):
|
||||
|
||||
timesteps_since_eval = 0
|
||||
episode_num = 0
|
||||
evaluations = []
|
||||
start_time = time.time()
|
||||
eval_env = self._get_eval_env(eval_env)
|
||||
obs = self.env.reset()
|
||||
|
||||
while self.num_timesteps < total_timesteps:
|
||||
|
||||
|
|
@ -230,21 +233,27 @@ class SAC(BaseRLModel):
|
|||
if callback(locals(), globals()) is False:
|
||||
break
|
||||
|
||||
episode_reward, episode_timesteps = self.collect_rollouts(self.env, n_episodes=1,
|
||||
action_noise_std=self.action_noise_std,
|
||||
deterministic=False, callback=None,
|
||||
learning_starts=self.learning_starts,
|
||||
num_timesteps=self.num_timesteps,
|
||||
replay_buffer=self.replay_buffer)
|
||||
episode_num += 1
|
||||
rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout,
|
||||
n_steps=self.train_freq, action_noise_std=self.action_noise_std,
|
||||
deterministic=False, callback=None,
|
||||
learning_starts=self.learning_starts,
|
||||
num_timesteps=self.num_timesteps,
|
||||
replay_buffer=self.replay_buffer,
|
||||
obs=obs)
|
||||
# Unpack
|
||||
episode_reward, episode_timesteps, n_episodes, obs = rollout
|
||||
|
||||
self.num_timesteps += episode_timesteps
|
||||
episode_num += n_episodes
|
||||
timesteps_since_eval += episode_timesteps
|
||||
|
||||
if self.num_timesteps > 0:
|
||||
if self.verbose > 1:
|
||||
print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format(
|
||||
self.num_timesteps, episode_num, episode_timesteps, episode_reward))
|
||||
self.train(episode_timesteps, batch_size=self.batch_size)
|
||||
gradient_steps = self.gradient_steps if self.gradient_steps > 0 else episode_timesteps
|
||||
|
||||
self.train(gradient_steps, batch_size=self.batch_size)
|
||||
|
||||
# Evaluate episode
|
||||
if 0 < eval_freq <= timesteps_since_eval and eval_env is not None:
|
||||
|
|
|
|||
|
|
@ -46,7 +46,8 @@ class TD3(BaseRLModel):
|
|||
"""
|
||||
def __init__(self, policy, env, buffer_size=int(1e6), learning_rate=1e-3,
|
||||
action_noise_std=0.1, policy_delay=2, learning_starts=100,
|
||||
gamma=0.99, batch_size=100, train_freq=1000, gradient_steps=1000,
|
||||
gamma=0.99, batch_size=100,
|
||||
train_freq=-1, gradient_steps=-1, n_episodes_rollout=1,
|
||||
tau=0.005, action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5,
|
||||
create_eval_env=False, policy_kwargs=None, verbose=0,
|
||||
seed=0, device='auto', _init_setup_model=True):
|
||||
|
|
@ -63,10 +64,11 @@ class TD3(BaseRLModel):
|
|||
# TODO: accept callables
|
||||
self.learning_rate = learning_rate
|
||||
self.learning_starts = learning_starts
|
||||
# self.train_freq = train_freq
|
||||
# self.gradient_steps = gradient_steps
|
||||
self.train_freq = train_freq
|
||||
self.gradient_steps = gradient_steps
|
||||
self.n_episodes_rollout = n_episodes_rollout
|
||||
self.batch_size = batch_size
|
||||
# self.tau = tau
|
||||
self.tau = tau
|
||||
self.gamma = gamma
|
||||
# self.action_noise = action_noise
|
||||
self.policy_delay = policy_delay
|
||||
|
|
@ -109,12 +111,13 @@ class TD3(BaseRLModel):
|
|||
:return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies)
|
||||
"""
|
||||
# Rescale the action (no need for symmetric action space)
|
||||
return self.action_space.low +\
|
||||
(0.5 * (self.select_action(observation) + 1.0) * (self.action_space.high - self.action_space.low))
|
||||
# return self.action_space.low +\
|
||||
# (0.5 * (self.select_action(observation) + 1.0) * (self.action_space.high - self.action_space.low))
|
||||
return self.max_action * self.select_action(observation)
|
||||
|
||||
def train_critic(self, n_iterations=1, batch_size=100, replay_data=None, tau=0.0):
|
||||
def train_critic(self, gradient_steps=1, batch_size=100, replay_data=None, tau=0.0):
|
||||
|
||||
for it in range(n_iterations):
|
||||
for gradient_step in range(gradient_steps):
|
||||
# Sample replay buffer
|
||||
if replay_data is None:
|
||||
obs, action, next_obs, done, reward = self.replay_buffer.sample(batch_size)
|
||||
|
|
@ -149,9 +152,9 @@ class TD3(BaseRLModel):
|
|||
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
|
||||
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
|
||||
|
||||
def train_actor(self, n_iterations=1, batch_size=100, tau_actor=0.005, tau_critic=0.005, replay_data=None):
|
||||
def train_actor(self, gradient_steps=1, batch_size=100, tau_actor=0.005, tau_critic=0.005, replay_data=None):
|
||||
|
||||
for it in range(n_iterations):
|
||||
for gradient_step in range(gradient_steps):
|
||||
# Sample replay buffer
|
||||
if replay_data is None:
|
||||
obs, _, next_obs, done, reward = self.replay_buffer.sample(batch_size)
|
||||
|
|
@ -174,17 +177,17 @@ class TD3(BaseRLModel):
|
|||
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
|
||||
target_param.data.copy_(tau_actor * param.data + (1 - tau_actor) * target_param.data)
|
||||
|
||||
def train(self, n_iterations, batch_size=100, policy_delay=2):
|
||||
def train(self, gradient_steps, batch_size=100, policy_delay=2):
|
||||
|
||||
for it in range(n_iterations):
|
||||
for gradient_step in range(gradient_steps):
|
||||
|
||||
# Sample replay buffer
|
||||
replay_data = self.replay_buffer.sample(batch_size)
|
||||
self.train_critic(replay_data=replay_data)
|
||||
|
||||
# Delayed policy updates
|
||||
if it % policy_delay == 0:
|
||||
self.train_actor(replay_data=replay_data)
|
||||
if gradient_step % policy_delay == 0:
|
||||
self.train_actor(replay_data=replay_data, tau_actor=self.tau, tau_critic=self.tau)
|
||||
|
||||
def learn(self, total_timesteps, callback=None, log_interval=100,
|
||||
eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="TD3", reset_num_timesteps=True):
|
||||
|
|
@ -194,6 +197,7 @@ class TD3(BaseRLModel):
|
|||
evaluations = []
|
||||
start_time = time.time()
|
||||
eval_env = self._get_eval_env(eval_env)
|
||||
obs = self.env.reset()
|
||||
|
||||
while self.num_timesteps < total_timesteps:
|
||||
|
||||
|
|
@ -202,13 +206,17 @@ class TD3(BaseRLModel):
|
|||
if callback(locals(), globals()) is False:
|
||||
break
|
||||
|
||||
episode_reward, episode_timesteps = self.collect_rollouts(self.env, n_episodes=1,
|
||||
action_noise_std=self.action_noise_std,
|
||||
deterministic=False, callback=None,
|
||||
learning_starts=self.learning_starts,
|
||||
num_timesteps=self.num_timesteps,
|
||||
replay_buffer=self.replay_buffer)
|
||||
episode_num += 1
|
||||
rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout,
|
||||
n_steps=self.train_freq, action_noise_std=self.action_noise_std,
|
||||
deterministic=False, callback=None,
|
||||
learning_starts=self.learning_starts,
|
||||
num_timesteps=self.num_timesteps,
|
||||
replay_buffer=self.replay_buffer,
|
||||
obs=obs)
|
||||
# Unpack
|
||||
episode_reward, episode_timesteps, n_episodes, obs = rollout
|
||||
|
||||
episode_num += n_episodes
|
||||
self.num_timesteps += episode_timesteps
|
||||
timesteps_since_eval += episode_timesteps
|
||||
|
||||
|
|
@ -216,7 +224,9 @@ class TD3(BaseRLModel):
|
|||
if self.verbose > 1:
|
||||
print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format(
|
||||
self.num_timesteps, episode_num, episode_timesteps, episode_reward))
|
||||
self.train(episode_timesteps, batch_size=self.batch_size, policy_delay=self.policy_delay)
|
||||
|
||||
gradient_steps = self.gradient_steps if self.gradient_steps > 0 else episode_timesteps
|
||||
self.train(gradient_steps, batch_size=self.batch_size, policy_delay=self.policy_delay)
|
||||
|
||||
# Evaluate episode
|
||||
if 0 < eval_freq <= timesteps_since_eval and eval_env is not None:
|
||||
|
|
|
|||
Loading…
Reference in a new issue