diff --git a/README.md b/README.md index 64dfbb9..3388c3b 100644 --- a/README.md +++ b/README.md @@ -8,13 +8,13 @@ PyTorch version of [Stable Baselines](https://github.com/hill-a/stable-baselines TODO: - save/load -- automatic choice for action distribution - predict - better rescale (min + action * range) - documentation - flexible mlp - logger - better monitor wrapper? +- automatic choice for action distribution Later: - get_parameters / set_parameters diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py index 4008e8a..93453eb 100644 --- a/torchy_baselines/cem_rl/cem_rl.py +++ b/torchy_baselines/cem_rl/cem_rl.py @@ -19,13 +19,15 @@ class CEMRL(TD3): sigma_init=1e-3, pop_size=10, damp=1e-3, damp_limit=1e-5, elitism=False, n_grad=5, policy_delay=2, batch_size=100, buffer_size=int(1e6), learning_rate=1e-3, seed=0, device='auto', - action_noise_std=0.0, learning_starts=100, update_style='original', + action_noise_std=0.0, learning_starts=100, tau=0.005, + n_episodes_rollout=1, update_style='original', create_eval_env=False, _init_setup_model=True): super(CEMRL, self).__init__(policy, env, buffer_size=buffer_size, learning_rate=learning_rate, seed=seed, device=device, action_noise_std=action_noise_std, learning_starts=learning_starts, + n_episodes_rollout=n_episodes_rollout, tau=tau, policy_kwargs=policy_kwargs, verbose=verbose, policy_delay=policy_delay, batch_size=batch_size, create_eval_env=create_eval_env, @@ -61,6 +63,7 @@ class CEMRL(TD3): evaluations = [] start_time = time.time() eval_env = self._get_eval_env(eval_env) + obs = self.env.reset() while self.num_timesteps < total_timesteps: @@ -88,11 +91,11 @@ class CEMRL(TD3): # instead of the train_actor() and no policy delay # Issue with this update style: the bigger the population, the slower the code if self.update_style == 'original': - self.train_critic(actor_steps // self.n_grad, tau=0.005) - self.train_actor(actor_steps, tau_critic=0.0) + self.train_critic(actor_steps // self.n_grad, tau=self.tau) + self.train_actor(actor_steps, tau_actor=self.tau, tau_critic=0.0) elif self.update_style == 'original_td3': self.train_critic(actor_steps // self.n_grad, tau=0.0) - self.train_actor(actor_steps) + self.train_actor(actor_steps, tau_actor=self.tau, tau_critic=self.tau) else: # Closer to td3: with policy delay if self.update_style == 'td3_like': @@ -108,7 +111,7 @@ class CEMRL(TD3): # Delayed policy updates if it % self.policy_delay == 0: - self.train_actor(replay_data=replay_data) + self.train_actor(replay_data=replay_data, tau_actor=self.tau, tau_critic=self.tau) # Get the params back in the population self.es_params[i] = self.actor.parameters_to_vector() @@ -132,13 +135,18 @@ class CEMRL(TD3): self.actor.load_from_vector(params) - episode_reward, episode_timesteps = self.collect_rollouts(self.env, n_episodes=1, - action_noise_std=self.action_noise_std, - deterministic=False, callback=None, - learning_starts=self.learning_starts, - num_timesteps=self.num_timesteps, - replay_buffer=self.replay_buffer) - episode_num += 1 + rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout, + n_steps=-1, action_noise_std=self.action_noise_std, + deterministic=False, callback=None, + learning_starts=self.learning_starts, + num_timesteps=self.num_timesteps, + replay_buffer=self.replay_buffer, + obs=obs) + + # Unpack + episode_reward, episode_timesteps, n_episodes, obs = rollout + + episode_num += n_episodes self.num_timesteps += episode_timesteps timesteps_since_eval += episode_timesteps actor_steps += episode_timesteps diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index 00d8093..c9c0ded 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -215,20 +215,23 @@ class BaseRLModel(object): if self.eval_env is not None: self.eval_env.seed(seed) - def collect_rollouts(self, env, n_episodes=1, action_noise_std=0.0, + def collect_rollouts(self, env, n_episodes=1, n_steps=-1, action_noise_std=0.0, deterministic=False, callback=None, - learning_starts=0, num_timesteps=0, replay_buffer=None): + learning_starts=0, num_timesteps=0, + replay_buffer=None, obs=None): episode_rewards = [] total_timesteps = [] + total_steps, total_episodes = 0, 0 assert isinstance(env, VecEnv) assert env.num_envs == 1 - for _ in range(n_episodes): + while total_steps < n_steps or total_episodes < n_episodes: done = False - # Reset environment - obs = env.reset() + # Reset environment: not needed for VecEnv + # obs = env.reset() episode_reward, episode_timesteps = 0.0, 0 + while not done: # Select action randomly or according to policy if num_timesteps < learning_starts: @@ -242,6 +245,7 @@ class BaseRLModel(object): action = (action + action_noise).clip(-1, 1) # Rescale and perform action + # TODO: better rescale new_obs, reward, done, _ = env.step(self.max_action * action) done_bool = [float(done[0])] @@ -255,8 +259,15 @@ class BaseRLModel(object): num_timesteps += 1 episode_timesteps += 1 + total_steps += 1 + if n_steps > 0 and total_steps >= n_steps: + break - episode_rewards.append(episode_reward) - total_timesteps.append(episode_timesteps) + if done: + total_episodes += 1 + episode_rewards.append(episode_reward) + total_timesteps.append(episode_timesteps) - return np.mean(episode_rewards), np.sum(total_timesteps) + mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0 + + return mean_reward, total_steps, total_episodes, obs diff --git a/torchy_baselines/common/policies.py b/torchy_baselines/common/policies.py index c1ab00f..38922ba 100644 --- a/torchy_baselines/common/policies.py +++ b/torchy_baselines/common/policies.py @@ -16,6 +16,12 @@ class BasePolicy(nn.Module): self.action_space = action_space self.device = device + @staticmethod + def init_weights(module, gain=1): + if type(module) == nn.Linear: + nn.init.orthogonal_(module.weight, gain=gain) + module.bias.data.fill_(0.0) + def forward(self, *_args, **kwargs): raise NotImplementedError() diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py index e773251..73ccf2b 100644 --- a/torchy_baselines/ppo/policies.py +++ b/torchy_baselines/ppo/policies.py @@ -33,12 +33,6 @@ class PPOPolicy(BasePolicy): # self.action_dist = SquashedDiagGaussianDistribution(self.action_dim) self._build(learning_rate) - @staticmethod - def init_weights(module, gain=1): - if type(module) == nn.Linear: - nn.init.orthogonal_(module.weight, gain=gain) - module.bias.data.fill_(0.0) - def _build(self, learning_rate): # TODO: support shared network # shared_net = create_mlp(self.obs_dim, output_dim=-1, net_arch=self.net_arch, activation_fn=self.activation_fn) diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 5ef0793..2dc738c 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -116,10 +116,9 @@ class PPO(BaseRLModel): return obs - def train(self, n_iterations, batch_size=64): + def train(self, gradient_steps, batch_size=64): - # TODO: replace with iterator? - for it in range(n_iterations): + for gradient_step in range(gradient_steps): approx_kl_divs = [] # Sample replay buffer for replay_data in self.rollout_buffer.get(batch_size): diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py index 1465e3a..66ca400 100644 --- a/torchy_baselines/sac/sac.py +++ b/torchy_baselines/sac/sac.py @@ -35,9 +35,9 @@ class SAC(BaseRLModel): :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) Controlling exploration/exploitation trade-off. Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value) - :param train_freq: (int) Update the model every `train_freq` steps. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_update_interval: (int) update the target network every `target_network_update_freq` steps. + :param train_freq: (int) Update the model every `train_freq` steps. :param gradient_steps: (int) How many gradient update after each step :param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto') :param action_noise: (ActionNoise) the action noise type (None by default), this can help @@ -51,9 +51,10 @@ class SAC(BaseRLModel): :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ def __init__(self, policy, env, learning_rate=3e-4, buffer_size=int(1e6), - learning_starts=100, train_freq=1, batch_size=64, + learning_starts=100, batch_size=64, tau=0.005, ent_coef='auto', target_update_interval=1, - gradient_steps=1, target_entropy='auto', action_noise=None, + train_freq=1, gradient_steps=1, n_episodes_rollout=-1, + target_entropy='auto', action_noise=None, gamma=0.99, action_noise_std=0.0, create_eval_env=False, policy_kwargs=None, verbose=0, seed=0, device='auto', _init_setup_model=True): @@ -67,8 +68,7 @@ class SAC(BaseRLModel): self.seed = seed self.target_entropy = target_entropy self.log_ent_coef = None - # self.target_update_interval = target_update_interval - # self.gradient_steps = gradient_steps + self.target_update_interval = target_update_interval self.buffer_size = buffer_size # In the original paper, same learning rate is used for all networks self.learning_rate = learning_rate @@ -79,8 +79,9 @@ class SAC(BaseRLModel): # Inverse of the reward scale self.ent_coef = ent_coef self.target_update_interval = target_update_interval - # self.train_freq = train_freq - # self.gradient_steps = gradient_steps + self.train_freq = train_freq + self.gradient_steps = gradient_steps + self.n_episodes_rollout = n_episodes_rollout # self.action_noise = action_noise self.gamma = gamma @@ -154,9 +155,9 @@ class SAC(BaseRLModel): """ return self.max_action * self.select_action(observation) - def train(self, n_iterations, batch_size=64): + def train(self, gradient_steps, batch_size=64): - for it in range(n_iterations): + for gradient_step in range(gradient_steps): # Sample replay buffer replay_data = self.replay_buffer.sample(batch_size) @@ -211,17 +212,19 @@ class SAC(BaseRLModel): self.actor.optimizer.step() # Update target networks - for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): - target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) + if gradient_step % self.target_update_interval == 0: + for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): + target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def learn(self, total_timesteps, callback=None, log_interval=100, - eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="TD3", reset_num_timesteps=True): + eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="SAC", reset_num_timesteps=True): timesteps_since_eval = 0 episode_num = 0 evaluations = [] start_time = time.time() eval_env = self._get_eval_env(eval_env) + obs = self.env.reset() while self.num_timesteps < total_timesteps: @@ -230,21 +233,27 @@ class SAC(BaseRLModel): if callback(locals(), globals()) is False: break - episode_reward, episode_timesteps = self.collect_rollouts(self.env, n_episodes=1, - action_noise_std=self.action_noise_std, - deterministic=False, callback=None, - learning_starts=self.learning_starts, - num_timesteps=self.num_timesteps, - replay_buffer=self.replay_buffer) - episode_num += 1 + rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout, + n_steps=self.train_freq, action_noise_std=self.action_noise_std, + deterministic=False, callback=None, + learning_starts=self.learning_starts, + num_timesteps=self.num_timesteps, + replay_buffer=self.replay_buffer, + obs=obs) + # Unpack + episode_reward, episode_timesteps, n_episodes, obs = rollout + self.num_timesteps += episode_timesteps + episode_num += n_episodes timesteps_since_eval += episode_timesteps if self.num_timesteps > 0: if self.verbose > 1: print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format( self.num_timesteps, episode_num, episode_timesteps, episode_reward)) - self.train(episode_timesteps, batch_size=self.batch_size) + gradient_steps = self.gradient_steps if self.gradient_steps > 0 else episode_timesteps + + self.train(gradient_steps, batch_size=self.batch_size) # Evaluate episode if 0 < eval_freq <= timesteps_since_eval and eval_env is not None: diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index 34f1051..4a8b69c 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -46,7 +46,8 @@ class TD3(BaseRLModel): """ def __init__(self, policy, env, buffer_size=int(1e6), learning_rate=1e-3, action_noise_std=0.1, policy_delay=2, learning_starts=100, - gamma=0.99, batch_size=100, train_freq=1000, gradient_steps=1000, + gamma=0.99, batch_size=100, + train_freq=-1, gradient_steps=-1, n_episodes_rollout=1, tau=0.005, action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5, create_eval_env=False, policy_kwargs=None, verbose=0, seed=0, device='auto', _init_setup_model=True): @@ -63,10 +64,11 @@ class TD3(BaseRLModel): # TODO: accept callables self.learning_rate = learning_rate self.learning_starts = learning_starts - # self.train_freq = train_freq - # self.gradient_steps = gradient_steps + self.train_freq = train_freq + self.gradient_steps = gradient_steps + self.n_episodes_rollout = n_episodes_rollout self.batch_size = batch_size - # self.tau = tau + self.tau = tau self.gamma = gamma # self.action_noise = action_noise self.policy_delay = policy_delay @@ -109,12 +111,13 @@ class TD3(BaseRLModel): :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies) """ # Rescale the action (no need for symmetric action space) - return self.action_space.low +\ - (0.5 * (self.select_action(observation) + 1.0) * (self.action_space.high - self.action_space.low)) + # return self.action_space.low +\ + # (0.5 * (self.select_action(observation) + 1.0) * (self.action_space.high - self.action_space.low)) + return self.max_action * self.select_action(observation) - def train_critic(self, n_iterations=1, batch_size=100, replay_data=None, tau=0.0): + def train_critic(self, gradient_steps=1, batch_size=100, replay_data=None, tau=0.0): - for it in range(n_iterations): + for gradient_step in range(gradient_steps): # Sample replay buffer if replay_data is None: obs, action, next_obs, done, reward = self.replay_buffer.sample(batch_size) @@ -149,9 +152,9 @@ class TD3(BaseRLModel): for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) - def train_actor(self, n_iterations=1, batch_size=100, tau_actor=0.005, tau_critic=0.005, replay_data=None): + def train_actor(self, gradient_steps=1, batch_size=100, tau_actor=0.005, tau_critic=0.005, replay_data=None): - for it in range(n_iterations): + for gradient_step in range(gradient_steps): # Sample replay buffer if replay_data is None: obs, _, next_obs, done, reward = self.replay_buffer.sample(batch_size) @@ -174,17 +177,17 @@ class TD3(BaseRLModel): for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(tau_actor * param.data + (1 - tau_actor) * target_param.data) - def train(self, n_iterations, batch_size=100, policy_delay=2): + def train(self, gradient_steps, batch_size=100, policy_delay=2): - for it in range(n_iterations): + for gradient_step in range(gradient_steps): # Sample replay buffer replay_data = self.replay_buffer.sample(batch_size) self.train_critic(replay_data=replay_data) # Delayed policy updates - if it % policy_delay == 0: - self.train_actor(replay_data=replay_data) + if gradient_step % policy_delay == 0: + self.train_actor(replay_data=replay_data, tau_actor=self.tau, tau_critic=self.tau) def learn(self, total_timesteps, callback=None, log_interval=100, eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="TD3", reset_num_timesteps=True): @@ -194,6 +197,7 @@ class TD3(BaseRLModel): evaluations = [] start_time = time.time() eval_env = self._get_eval_env(eval_env) + obs = self.env.reset() while self.num_timesteps < total_timesteps: @@ -202,13 +206,17 @@ class TD3(BaseRLModel): if callback(locals(), globals()) is False: break - episode_reward, episode_timesteps = self.collect_rollouts(self.env, n_episodes=1, - action_noise_std=self.action_noise_std, - deterministic=False, callback=None, - learning_starts=self.learning_starts, - num_timesteps=self.num_timesteps, - replay_buffer=self.replay_buffer) - episode_num += 1 + rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout, + n_steps=self.train_freq, action_noise_std=self.action_noise_std, + deterministic=False, callback=None, + learning_starts=self.learning_starts, + num_timesteps=self.num_timesteps, + replay_buffer=self.replay_buffer, + obs=obs) + # Unpack + episode_reward, episode_timesteps, n_episodes, obs = rollout + + episode_num += n_episodes self.num_timesteps += episode_timesteps timesteps_since_eval += episode_timesteps @@ -216,7 +224,9 @@ class TD3(BaseRLModel): if self.verbose > 1: print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format( self.num_timesteps, episode_num, episode_timesteps, episode_reward)) - self.train(episode_timesteps, batch_size=self.batch_size, policy_delay=self.policy_delay) + + gradient_steps = self.gradient_steps if self.gradient_steps > 0 else episode_timesteps + self.train(gradient_steps, batch_size=self.batch_size, policy_delay=self.policy_delay) # Evaluate episode if 0 < eval_freq <= timesteps_since_eval and eval_env is not None: