diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 98d1a87..56cf18f 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -35,6 +35,24 @@ Others: Documentation: ^^^^^^^^^^^^^^ +Pre-Release 0.5.0a0 (WIP) +------------------------------ + +Breaking Changes: +^^^^^^^^^^^^^^^^^ + +New Features: +^^^^^^^^^^^^^ + +Bug Fixes: +^^^^^^^^^^ +- Fixed ``reset_num_timesteps`` behavior, so ``env.reset()`` is not called if ``reset_num_timesteps=True`` + +Others: +^^^^^^^ +- Cleanup rollout return + + Pre-Release 0.3.0 (2020-02-14) ------------------------------ @@ -57,9 +75,6 @@ Bug Fixes: - Fixed colors in ``results_plotter`` - Fix entropy computation (now summed over action dim) -Deprecations: -^^^^^^^^^^^^^ - Others: ^^^^^^^ - SAC with SDE now sample only one matrix @@ -106,9 +121,6 @@ Bug Fixes: - Fix entropy computation for squashed Gaussian (approximate it now) - Fix seeding when using multiple environments (different seed per env) -Deprecations: -^^^^^^^^^^^^^ - Others: ^^^^^^^ - Add type check @@ -125,25 +137,11 @@ Pre-Release 0.1.0 (2020-01-20) ------------------------------ **First Release: base algorithms and state-dependent exploration** -Breaking Changes: -^^^^^^^^^^^^^^^^^ - New Features: ^^^^^^^^^^^^^ - Initial release of A2C, CEM-RL, PPO, SAC and TD3, working only with ``Box`` input space - State-Dependent Exploration (SDE) for A2C, PPO, SAC and TD3 -Bug Fixes: -^^^^^^^^^^ - -Deprecations: -^^^^^^^^^^^^^ - -Others: -^^^^^^^ - -Documentation: -^^^^^^^^^^^^^^ Maintainers diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index 3c36d49..fe8f97e 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -93,7 +93,11 @@ class BaseRLModel(ABC): self.start_time = None self.policy = None self.learning_rate = learning_rate - self.lr_schedule = None # type: Optional[Callable] + self.lr_schedule = None # type: Optional[Callable] + self._last_obs = None # type: Optional[np.ndarray] + # When using VecNormalize: + self._last_original_obs = None # type: Optional[np.ndarray] + self._episode_num = 0 # Used for SDE only self.use_sde = use_sde self.sde_sample_freq = sde_sample_freq @@ -486,7 +490,7 @@ class BaseRLModel(ABC): n_eval_episodes: int = 5, log_path: Optional[str] = None, reset_num_timesteps: bool = True, - ) -> Tuple[int, np.ndarray, BaseCallback]: + ) -> 'BaseCallback': """ Initialize different variables needed for training. @@ -496,7 +500,7 @@ class BaseRLModel(ABC): :param n_eval_episodes: (int) :param log_path (Optional[str]): Path to a log folder :param reset_num_timesteps: (bool) Whether to reset or not the `num_timesteps` attribute - :return: (Tuple[int, np.ndarray, BaseCallback]) + :return: (BaseCallback) """ self.start_time = time.time() self.ep_info_buffer = deque(maxlen=100) @@ -505,21 +509,26 @@ class BaseRLModel(ABC): if self.action_noise is not None: self.action_noise.reset() - timesteps_since_eval, episode_num = 0, 0 - if reset_num_timesteps: self.num_timesteps = 0 + self._episode_num = 0 + + # Avoid resetting the environment when calling `.learn()` consecutive times + if reset_num_timesteps or self._last_obs is None: + self._last_obs = self.env.reset() + # Retrieve unnormalized observation for saving into the buffer + if self._vec_normalize_env is not None: + self._last_original_obs = self._vec_normalize_env.get_original_obs() if eval_env is not None and self.seed is not None: eval_env.seed(self.seed) eval_env = self._get_eval_env(eval_env) - obs = self.env.reset() # Create eval callback if needed callback = self._init_callback(callback, eval_env, eval_freq, n_eval_episodes, log_path) - return episode_num, obs, callback + return callback def _update_info_buffer(self, infos: List[Dict[str, Any]], dones: Optional[np.ndarray] = None) -> None: """ @@ -744,8 +753,6 @@ class OffPolicyRLModel(BaseRLModel): action_noise: Optional[ActionNoise] = None, learning_starts: int = 0, replay_buffer: Optional[ReplayBuffer] = None, - obs: Optional[np.ndarray] = None, - episode_num: int = 0, log_interval: Optional[int] = None) -> RolloutReturn: """ Collect rollout using the current policy (and possibly fill the replay buffer) @@ -762,8 +769,6 @@ class OffPolicyRLModel(BaseRLModel): (and at the beginning and end of the rollout) :param learning_starts: (int) Number of steps before learning for the warm-up phase. :param replay_buffer: (ReplayBuffer) - :param obs: (np.ndarray) Last observation from the environment - :param episode_num: (int) Episode index :param log_interval: (int) Log data every `log_interval` episodes :return: (RolloutReturn) """ @@ -773,10 +778,6 @@ class OffPolicyRLModel(BaseRLModel): assert isinstance(env, VecEnv), "You must pass a VecEnv" assert env.num_envs == 1, "OffPolicyRLModel only support single environment" - # Retrieve unnormalized observation for saving into the buffer - if self._vec_normalize_env is not None: - obs_ = self._vec_normalize_env.get_original_obs() - self.rollout_data = None if self.use_sde: self.actor.reset_noise() @@ -804,7 +805,7 @@ class OffPolicyRLModel(BaseRLModel): else: # Note: we assume that the policy uses tanh to scale the action # We use non-deterministic action in the case of SAC, for TD3, it does not matter - unscaled_action, _ = self.predict(obs, deterministic=False) + unscaled_action, _ = self.predict(self._last_obs, deterministic=False) # Rescale the action from [low, high] to [-1, 1] scaled_action = self.policy.scale_action(unscaled_action) @@ -827,7 +828,7 @@ class OffPolicyRLModel(BaseRLModel): # Only stop training if return value is False, not when it is None. if callback.on_step() is False: - return RolloutReturn(0.0, total_steps, total_episodes, None, continue_training=False) + return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False) episode_reward += reward @@ -842,25 +843,23 @@ class OffPolicyRLModel(BaseRLModel): reward_ = self._vec_normalize_env.get_original_reward() else: # Avoid changing the original ones - obs_, new_obs_, reward_ = obs, new_obs, reward + self._last_original_obs, new_obs_, reward_ = self._last_obs, new_obs, reward - replay_buffer.add(obs_, new_obs_, clipped_action, reward_, done) + replay_buffer.add(self._last_original_obs, new_obs_, clipped_action, reward_, done) if self.rollout_data is not None: # Assume only one env - self.rollout_data['observations'].append(obs[0].copy()) + self.rollout_data['observations'].append(self._last_obs[0].copy()) self.rollout_data['actions'].append(scaled_action[0].copy()) self.rollout_data['rewards'].append(reward[0].copy()) self.rollout_data['dones'].append(done[0].copy()) - obs_tensor = th.FloatTensor(obs).to(self.device) + obs_tensor = th.FloatTensor(self._last_obs).to(self.device) self.rollout_data['values'].append(self.vf_net(obs_tensor)[0].cpu().detach().numpy()) - obs = new_obs - # Save the true unnormalized observation - # otherwise obs_ = self._vec_normalize_env.unnormalize_obs(obs) - # is a good approximation + self._last_obs = new_obs + # Save the unnormalized observation if self._vec_normalize_env is not None: - obs_ = new_obs_ + self._last_original_obs = new_obs_ self.num_timesteps += 1 episode_timesteps += 1 @@ -870,16 +869,16 @@ class OffPolicyRLModel(BaseRLModel): if done: total_episodes += 1 + self._episode_num += 1 episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) if action_noise is not None: action_noise.reset() # Display training infos - if self.verbose >= 1 and log_interval is not None and ( - episode_num + total_episodes) % log_interval == 0: + if self.verbose >= 1 and log_interval is not None and (self._episode_num) % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) - logger.logkv("episodes", episode_num + total_episodes) + logger.logkv("episodes", self._episode_num) if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: logger.logkv('ep_rew_mean', self.safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer])) logger.logkv('ep_len_mean', self.safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer])) @@ -909,7 +908,7 @@ class OffPolicyRLModel(BaseRLModel): for step in reversed(range(len(self.rollout_data['rewards']))): if step == len(self.rollout_data['rewards']) - 1: next_non_terminal = 1.0 - done[0] - next_value = self.vf_net(th.FloatTensor(obs).to(self.device))[0].detach() + next_value = self.vf_net(th.FloatTensor(self._last_obs).to(self.device))[0].detach() last_return = self.rollout_data['rewards'][step] + next_non_terminal * next_value else: next_non_terminal = 1.0 - self.rollout_data['dones'][step + 1] @@ -919,4 +918,4 @@ class OffPolicyRLModel(BaseRLModel): callback.on_rollout_end() - return RolloutReturn(mean_reward, total_steps, total_episodes, obs, continue_training) + return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training) diff --git a/torchy_baselines/common/type_aliases.py b/torchy_baselines/common/type_aliases.py index d116ac6..db6b453 100644 --- a/torchy_baselines/common/type_aliases.py +++ b/torchy_baselines/common/type_aliases.py @@ -38,5 +38,4 @@ class RolloutReturn(NamedTuple): episode_reward: float episode_timesteps: int n_episodes: int - obs: Optional[np.ndarray] continue_training: bool diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 51473af..e9d8bdf 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -141,12 +141,10 @@ class PPO(BaseRLModel): env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, - n_rollout_steps: int = 256, - obs: Optional[np.ndarray] = None) -> Tuple[Optional[np.ndarray], bool]: + n_rollout_steps: int = 256) -> bool: - assert obs is not None, "No previous observation was provided" + assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 - continue_training = True rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: @@ -162,7 +160,7 @@ class PPO(BaseRLModel): with th.no_grad(): # Convert to pytorch tensor - obs_tensor = th.as_tensor(obs).to(self.device) + obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() @@ -171,11 +169,11 @@ class PPO(BaseRLModel): # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) + new_obs, rewards, dones, infos = env.step(clipped_actions) if callback.on_step() is False: - continue_training = False - return None, continue_training + return False self._update_info_buffer(infos) n_steps += 1 @@ -184,14 +182,14 @@ class PPO(BaseRLModel): if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) - rollout_buffer.add(obs, actions, rewards, dones, values, log_probs) - obs = new_obs + rollout_buffer.add(self._last_obs, actions, rewards, dones, values, log_probs) + self._last_obs = new_obs rollout_buffer.compute_returns_and_advantage(values, dones=dones) callback.on_rollout_end() - return obs, continue_training + return True def train(self, n_epochs: int, batch_size: int = 64) -> None: # Update optimizer learning rate @@ -307,9 +305,9 @@ class PPO(BaseRLModel): eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True) -> 'PPO': - episode_num, obs, callback = self._setup_learn(eval_env, callback, eval_freq, - n_eval_episodes, eval_log_path, reset_num_timesteps) iteration = 0 + callback = self._setup_learn(eval_env, callback, eval_freq, + n_eval_episodes, eval_log_path, reset_num_timesteps) # if self.tensorboard_log is not None and SummaryWriter is not None: # self.tb_writer = SummaryWriter(log_dir=os.path.join(self.tensorboard_log, tb_log_name)) @@ -318,10 +316,9 @@ class PPO(BaseRLModel): while self.num_timesteps < total_timesteps: - obs, continue_training = self.collect_rollouts(self.env, callback, - self.rollout_buffer, - n_rollout_steps=self.n_steps, - obs=obs) + continue_training = self.collect_rollouts(self.env, callback, + self.rollout_buffer, + n_rollout_steps=self.n_steps) if continue_training is False: break diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py index d81bbbb..b37658e 100644 --- a/torchy_baselines/sac/sac.py +++ b/torchy_baselines/sac/sac.py @@ -256,8 +256,8 @@ class SAC(OffPolicyRLModel): eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True) -> OffPolicyRLModel: - episode_num, obs, callback = self._setup_learn(eval_env, callback, eval_freq, - n_eval_episodes, eval_log_path, reset_num_timesteps) + callback = self._setup_learn(eval_env, callback, eval_freq, + n_eval_episodes, eval_log_path, reset_num_timesteps) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: @@ -266,14 +266,11 @@ class SAC(OffPolicyRLModel): callback=callback, learning_starts=self.learning_starts, replay_buffer=self.replay_buffer, - obs=obs, episode_num=episode_num, log_interval=log_interval) if rollout.continue_training is False: break - obs = rollout.obs - episode_num += rollout.n_episodes self._update_current_progress(self.num_timesteps, total_timesteps) if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index d39b9f5..09742df 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -235,8 +235,8 @@ class TD3(OffPolicyRLModel): eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True) -> OffPolicyRLModel: - episode_num, obs, callback = self._setup_learn(eval_env, callback, eval_freq, - n_eval_episodes, eval_log_path, reset_num_timesteps) + callback = self._setup_learn(eval_env, callback, eval_freq, + n_eval_episodes, eval_log_path, reset_num_timesteps) callback.on_training_start(locals(), globals()) @@ -247,14 +247,11 @@ class TD3(OffPolicyRLModel): callback=callback, learning_starts=self.learning_starts, replay_buffer=self.replay_buffer, - obs=obs, episode_num=episode_num, log_interval=log_interval) if rollout.continue_training is False: break - obs = rollout.obs - episode_num += rollout.n_episodes self._update_current_progress(self.num_timesteps, total_timesteps) if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: diff --git a/torchy_baselines/version.txt b/torchy_baselines/version.txt index 1d0ba9e..515423e 100644 --- a/torchy_baselines/version.txt +++ b/torchy_baselines/version.txt @@ -1 +1 @@ -0.4.0 +0.5.0a0