From f04754afec5e1f96642e6cdde998fbe1990eed32 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 12 Sep 2019 14:00:55 +0200 Subject: [PATCH] Refactor for collecting rollout --- torchy_baselines/cem_rl/cem_rl.py | 52 ++++--------------- torchy_baselines/common/base_class.py | 49 +++++++++++++++++ torchy_baselines/common/evaluation.py | 2 +- torchy_baselines/td3/td3.py | 75 +++++++++------------------ 4 files changed, 85 insertions(+), 93 deletions(-) diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py index ef27124..bd02d0f 100644 --- a/torchy_baselines/cem_rl/cem_rl.py +++ b/torchy_baselines/cem_rl/cem_rl.py @@ -54,8 +54,7 @@ class CEMRL(TD3): def learn(self, total_timesteps, callback=None, log_interval=100, eval_freq=-1, n_eval_episodes=5, tb_log_name="CEMRL", reset_num_timesteps=True): - timesteps_since_eval = 0 - actor_steps = 0 + timesteps_since_eval, actor_steps = 0, 0 episode_num = 0 evaluations = [] start_time = time.time() @@ -127,53 +126,24 @@ class CEMRL(TD3): self.actor.load_from_vector(params) - # Reset environment - obs = self.env.reset() - episode_reward = 0 - episode_timesteps = 0 + episode_reward, episode_timesteps = self.collect_rollouts(self.env, n_episodes=1, + action_noise_std=self.action_noise_std, + deterministic=False, callback=None, + start_timesteps=self.start_timesteps, + num_timesteps=self.num_timesteps, + replay_buffer=self.replay_buffer) episode_num += 1 - done = False - - while not done: - # Select action randomly or according to policy - if self.num_timesteps < self.start_timesteps: - action = self.env.action_space.sample() - else: - action = self.select_action(np.array(obs)) - - if self.action_noise_std > 0: - # NOTE: in the original implementation, the noise is applied to the unscaled action - action_noise = np.random.normal(0, self.action_noise_std, size=self.action_space.shape[0]) - action = (action + action_noise).clip(-1, 1) - - # Rescale and perform action - new_obs, reward, done, _ = self.env.step(self.max_action * action) - - if hasattr(self.env, '_max_episode_steps'): - done_bool = 0 if episode_timesteps + 1 == self.env._max_episode_steps else float(done) - else: - done_bool = float(done) - - episode_reward += reward - - # Store data in replay buffer - self.replay_buffer.add(obs, new_obs, action, reward, done_bool) - - obs = new_obs - episode_timesteps += 1 - # Note: if put on the outer, it will explore start_timesteps for each actor - self.num_timesteps += 1 + self.num_timesteps += episode_timesteps + timesteps_since_eval += episode_timesteps + actor_steps += episode_timesteps + self.fitnesses.append(episode_reward) if self.verbose > 1: print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format( self.num_timesteps, episode_num, episode_timesteps, episode_reward)) - actor_steps += episode_timesteps - self.fitnesses.append(episode_reward) self.es.tell(self.es_params, self.fitnesses) - - # self.num_timesteps += actor_steps timesteps_since_eval += actor_steps return self diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index a7f6bdb..f51e9dc 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -44,6 +44,7 @@ class BaseRLModel(object): self.n_envs = None self.num_timesteps = 0 self.params = None + self.replay_buffer = None if env is not None: if env is not None: @@ -180,3 +181,51 @@ class BaseRLModel(object): :param kwargs: extra arguments to change the model when loading """ raise NotImplementedError() + + def collect_rollouts(self, env, n_episodes=1, action_noise_std=0.0, + deterministic=False, callback=None, + start_timesteps=0, num_timesteps=0, replay_buffer=None): + + episode_rewards = [] + total_timesteps = [] + + for _ in range(n_episodes): + done = False + # Reset environment + obs = env.reset() + episode_reward, episode_timesteps = 0.0, 0 + while not done: + # Select action randomly or according to policy + if num_timesteps < start_timesteps: + action = env.action_space.sample() + else: + action = self.predict(obs, deterministic=deterministic) / self.max_action + + if action_noise_std > 0: + # NOTE: in the original implementation, the noise is applied to the unscaled action + action_noise = np.random.normal(0, action_noise_std, size=self.action_space.shape[0]) + action = (action + action_noise).clip(-1, 1) + + # Rescale and perform action + new_obs, reward, done, _ = env.step(self.max_action * action) + + if hasattr(self.env, '_max_episode_steps'): + done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) + else: + done_bool = float(done) + + episode_reward += reward + + # Store data in replay buffer + if replay_buffer is not None: + replay_buffer.add(obs, new_obs, action, reward, done_bool) + + obs = new_obs + + num_timesteps += 1 + episode_timesteps += 1 + + episode_rewards.append(episode_reward) + total_timesteps.append(episode_timesteps) + + return np.mean(episode_rewards), np.sum(total_timesteps) diff --git a/torchy_baselines/common/evaluation.py b/torchy_baselines/common/evaluation.py index 441cb1c..fc0f38a 100644 --- a/torchy_baselines/common/evaluation.py +++ b/torchy_baselines/common/evaluation.py @@ -10,7 +10,7 @@ def evaluate_policy(model, env, n_eval_episodes=10, deterministic=True, render=F obs = env.reset() done = False while not done: - action = model.predict(np.array(obs), deterministic=deterministic) + action = model.predict(obs, deterministic=deterministic) obs, reward, done, _ = env.step(action) mean_reward += reward n_steps += 1 diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index 6cee111..3bb1bae 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -27,7 +27,6 @@ class TD3(BaseRLModel): super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device) self.max_action = np.abs(self.action_space.high) - self.replay_buffer = None self.action_noise_std = action_noise_std self.learning_rate = learning_rate self.buffer_size = buffer_size @@ -58,6 +57,8 @@ class TD3(BaseRLModel): self.critic_target = self.policy.critic_target def select_action(self, observation): + # Normally not needed + observation = np.array(observation) with th.no_grad(): observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device) return self.actor(observation).cpu().data.numpy().flatten() @@ -147,7 +148,6 @@ class TD3(BaseRLModel): timesteps_since_eval = 0 episode_num = 0 - done = True evaluations = [] start_time = time.time() @@ -158,58 +158,31 @@ class TD3(BaseRLModel): if callback(locals(), globals()) is False: break - if done: - if self.num_timesteps > 0: - if self.verbose > 1: - print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format( - self.num_timesteps, episode_num, episode_timesteps, episode_reward)) - self.train(episode_timesteps, batch_size=self.batch_size, policy_freq=self.policy_freq) + episode_reward, episode_timesteps = self.collect_rollouts(self.env, n_episodes=1, + action_noise_std=self.action_noise_std, + deterministic=False, callback=None, + start_timesteps=self.start_timesteps, + num_timesteps=self.num_timesteps, + replay_buffer=self.replay_buffer) + episode_num += 1 + self.num_timesteps += episode_timesteps + timesteps_since_eval += episode_timesteps - # Evaluate episode - if 0 < eval_freq <= timesteps_since_eval: - timesteps_since_eval %= eval_freq - mean_reward, _ = evaluate_policy(self, self.env, n_eval_episodes) - evaluations.append(mean_reward) - if self.verbose > 0: - print("Eval num_timesteps={}, mean_reward={:.2f}".format(self.num_timesteps, evaluations[-1])) - print("FPS: {:.2f}".format(self.num_timesteps / (time.time() - start_time))) + if self.num_timesteps > 0: + if self.verbose > 1: + print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format( + self.num_timesteps, episode_num, episode_timesteps, episode_reward)) + self.train(episode_timesteps, batch_size=self.batch_size, policy_freq=self.policy_freq) - # Reset environment - obs = self.env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 + # Evaluate episode + if 0 < eval_freq <= timesteps_since_eval: + timesteps_since_eval %= eval_freq + mean_reward, _ = evaluate_policy(self, self.env, n_eval_episodes) + evaluations.append(mean_reward) + if self.verbose > 0: + print("Eval num_timesteps={}, mean_reward={:.2f}".format(self.num_timesteps, evaluations[-1])) + print("FPS: {:.2f}".format(self.num_timesteps / (time.time() - start_time))) - # Select action randomly or according to policy - if self.num_timesteps < self.start_timesteps: - action = self.env.action_space.sample() - else: - action = self.select_action(np.array(obs)) - - if self.action_noise_std > 0: - # NOTE: in the original implementation, the noise is applied to the unscaled action - action_noise = np.random.normal(0, self.action_noise_std, size=self.action_space.shape[0]) - action = (action + action_noise).clip(-1, 1) - - # Rescale and perform action - new_obs, reward, done, _ = self.env.step(self.max_action * action) - - if hasattr(self.env, '_max_episode_steps'): - done_bool = 0 if episode_timesteps + 1 == self.env._max_episode_steps else float(done) - else: - done_bool = float(done) - - episode_reward += reward - - # Store data in replay buffer - # self.replay_buffer.add(state, next_state, action, reward, done) - self.replay_buffer.add(obs, new_obs, action, reward, done_bool) - - obs = new_obs - - episode_timesteps += 1 - self.num_timesteps += 1 - timesteps_since_eval += 1 return self def save(self, path):