mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-05-24 22:25:13 +00:00
Refactor for collecting rollout
This commit is contained in:
parent
5e3a84d551
commit
f04754afec
4 changed files with 85 additions and 93 deletions
|
|
@ -54,8 +54,7 @@ class CEMRL(TD3):
|
|||
def learn(self, total_timesteps, callback=None, log_interval=100,
|
||||
eval_freq=-1, n_eval_episodes=5, tb_log_name="CEMRL", reset_num_timesteps=True):
|
||||
|
||||
timesteps_since_eval = 0
|
||||
actor_steps = 0
|
||||
timesteps_since_eval, actor_steps = 0, 0
|
||||
episode_num = 0
|
||||
evaluations = []
|
||||
start_time = time.time()
|
||||
|
|
@ -127,53 +126,24 @@ class CEMRL(TD3):
|
|||
|
||||
self.actor.load_from_vector(params)
|
||||
|
||||
# Reset environment
|
||||
obs = self.env.reset()
|
||||
episode_reward = 0
|
||||
episode_timesteps = 0
|
||||
episode_reward, episode_timesteps = self.collect_rollouts(self.env, n_episodes=1,
|
||||
action_noise_std=self.action_noise_std,
|
||||
deterministic=False, callback=None,
|
||||
start_timesteps=self.start_timesteps,
|
||||
num_timesteps=self.num_timesteps,
|
||||
replay_buffer=self.replay_buffer)
|
||||
episode_num += 1
|
||||
done = False
|
||||
|
||||
while not done:
|
||||
# Select action randomly or according to policy
|
||||
if self.num_timesteps < self.start_timesteps:
|
||||
action = self.env.action_space.sample()
|
||||
else:
|
||||
action = self.select_action(np.array(obs))
|
||||
|
||||
if self.action_noise_std > 0:
|
||||
# NOTE: in the original implementation, the noise is applied to the unscaled action
|
||||
action_noise = np.random.normal(0, self.action_noise_std, size=self.action_space.shape[0])
|
||||
action = (action + action_noise).clip(-1, 1)
|
||||
|
||||
# Rescale and perform action
|
||||
new_obs, reward, done, _ = self.env.step(self.max_action * action)
|
||||
|
||||
if hasattr(self.env, '_max_episode_steps'):
|
||||
done_bool = 0 if episode_timesteps + 1 == self.env._max_episode_steps else float(done)
|
||||
else:
|
||||
done_bool = float(done)
|
||||
|
||||
episode_reward += reward
|
||||
|
||||
# Store data in replay buffer
|
||||
self.replay_buffer.add(obs, new_obs, action, reward, done_bool)
|
||||
|
||||
obs = new_obs
|
||||
episode_timesteps += 1
|
||||
# Note: if put on the outer, it will explore start_timesteps for each actor
|
||||
self.num_timesteps += 1
|
||||
self.num_timesteps += episode_timesteps
|
||||
timesteps_since_eval += episode_timesteps
|
||||
actor_steps += episode_timesteps
|
||||
self.fitnesses.append(episode_reward)
|
||||
|
||||
if self.verbose > 1:
|
||||
print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format(
|
||||
self.num_timesteps, episode_num, episode_timesteps, episode_reward))
|
||||
|
||||
actor_steps += episode_timesteps
|
||||
self.fitnesses.append(episode_reward)
|
||||
|
||||
self.es.tell(self.es_params, self.fitnesses)
|
||||
|
||||
# self.num_timesteps += actor_steps
|
||||
timesteps_since_eval += actor_steps
|
||||
return self
|
||||
|
||||
|
|
|
|||
|
|
@ -44,6 +44,7 @@ class BaseRLModel(object):
|
|||
self.n_envs = None
|
||||
self.num_timesteps = 0
|
||||
self.params = None
|
||||
self.replay_buffer = None
|
||||
|
||||
if env is not None:
|
||||
if env is not None:
|
||||
|
|
@ -180,3 +181,51 @@ class BaseRLModel(object):
|
|||
:param kwargs: extra arguments to change the model when loading
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def collect_rollouts(self, env, n_episodes=1, action_noise_std=0.0,
|
||||
deterministic=False, callback=None,
|
||||
start_timesteps=0, num_timesteps=0, replay_buffer=None):
|
||||
|
||||
episode_rewards = []
|
||||
total_timesteps = []
|
||||
|
||||
for _ in range(n_episodes):
|
||||
done = False
|
||||
# Reset environment
|
||||
obs = env.reset()
|
||||
episode_reward, episode_timesteps = 0.0, 0
|
||||
while not done:
|
||||
# Select action randomly or according to policy
|
||||
if num_timesteps < start_timesteps:
|
||||
action = env.action_space.sample()
|
||||
else:
|
||||
action = self.predict(obs, deterministic=deterministic) / self.max_action
|
||||
|
||||
if action_noise_std > 0:
|
||||
# NOTE: in the original implementation, the noise is applied to the unscaled action
|
||||
action_noise = np.random.normal(0, action_noise_std, size=self.action_space.shape[0])
|
||||
action = (action + action_noise).clip(-1, 1)
|
||||
|
||||
# Rescale and perform action
|
||||
new_obs, reward, done, _ = env.step(self.max_action * action)
|
||||
|
||||
if hasattr(self.env, '_max_episode_steps'):
|
||||
done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
|
||||
else:
|
||||
done_bool = float(done)
|
||||
|
||||
episode_reward += reward
|
||||
|
||||
# Store data in replay buffer
|
||||
if replay_buffer is not None:
|
||||
replay_buffer.add(obs, new_obs, action, reward, done_bool)
|
||||
|
||||
obs = new_obs
|
||||
|
||||
num_timesteps += 1
|
||||
episode_timesteps += 1
|
||||
|
||||
episode_rewards.append(episode_reward)
|
||||
total_timesteps.append(episode_timesteps)
|
||||
|
||||
return np.mean(episode_rewards), np.sum(total_timesteps)
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ def evaluate_policy(model, env, n_eval_episodes=10, deterministic=True, render=F
|
|||
obs = env.reset()
|
||||
done = False
|
||||
while not done:
|
||||
action = model.predict(np.array(obs), deterministic=deterministic)
|
||||
action = model.predict(obs, deterministic=deterministic)
|
||||
obs, reward, done, _ = env.step(action)
|
||||
mean_reward += reward
|
||||
n_steps += 1
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ class TD3(BaseRLModel):
|
|||
super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device)
|
||||
|
||||
self.max_action = np.abs(self.action_space.high)
|
||||
self.replay_buffer = None
|
||||
self.action_noise_std = action_noise_std
|
||||
self.learning_rate = learning_rate
|
||||
self.buffer_size = buffer_size
|
||||
|
|
@ -58,6 +57,8 @@ class TD3(BaseRLModel):
|
|||
self.critic_target = self.policy.critic_target
|
||||
|
||||
def select_action(self, observation):
|
||||
# Normally not needed
|
||||
observation = np.array(observation)
|
||||
with th.no_grad():
|
||||
observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device)
|
||||
return self.actor(observation).cpu().data.numpy().flatten()
|
||||
|
|
@ -147,7 +148,6 @@ class TD3(BaseRLModel):
|
|||
|
||||
timesteps_since_eval = 0
|
||||
episode_num = 0
|
||||
done = True
|
||||
evaluations = []
|
||||
start_time = time.time()
|
||||
|
||||
|
|
@ -158,58 +158,31 @@ class TD3(BaseRLModel):
|
|||
if callback(locals(), globals()) is False:
|
||||
break
|
||||
|
||||
if done:
|
||||
if self.num_timesteps > 0:
|
||||
if self.verbose > 1:
|
||||
print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format(
|
||||
self.num_timesteps, episode_num, episode_timesteps, episode_reward))
|
||||
self.train(episode_timesteps, batch_size=self.batch_size, policy_freq=self.policy_freq)
|
||||
episode_reward, episode_timesteps = self.collect_rollouts(self.env, n_episodes=1,
|
||||
action_noise_std=self.action_noise_std,
|
||||
deterministic=False, callback=None,
|
||||
start_timesteps=self.start_timesteps,
|
||||
num_timesteps=self.num_timesteps,
|
||||
replay_buffer=self.replay_buffer)
|
||||
episode_num += 1
|
||||
self.num_timesteps += episode_timesteps
|
||||
timesteps_since_eval += episode_timesteps
|
||||
|
||||
# Evaluate episode
|
||||
if 0 < eval_freq <= timesteps_since_eval:
|
||||
timesteps_since_eval %= eval_freq
|
||||
mean_reward, _ = evaluate_policy(self, self.env, n_eval_episodes)
|
||||
evaluations.append(mean_reward)
|
||||
if self.verbose > 0:
|
||||
print("Eval num_timesteps={}, mean_reward={:.2f}".format(self.num_timesteps, evaluations[-1]))
|
||||
print("FPS: {:.2f}".format(self.num_timesteps / (time.time() - start_time)))
|
||||
if self.num_timesteps > 0:
|
||||
if self.verbose > 1:
|
||||
print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format(
|
||||
self.num_timesteps, episode_num, episode_timesteps, episode_reward))
|
||||
self.train(episode_timesteps, batch_size=self.batch_size, policy_freq=self.policy_freq)
|
||||
|
||||
# Reset environment
|
||||
obs = self.env.reset()
|
||||
episode_reward = 0
|
||||
episode_timesteps = 0
|
||||
episode_num += 1
|
||||
# Evaluate episode
|
||||
if 0 < eval_freq <= timesteps_since_eval:
|
||||
timesteps_since_eval %= eval_freq
|
||||
mean_reward, _ = evaluate_policy(self, self.env, n_eval_episodes)
|
||||
evaluations.append(mean_reward)
|
||||
if self.verbose > 0:
|
||||
print("Eval num_timesteps={}, mean_reward={:.2f}".format(self.num_timesteps, evaluations[-1]))
|
||||
print("FPS: {:.2f}".format(self.num_timesteps / (time.time() - start_time)))
|
||||
|
||||
# Select action randomly or according to policy
|
||||
if self.num_timesteps < self.start_timesteps:
|
||||
action = self.env.action_space.sample()
|
||||
else:
|
||||
action = self.select_action(np.array(obs))
|
||||
|
||||
if self.action_noise_std > 0:
|
||||
# NOTE: in the original implementation, the noise is applied to the unscaled action
|
||||
action_noise = np.random.normal(0, self.action_noise_std, size=self.action_space.shape[0])
|
||||
action = (action + action_noise).clip(-1, 1)
|
||||
|
||||
# Rescale and perform action
|
||||
new_obs, reward, done, _ = self.env.step(self.max_action * action)
|
||||
|
||||
if hasattr(self.env, '_max_episode_steps'):
|
||||
done_bool = 0 if episode_timesteps + 1 == self.env._max_episode_steps else float(done)
|
||||
else:
|
||||
done_bool = float(done)
|
||||
|
||||
episode_reward += reward
|
||||
|
||||
# Store data in replay buffer
|
||||
# self.replay_buffer.add(state, next_state, action, reward, done)
|
||||
self.replay_buffer.add(obs, new_obs, action, reward, done_bool)
|
||||
|
||||
obs = new_obs
|
||||
|
||||
episode_timesteps += 1
|
||||
self.num_timesteps += 1
|
||||
timesteps_since_eval += 1
|
||||
return self
|
||||
|
||||
def save(self, path):
|
||||
|
|
|
|||
Loading…
Reference in a new issue