mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-05-23 22:20:18 +00:00
Full compat for VecEnv + bug fixes for cuda
This commit is contained in:
parent
255ff10bff
commit
0e727a5f72
8 changed files with 59 additions and 57 deletions
|
|
@ -4,8 +4,9 @@ import gym
|
|||
|
||||
from torchy_baselines import TD3, CEMRL, PPO
|
||||
|
||||
def test_pendulum():
|
||||
model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), start_timesteps=100, verbose=1)
|
||||
def test_td3():
|
||||
model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]),
|
||||
start_timesteps=100, verbose=1, create_eval_env=True)
|
||||
model.learn(total_timesteps=500, eval_freq=100)
|
||||
model.save("test_save")
|
||||
model.load("test_save")
|
||||
|
|
@ -13,14 +14,14 @@ def test_pendulum():
|
|||
|
||||
def test_cemrl():
|
||||
model = CEMRL('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), pop_size=2, n_grad=1,
|
||||
start_timesteps=100, verbose=1)
|
||||
start_timesteps=100, verbose=1, create_eval_env=True)
|
||||
model.learn(total_timesteps=1000, eval_freq=500)
|
||||
model.save("test_save")
|
||||
model.load("test_save")
|
||||
os.remove("test_save.pth")
|
||||
|
||||
def test_ppo():
|
||||
model = PPO('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), verbose=1)
|
||||
model = PPO('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
|
||||
model.learn(total_timesteps=1000, eval_freq=500)
|
||||
# model.save("test_save")
|
||||
# model.load("test_save")
|
||||
|
|
|
|||
|
|
@ -21,12 +21,14 @@ class CEMRL(TD3):
|
|||
elitism=False, n_grad=5, policy_freq=2, batch_size=100,
|
||||
buffer_size=int(1e6), learning_rate=1e-3, seed=0, device='auto',
|
||||
action_noise_std=0.0, start_timesteps=100, update_style='original',
|
||||
create_eval_env=False,
|
||||
_init_setup_model=True):
|
||||
|
||||
super(CEMRL, self).__init__(policy, env, policy_kwargs, verbose,
|
||||
buffer_size, learning_rate, seed, device,
|
||||
action_noise_std, start_timesteps,
|
||||
policy_freq=policy_freq, batch_size=batch_size,
|
||||
create_eval_env=create_eval_env,
|
||||
_init_setup_model=False)
|
||||
|
||||
self.es = None
|
||||
|
|
@ -58,6 +60,7 @@ class CEMRL(TD3):
|
|||
episode_num = 0
|
||||
evaluations = []
|
||||
start_time = time.time()
|
||||
eval_env = self._get_eval_env(eval_env)
|
||||
|
||||
while self.num_timesteps < total_timesteps:
|
||||
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ class BaseRLModel(object):
|
|||
__metaclass__ = ABCMeta
|
||||
|
||||
def __init__(self, policy, env, policy_base, policy_kwargs=None,
|
||||
verbose=0, device='auto', support_multi_env=False):
|
||||
verbose=0, device='auto', support_multi_env=False, create_eval_env=False):
|
||||
if isinstance(policy, str) and policy_base is not None:
|
||||
self.policy = get_policy_from_name(policy_base, policy)
|
||||
else:
|
||||
|
|
@ -47,10 +47,13 @@ class BaseRLModel(object):
|
|||
self.n_envs = None
|
||||
self.num_timesteps = 0
|
||||
self.params = None
|
||||
self.eval_env = None
|
||||
self.replay_buffer = None
|
||||
|
||||
if env is not None:
|
||||
if isinstance(env, str):
|
||||
if create_eval_env:
|
||||
self.eval_env = DummyVecEnv([lambda: gym.make(env)])
|
||||
if self.verbose >= 1:
|
||||
print("Creating environment from the given name, wrapped in a DummyVecEnv.")
|
||||
env = DummyVecEnv([lambda: gym.make(env)])
|
||||
|
|
@ -68,14 +71,15 @@ class BaseRLModel(object):
|
|||
raise ValueError("Error: the model does not support multiple envs requires a single vectorized"
|
||||
" environment.")
|
||||
|
||||
# if env is not None:
|
||||
# if env is not None:
|
||||
# if isinstance(env, str):
|
||||
# env = gym.make(env)
|
||||
# self.env = env
|
||||
# self.n_envs = 1
|
||||
# self.observation_space = env.observation_space
|
||||
# self.action_space = env.action_space
|
||||
def _get_eval_env(self, eval_env):
|
||||
if eval_env is None:
|
||||
eval_env = self.eval_env
|
||||
|
||||
if eval_env is not None:
|
||||
if not isinstance(eval_env, VecEnv):
|
||||
eval_env = DummyVecEnv([lambda: eval_env])
|
||||
assert eval_env.num_envs == 1
|
||||
return eval_env
|
||||
|
||||
def get_env(self):
|
||||
"""
|
||||
|
|
@ -216,6 +220,9 @@ class BaseRLModel(object):
|
|||
episode_rewards = []
|
||||
total_timesteps = []
|
||||
|
||||
assert isinstance(env, VecEnv)
|
||||
assert env.num_envs == 1
|
||||
|
||||
for _ in range(n_episodes):
|
||||
done = False
|
||||
# Reset environment
|
||||
|
|
@ -224,7 +231,7 @@ class BaseRLModel(object):
|
|||
while not done:
|
||||
# Select action randomly or according to policy
|
||||
if num_timesteps < start_timesteps:
|
||||
action = env.action_space.sample()
|
||||
action = [env.action_space.sample()]
|
||||
else:
|
||||
action = self.predict(obs, deterministic=deterministic) / self.max_action
|
||||
|
||||
|
|
@ -236,11 +243,12 @@ class BaseRLModel(object):
|
|||
# Rescale and perform action
|
||||
new_obs, reward, done, _ = env.step(self.max_action * action)
|
||||
|
||||
if hasattr(self.env, '_max_episode_steps') and remove_timelimits:
|
||||
done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
|
||||
else:
|
||||
done_bool = float(done)
|
||||
|
||||
# TODO: fix for VecEnv
|
||||
# if hasattr(self.env, '_max_episode_steps') and remove_timelimits:
|
||||
# done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
|
||||
# else:
|
||||
# done_bool = float(done)
|
||||
done_bool = [float(done[0])]
|
||||
episode_reward += reward
|
||||
|
||||
# Store data in replay buffer
|
||||
|
|
|
|||
|
|
@ -67,11 +67,12 @@ class ReplayBuffer(BaseBuffer):
|
|||
self.dones = th.zeros(self.buffer_size, self.n_envs)
|
||||
|
||||
def add(self, state, next_state, action, reward, done):
|
||||
self.states[self.pos] = th.FloatTensor(state)
|
||||
self.next_states[self.pos] = th.FloatTensor(next_state)
|
||||
self.actions[self.pos] = th.FloatTensor(action)
|
||||
self.rewards[self.pos] = th.FloatTensor(reward)
|
||||
self.dones[self.pos] = th.FloatTensor(done)
|
||||
# Copy to avoid modification by reference
|
||||
self.states[self.pos] = th.FloatTensor(np.array(state))
|
||||
self.next_states[self.pos] = th.FloatTensor(np.array(next_state))
|
||||
self.actions[self.pos] = th.FloatTensor(np.array(action))
|
||||
self.rewards[self.pos] = th.FloatTensor(np.array(reward))
|
||||
self.dones[self.pos] = th.FloatTensor(np.array(done))
|
||||
|
||||
self.pos += 1
|
||||
if self.pos == self.buffer_size:
|
||||
|
|
@ -90,7 +91,7 @@ class RolloutBuffer(BaseBuffer):
|
|||
def __init__(self, buffer_size, state_dim, action_dim, device='cpu',
|
||||
lambda_=1, gamma=0.99, n_envs=1):
|
||||
super(RolloutBuffer, self).__init__(buffer_size, state_dim, action_dim, device, n_envs=n_envs)
|
||||
|
||||
# TODO: try the buffer on the gpu?
|
||||
self.lambda_ = lambda_
|
||||
self.gamma = gamma
|
||||
self.states, self.actions, self.rewards, self.advantages = None, None, None, None
|
||||
|
|
@ -118,7 +119,7 @@ class RolloutBuffer(BaseBuffer):
|
|||
for step in reversed(range(self.buffer_size)):
|
||||
if step == self.buffer_size - 1:
|
||||
next_non_terminal = th.FloatTensor(1.0 - dones)
|
||||
next_value = last_value.flatten()
|
||||
next_value = last_value.clone().cpu().flatten()
|
||||
else:
|
||||
next_non_terminal = 1.0 - self.dones[step + 1]
|
||||
next_value = self.values[step + 1]
|
||||
|
|
@ -128,12 +129,12 @@ class RolloutBuffer(BaseBuffer):
|
|||
self.returns = self.advantages + self.values
|
||||
|
||||
def add(self, state, action, reward, done, value, log_prob):
|
||||
self.values[self.pos] = th.FloatTensor(value.flatten())
|
||||
self.log_probs[self.pos] = th.FloatTensor(log_prob)
|
||||
self.states[self.pos] = th.FloatTensor(state)
|
||||
self.actions[self.pos] = th.FloatTensor(action)
|
||||
self.rewards[self.pos] = th.FloatTensor(reward)
|
||||
self.dones[self.pos] = th.FloatTensor(done)
|
||||
self.values[self.pos] = th.FloatTensor(value.clone().cpu().flatten())
|
||||
self.log_probs[self.pos] = th.FloatTensor(log_prob.cpu().clone())
|
||||
self.states[self.pos] = th.FloatTensor(np.array(state))
|
||||
self.actions[self.pos] = th.FloatTensor(np.array(action))
|
||||
self.rewards[self.pos] = th.FloatTensor(np.array(reward))
|
||||
self.dones[self.pos] = th.FloatTensor(np.array(done))
|
||||
self.pos += 1
|
||||
if self.pos == self.buffer_size:
|
||||
self.full = True
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ class PPOPolicy(BasePolicy):
|
|||
|
||||
def _get_action_dist_from_latent(self, latent, deterministic=False):
|
||||
mean_actions = self.actor_net(latent)
|
||||
action_std = th.ones(mean_actions.size()) * self.log_std.exp()
|
||||
action_std = th.ones_like(mean_actions) * self.log_std.exp()
|
||||
action_distribution = Normal(mean_actions, action_std)
|
||||
# Sample from the gaussian
|
||||
if deterministic:
|
||||
|
|
@ -73,13 +73,11 @@ class PPOPolicy(BasePolicy):
|
|||
return log_prob
|
||||
|
||||
def actor_forward(self, state, deterministic=False):
|
||||
state = th.FloatTensor(state).to(self.device)
|
||||
latent = self.shared_net(state)
|
||||
action, _ = self._get_action_dist_from_latent(latent, deterministic=deterministic)
|
||||
return action.detach().cpu().numpy()
|
||||
|
||||
def get_policy_stats(self, state, action):
|
||||
state = th.FloatTensor(state).to(self.device)
|
||||
latent = self.shared_net(state)
|
||||
_, action_distribution = self._get_action_dist_from_latent(latent)
|
||||
log_prob = self._get_log_prob(action_distribution, action)
|
||||
|
|
|
|||
|
|
@ -10,7 +10,6 @@ from torchy_baselines.common.evaluation import evaluate_policy
|
|||
from torchy_baselines.ppo.policies import PPOPolicy
|
||||
from torchy_baselines.common.buffers import RolloutBuffer
|
||||
from torchy_baselines.common.utils import explained_variance
|
||||
from torchy_baselines.common.vec_env import VecEnv, DummyVecEnv
|
||||
|
||||
|
||||
class PPO(BaseRLModel):
|
||||
|
|
@ -27,11 +26,11 @@ class PPO(BaseRLModel):
|
|||
n_optim=5, batch_size=64, n_steps=256,
|
||||
gamma=0.99, lambda_=0.95, clip_range=0.2,
|
||||
ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5,
|
||||
target_kl=None, clip_range_vf=None,
|
||||
target_kl=None, clip_range_vf=None, create_eval_env=False,
|
||||
_init_setup_model=True):
|
||||
|
||||
super(PPO, self).__init__(policy, env, PPOPolicy, policy_kwargs,
|
||||
verbose, device, support_multi_env=True)
|
||||
verbose, device, create_eval_env=create_eval_env, support_multi_env=True)
|
||||
|
||||
self.learning_rate = learning_rate
|
||||
self._seed = seed
|
||||
|
|
@ -53,12 +52,15 @@ class PPO(BaseRLModel):
|
|||
|
||||
def _setup_model(self):
|
||||
state_dim, action_dim = self.observation_space.shape[0], self.action_space.shape[0]
|
||||
self.seed(self._seed)
|
||||
# TODO: different seed for each env when n_envs > 1
|
||||
if self.n_envs == 1:
|
||||
self.seed(self._seed)
|
||||
|
||||
self.rollout_buffer = RolloutBuffer(self.n_steps, state_dim, action_dim, self.device,
|
||||
gamma=self.gamma, lambda_=self.lambda_, n_envs=self.n_envs)
|
||||
self.policy = self.policy(self.observation_space, self.action_space,
|
||||
self.learning_rate, device=self.device, **self.policy_kwargs)
|
||||
self.policy = self.policy.to(self.device)
|
||||
|
||||
def select_action(self, observation):
|
||||
# Normally not needed
|
||||
|
|
@ -99,7 +101,6 @@ class PPO(BaseRLModel):
|
|||
|
||||
n_steps += 1
|
||||
rollout_buffer.add(obs, actions, rewards, dones, values, log_probs)
|
||||
|
||||
obs = new_obs
|
||||
|
||||
rollout_buffer.compute_returns_and_advantage(values, dones=dones)
|
||||
|
|
@ -123,7 +124,6 @@ class PPO(BaseRLModel):
|
|||
|
||||
# ratio between old and new policy, should be one at the first iteration
|
||||
ratio = th.exp(log_prob - old_log_prob)
|
||||
|
||||
# clipped surrogate loss
|
||||
policy_loss_1 = advantage * ratio
|
||||
policy_loss_2 = advantage * th.clamp(ratio, 1 - self.clip_range, 1 + self.clip_range)
|
||||
|
|
@ -136,7 +136,6 @@ class PPO(BaseRLModel):
|
|||
# Clip the different between old and new value
|
||||
# NOTE: this depends on the reward scaling
|
||||
values_pred = old_values + th.clamp(values - old_values, -self.clip_range_vf, self.clip_range_vf)
|
||||
|
||||
# Value loss using the TD(lambda_) target
|
||||
value_loss = F.mse_loss(return_batch, values_pred)
|
||||
|
||||
|
|
@ -152,7 +151,6 @@ class PPO(BaseRLModel):
|
|||
# Clip grad norm
|
||||
th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
|
||||
self.policy.optimizer.step()
|
||||
|
||||
approx_kl_divs.append(th.mean(old_log_prob - log_prob).detach().cpu().numpy())
|
||||
|
||||
if self.target_kl is not None and np.mean(approx_kl_divs) > 1.5 * self.target_kl:
|
||||
|
|
@ -170,11 +168,7 @@ class PPO(BaseRLModel):
|
|||
evaluations = []
|
||||
start_time = time.time()
|
||||
obs = self.env.reset()
|
||||
|
||||
if eval_env is not None and not isinstance(eval_env, VecEnv):
|
||||
eval_env = DummyVecEnv([lambda: eval_env])
|
||||
|
||||
assert eval_env.num_envs == 1
|
||||
eval_env = self._get_eval_env(eval_env)
|
||||
|
||||
while self.num_timesteps < total_timesteps:
|
||||
|
||||
|
|
|
|||
|
|
@ -27,12 +27,6 @@ class Critic(BaseNetwork):
|
|||
if net_arch is None:
|
||||
net_arch = [400, 300]
|
||||
|
||||
# TODO: solve pytorch parameter registration
|
||||
# for _ in range(n_critics):
|
||||
# q_net = create_mlp(state_dim + action_dim, 1, net_arch, activation_fn)
|
||||
# self.q_net = nn.Sequential(*q_net)
|
||||
# self.q_networks.append(self.q_net)
|
||||
|
||||
q1_net = create_mlp(state_dim + action_dim, 1, net_arch, activation_fn)
|
||||
self.q1_net = nn.Sequential(*q1_net)
|
||||
|
||||
|
|
|
|||
|
|
@ -20,10 +20,11 @@ class TD3(BaseRLModel):
|
|||
def __init__(self, policy, env, policy_kwargs=None, verbose=0,
|
||||
buffer_size=int(1e6), learning_rate=1e-3, seed=0, device='auto',
|
||||
action_noise_std=0.1, start_timesteps=100, policy_freq=2,
|
||||
batch_size=100,
|
||||
batch_size=100, create_eval_env=False,
|
||||
_init_setup_model=True):
|
||||
|
||||
super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device)
|
||||
super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device,
|
||||
create_eval_env=create_eval_env)
|
||||
|
||||
self.max_action = np.abs(self.action_space.high)
|
||||
self.action_noise_std = action_noise_std
|
||||
|
|
@ -43,6 +44,7 @@ class TD3(BaseRLModel):
|
|||
self.replay_buffer = ReplayBuffer(self.buffer_size, state_dim, action_dim, self.device)
|
||||
self.policy = self.policy(self.observation_space, self.action_space,
|
||||
self.learning_rate, device=self.device, **self.policy_kwargs)
|
||||
self.policy = self.policy.to(self.device)
|
||||
self._create_aliases()
|
||||
|
||||
def _create_aliases(self):
|
||||
|
|
@ -56,7 +58,7 @@ class TD3(BaseRLModel):
|
|||
observation = np.array(observation)
|
||||
with th.no_grad():
|
||||
observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device)
|
||||
return self.actor(observation).cpu().data.numpy().flatten()
|
||||
return self.actor(observation).cpu().data.numpy()
|
||||
|
||||
def predict(self, observation, state=None, mask=None, deterministic=True):
|
||||
"""
|
||||
|
|
@ -153,6 +155,7 @@ class TD3(BaseRLModel):
|
|||
episode_num = 0
|
||||
evaluations = []
|
||||
start_time = time.time()
|
||||
eval_env = self._get_eval_env(eval_env)
|
||||
|
||||
while self.num_timesteps < total_timesteps:
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue