Full compat for VecEnv + bug fixes for cuda

This commit is contained in:
Antonin Raffin 2019-09-20 16:43:19 +02:00
parent 255ff10bff
commit 0e727a5f72
8 changed files with 59 additions and 57 deletions

View file

@ -4,8 +4,9 @@ import gym
from torchy_baselines import TD3, CEMRL, PPO
def test_pendulum():
model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), start_timesteps=100, verbose=1)
def test_td3():
model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]),
start_timesteps=100, verbose=1, create_eval_env=True)
model.learn(total_timesteps=500, eval_freq=100)
model.save("test_save")
model.load("test_save")
@ -13,14 +14,14 @@ def test_pendulum():
def test_cemrl():
model = CEMRL('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), pop_size=2, n_grad=1,
start_timesteps=100, verbose=1)
start_timesteps=100, verbose=1, create_eval_env=True)
model.learn(total_timesteps=1000, eval_freq=500)
model.save("test_save")
model.load("test_save")
os.remove("test_save.pth")
def test_ppo():
model = PPO('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), verbose=1)
model = PPO('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
model.learn(total_timesteps=1000, eval_freq=500)
# model.save("test_save")
# model.load("test_save")

View file

@ -21,12 +21,14 @@ class CEMRL(TD3):
elitism=False, n_grad=5, policy_freq=2, batch_size=100,
buffer_size=int(1e6), learning_rate=1e-3, seed=0, device='auto',
action_noise_std=0.0, start_timesteps=100, update_style='original',
create_eval_env=False,
_init_setup_model=True):
super(CEMRL, self).__init__(policy, env, policy_kwargs, verbose,
buffer_size, learning_rate, seed, device,
action_noise_std, start_timesteps,
policy_freq=policy_freq, batch_size=batch_size,
create_eval_env=create_eval_env,
_init_setup_model=False)
self.es = None
@ -58,6 +60,7 @@ class CEMRL(TD3):
episode_num = 0
evaluations = []
start_time = time.time()
eval_env = self._get_eval_env(eval_env)
while self.num_timesteps < total_timesteps:

View file

@ -26,7 +26,7 @@ class BaseRLModel(object):
__metaclass__ = ABCMeta
def __init__(self, policy, env, policy_base, policy_kwargs=None,
verbose=0, device='auto', support_multi_env=False):
verbose=0, device='auto', support_multi_env=False, create_eval_env=False):
if isinstance(policy, str) and policy_base is not None:
self.policy = get_policy_from_name(policy_base, policy)
else:
@ -47,10 +47,13 @@ class BaseRLModel(object):
self.n_envs = None
self.num_timesteps = 0
self.params = None
self.eval_env = None
self.replay_buffer = None
if env is not None:
if isinstance(env, str):
if create_eval_env:
self.eval_env = DummyVecEnv([lambda: gym.make(env)])
if self.verbose >= 1:
print("Creating environment from the given name, wrapped in a DummyVecEnv.")
env = DummyVecEnv([lambda: gym.make(env)])
@ -68,14 +71,15 @@ class BaseRLModel(object):
raise ValueError("Error: the model does not support multiple envs requires a single vectorized"
" environment.")
# if env is not None:
# if env is not None:
# if isinstance(env, str):
# env = gym.make(env)
# self.env = env
# self.n_envs = 1
# self.observation_space = env.observation_space
# self.action_space = env.action_space
def _get_eval_env(self, eval_env):
if eval_env is None:
eval_env = self.eval_env
if eval_env is not None:
if not isinstance(eval_env, VecEnv):
eval_env = DummyVecEnv([lambda: eval_env])
assert eval_env.num_envs == 1
return eval_env
def get_env(self):
"""
@ -216,6 +220,9 @@ class BaseRLModel(object):
episode_rewards = []
total_timesteps = []
assert isinstance(env, VecEnv)
assert env.num_envs == 1
for _ in range(n_episodes):
done = False
# Reset environment
@ -224,7 +231,7 @@ class BaseRLModel(object):
while not done:
# Select action randomly or according to policy
if num_timesteps < start_timesteps:
action = env.action_space.sample()
action = [env.action_space.sample()]
else:
action = self.predict(obs, deterministic=deterministic) / self.max_action
@ -236,11 +243,12 @@ class BaseRLModel(object):
# Rescale and perform action
new_obs, reward, done, _ = env.step(self.max_action * action)
if hasattr(self.env, '_max_episode_steps') and remove_timelimits:
done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
else:
done_bool = float(done)
# TODO: fix for VecEnv
# if hasattr(self.env, '_max_episode_steps') and remove_timelimits:
# done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
# else:
# done_bool = float(done)
done_bool = [float(done[0])]
episode_reward += reward
# Store data in replay buffer

View file

@ -67,11 +67,12 @@ class ReplayBuffer(BaseBuffer):
self.dones = th.zeros(self.buffer_size, self.n_envs)
def add(self, state, next_state, action, reward, done):
self.states[self.pos] = th.FloatTensor(state)
self.next_states[self.pos] = th.FloatTensor(next_state)
self.actions[self.pos] = th.FloatTensor(action)
self.rewards[self.pos] = th.FloatTensor(reward)
self.dones[self.pos] = th.FloatTensor(done)
# Copy to avoid modification by reference
self.states[self.pos] = th.FloatTensor(np.array(state))
self.next_states[self.pos] = th.FloatTensor(np.array(next_state))
self.actions[self.pos] = th.FloatTensor(np.array(action))
self.rewards[self.pos] = th.FloatTensor(np.array(reward))
self.dones[self.pos] = th.FloatTensor(np.array(done))
self.pos += 1
if self.pos == self.buffer_size:
@ -90,7 +91,7 @@ class RolloutBuffer(BaseBuffer):
def __init__(self, buffer_size, state_dim, action_dim, device='cpu',
lambda_=1, gamma=0.99, n_envs=1):
super(RolloutBuffer, self).__init__(buffer_size, state_dim, action_dim, device, n_envs=n_envs)
# TODO: try the buffer on the gpu?
self.lambda_ = lambda_
self.gamma = gamma
self.states, self.actions, self.rewards, self.advantages = None, None, None, None
@ -118,7 +119,7 @@ class RolloutBuffer(BaseBuffer):
for step in reversed(range(self.buffer_size)):
if step == self.buffer_size - 1:
next_non_terminal = th.FloatTensor(1.0 - dones)
next_value = last_value.flatten()
next_value = last_value.clone().cpu().flatten()
else:
next_non_terminal = 1.0 - self.dones[step + 1]
next_value = self.values[step + 1]
@ -128,12 +129,12 @@ class RolloutBuffer(BaseBuffer):
self.returns = self.advantages + self.values
def add(self, state, action, reward, done, value, log_prob):
self.values[self.pos] = th.FloatTensor(value.flatten())
self.log_probs[self.pos] = th.FloatTensor(log_prob)
self.states[self.pos] = th.FloatTensor(state)
self.actions[self.pos] = th.FloatTensor(action)
self.rewards[self.pos] = th.FloatTensor(reward)
self.dones[self.pos] = th.FloatTensor(done)
self.values[self.pos] = th.FloatTensor(value.clone().cpu().flatten())
self.log_probs[self.pos] = th.FloatTensor(log_prob.cpu().clone())
self.states[self.pos] = th.FloatTensor(np.array(state))
self.actions[self.pos] = th.FloatTensor(np.array(action))
self.rewards[self.pos] = th.FloatTensor(np.array(reward))
self.dones[self.pos] = th.FloatTensor(np.array(done))
self.pos += 1
if self.pos == self.buffer_size:
self.full = True

View file

@ -54,7 +54,7 @@ class PPOPolicy(BasePolicy):
def _get_action_dist_from_latent(self, latent, deterministic=False):
mean_actions = self.actor_net(latent)
action_std = th.ones(mean_actions.size()) * self.log_std.exp()
action_std = th.ones_like(mean_actions) * self.log_std.exp()
action_distribution = Normal(mean_actions, action_std)
# Sample from the gaussian
if deterministic:
@ -73,13 +73,11 @@ class PPOPolicy(BasePolicy):
return log_prob
def actor_forward(self, state, deterministic=False):
state = th.FloatTensor(state).to(self.device)
latent = self.shared_net(state)
action, _ = self._get_action_dist_from_latent(latent, deterministic=deterministic)
return action.detach().cpu().numpy()
def get_policy_stats(self, state, action):
state = th.FloatTensor(state).to(self.device)
latent = self.shared_net(state)
_, action_distribution = self._get_action_dist_from_latent(latent)
log_prob = self._get_log_prob(action_distribution, action)

View file

@ -10,7 +10,6 @@ from torchy_baselines.common.evaluation import evaluate_policy
from torchy_baselines.ppo.policies import PPOPolicy
from torchy_baselines.common.buffers import RolloutBuffer
from torchy_baselines.common.utils import explained_variance
from torchy_baselines.common.vec_env import VecEnv, DummyVecEnv
class PPO(BaseRLModel):
@ -27,11 +26,11 @@ class PPO(BaseRLModel):
n_optim=5, batch_size=64, n_steps=256,
gamma=0.99, lambda_=0.95, clip_range=0.2,
ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5,
target_kl=None, clip_range_vf=None,
target_kl=None, clip_range_vf=None, create_eval_env=False,
_init_setup_model=True):
super(PPO, self).__init__(policy, env, PPOPolicy, policy_kwargs,
verbose, device, support_multi_env=True)
verbose, device, create_eval_env=create_eval_env, support_multi_env=True)
self.learning_rate = learning_rate
self._seed = seed
@ -53,12 +52,15 @@ class PPO(BaseRLModel):
def _setup_model(self):
state_dim, action_dim = self.observation_space.shape[0], self.action_space.shape[0]
self.seed(self._seed)
# TODO: different seed for each env when n_envs > 1
if self.n_envs == 1:
self.seed(self._seed)
self.rollout_buffer = RolloutBuffer(self.n_steps, state_dim, action_dim, self.device,
gamma=self.gamma, lambda_=self.lambda_, n_envs=self.n_envs)
self.policy = self.policy(self.observation_space, self.action_space,
self.learning_rate, device=self.device, **self.policy_kwargs)
self.policy = self.policy.to(self.device)
def select_action(self, observation):
# Normally not needed
@ -99,7 +101,6 @@ class PPO(BaseRLModel):
n_steps += 1
rollout_buffer.add(obs, actions, rewards, dones, values, log_probs)
obs = new_obs
rollout_buffer.compute_returns_and_advantage(values, dones=dones)
@ -123,7 +124,6 @@ class PPO(BaseRLModel):
# ratio between old and new policy, should be one at the first iteration
ratio = th.exp(log_prob - old_log_prob)
# clipped surrogate loss
policy_loss_1 = advantage * ratio
policy_loss_2 = advantage * th.clamp(ratio, 1 - self.clip_range, 1 + self.clip_range)
@ -136,7 +136,6 @@ class PPO(BaseRLModel):
# Clip the different between old and new value
# NOTE: this depends on the reward scaling
values_pred = old_values + th.clamp(values - old_values, -self.clip_range_vf, self.clip_range_vf)
# Value loss using the TD(lambda_) target
value_loss = F.mse_loss(return_batch, values_pred)
@ -152,7 +151,6 @@ class PPO(BaseRLModel):
# Clip grad norm
th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
self.policy.optimizer.step()
approx_kl_divs.append(th.mean(old_log_prob - log_prob).detach().cpu().numpy())
if self.target_kl is not None and np.mean(approx_kl_divs) > 1.5 * self.target_kl:
@ -170,11 +168,7 @@ class PPO(BaseRLModel):
evaluations = []
start_time = time.time()
obs = self.env.reset()
if eval_env is not None and not isinstance(eval_env, VecEnv):
eval_env = DummyVecEnv([lambda: eval_env])
assert eval_env.num_envs == 1
eval_env = self._get_eval_env(eval_env)
while self.num_timesteps < total_timesteps:

View file

@ -27,12 +27,6 @@ class Critic(BaseNetwork):
if net_arch is None:
net_arch = [400, 300]
# TODO: solve pytorch parameter registration
# for _ in range(n_critics):
# q_net = create_mlp(state_dim + action_dim, 1, net_arch, activation_fn)
# self.q_net = nn.Sequential(*q_net)
# self.q_networks.append(self.q_net)
q1_net = create_mlp(state_dim + action_dim, 1, net_arch, activation_fn)
self.q1_net = nn.Sequential(*q1_net)

View file

@ -20,10 +20,11 @@ class TD3(BaseRLModel):
def __init__(self, policy, env, policy_kwargs=None, verbose=0,
buffer_size=int(1e6), learning_rate=1e-3, seed=0, device='auto',
action_noise_std=0.1, start_timesteps=100, policy_freq=2,
batch_size=100,
batch_size=100, create_eval_env=False,
_init_setup_model=True):
super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device)
super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device,
create_eval_env=create_eval_env)
self.max_action = np.abs(self.action_space.high)
self.action_noise_std = action_noise_std
@ -43,6 +44,7 @@ class TD3(BaseRLModel):
self.replay_buffer = ReplayBuffer(self.buffer_size, state_dim, action_dim, self.device)
self.policy = self.policy(self.observation_space, self.action_space,
self.learning_rate, device=self.device, **self.policy_kwargs)
self.policy = self.policy.to(self.device)
self._create_aliases()
def _create_aliases(self):
@ -56,7 +58,7 @@ class TD3(BaseRLModel):
observation = np.array(observation)
with th.no_grad():
observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device)
return self.actor(observation).cpu().data.numpy().flatten()
return self.actor(observation).cpu().data.numpy()
def predict(self, observation, state=None, mask=None, deterministic=True):
"""
@ -153,6 +155,7 @@ class TD3(BaseRLModel):
episode_num = 0
evaluations = []
start_time = time.time()
eval_env = self._get_eval_env(eval_env)
while self.num_timesteps < total_timesteps: