From 0e727a5f7263b855bb6b8ebcde47f404fc036f36 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 20 Sep 2019 16:43:19 +0200 Subject: [PATCH] Full compat for VecEnv + bug fixes for cuda --- tests/test_run.py | 9 ++++--- torchy_baselines/cem_rl/cem_rl.py | 3 +++ torchy_baselines/common/base_class.py | 38 ++++++++++++++++----------- torchy_baselines/common/buffers.py | 27 ++++++++++--------- torchy_baselines/ppo/policies.py | 4 +-- torchy_baselines/ppo/ppo.py | 20 +++++--------- torchy_baselines/td3/policies.py | 6 ----- torchy_baselines/td3/td3.py | 9 ++++--- 8 files changed, 59 insertions(+), 57 deletions(-) diff --git a/tests/test_run.py b/tests/test_run.py index b3ed1a1..d2f2165 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -4,8 +4,9 @@ import gym from torchy_baselines import TD3, CEMRL, PPO -def test_pendulum(): - model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), start_timesteps=100, verbose=1) +def test_td3(): + model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), + start_timesteps=100, verbose=1, create_eval_env=True) model.learn(total_timesteps=500, eval_freq=100) model.save("test_save") model.load("test_save") @@ -13,14 +14,14 @@ def test_pendulum(): def test_cemrl(): model = CEMRL('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), pop_size=2, n_grad=1, - start_timesteps=100, verbose=1) + start_timesteps=100, verbose=1, create_eval_env=True) model.learn(total_timesteps=1000, eval_freq=500) model.save("test_save") model.load("test_save") os.remove("test_save.pth") def test_ppo(): - model = PPO('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), verbose=1) + model = PPO('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True) model.learn(total_timesteps=1000, eval_freq=500) # model.save("test_save") # model.load("test_save") diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py index fd53ee1..aec935d 100644 --- a/torchy_baselines/cem_rl/cem_rl.py +++ b/torchy_baselines/cem_rl/cem_rl.py @@ -21,12 +21,14 @@ class CEMRL(TD3): elitism=False, n_grad=5, policy_freq=2, batch_size=100, buffer_size=int(1e6), learning_rate=1e-3, seed=0, device='auto', action_noise_std=0.0, start_timesteps=100, update_style='original', + create_eval_env=False, _init_setup_model=True): super(CEMRL, self).__init__(policy, env, policy_kwargs, verbose, buffer_size, learning_rate, seed, device, action_noise_std, start_timesteps, policy_freq=policy_freq, batch_size=batch_size, + create_eval_env=create_eval_env, _init_setup_model=False) self.es = None @@ -58,6 +60,7 @@ class CEMRL(TD3): episode_num = 0 evaluations = [] start_time = time.time() + eval_env = self._get_eval_env(eval_env) while self.num_timesteps < total_timesteps: diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index 11073e9..1d0c773 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -26,7 +26,7 @@ class BaseRLModel(object): __metaclass__ = ABCMeta def __init__(self, policy, env, policy_base, policy_kwargs=None, - verbose=0, device='auto', support_multi_env=False): + verbose=0, device='auto', support_multi_env=False, create_eval_env=False): if isinstance(policy, str) and policy_base is not None: self.policy = get_policy_from_name(policy_base, policy) else: @@ -47,10 +47,13 @@ class BaseRLModel(object): self.n_envs = None self.num_timesteps = 0 self.params = None + self.eval_env = None self.replay_buffer = None if env is not None: if isinstance(env, str): + if create_eval_env: + self.eval_env = DummyVecEnv([lambda: gym.make(env)]) if self.verbose >= 1: print("Creating environment from the given name, wrapped in a DummyVecEnv.") env = DummyVecEnv([lambda: gym.make(env)]) @@ -68,14 +71,15 @@ class BaseRLModel(object): raise ValueError("Error: the model does not support multiple envs requires a single vectorized" " environment.") - # if env is not None: - # if env is not None: - # if isinstance(env, str): - # env = gym.make(env) - # self.env = env - # self.n_envs = 1 - # self.observation_space = env.observation_space - # self.action_space = env.action_space + def _get_eval_env(self, eval_env): + if eval_env is None: + eval_env = self.eval_env + + if eval_env is not None: + if not isinstance(eval_env, VecEnv): + eval_env = DummyVecEnv([lambda: eval_env]) + assert eval_env.num_envs == 1 + return eval_env def get_env(self): """ @@ -216,6 +220,9 @@ class BaseRLModel(object): episode_rewards = [] total_timesteps = [] + assert isinstance(env, VecEnv) + assert env.num_envs == 1 + for _ in range(n_episodes): done = False # Reset environment @@ -224,7 +231,7 @@ class BaseRLModel(object): while not done: # Select action randomly or according to policy if num_timesteps < start_timesteps: - action = env.action_space.sample() + action = [env.action_space.sample()] else: action = self.predict(obs, deterministic=deterministic) / self.max_action @@ -236,11 +243,12 @@ class BaseRLModel(object): # Rescale and perform action new_obs, reward, done, _ = env.step(self.max_action * action) - if hasattr(self.env, '_max_episode_steps') and remove_timelimits: - done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) - else: - done_bool = float(done) - + # TODO: fix for VecEnv + # if hasattr(self.env, '_max_episode_steps') and remove_timelimits: + # done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) + # else: + # done_bool = float(done) + done_bool = [float(done[0])] episode_reward += reward # Store data in replay buffer diff --git a/torchy_baselines/common/buffers.py b/torchy_baselines/common/buffers.py index 8c4ccf6..304384b 100644 --- a/torchy_baselines/common/buffers.py +++ b/torchy_baselines/common/buffers.py @@ -67,11 +67,12 @@ class ReplayBuffer(BaseBuffer): self.dones = th.zeros(self.buffer_size, self.n_envs) def add(self, state, next_state, action, reward, done): - self.states[self.pos] = th.FloatTensor(state) - self.next_states[self.pos] = th.FloatTensor(next_state) - self.actions[self.pos] = th.FloatTensor(action) - self.rewards[self.pos] = th.FloatTensor(reward) - self.dones[self.pos] = th.FloatTensor(done) + # Copy to avoid modification by reference + self.states[self.pos] = th.FloatTensor(np.array(state)) + self.next_states[self.pos] = th.FloatTensor(np.array(next_state)) + self.actions[self.pos] = th.FloatTensor(np.array(action)) + self.rewards[self.pos] = th.FloatTensor(np.array(reward)) + self.dones[self.pos] = th.FloatTensor(np.array(done)) self.pos += 1 if self.pos == self.buffer_size: @@ -90,7 +91,7 @@ class RolloutBuffer(BaseBuffer): def __init__(self, buffer_size, state_dim, action_dim, device='cpu', lambda_=1, gamma=0.99, n_envs=1): super(RolloutBuffer, self).__init__(buffer_size, state_dim, action_dim, device, n_envs=n_envs) - + # TODO: try the buffer on the gpu? self.lambda_ = lambda_ self.gamma = gamma self.states, self.actions, self.rewards, self.advantages = None, None, None, None @@ -118,7 +119,7 @@ class RolloutBuffer(BaseBuffer): for step in reversed(range(self.buffer_size)): if step == self.buffer_size - 1: next_non_terminal = th.FloatTensor(1.0 - dones) - next_value = last_value.flatten() + next_value = last_value.clone().cpu().flatten() else: next_non_terminal = 1.0 - self.dones[step + 1] next_value = self.values[step + 1] @@ -128,12 +129,12 @@ class RolloutBuffer(BaseBuffer): self.returns = self.advantages + self.values def add(self, state, action, reward, done, value, log_prob): - self.values[self.pos] = th.FloatTensor(value.flatten()) - self.log_probs[self.pos] = th.FloatTensor(log_prob) - self.states[self.pos] = th.FloatTensor(state) - self.actions[self.pos] = th.FloatTensor(action) - self.rewards[self.pos] = th.FloatTensor(reward) - self.dones[self.pos] = th.FloatTensor(done) + self.values[self.pos] = th.FloatTensor(value.clone().cpu().flatten()) + self.log_probs[self.pos] = th.FloatTensor(log_prob.cpu().clone()) + self.states[self.pos] = th.FloatTensor(np.array(state)) + self.actions[self.pos] = th.FloatTensor(np.array(action)) + self.rewards[self.pos] = th.FloatTensor(np.array(reward)) + self.dones[self.pos] = th.FloatTensor(np.array(done)) self.pos += 1 if self.pos == self.buffer_size: self.full = True diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py index eb20f3f..f542788 100644 --- a/torchy_baselines/ppo/policies.py +++ b/torchy_baselines/ppo/policies.py @@ -54,7 +54,7 @@ class PPOPolicy(BasePolicy): def _get_action_dist_from_latent(self, latent, deterministic=False): mean_actions = self.actor_net(latent) - action_std = th.ones(mean_actions.size()) * self.log_std.exp() + action_std = th.ones_like(mean_actions) * self.log_std.exp() action_distribution = Normal(mean_actions, action_std) # Sample from the gaussian if deterministic: @@ -73,13 +73,11 @@ class PPOPolicy(BasePolicy): return log_prob def actor_forward(self, state, deterministic=False): - state = th.FloatTensor(state).to(self.device) latent = self.shared_net(state) action, _ = self._get_action_dist_from_latent(latent, deterministic=deterministic) return action.detach().cpu().numpy() def get_policy_stats(self, state, action): - state = th.FloatTensor(state).to(self.device) latent = self.shared_net(state) _, action_distribution = self._get_action_dist_from_latent(latent) log_prob = self._get_log_prob(action_distribution, action) diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 463bf6b..9367403 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -10,7 +10,6 @@ from torchy_baselines.common.evaluation import evaluate_policy from torchy_baselines.ppo.policies import PPOPolicy from torchy_baselines.common.buffers import RolloutBuffer from torchy_baselines.common.utils import explained_variance -from torchy_baselines.common.vec_env import VecEnv, DummyVecEnv class PPO(BaseRLModel): @@ -27,11 +26,11 @@ class PPO(BaseRLModel): n_optim=5, batch_size=64, n_steps=256, gamma=0.99, lambda_=0.95, clip_range=0.2, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, - target_kl=None, clip_range_vf=None, + target_kl=None, clip_range_vf=None, create_eval_env=False, _init_setup_model=True): super(PPO, self).__init__(policy, env, PPOPolicy, policy_kwargs, - verbose, device, support_multi_env=True) + verbose, device, create_eval_env=create_eval_env, support_multi_env=True) self.learning_rate = learning_rate self._seed = seed @@ -53,12 +52,15 @@ class PPO(BaseRLModel): def _setup_model(self): state_dim, action_dim = self.observation_space.shape[0], self.action_space.shape[0] - self.seed(self._seed) + # TODO: different seed for each env when n_envs > 1 + if self.n_envs == 1: + self.seed(self._seed) self.rollout_buffer = RolloutBuffer(self.n_steps, state_dim, action_dim, self.device, gamma=self.gamma, lambda_=self.lambda_, n_envs=self.n_envs) self.policy = self.policy(self.observation_space, self.action_space, self.learning_rate, device=self.device, **self.policy_kwargs) + self.policy = self.policy.to(self.device) def select_action(self, observation): # Normally not needed @@ -99,7 +101,6 @@ class PPO(BaseRLModel): n_steps += 1 rollout_buffer.add(obs, actions, rewards, dones, values, log_probs) - obs = new_obs rollout_buffer.compute_returns_and_advantage(values, dones=dones) @@ -123,7 +124,6 @@ class PPO(BaseRLModel): # ratio between old and new policy, should be one at the first iteration ratio = th.exp(log_prob - old_log_prob) - # clipped surrogate loss policy_loss_1 = advantage * ratio policy_loss_2 = advantage * th.clamp(ratio, 1 - self.clip_range, 1 + self.clip_range) @@ -136,7 +136,6 @@ class PPO(BaseRLModel): # Clip the different between old and new value # NOTE: this depends on the reward scaling values_pred = old_values + th.clamp(values - old_values, -self.clip_range_vf, self.clip_range_vf) - # Value loss using the TD(lambda_) target value_loss = F.mse_loss(return_batch, values_pred) @@ -152,7 +151,6 @@ class PPO(BaseRLModel): # Clip grad norm th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) self.policy.optimizer.step() - approx_kl_divs.append(th.mean(old_log_prob - log_prob).detach().cpu().numpy()) if self.target_kl is not None and np.mean(approx_kl_divs) > 1.5 * self.target_kl: @@ -170,11 +168,7 @@ class PPO(BaseRLModel): evaluations = [] start_time = time.time() obs = self.env.reset() - - if eval_env is not None and not isinstance(eval_env, VecEnv): - eval_env = DummyVecEnv([lambda: eval_env]) - - assert eval_env.num_envs == 1 + eval_env = self._get_eval_env(eval_env) while self.num_timesteps < total_timesteps: diff --git a/torchy_baselines/td3/policies.py b/torchy_baselines/td3/policies.py index 23c3945..99eca0c 100644 --- a/torchy_baselines/td3/policies.py +++ b/torchy_baselines/td3/policies.py @@ -27,12 +27,6 @@ class Critic(BaseNetwork): if net_arch is None: net_arch = [400, 300] - # TODO: solve pytorch parameter registration - # for _ in range(n_critics): - # q_net = create_mlp(state_dim + action_dim, 1, net_arch, activation_fn) - # self.q_net = nn.Sequential(*q_net) - # self.q_networks.append(self.q_net) - q1_net = create_mlp(state_dim + action_dim, 1, net_arch, activation_fn) self.q1_net = nn.Sequential(*q1_net) diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index 6215b75..e47fc79 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -20,10 +20,11 @@ class TD3(BaseRLModel): def __init__(self, policy, env, policy_kwargs=None, verbose=0, buffer_size=int(1e6), learning_rate=1e-3, seed=0, device='auto', action_noise_std=0.1, start_timesteps=100, policy_freq=2, - batch_size=100, + batch_size=100, create_eval_env=False, _init_setup_model=True): - super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device) + super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device, + create_eval_env=create_eval_env) self.max_action = np.abs(self.action_space.high) self.action_noise_std = action_noise_std @@ -43,6 +44,7 @@ class TD3(BaseRLModel): self.replay_buffer = ReplayBuffer(self.buffer_size, state_dim, action_dim, self.device) self.policy = self.policy(self.observation_space, self.action_space, self.learning_rate, device=self.device, **self.policy_kwargs) + self.policy = self.policy.to(self.device) self._create_aliases() def _create_aliases(self): @@ -56,7 +58,7 @@ class TD3(BaseRLModel): observation = np.array(observation) with th.no_grad(): observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device) - return self.actor(observation).cpu().data.numpy().flatten() + return self.actor(observation).cpu().data.numpy() def predict(self, observation, state=None, mask=None, deterministic=True): """ @@ -153,6 +155,7 @@ class TD3(BaseRLModel): episode_num = 0 evaluations = [] start_time = time.time() + eval_env = self._get_eval_env(eval_env) while self.num_timesteps < total_timesteps: