Add save/load weights for policies and refactor action distributions

This commit is contained in:
Antonin RAFFIN 2020-03-31 16:29:13 +02:00
parent b782f3a208
commit fdecd512db
11 changed files with 319 additions and 211 deletions

View file

@ -10,10 +10,12 @@ Pre-Release 0.4.0a0 (WIP)
Breaking Changes:
^^^^^^^^^^^^^^^^^
- Removed CEMRL
- Model saved with previous versions cannot be loaded (because of the pre-preprocessing)
New Features:
^^^^^^^^^^^^^
- Add support for Discrete observation spaces
- Add saving/loading for policy weights, so the policy can be used without the model
Bug Fixes:
^^^^^^^^^^
@ -26,6 +28,8 @@ Others:
^^^^^^^
- Refactor handling of observation and action spaces
- Refactored features extraction to have proper preprocessing
- Refactored action distributions
Documentation:
^^^^^^^^^^^^^^

View file

@ -38,7 +38,8 @@ def test_squashed_gaussian(model_class):
gaussian_mean = th.rand(N_SAMPLES, N_ACTIONS)
dist = SquashedDiagGaussianDistribution(N_ACTIONS)
_, log_std = dist.proba_distribution_net(N_FEATURES)
actions, _ = dist.proba_distribution(gaussian_mean, log_std)
dist = dist.proba_distribution(gaussian_mean, log_std)
actions = dist.get_action()
assert th.max(th.abs(actions)) <= 1.0
def test_sde_distribution():
@ -51,7 +52,8 @@ def test_sde_distribution():
_, log_std = dist.proba_distribution_net(N_FEATURES)
dist.sample_weights(log_std, batch_size=N_SAMPLES)
actions, _ = dist.proba_distribution(deterministic_actions, log_std, state)
dist = dist.proba_distribution(deterministic_actions, log_std, state)
actions = dist.get_action()
assert th.allclose(actions.mean(), dist.distribution.mean.mean(), rtol=1e-3)
assert th.allclose(actions.std(), dist.distribution.scale.mean(), rtol=1e-3)
@ -71,11 +73,12 @@ def test_entropy(dist):
_, log_std = dist.proba_distribution_net(N_FEATURES, log_std_init=th.log(th.tensor(0.2)))
if isinstance(dist, DiagGaussianDistribution):
actions, dist = dist.proba_distribution(deterministic_actions, log_std)
dist = dist.proba_distribution(deterministic_actions, log_std)
else:
dist.sample_weights(log_std, batch_size=N_SAMPLES)
actions, dist = dist.proba_distribution(deterministic_actions, log_std, state)
dist = dist.proba_distribution(deterministic_actions, log_std, state)
actions = dist.get_action()
entropy = dist.entropy()
log_prob = dist.log_prob(actions)
assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=5e-3)
@ -88,8 +91,9 @@ def test_categorical():
set_random_seed(1)
state = th.rand(N_SAMPLES, N_FEATURES)
action_logits = th.rand(N_SAMPLES, N_ACTIONS)
actions, dist = dist.proba_distribution(action_logits)
dist = dist.proba_distribution(action_logits)
actions = dist.get_action()
entropy = dist.entropy()
log_prob = dist.log_prob(actions)
assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=1e-4)

View file

@ -20,7 +20,7 @@ def test_continuous(model_class):
env = IdentityEnvBox(eps=0.5)
n_steps = {
A2C: 3000,
A2C: 3500,
PPO: 3000,
SAC: 700,
TD3: 500

View file

@ -16,7 +16,7 @@ MODEL_LIST = [
SAC,
]
#
@pytest.mark.parametrize("model_class", MODEL_LIST)
def test_save_load(model_class):
"""
@ -160,3 +160,61 @@ def test_save_load_replay_buffer(model_class):
# clear file from os
os.remove(replay_path)
@pytest.mark.parametrize("model_class", MODEL_LIST)
def test_save_load_policy(model_class):
"""
Test saving and loading policy only.
:param model_class: (BaseRLModel) A RL model
"""
env = DummyVecEnv([lambda: IdentityEnvBox(10)])
# create model
model = model_class('MlpPolicy', env, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
model.learn(total_timesteps=500, eval_freq=250)
env.reset()
observations = np.array([env.step(env.action_space.sample())[0] for _ in range(10)])
observations = observations.reshape(10, -1)
policy = model.policy
# Get dictionary of current parameters
params = deepcopy(policy.state_dict())
# Modify all parameters to be random values
random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items())
# Update model parameters with the new random values
policy.load_state_dict(random_params)
new_params = policy.state_dict()
# Check that all params are different now
for k in params:
assert not th.allclose(params[k], new_params[k]), "Parameters did not change as expected."
params = new_params
# get selected actions
selected_actions, _ = policy.predict(observations, deterministic=True)
# Save and load policy
policy.save("./logs/policy_weights.pkl")
# del policy
policy.load("./logs/policy_weights.pkl")
# check if params are still the same after load
new_params = policy.state_dict()
# Check that all params are the same as before save load procedure now
for key in params:
assert th.allclose(params[key], new_params[key]), "Policy parameters not the same after save and load."
# check if model still selects the same actions
new_selected_actions, _ = policy.predict(observations, deterministic=True)
assert np.allclose(selected_actions, new_selected_actions, 1e-4)
# clear file from os
os.remove("./logs/policy_weights.pkl")

View file

@ -158,27 +158,6 @@ class BaseRLModel(ABC):
assert eval_env.num_envs == 1
return eval_env
def scale_action(self, action: np.ndarray) -> np.ndarray:
"""
Rescale the action from [low, high] to [-1, 1]
(no need for symmetric action space)
:param action: (np.ndarray) Action to scale
:return: (np.ndarray) Scaled action
"""
low, high = self.action_space.low, self.action_space.high
return 2.0 * ((action - low) / (high - low)) - 1.0
def unscale_action(self, scaled_action: np.ndarray) -> np.ndarray:
"""
Rescale the action from [-1, 1] to [low, high]
(no need for symmetric action space)
:param scaled_action: Action to un-scale
"""
low, high = self.action_space.low, self.action_space.high
return low + (0.5 * (scaled_action + 1.0) * (high - low))
def _setup_lr_schedule(self) -> None:
"""Transform to callable if needed."""
self.lr_schedule = get_schedule_fn(self.learning_rate)
@ -318,57 +297,6 @@ class BaseRLModel(ABC):
"""
raise NotImplementedError()
@staticmethod
def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.spaces.Space) -> bool:
"""
For every observation type, detects and validates the shape,
then returns whether or not the observation is vectorized.
:param observation: (np.ndarray) the input observation to validate
:param observation_space: (gym.spaces) the observation space
:return: (bool) whether the given observation is vectorized or not
"""
if isinstance(observation_space, gym.spaces.Box):
if observation.shape == observation_space.shape:
return False
elif observation.shape[1:] == observation_space.shape:
return True
else:
raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
"Box environment, please use {} ".format(observation_space.shape) +
"or (n_env, {}) for the observation shape."
.format(", ".join(map(str, observation_space.shape))))
elif isinstance(observation_space, gym.spaces.Discrete):
if observation.shape == (): # A numpy array of a number, has shape empty tuple '()'
return False
elif len(observation.shape) == 1:
return True
else:
raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
"Discrete environment, please use (1,) or (n_env, 1) for the observation shape.")
# TODO: add support for MultiDiscrete and MultiBinary observation spaces
# elif isinstance(observation_space, gym.spaces.MultiDiscrete):
# if observation.shape == (len(observation_space.nvec),):
# return False
# elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec):
# return True
# else:
# raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) +
# "environment, please use ({},) or ".format(len(observation_space.nvec)) +
# "(n_env, {}) for the observation shape.".format(len(observation_space.nvec)))
# elif isinstance(observation_space, gym.spaces.MultiBinary):
# if observation.shape == (observation_space.n,):
# return False
# elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n:
# return True
# else:
# raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) +
# "environment, please use ({},) or ".format(observation_space.n) +
# "(n_env, {}) for the observation shape.".format(observation_space.n))
else:
raise ValueError("Error: Cannot determine if the observation is vectorized with the space type {}."
.format(observation_space))
def predict(self, observation: np.ndarray,
state: Optional[np.ndarray] = None,
mask: Optional[np.ndarray] = None,
@ -383,36 +311,7 @@ class BaseRLModel(ABC):
:return: (Tuple[np.ndarray, Optional[np.ndarray]]) the model's action and the next state
(used in recurrent policies)
"""
# TODO: move this block to BasePolicy
# if state is None:
# state = self.initial_state
# if mask is None:
# mask = [False for _ in range(self.n_envs)]
observation = np.array(observation)
vectorized_env = self._is_vectorized_observation(observation, self.observation_space)
observation = observation.reshape((-1,) + self.observation_space.shape)
observation = th.as_tensor(observation).to(self.device)
with th.no_grad():
actions = self.policy.predict(observation, deterministic=deterministic)
# Convert to numpy
actions = actions.cpu().numpy()
# Rescale to proper domain when using squashing
if isinstance(self.action_space, gym.spaces.Box) and self.policy.squash_output:
actions = self.unscale_action(actions)
clipped_actions = actions
# Clip the actions to avoid out of bound error when using gaussian distribution
if isinstance(self.action_space, gym.spaces.Box) and not self.policy.squash_output:
clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
if not vectorized_env:
if state is not None:
raise ValueError("Error: The environment must be vectorized when using recurrent policies.")
clipped_actions = clipped_actions[0]
return clipped_actions, state
return self.policy.predict(observation, state, mask, deterministic)
@classmethod
def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs):
@ -484,10 +383,7 @@ class BaseRLModel(ABC):
raise ValueError(f"Error: the file {load_path} could not be found")
# set device to cpu if cuda is not available
if th.cuda.is_available():
device = th.device('cuda')
else:
device = th.device('cpu')
device = th.device('cuda') if th.cuda.is_available() else th.device('cpu')
# Open the zip archive and load data
try:
@ -534,20 +430,6 @@ class BaseRLModel(ABC):
# load the parameters with the right `map_location`
params[os.path.splitext(file_path)[0]] = th.load(file_content, map_location=device)
# for backward compatibility
if params.get('params') is not None:
params_copy = {}
for name in params:
if name == 'params':
params_copy['policy'] = params[name]
elif name == 'opt':
params_copy['policy.optimizer'] = params[name]
# Special case for SAC
elif name == 'ent_coef_optimizer':
params_copy[name] = params[name]
else:
params_copy[name + '.optimizer'] = params[name]
params = params_copy
except zipfile.BadZipFile:
# load_path wasn't a zip file
raise ValueError(f"Error: the file {load_path} wasn't a zip-file")
@ -925,7 +807,7 @@ class OffPolicyRLModel(BaseRLModel):
unscaled_action, _ = self.predict(obs, deterministic=False)
# Rescale the action from [low, high] to [-1, 1]
scaled_action = self.scale_action(unscaled_action)
scaled_action = self.policy.scale_action(unscaled_action)
if self.use_sde:
# When using SDE, the action can be out of bounds
@ -941,7 +823,7 @@ class OffPolicyRLModel(BaseRLModel):
clipped_action = np.clip(clipped_action + action_noise(), -1, 1)
# Rescale and perform action
new_obs, reward, done, infos = env.step(self.unscale_action(clipped_action))
new_obs, reward, done, infos = env.step(self.policy.unscale_action(clipped_action))
# Only stop training if return value is False, not when it is None.
if callback.on_step() is False:

View file

@ -33,12 +33,52 @@ class Distribution(object):
def sample(self) -> th.Tensor:
"""
returns a sample from the probabilty distribution
Returns a sample from the probabilty distribution
:return: (th.Tensor) the stochastic action
"""
raise NotImplementedError
def mode(self) -> th.Tensor:
"""
Returns the most likely action (deterministic output)
from the probabilty distribution
:return: (th.Tensor) the stochastic action
"""
raise NotImplementedError
def get_action(self, deterministic: bool = False) -> th.Tensor:
"""
Return an action according to the probabilty distribution.
:param deterministic: (bool)
:return: (th.Tensor)
"""
if deterministic:
return self.mode()
else:
return self.sample()
def action_from_params(self, *args, **kwargs) -> th.Tensor:
"""
Returns a sample from the probabilty distribution
given its parameters.
:return: (th.Tensor) the action
"""
raise NotImplementedError
def log_prob_from_params(self, *args, **kwargs) -> Tuple[th.Tensor, th.Tensor]:
"""
Returns a sample and the associated log probabilty
from the probabilty distribution
given its parameters.
:return: (th.Tuple[th.Tensor, th.Tensor]) action and log prob
"""
raise NotImplementedError
def sum_independent_dims(tensor: th.Tensor) -> th.Tensor:
"""
@ -88,23 +128,17 @@ class DiagGaussianDistribution(Distribution):
return mean_actions, log_std
def proba_distribution(self, mean_actions: th.Tensor,
log_std: th.Tensor,
deterministic: bool = False) -> Tuple[th.Tensor, 'DiagGaussianDistribution']:
log_std: th.Tensor) -> 'DiagGaussianDistribution':
"""
Create and sample for the distribution given its parameters (mean, std)
Create the distribution given its parameters (mean, std)
:param mean_actions: (th.Tensor)
:param log_std: (th.Tensor)
:param deterministic: (bool)
:return: (th.Tensor)
:return: (DiagGaussianDistribution)
"""
action_std = th.ones_like(mean_actions) * log_std.exp()
self.distribution = Normal(mean_actions, action_std)
if deterministic:
action = self.mode()
else:
action = self.sample()
return action, self
return self
def mode(self) -> th.Tensor:
return self.distribution.mean
@ -115,7 +149,15 @@ class DiagGaussianDistribution(Distribution):
def entropy(self) -> th.Tensor:
return sum_independent_dims(self.distribution.entropy())
def log_prob_from_params(self, mean_actions: th.Tensor, log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
def action_from_params(self, mean_actions: th.Tensor,
log_std: th.Tensor,
deterministic: bool = False) -> th.Tensor:
# Update the proba distribution
self.proba_distribution(mean_actions, log_std)
return self.get_action(deterministic=deterministic)
def log_prob_from_params(self, mean_actions: th.Tensor,
log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
"""
Compute the log probabilty of taking an action
given the distribution parameters.
@ -124,7 +166,7 @@ class DiagGaussianDistribution(Distribution):
:param log_std: (th.Tensor)
:return: (Tuple[th.Tensor, th.Tensor])
"""
action, _ = self.proba_distribution(mean_actions, log_std)
action = self.action_from_params(mean_actions, log_std)
log_prob = self.log_prob(action)
return action, log_prob
@ -156,10 +198,10 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution):
self.epsilon = epsilon
self.gaussian_action = None
def proba_distribution(self, mean_actions, log_std, deterministic=False):
action, _ = super(SquashedDiagGaussianDistribution, self).proba_distribution(mean_actions, log_std,
deterministic)
return action, self
def proba_distribution(self, mean_actions: th.Tensor,
log_std: th.Tensor) -> 'SquashedDiagGaussianDistribution':
super(SquashedDiagGaussianDistribution, self).proba_distribution(mean_actions, log_std)
return self
def mode(self) -> th.Tensor:
self.gaussian_action = self.distribution.mean
@ -175,12 +217,14 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution):
self.gaussian_action = self.distribution.rsample()
return th.tanh(self.gaussian_action)
def log_prob_from_params(self, mean_actions, log_std) -> Tuple[th.Tensor, th.Tensor]:
action, _ = self.proba_distribution(mean_actions, log_std)
def log_prob_from_params(self, mean_actions: th.Tensor,
log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
action = self.action_from_params(mean_actions, log_std)
log_prob = self.log_prob(action, self.gaussian_action)
return action, log_prob
def log_prob(self, action: th.Tensor, gaussian_action: Optional[th.Tensor] = None) -> th.Tensor:
def log_prob(self, action: th.Tensor,
gaussian_action: Optional[th.Tensor] = None) -> th.Tensor:
# Inverse tanh
# Naive implementation (not stable): 0.5 * torch.log((1 + x) / (1 - x))
# We use numpy to avoid numerical instability
@ -220,14 +264,9 @@ class CategoricalDistribution(Distribution):
action_logits = nn.Linear(latent_dim, self.action_dim)
return action_logits
def proba_distribution(self, action_logits: th.Tensor,
deterministic: bool = False) -> Tuple[th.Tensor, 'CategoricalDistribution']:
def proba_distribution(self, action_logits: th.Tensor) -> 'CategoricalDistribution':
self.distribution = Categorical(logits=action_logits)
if deterministic:
action = self.mode()
else:
action = self.sample()
return action, self
return self
def mode(self) -> th.Tensor:
return th.argmax(self.distribution.probs, dim=1)
@ -238,8 +277,14 @@ class CategoricalDistribution(Distribution):
def entropy(self) -> th.Tensor:
return self.distribution.entropy()
def action_from_params(self, action_logits: th.Tensor,
deterministic: bool = False) -> th.Tensor:
# Update the proba distribution
self.proba_distribution(action_logits)
return self.get_action(deterministic=deterministic)
def log_prob_from_params(self, action_logits: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
action, _ = self.proba_distribution(action_logits)
action = self.action_from_params(action_logits)
log_prob = self.log_prob(action)
return action, log_prob
@ -283,6 +328,7 @@ class StateDependentNoiseDistribution(Distribution):
self.weights_dist = None
self.exploration_mat = None
self.exploration_matrices = None
self._latent_sde = None
self.use_expln = use_expln
self.full_std = full_std
self.epsilon = epsilon
@ -358,27 +404,26 @@ class StateDependentNoiseDistribution(Distribution):
def proba_distribution(self, mean_actions: th.Tensor,
log_std: th.Tensor,
latent_sde: th.Tensor,
deterministic: bool = False) -> Tuple[th.Tensor, 'StateDependentNoiseDistribution']:
latent_sde: th.Tensor) -> 'StateDependentNoiseDistribution':
"""
Create and sample for the distribution given its parameters (mean, std)
:param mean_actions: (th.Tensor)
:param log_std: (th.Tensor)
:param latent_sde: (th.Tensor)
:param deterministic: (bool)
:return: (Tuple[th.Tensor, Distribution])
:return: (StateDependentNoiseDistribution)
"""
# Stop gradient if we don't want to influence the features
latent_sde = latent_sde if self.learn_features else latent_sde.detach()
self._latent_sde = latent_sde if self.learn_features else latent_sde.detach()
variance = th.mm(latent_sde ** 2, self.get_std(log_std) ** 2)
self.distribution = Normal(mean_actions, th.sqrt(variance + self.epsilon))
return self
def get_action(self, deterministic: bool = False) -> th.Tensor:
if deterministic:
action = self.mode()
return self.mode()
else:
action = self.sample(latent_sde)
return action, self
return self.sample(self._latent_sde)
def mode(self) -> th.Tensor:
action = self.distribution.mean
@ -412,10 +457,18 @@ class StateDependentNoiseDistribution(Distribution):
return None
return sum_independent_dims(self.distribution.entropy())
def action_from_params(self, mean_actions: th.Tensor,
log_std: th.Tensor,
latent_sde: th.Tensor,
deterministic: bool = False) -> th.Tensor:
# Update the proba distribution
self.proba_distribution(mean_actions, log_std, latent_sde)
return self.get_action(deterministic=deterministic)
def log_prob_from_params(self, mean_actions: th.Tensor,
log_std: th.Tensor,
latent_sde: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
action, _ = self.proba_distribution(mean_actions, log_std, latent_sde)
action = self.action_from_params(mean_actions, log_std, latent_sde)
log_prob = self.log_prob(action)
return action, log_prob

View file

@ -63,7 +63,7 @@ class BasePolicy(nn.Module):
def forward(self, *_args, **kwargs):
raise NotImplementedError()
def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
"""
Get the action according to the policy for a given observation.
@ -73,21 +73,145 @@ class BasePolicy(nn.Module):
"""
raise NotImplementedError()
def predict(self, observation: np.ndarray,
state: Optional[np.ndarray] = None,
mask: Optional[np.ndarray] = None,
deterministic: bool = False) -> Tuple[np.ndarray, Optional[np.ndarray]]:
"""
Get the policy action and state from an observation (and optional state).
:param observation: (np.ndarray) the input observation
:param state: (Optional[np.ndarray]) The last states (can be None, used in recurrent policies)
:param mask: (Optional[np.ndarray]) The last masks (can be None, used in recurrent policies)
:param deterministic: (bool) Whether or not to return deterministic actions.
:return: (Tuple[np.ndarray, Optional[np.ndarray]]) the model's action and the next state
(used in recurrent policies)
"""
# if state is None:
# state = self.initial_state
# if mask is None:
# mask = [False for _ in range(self.n_envs)]
observation = np.array(observation)
vectorized_env = self._is_vectorized_observation(observation, self.observation_space)
observation = observation.reshape((-1,) + self.observation_space.shape)
observation = th.as_tensor(observation).to(self.device)
with th.no_grad():
actions = self._predict(observation, deterministic=deterministic)
# Convert to numpy
actions = actions.cpu().numpy()
# Rescale to proper domain when using squashing
if isinstance(self.action_space, gym.spaces.Box) and self.squash_output:
actions = self.unscale_action(actions)
clipped_actions = actions
# Clip the actions to avoid out of bound error when using gaussian distribution
if isinstance(self.action_space, gym.spaces.Box) and not self.squash_output:
clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
if not vectorized_env:
if state is not None:
raise ValueError("Error: The environment must be vectorized when using recurrent policies.")
clipped_actions = clipped_actions[0]
return clipped_actions, state
def scale_action(self, action: np.ndarray) -> np.ndarray:
"""
Rescale the action from [low, high] to [-1, 1]
(no need for symmetric action space)
:param action: (np.ndarray) Action to scale
:return: (np.ndarray) Scaled action
"""
low, high = self.action_space.low, self.action_space.high
return 2.0 * ((action - low) / (high - low)) - 1.0
def unscale_action(self, scaled_action: np.ndarray) -> np.ndarray:
"""
Rescale the action from [-1, 1] to [low, high]
(no need for symmetric action space)
:param scaled_action: Action to un-scale
"""
low, high = self.action_space.low, self.action_space.high
return low + (0.5 * (scaled_action + 1.0) * (high - low))
@staticmethod
def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.spaces.Space) -> bool:
"""
For every observation type, detects and validates the shape,
then returns whether or not the observation is vectorized.
:param observation: (np.ndarray) the input observation to validate
:param observation_space: (gym.spaces) the observation space
:return: (bool) whether the given observation is vectorized or not
"""
if isinstance(observation_space, gym.spaces.Box):
if observation.shape == observation_space.shape:
return False
elif observation.shape[1:] == observation_space.shape:
return True
else:
raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
"Box environment, please use {} ".format(observation_space.shape) +
"or (n_env, {}) for the observation shape."
.format(", ".join(map(str, observation_space.shape))))
elif isinstance(observation_space, gym.spaces.Discrete):
if observation.shape == (): # A numpy array of a number, has shape empty tuple '()'
return False
elif len(observation.shape) == 1:
return True
else:
raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
"Discrete environment, please use (1,) or (n_env, 1) for the observation shape.")
# TODO: add support for MultiDiscrete and MultiBinary observation spaces
# elif isinstance(observation_space, gym.spaces.MultiDiscrete):
# if observation.shape == (len(observation_space.nvec),):
# return False
# elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec):
# return True
# else:
# raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) +
# "environment, please use ({},) or ".format(len(observation_space.nvec)) +
# "(n_env, {}) for the observation shape.".format(len(observation_space.nvec)))
# elif isinstance(observation_space, gym.spaces.MultiBinary):
# if observation.shape == (observation_space.n,):
# return False
# elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n:
# return True
# else:
# raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) +
# "environment, please use ({},) or ".format(observation_space.n) +
# "(n_env, {}) for the observation shape.".format(observation_space.n))
else:
raise ValueError("Error: Cannot determine if the observation is vectorized with the space type {}."
.format(observation_space))
def save(self, path: str) -> None:
"""
Save model to a given location.
Save policy weights to a given location.
NOTE: we don't save policy parameters
:param path: (str)
"""
previous_device = self.device
# Convert to cpu before saving
self = self.to('cpu')
th.save(self.state_dict(), path)
self = self.to(previous_device)
def load(self, path: str) -> None:
"""
Load saved model from path.
Load policy weights from path.
NOTE: we don't load policy parameters
:param path: (str)
"""
self.load_state_dict(th.load(path))
self = self.to(self.device)
def load_from_vector(self, vector: np.ndarray):
"""

View file

@ -156,9 +156,9 @@ class PPOPolicy(BasePolicy):
latent_pi, latent_vf, latent_sde = self._get_latent(obs)
# Evaluate the values for the given observations
value = self.value_net(latent_vf)
action, action_distribution = self._get_action_dist_from_latent(latent_pi, latent_sde=latent_sde,
deterministic=deterministic)
log_prob = action_distribution.log_prob(action)
distribution = self._get_action_dist_from_latent(latent_pi, latent_sde=latent_sde)
action = distribution.get_action(deterministic=deterministic)
log_prob = distribution.log_prob(action)
return action, value, log_prob
def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
@ -180,33 +180,29 @@ class PPOPolicy(BasePolicy):
return latent_pi, latent_vf, latent_sde
def _get_action_dist_from_latent(self, latent_pi: th.Tensor,
latent_sde: Optional[th.Tensor] = None,
deterministic: bool = False) -> Tuple[th.Tensor, Distribution]:
latent_sde: Optional[th.Tensor] = None) -> Distribution:
"""
Retrieve action and associated action distribution
given the latent codes.
Retrieve action distribution given the latent codes.
:param latent_pi: (th.Tensor) Latent code for the actor
:param latent_sde: (Optional[th.Tensor]) Latent code for the SDE exploration function
:param deterministic: (bool) Whether to sample or use deterministic actions
:return: (Tuple[th.Tensor, Distribution]) Action and action distribution
:return: (Distribution) Action distribution
"""
mean_actions = self.action_net(latent_pi)
if isinstance(self.action_dist, DiagGaussianDistribution):
return self.action_dist.proba_distribution(mean_actions, self.log_std, deterministic=deterministic)
return self.action_dist.proba_distribution(mean_actions, self.log_std)
elif isinstance(self.action_dist, CategoricalDistribution):
# Here mean_actions are the logits before the softmax
return self.action_dist.proba_distribution(mean_actions, deterministic=deterministic)
return self.action_dist.proba_distribution(action_logits=mean_actions)
elif isinstance(self.action_dist, StateDependentNoiseDistribution):
return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde,
deterministic=deterministic)
return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde)
else:
raise ValueError('Invalid action distribution')
def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
"""
Get the action according to the policy for a given observation.
@ -215,27 +211,25 @@ class PPOPolicy(BasePolicy):
:return: (th.Tensor) Taken action according to the policy
"""
latent_pi, _, latent_sde = self._get_latent(observation)
action, _ = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic)
return action
distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
return distribution.get_action(deterministic=deterministic)
def evaluate_actions(self, obs: th.Tensor,
actions: th.Tensor,
deterministic: bool = False) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
actions: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
"""
Evaluate actions according to the current policy,
given the observations.
:param obs: (th.Tensor)
:param actions: (th.Tensor)
:param deterministic: (bool)
:return: (th.Tensor, th.Tensor, th.Tensor) estimated value, log likelihood of taking those actions
and entropy of the action distribution.
"""
latent_pi, latent_vf, latent_sde = self._get_latent(obs)
_, action_distribution = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic)
log_prob = action_distribution.log_prob(actions)
distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
log_prob = distribution.log_prob(actions)
values = self.value_net(latent_vf)
return values, log_prob, action_distribution.entropy()
return values, log_prob, distribution.entropy()
MlpPolicy = PPOPolicy

View file

@ -108,14 +108,11 @@ class Actor(BasePolicy):
'reset_noise() is only available when using SDE'
self.action_dist.sample_weights(self.log_std, batch_size=batch_size)
def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
features = self.extract_features(obs)
latent_pi = self.latent_pi(features)
latent_sde = self.sde_features_extractor(features) if self.sde_features_extractor is not None else latent_pi
return latent_pi, latent_sde
def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
latent_pi, latent_sde = self._get_latent(obs)
mean_actions = self.mu(latent_pi)
if self.use_sde:
@ -130,9 +127,8 @@ class Actor(BasePolicy):
mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
kwargs = dict(latent_sde=latent_sde) if self.use_sde else {}
# Note: the action is squashed
action, _ = self.action_dist.proba_distribution(mean_actions, log_std,
deterministic=deterministic, **kwargs)
return action
return self.action_dist.action_from_params(mean_actions, log_std,
deterministic=deterministic, **kwargs)
def action_log_prob(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
@ -268,7 +264,7 @@ class SACPolicy(BasePolicy):
def forward(self, obs: th.Tensor) -> th.Tensor:
return self.predict(obs, deterministic=False)
def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
return self.actor(observation, deterministic)

View file

@ -104,11 +104,6 @@ class Actor(BasePolicy):
"""
return self.action_dist.get_std(self.log_std)
def _get_action_dist_from_latent(self, latent_pi: th.Tensor,
latent_sde: th.Tensor) -> Tuple[th.Tensor, Distribution]:
mean_actions = self.mu(latent_pi)
return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde)
def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
features = self.extract_features(obs)
latent_pi = self.latent_pi(features)
@ -126,9 +121,9 @@ class Actor(BasePolicy):
and entropy of the action distribution.
"""
latent_pi, latent_sde = self._get_latent(obs)
_, distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
mean_actions = self.mu(latent_pi)
distribution = self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde)
log_prob = distribution.log_prob(action)
# value = self.value_net(latent_vf)
return log_prob, distribution.entropy()
def reset_noise(self) -> None:
@ -150,8 +145,6 @@ class Actor(BasePolicy):
# -> set squash_output=True in the action_dist?
# NOTE: the clipping is done in the rollout for now
return self.mu(latent_pi) + noise
# action, _ = self._get_action_dist_from_latent(latent_pi)
# return action
else:
features = self.extract_features(obs)
return self.mu(features)
@ -338,9 +331,9 @@ class TD3Policy(BasePolicy):
return Critic(**self.net_args).to(self.device)
def forward(self, observation: th.Tensor, deterministic: bool = False):
return self.predict(observation, deterministic=deterministic)
return self._predict(observation, deterministic=deterministic)
def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
return self.actor(observation, deterministic=deterministic)

View file

@ -1 +1 @@
0.4.0a2
0.4.0a3