mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-05-14 20:58:03 +00:00
Add save/load weights for policies and refactor action distributions
This commit is contained in:
parent
b782f3a208
commit
fdecd512db
11 changed files with 319 additions and 211 deletions
|
|
@ -10,10 +10,12 @@ Pre-Release 0.4.0a0 (WIP)
|
|||
Breaking Changes:
|
||||
^^^^^^^^^^^^^^^^^
|
||||
- Removed CEMRL
|
||||
- Model saved with previous versions cannot be loaded (because of the pre-preprocessing)
|
||||
|
||||
New Features:
|
||||
^^^^^^^^^^^^^
|
||||
- Add support for Discrete observation spaces
|
||||
- Add saving/loading for policy weights, so the policy can be used without the model
|
||||
|
||||
Bug Fixes:
|
||||
^^^^^^^^^^
|
||||
|
|
@ -26,6 +28,8 @@ Others:
|
|||
^^^^^^^
|
||||
- Refactor handling of observation and action spaces
|
||||
- Refactored features extraction to have proper preprocessing
|
||||
- Refactored action distributions
|
||||
|
||||
|
||||
Documentation:
|
||||
^^^^^^^^^^^^^^
|
||||
|
|
|
|||
|
|
@ -38,7 +38,8 @@ def test_squashed_gaussian(model_class):
|
|||
gaussian_mean = th.rand(N_SAMPLES, N_ACTIONS)
|
||||
dist = SquashedDiagGaussianDistribution(N_ACTIONS)
|
||||
_, log_std = dist.proba_distribution_net(N_FEATURES)
|
||||
actions, _ = dist.proba_distribution(gaussian_mean, log_std)
|
||||
dist = dist.proba_distribution(gaussian_mean, log_std)
|
||||
actions = dist.get_action()
|
||||
assert th.max(th.abs(actions)) <= 1.0
|
||||
|
||||
def test_sde_distribution():
|
||||
|
|
@ -51,7 +52,8 @@ def test_sde_distribution():
|
|||
_, log_std = dist.proba_distribution_net(N_FEATURES)
|
||||
dist.sample_weights(log_std, batch_size=N_SAMPLES)
|
||||
|
||||
actions, _ = dist.proba_distribution(deterministic_actions, log_std, state)
|
||||
dist = dist.proba_distribution(deterministic_actions, log_std, state)
|
||||
actions = dist.get_action()
|
||||
|
||||
assert th.allclose(actions.mean(), dist.distribution.mean.mean(), rtol=1e-3)
|
||||
assert th.allclose(actions.std(), dist.distribution.scale.mean(), rtol=1e-3)
|
||||
|
|
@ -71,11 +73,12 @@ def test_entropy(dist):
|
|||
_, log_std = dist.proba_distribution_net(N_FEATURES, log_std_init=th.log(th.tensor(0.2)))
|
||||
|
||||
if isinstance(dist, DiagGaussianDistribution):
|
||||
actions, dist = dist.proba_distribution(deterministic_actions, log_std)
|
||||
dist = dist.proba_distribution(deterministic_actions, log_std)
|
||||
else:
|
||||
dist.sample_weights(log_std, batch_size=N_SAMPLES)
|
||||
actions, dist = dist.proba_distribution(deterministic_actions, log_std, state)
|
||||
dist = dist.proba_distribution(deterministic_actions, log_std, state)
|
||||
|
||||
actions = dist.get_action()
|
||||
entropy = dist.entropy()
|
||||
log_prob = dist.log_prob(actions)
|
||||
assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=5e-3)
|
||||
|
|
@ -88,8 +91,9 @@ def test_categorical():
|
|||
set_random_seed(1)
|
||||
state = th.rand(N_SAMPLES, N_FEATURES)
|
||||
action_logits = th.rand(N_SAMPLES, N_ACTIONS)
|
||||
actions, dist = dist.proba_distribution(action_logits)
|
||||
dist = dist.proba_distribution(action_logits)
|
||||
|
||||
actions = dist.get_action()
|
||||
entropy = dist.entropy()
|
||||
log_prob = dist.log_prob(actions)
|
||||
assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=1e-4)
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ def test_continuous(model_class):
|
|||
env = IdentityEnvBox(eps=0.5)
|
||||
|
||||
n_steps = {
|
||||
A2C: 3000,
|
||||
A2C: 3500,
|
||||
PPO: 3000,
|
||||
SAC: 700,
|
||||
TD3: 500
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ MODEL_LIST = [
|
|||
SAC,
|
||||
]
|
||||
|
||||
|
||||
#
|
||||
@pytest.mark.parametrize("model_class", MODEL_LIST)
|
||||
def test_save_load(model_class):
|
||||
"""
|
||||
|
|
@ -160,3 +160,61 @@ def test_save_load_replay_buffer(model_class):
|
|||
|
||||
# clear file from os
|
||||
os.remove(replay_path)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_class", MODEL_LIST)
|
||||
def test_save_load_policy(model_class):
|
||||
"""
|
||||
Test saving and loading policy only.
|
||||
|
||||
:param model_class: (BaseRLModel) A RL model
|
||||
"""
|
||||
env = DummyVecEnv([lambda: IdentityEnvBox(10)])
|
||||
|
||||
# create model
|
||||
model = model_class('MlpPolicy', env, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
|
||||
model.learn(total_timesteps=500, eval_freq=250)
|
||||
|
||||
env.reset()
|
||||
observations = np.array([env.step(env.action_space.sample())[0] for _ in range(10)])
|
||||
observations = observations.reshape(10, -1)
|
||||
|
||||
policy = model.policy
|
||||
|
||||
# Get dictionary of current parameters
|
||||
params = deepcopy(policy.state_dict())
|
||||
|
||||
# Modify all parameters to be random values
|
||||
random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items())
|
||||
|
||||
# Update model parameters with the new random values
|
||||
policy.load_state_dict(random_params)
|
||||
|
||||
new_params = policy.state_dict()
|
||||
# Check that all params are different now
|
||||
for k in params:
|
||||
assert not th.allclose(params[k], new_params[k]), "Parameters did not change as expected."
|
||||
|
||||
params = new_params
|
||||
|
||||
# get selected actions
|
||||
selected_actions, _ = policy.predict(observations, deterministic=True)
|
||||
|
||||
# Save and load policy
|
||||
policy.save("./logs/policy_weights.pkl")
|
||||
# del policy
|
||||
policy.load("./logs/policy_weights.pkl")
|
||||
|
||||
# check if params are still the same after load
|
||||
new_params = policy.state_dict()
|
||||
|
||||
# Check that all params are the same as before save load procedure now
|
||||
for key in params:
|
||||
assert th.allclose(params[key], new_params[key]), "Policy parameters not the same after save and load."
|
||||
|
||||
# check if model still selects the same actions
|
||||
new_selected_actions, _ = policy.predict(observations, deterministic=True)
|
||||
assert np.allclose(selected_actions, new_selected_actions, 1e-4)
|
||||
|
||||
# clear file from os
|
||||
os.remove("./logs/policy_weights.pkl")
|
||||
|
|
|
|||
|
|
@ -158,27 +158,6 @@ class BaseRLModel(ABC):
|
|||
assert eval_env.num_envs == 1
|
||||
return eval_env
|
||||
|
||||
def scale_action(self, action: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Rescale the action from [low, high] to [-1, 1]
|
||||
(no need for symmetric action space)
|
||||
|
||||
:param action: (np.ndarray) Action to scale
|
||||
:return: (np.ndarray) Scaled action
|
||||
"""
|
||||
low, high = self.action_space.low, self.action_space.high
|
||||
return 2.0 * ((action - low) / (high - low)) - 1.0
|
||||
|
||||
def unscale_action(self, scaled_action: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Rescale the action from [-1, 1] to [low, high]
|
||||
(no need for symmetric action space)
|
||||
|
||||
:param scaled_action: Action to un-scale
|
||||
"""
|
||||
low, high = self.action_space.low, self.action_space.high
|
||||
return low + (0.5 * (scaled_action + 1.0) * (high - low))
|
||||
|
||||
def _setup_lr_schedule(self) -> None:
|
||||
"""Transform to callable if needed."""
|
||||
self.lr_schedule = get_schedule_fn(self.learning_rate)
|
||||
|
|
@ -318,57 +297,6 @@ class BaseRLModel(ABC):
|
|||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@staticmethod
|
||||
def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.spaces.Space) -> bool:
|
||||
"""
|
||||
For every observation type, detects and validates the shape,
|
||||
then returns whether or not the observation is vectorized.
|
||||
|
||||
:param observation: (np.ndarray) the input observation to validate
|
||||
:param observation_space: (gym.spaces) the observation space
|
||||
:return: (bool) whether the given observation is vectorized or not
|
||||
"""
|
||||
if isinstance(observation_space, gym.spaces.Box):
|
||||
if observation.shape == observation_space.shape:
|
||||
return False
|
||||
elif observation.shape[1:] == observation_space.shape:
|
||||
return True
|
||||
else:
|
||||
raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
|
||||
"Box environment, please use {} ".format(observation_space.shape) +
|
||||
"or (n_env, {}) for the observation shape."
|
||||
.format(", ".join(map(str, observation_space.shape))))
|
||||
elif isinstance(observation_space, gym.spaces.Discrete):
|
||||
if observation.shape == (): # A numpy array of a number, has shape empty tuple '()'
|
||||
return False
|
||||
elif len(observation.shape) == 1:
|
||||
return True
|
||||
else:
|
||||
raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
|
||||
"Discrete environment, please use (1,) or (n_env, 1) for the observation shape.")
|
||||
# TODO: add support for MultiDiscrete and MultiBinary observation spaces
|
||||
# elif isinstance(observation_space, gym.spaces.MultiDiscrete):
|
||||
# if observation.shape == (len(observation_space.nvec),):
|
||||
# return False
|
||||
# elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec):
|
||||
# return True
|
||||
# else:
|
||||
# raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) +
|
||||
# "environment, please use ({},) or ".format(len(observation_space.nvec)) +
|
||||
# "(n_env, {}) for the observation shape.".format(len(observation_space.nvec)))
|
||||
# elif isinstance(observation_space, gym.spaces.MultiBinary):
|
||||
# if observation.shape == (observation_space.n,):
|
||||
# return False
|
||||
# elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n:
|
||||
# return True
|
||||
# else:
|
||||
# raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) +
|
||||
# "environment, please use ({},) or ".format(observation_space.n) +
|
||||
# "(n_env, {}) for the observation shape.".format(observation_space.n))
|
||||
else:
|
||||
raise ValueError("Error: Cannot determine if the observation is vectorized with the space type {}."
|
||||
.format(observation_space))
|
||||
|
||||
def predict(self, observation: np.ndarray,
|
||||
state: Optional[np.ndarray] = None,
|
||||
mask: Optional[np.ndarray] = None,
|
||||
|
|
@ -383,36 +311,7 @@ class BaseRLModel(ABC):
|
|||
:return: (Tuple[np.ndarray, Optional[np.ndarray]]) the model's action and the next state
|
||||
(used in recurrent policies)
|
||||
"""
|
||||
# TODO: move this block to BasePolicy
|
||||
# if state is None:
|
||||
# state = self.initial_state
|
||||
# if mask is None:
|
||||
# mask = [False for _ in range(self.n_envs)]
|
||||
observation = np.array(observation)
|
||||
vectorized_env = self._is_vectorized_observation(observation, self.observation_space)
|
||||
|
||||
observation = observation.reshape((-1,) + self.observation_space.shape)
|
||||
observation = th.as_tensor(observation).to(self.device)
|
||||
with th.no_grad():
|
||||
actions = self.policy.predict(observation, deterministic=deterministic)
|
||||
# Convert to numpy
|
||||
actions = actions.cpu().numpy()
|
||||
|
||||
# Rescale to proper domain when using squashing
|
||||
if isinstance(self.action_space, gym.spaces.Box) and self.policy.squash_output:
|
||||
actions = self.unscale_action(actions)
|
||||
|
||||
clipped_actions = actions
|
||||
# Clip the actions to avoid out of bound error when using gaussian distribution
|
||||
if isinstance(self.action_space, gym.spaces.Box) and not self.policy.squash_output:
|
||||
clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
|
||||
|
||||
if not vectorized_env:
|
||||
if state is not None:
|
||||
raise ValueError("Error: The environment must be vectorized when using recurrent policies.")
|
||||
clipped_actions = clipped_actions[0]
|
||||
|
||||
return clipped_actions, state
|
||||
return self.policy.predict(observation, state, mask, deterministic)
|
||||
|
||||
@classmethod
|
||||
def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs):
|
||||
|
|
@ -484,10 +383,7 @@ class BaseRLModel(ABC):
|
|||
raise ValueError(f"Error: the file {load_path} could not be found")
|
||||
|
||||
# set device to cpu if cuda is not available
|
||||
if th.cuda.is_available():
|
||||
device = th.device('cuda')
|
||||
else:
|
||||
device = th.device('cpu')
|
||||
device = th.device('cuda') if th.cuda.is_available() else th.device('cpu')
|
||||
|
||||
# Open the zip archive and load data
|
||||
try:
|
||||
|
|
@ -534,20 +430,6 @@ class BaseRLModel(ABC):
|
|||
# load the parameters with the right `map_location`
|
||||
params[os.path.splitext(file_path)[0]] = th.load(file_content, map_location=device)
|
||||
|
||||
# for backward compatibility
|
||||
if params.get('params') is not None:
|
||||
params_copy = {}
|
||||
for name in params:
|
||||
if name == 'params':
|
||||
params_copy['policy'] = params[name]
|
||||
elif name == 'opt':
|
||||
params_copy['policy.optimizer'] = params[name]
|
||||
# Special case for SAC
|
||||
elif name == 'ent_coef_optimizer':
|
||||
params_copy[name] = params[name]
|
||||
else:
|
||||
params_copy[name + '.optimizer'] = params[name]
|
||||
params = params_copy
|
||||
except zipfile.BadZipFile:
|
||||
# load_path wasn't a zip file
|
||||
raise ValueError(f"Error: the file {load_path} wasn't a zip-file")
|
||||
|
|
@ -925,7 +807,7 @@ class OffPolicyRLModel(BaseRLModel):
|
|||
unscaled_action, _ = self.predict(obs, deterministic=False)
|
||||
|
||||
# Rescale the action from [low, high] to [-1, 1]
|
||||
scaled_action = self.scale_action(unscaled_action)
|
||||
scaled_action = self.policy.scale_action(unscaled_action)
|
||||
|
||||
if self.use_sde:
|
||||
# When using SDE, the action can be out of bounds
|
||||
|
|
@ -941,7 +823,7 @@ class OffPolicyRLModel(BaseRLModel):
|
|||
clipped_action = np.clip(clipped_action + action_noise(), -1, 1)
|
||||
|
||||
# Rescale and perform action
|
||||
new_obs, reward, done, infos = env.step(self.unscale_action(clipped_action))
|
||||
new_obs, reward, done, infos = env.step(self.policy.unscale_action(clipped_action))
|
||||
|
||||
# Only stop training if return value is False, not when it is None.
|
||||
if callback.on_step() is False:
|
||||
|
|
|
|||
|
|
@ -33,12 +33,52 @@ class Distribution(object):
|
|||
|
||||
def sample(self) -> th.Tensor:
|
||||
"""
|
||||
returns a sample from the probabilty distribution
|
||||
Returns a sample from the probabilty distribution
|
||||
|
||||
:return: (th.Tensor) the stochastic action
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def mode(self) -> th.Tensor:
|
||||
"""
|
||||
Returns the most likely action (deterministic output)
|
||||
from the probabilty distribution
|
||||
|
||||
:return: (th.Tensor) the stochastic action
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_action(self, deterministic: bool = False) -> th.Tensor:
|
||||
"""
|
||||
Return an action according to the probabilty distribution.
|
||||
|
||||
:param deterministic: (bool)
|
||||
:return: (th.Tensor)
|
||||
"""
|
||||
if deterministic:
|
||||
return self.mode()
|
||||
else:
|
||||
return self.sample()
|
||||
|
||||
def action_from_params(self, *args, **kwargs) -> th.Tensor:
|
||||
"""
|
||||
Returns a sample from the probabilty distribution
|
||||
given its parameters.
|
||||
|
||||
:return: (th.Tensor) the action
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def log_prob_from_params(self, *args, **kwargs) -> Tuple[th.Tensor, th.Tensor]:
|
||||
"""
|
||||
Returns a sample and the associated log probabilty
|
||||
from the probabilty distribution
|
||||
given its parameters.
|
||||
|
||||
:return: (th.Tuple[th.Tensor, th.Tensor]) action and log prob
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def sum_independent_dims(tensor: th.Tensor) -> th.Tensor:
|
||||
"""
|
||||
|
|
@ -88,23 +128,17 @@ class DiagGaussianDistribution(Distribution):
|
|||
return mean_actions, log_std
|
||||
|
||||
def proba_distribution(self, mean_actions: th.Tensor,
|
||||
log_std: th.Tensor,
|
||||
deterministic: bool = False) -> Tuple[th.Tensor, 'DiagGaussianDistribution']:
|
||||
log_std: th.Tensor) -> 'DiagGaussianDistribution':
|
||||
"""
|
||||
Create and sample for the distribution given its parameters (mean, std)
|
||||
Create the distribution given its parameters (mean, std)
|
||||
|
||||
:param mean_actions: (th.Tensor)
|
||||
:param log_std: (th.Tensor)
|
||||
:param deterministic: (bool)
|
||||
:return: (th.Tensor)
|
||||
:return: (DiagGaussianDistribution)
|
||||
"""
|
||||
action_std = th.ones_like(mean_actions) * log_std.exp()
|
||||
self.distribution = Normal(mean_actions, action_std)
|
||||
if deterministic:
|
||||
action = self.mode()
|
||||
else:
|
||||
action = self.sample()
|
||||
return action, self
|
||||
return self
|
||||
|
||||
def mode(self) -> th.Tensor:
|
||||
return self.distribution.mean
|
||||
|
|
@ -115,7 +149,15 @@ class DiagGaussianDistribution(Distribution):
|
|||
def entropy(self) -> th.Tensor:
|
||||
return sum_independent_dims(self.distribution.entropy())
|
||||
|
||||
def log_prob_from_params(self, mean_actions: th.Tensor, log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
|
||||
def action_from_params(self, mean_actions: th.Tensor,
|
||||
log_std: th.Tensor,
|
||||
deterministic: bool = False) -> th.Tensor:
|
||||
# Update the proba distribution
|
||||
self.proba_distribution(mean_actions, log_std)
|
||||
return self.get_action(deterministic=deterministic)
|
||||
|
||||
def log_prob_from_params(self, mean_actions: th.Tensor,
|
||||
log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
|
||||
"""
|
||||
Compute the log probabilty of taking an action
|
||||
given the distribution parameters.
|
||||
|
|
@ -124,7 +166,7 @@ class DiagGaussianDistribution(Distribution):
|
|||
:param log_std: (th.Tensor)
|
||||
:return: (Tuple[th.Tensor, th.Tensor])
|
||||
"""
|
||||
action, _ = self.proba_distribution(mean_actions, log_std)
|
||||
action = self.action_from_params(mean_actions, log_std)
|
||||
log_prob = self.log_prob(action)
|
||||
return action, log_prob
|
||||
|
||||
|
|
@ -156,10 +198,10 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution):
|
|||
self.epsilon = epsilon
|
||||
self.gaussian_action = None
|
||||
|
||||
def proba_distribution(self, mean_actions, log_std, deterministic=False):
|
||||
action, _ = super(SquashedDiagGaussianDistribution, self).proba_distribution(mean_actions, log_std,
|
||||
deterministic)
|
||||
return action, self
|
||||
def proba_distribution(self, mean_actions: th.Tensor,
|
||||
log_std: th.Tensor) -> 'SquashedDiagGaussianDistribution':
|
||||
super(SquashedDiagGaussianDistribution, self).proba_distribution(mean_actions, log_std)
|
||||
return self
|
||||
|
||||
def mode(self) -> th.Tensor:
|
||||
self.gaussian_action = self.distribution.mean
|
||||
|
|
@ -175,12 +217,14 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution):
|
|||
self.gaussian_action = self.distribution.rsample()
|
||||
return th.tanh(self.gaussian_action)
|
||||
|
||||
def log_prob_from_params(self, mean_actions, log_std) -> Tuple[th.Tensor, th.Tensor]:
|
||||
action, _ = self.proba_distribution(mean_actions, log_std)
|
||||
def log_prob_from_params(self, mean_actions: th.Tensor,
|
||||
log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
|
||||
action = self.action_from_params(mean_actions, log_std)
|
||||
log_prob = self.log_prob(action, self.gaussian_action)
|
||||
return action, log_prob
|
||||
|
||||
def log_prob(self, action: th.Tensor, gaussian_action: Optional[th.Tensor] = None) -> th.Tensor:
|
||||
def log_prob(self, action: th.Tensor,
|
||||
gaussian_action: Optional[th.Tensor] = None) -> th.Tensor:
|
||||
# Inverse tanh
|
||||
# Naive implementation (not stable): 0.5 * torch.log((1 + x) / (1 - x))
|
||||
# We use numpy to avoid numerical instability
|
||||
|
|
@ -220,14 +264,9 @@ class CategoricalDistribution(Distribution):
|
|||
action_logits = nn.Linear(latent_dim, self.action_dim)
|
||||
return action_logits
|
||||
|
||||
def proba_distribution(self, action_logits: th.Tensor,
|
||||
deterministic: bool = False) -> Tuple[th.Tensor, 'CategoricalDistribution']:
|
||||
def proba_distribution(self, action_logits: th.Tensor) -> 'CategoricalDistribution':
|
||||
self.distribution = Categorical(logits=action_logits)
|
||||
if deterministic:
|
||||
action = self.mode()
|
||||
else:
|
||||
action = self.sample()
|
||||
return action, self
|
||||
return self
|
||||
|
||||
def mode(self) -> th.Tensor:
|
||||
return th.argmax(self.distribution.probs, dim=1)
|
||||
|
|
@ -238,8 +277,14 @@ class CategoricalDistribution(Distribution):
|
|||
def entropy(self) -> th.Tensor:
|
||||
return self.distribution.entropy()
|
||||
|
||||
def action_from_params(self, action_logits: th.Tensor,
|
||||
deterministic: bool = False) -> th.Tensor:
|
||||
# Update the proba distribution
|
||||
self.proba_distribution(action_logits)
|
||||
return self.get_action(deterministic=deterministic)
|
||||
|
||||
def log_prob_from_params(self, action_logits: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
|
||||
action, _ = self.proba_distribution(action_logits)
|
||||
action = self.action_from_params(action_logits)
|
||||
log_prob = self.log_prob(action)
|
||||
return action, log_prob
|
||||
|
||||
|
|
@ -283,6 +328,7 @@ class StateDependentNoiseDistribution(Distribution):
|
|||
self.weights_dist = None
|
||||
self.exploration_mat = None
|
||||
self.exploration_matrices = None
|
||||
self._latent_sde = None
|
||||
self.use_expln = use_expln
|
||||
self.full_std = full_std
|
||||
self.epsilon = epsilon
|
||||
|
|
@ -358,27 +404,26 @@ class StateDependentNoiseDistribution(Distribution):
|
|||
|
||||
def proba_distribution(self, mean_actions: th.Tensor,
|
||||
log_std: th.Tensor,
|
||||
latent_sde: th.Tensor,
|
||||
deterministic: bool = False) -> Tuple[th.Tensor, 'StateDependentNoiseDistribution']:
|
||||
latent_sde: th.Tensor) -> 'StateDependentNoiseDistribution':
|
||||
"""
|
||||
Create and sample for the distribution given its parameters (mean, std)
|
||||
|
||||
:param mean_actions: (th.Tensor)
|
||||
:param log_std: (th.Tensor)
|
||||
:param latent_sde: (th.Tensor)
|
||||
:param deterministic: (bool)
|
||||
:return: (Tuple[th.Tensor, Distribution])
|
||||
:return: (StateDependentNoiseDistribution)
|
||||
"""
|
||||
# Stop gradient if we don't want to influence the features
|
||||
latent_sde = latent_sde if self.learn_features else latent_sde.detach()
|
||||
self._latent_sde = latent_sde if self.learn_features else latent_sde.detach()
|
||||
variance = th.mm(latent_sde ** 2, self.get_std(log_std) ** 2)
|
||||
self.distribution = Normal(mean_actions, th.sqrt(variance + self.epsilon))
|
||||
return self
|
||||
|
||||
def get_action(self, deterministic: bool = False) -> th.Tensor:
|
||||
if deterministic:
|
||||
action = self.mode()
|
||||
return self.mode()
|
||||
else:
|
||||
action = self.sample(latent_sde)
|
||||
return action, self
|
||||
return self.sample(self._latent_sde)
|
||||
|
||||
def mode(self) -> th.Tensor:
|
||||
action = self.distribution.mean
|
||||
|
|
@ -412,10 +457,18 @@ class StateDependentNoiseDistribution(Distribution):
|
|||
return None
|
||||
return sum_independent_dims(self.distribution.entropy())
|
||||
|
||||
def action_from_params(self, mean_actions: th.Tensor,
|
||||
log_std: th.Tensor,
|
||||
latent_sde: th.Tensor,
|
||||
deterministic: bool = False) -> th.Tensor:
|
||||
# Update the proba distribution
|
||||
self.proba_distribution(mean_actions, log_std, latent_sde)
|
||||
return self.get_action(deterministic=deterministic)
|
||||
|
||||
def log_prob_from_params(self, mean_actions: th.Tensor,
|
||||
log_std: th.Tensor,
|
||||
latent_sde: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
|
||||
action, _ = self.proba_distribution(mean_actions, log_std, latent_sde)
|
||||
action = self.action_from_params(mean_actions, log_std, latent_sde)
|
||||
log_prob = self.log_prob(action)
|
||||
return action, log_prob
|
||||
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ class BasePolicy(nn.Module):
|
|||
def forward(self, *_args, **kwargs):
|
||||
raise NotImplementedError()
|
||||
|
||||
def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
|
||||
def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
|
||||
"""
|
||||
Get the action according to the policy for a given observation.
|
||||
|
||||
|
|
@ -73,21 +73,145 @@ class BasePolicy(nn.Module):
|
|||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def predict(self, observation: np.ndarray,
|
||||
state: Optional[np.ndarray] = None,
|
||||
mask: Optional[np.ndarray] = None,
|
||||
deterministic: bool = False) -> Tuple[np.ndarray, Optional[np.ndarray]]:
|
||||
"""
|
||||
Get the policy action and state from an observation (and optional state).
|
||||
|
||||
:param observation: (np.ndarray) the input observation
|
||||
:param state: (Optional[np.ndarray]) The last states (can be None, used in recurrent policies)
|
||||
:param mask: (Optional[np.ndarray]) The last masks (can be None, used in recurrent policies)
|
||||
:param deterministic: (bool) Whether or not to return deterministic actions.
|
||||
:return: (Tuple[np.ndarray, Optional[np.ndarray]]) the model's action and the next state
|
||||
(used in recurrent policies)
|
||||
"""
|
||||
# if state is None:
|
||||
# state = self.initial_state
|
||||
# if mask is None:
|
||||
# mask = [False for _ in range(self.n_envs)]
|
||||
observation = np.array(observation)
|
||||
vectorized_env = self._is_vectorized_observation(observation, self.observation_space)
|
||||
|
||||
observation = observation.reshape((-1,) + self.observation_space.shape)
|
||||
observation = th.as_tensor(observation).to(self.device)
|
||||
with th.no_grad():
|
||||
actions = self._predict(observation, deterministic=deterministic)
|
||||
# Convert to numpy
|
||||
actions = actions.cpu().numpy()
|
||||
|
||||
# Rescale to proper domain when using squashing
|
||||
if isinstance(self.action_space, gym.spaces.Box) and self.squash_output:
|
||||
actions = self.unscale_action(actions)
|
||||
|
||||
clipped_actions = actions
|
||||
# Clip the actions to avoid out of bound error when using gaussian distribution
|
||||
if isinstance(self.action_space, gym.spaces.Box) and not self.squash_output:
|
||||
clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
|
||||
|
||||
if not vectorized_env:
|
||||
if state is not None:
|
||||
raise ValueError("Error: The environment must be vectorized when using recurrent policies.")
|
||||
clipped_actions = clipped_actions[0]
|
||||
|
||||
return clipped_actions, state
|
||||
|
||||
def scale_action(self, action: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Rescale the action from [low, high] to [-1, 1]
|
||||
(no need for symmetric action space)
|
||||
|
||||
:param action: (np.ndarray) Action to scale
|
||||
:return: (np.ndarray) Scaled action
|
||||
"""
|
||||
low, high = self.action_space.low, self.action_space.high
|
||||
return 2.0 * ((action - low) / (high - low)) - 1.0
|
||||
|
||||
def unscale_action(self, scaled_action: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Rescale the action from [-1, 1] to [low, high]
|
||||
(no need for symmetric action space)
|
||||
|
||||
:param scaled_action: Action to un-scale
|
||||
"""
|
||||
low, high = self.action_space.low, self.action_space.high
|
||||
return low + (0.5 * (scaled_action + 1.0) * (high - low))
|
||||
|
||||
@staticmethod
|
||||
def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.spaces.Space) -> bool:
|
||||
"""
|
||||
For every observation type, detects and validates the shape,
|
||||
then returns whether or not the observation is vectorized.
|
||||
|
||||
:param observation: (np.ndarray) the input observation to validate
|
||||
:param observation_space: (gym.spaces) the observation space
|
||||
:return: (bool) whether the given observation is vectorized or not
|
||||
"""
|
||||
if isinstance(observation_space, gym.spaces.Box):
|
||||
if observation.shape == observation_space.shape:
|
||||
return False
|
||||
elif observation.shape[1:] == observation_space.shape:
|
||||
return True
|
||||
else:
|
||||
raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
|
||||
"Box environment, please use {} ".format(observation_space.shape) +
|
||||
"or (n_env, {}) for the observation shape."
|
||||
.format(", ".join(map(str, observation_space.shape))))
|
||||
elif isinstance(observation_space, gym.spaces.Discrete):
|
||||
if observation.shape == (): # A numpy array of a number, has shape empty tuple '()'
|
||||
return False
|
||||
elif len(observation.shape) == 1:
|
||||
return True
|
||||
else:
|
||||
raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
|
||||
"Discrete environment, please use (1,) or (n_env, 1) for the observation shape.")
|
||||
# TODO: add support for MultiDiscrete and MultiBinary observation spaces
|
||||
# elif isinstance(observation_space, gym.spaces.MultiDiscrete):
|
||||
# if observation.shape == (len(observation_space.nvec),):
|
||||
# return False
|
||||
# elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec):
|
||||
# return True
|
||||
# else:
|
||||
# raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) +
|
||||
# "environment, please use ({},) or ".format(len(observation_space.nvec)) +
|
||||
# "(n_env, {}) for the observation shape.".format(len(observation_space.nvec)))
|
||||
# elif isinstance(observation_space, gym.spaces.MultiBinary):
|
||||
# if observation.shape == (observation_space.n,):
|
||||
# return False
|
||||
# elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n:
|
||||
# return True
|
||||
# else:
|
||||
# raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) +
|
||||
# "environment, please use ({},) or ".format(observation_space.n) +
|
||||
# "(n_env, {}) for the observation shape.".format(observation_space.n))
|
||||
else:
|
||||
raise ValueError("Error: Cannot determine if the observation is vectorized with the space type {}."
|
||||
.format(observation_space))
|
||||
|
||||
|
||||
def save(self, path: str) -> None:
|
||||
"""
|
||||
Save model to a given location.
|
||||
Save policy weights to a given location.
|
||||
NOTE: we don't save policy parameters
|
||||
|
||||
:param path: (str)
|
||||
"""
|
||||
previous_device = self.device
|
||||
# Convert to cpu before saving
|
||||
self = self.to('cpu')
|
||||
th.save(self.state_dict(), path)
|
||||
self = self.to(previous_device)
|
||||
|
||||
def load(self, path: str) -> None:
|
||||
"""
|
||||
Load saved model from path.
|
||||
Load policy weights from path.
|
||||
NOTE: we don't load policy parameters
|
||||
|
||||
:param path: (str)
|
||||
"""
|
||||
self.load_state_dict(th.load(path))
|
||||
self = self.to(self.device)
|
||||
|
||||
def load_from_vector(self, vector: np.ndarray):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -156,9 +156,9 @@ class PPOPolicy(BasePolicy):
|
|||
latent_pi, latent_vf, latent_sde = self._get_latent(obs)
|
||||
# Evaluate the values for the given observations
|
||||
value = self.value_net(latent_vf)
|
||||
action, action_distribution = self._get_action_dist_from_latent(latent_pi, latent_sde=latent_sde,
|
||||
deterministic=deterministic)
|
||||
log_prob = action_distribution.log_prob(action)
|
||||
distribution = self._get_action_dist_from_latent(latent_pi, latent_sde=latent_sde)
|
||||
action = distribution.get_action(deterministic=deterministic)
|
||||
log_prob = distribution.log_prob(action)
|
||||
return action, value, log_prob
|
||||
|
||||
def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
|
||||
|
|
@ -180,33 +180,29 @@ class PPOPolicy(BasePolicy):
|
|||
return latent_pi, latent_vf, latent_sde
|
||||
|
||||
def _get_action_dist_from_latent(self, latent_pi: th.Tensor,
|
||||
latent_sde: Optional[th.Tensor] = None,
|
||||
deterministic: bool = False) -> Tuple[th.Tensor, Distribution]:
|
||||
latent_sde: Optional[th.Tensor] = None) -> Distribution:
|
||||
"""
|
||||
Retrieve action and associated action distribution
|
||||
given the latent codes.
|
||||
Retrieve action distribution given the latent codes.
|
||||
|
||||
:param latent_pi: (th.Tensor) Latent code for the actor
|
||||
:param latent_sde: (Optional[th.Tensor]) Latent code for the SDE exploration function
|
||||
:param deterministic: (bool) Whether to sample or use deterministic actions
|
||||
:return: (Tuple[th.Tensor, Distribution]) Action and action distribution
|
||||
:return: (Distribution) Action distribution
|
||||
"""
|
||||
mean_actions = self.action_net(latent_pi)
|
||||
|
||||
if isinstance(self.action_dist, DiagGaussianDistribution):
|
||||
return self.action_dist.proba_distribution(mean_actions, self.log_std, deterministic=deterministic)
|
||||
return self.action_dist.proba_distribution(mean_actions, self.log_std)
|
||||
|
||||
elif isinstance(self.action_dist, CategoricalDistribution):
|
||||
# Here mean_actions are the logits before the softmax
|
||||
return self.action_dist.proba_distribution(mean_actions, deterministic=deterministic)
|
||||
return self.action_dist.proba_distribution(action_logits=mean_actions)
|
||||
|
||||
elif isinstance(self.action_dist, StateDependentNoiseDistribution):
|
||||
return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde,
|
||||
deterministic=deterministic)
|
||||
return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde)
|
||||
else:
|
||||
raise ValueError('Invalid action distribution')
|
||||
|
||||
def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
|
||||
def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
|
||||
"""
|
||||
Get the action according to the policy for a given observation.
|
||||
|
||||
|
|
@ -215,27 +211,25 @@ class PPOPolicy(BasePolicy):
|
|||
:return: (th.Tensor) Taken action according to the policy
|
||||
"""
|
||||
latent_pi, _, latent_sde = self._get_latent(observation)
|
||||
action, _ = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic)
|
||||
return action
|
||||
distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
|
||||
return distribution.get_action(deterministic=deterministic)
|
||||
|
||||
def evaluate_actions(self, obs: th.Tensor,
|
||||
actions: th.Tensor,
|
||||
deterministic: bool = False) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
|
||||
actions: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
|
||||
"""
|
||||
Evaluate actions according to the current policy,
|
||||
given the observations.
|
||||
|
||||
:param obs: (th.Tensor)
|
||||
:param actions: (th.Tensor)
|
||||
:param deterministic: (bool)
|
||||
:return: (th.Tensor, th.Tensor, th.Tensor) estimated value, log likelihood of taking those actions
|
||||
and entropy of the action distribution.
|
||||
"""
|
||||
latent_pi, latent_vf, latent_sde = self._get_latent(obs)
|
||||
_, action_distribution = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic)
|
||||
log_prob = action_distribution.log_prob(actions)
|
||||
distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
|
||||
log_prob = distribution.log_prob(actions)
|
||||
values = self.value_net(latent_vf)
|
||||
return values, log_prob, action_distribution.entropy()
|
||||
return values, log_prob, distribution.entropy()
|
||||
|
||||
|
||||
MlpPolicy = PPOPolicy
|
||||
|
|
|
|||
|
|
@ -108,14 +108,11 @@ class Actor(BasePolicy):
|
|||
'reset_noise() is only available when using SDE'
|
||||
self.action_dist.sample_weights(self.log_std, batch_size=batch_size)
|
||||
|
||||
def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
|
||||
def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
|
||||
features = self.extract_features(obs)
|
||||
latent_pi = self.latent_pi(features)
|
||||
latent_sde = self.sde_features_extractor(features) if self.sde_features_extractor is not None else latent_pi
|
||||
return latent_pi, latent_sde
|
||||
|
||||
def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
|
||||
latent_pi, latent_sde = self._get_latent(obs)
|
||||
mean_actions = self.mu(latent_pi)
|
||||
|
||||
if self.use_sde:
|
||||
|
|
@ -130,9 +127,8 @@ class Actor(BasePolicy):
|
|||
mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
|
||||
kwargs = dict(latent_sde=latent_sde) if self.use_sde else {}
|
||||
# Note: the action is squashed
|
||||
action, _ = self.action_dist.proba_distribution(mean_actions, log_std,
|
||||
deterministic=deterministic, **kwargs)
|
||||
return action
|
||||
return self.action_dist.action_from_params(mean_actions, log_std,
|
||||
deterministic=deterministic, **kwargs)
|
||||
|
||||
def action_log_prob(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
|
||||
mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
|
||||
|
|
@ -268,7 +264,7 @@ class SACPolicy(BasePolicy):
|
|||
def forward(self, obs: th.Tensor) -> th.Tensor:
|
||||
return self.predict(obs, deterministic=False)
|
||||
|
||||
def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
|
||||
def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
|
||||
return self.actor(observation, deterministic)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -104,11 +104,6 @@ class Actor(BasePolicy):
|
|||
"""
|
||||
return self.action_dist.get_std(self.log_std)
|
||||
|
||||
def _get_action_dist_from_latent(self, latent_pi: th.Tensor,
|
||||
latent_sde: th.Tensor) -> Tuple[th.Tensor, Distribution]:
|
||||
mean_actions = self.mu(latent_pi)
|
||||
return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde)
|
||||
|
||||
def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
|
||||
features = self.extract_features(obs)
|
||||
latent_pi = self.latent_pi(features)
|
||||
|
|
@ -126,9 +121,9 @@ class Actor(BasePolicy):
|
|||
and entropy of the action distribution.
|
||||
"""
|
||||
latent_pi, latent_sde = self._get_latent(obs)
|
||||
_, distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
|
||||
mean_actions = self.mu(latent_pi)
|
||||
distribution = self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde)
|
||||
log_prob = distribution.log_prob(action)
|
||||
# value = self.value_net(latent_vf)
|
||||
return log_prob, distribution.entropy()
|
||||
|
||||
def reset_noise(self) -> None:
|
||||
|
|
@ -150,8 +145,6 @@ class Actor(BasePolicy):
|
|||
# -> set squash_output=True in the action_dist?
|
||||
# NOTE: the clipping is done in the rollout for now
|
||||
return self.mu(latent_pi) + noise
|
||||
# action, _ = self._get_action_dist_from_latent(latent_pi)
|
||||
# return action
|
||||
else:
|
||||
features = self.extract_features(obs)
|
||||
return self.mu(features)
|
||||
|
|
@ -338,9 +331,9 @@ class TD3Policy(BasePolicy):
|
|||
return Critic(**self.net_args).to(self.device)
|
||||
|
||||
def forward(self, observation: th.Tensor, deterministic: bool = False):
|
||||
return self.predict(observation, deterministic=deterministic)
|
||||
return self._predict(observation, deterministic=deterministic)
|
||||
|
||||
def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
|
||||
def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
|
||||
return self.actor(observation, deterministic=deterministic)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
0.4.0a2
|
||||
0.4.0a3
|
||||
|
|
|
|||
Loading…
Reference in a new issue