Add save/load weights for policies and refactor action distributions

2026-05-14 20:58:03 +00:00 · 2020-03-31 16:29:13 +02:00 · 2020-03-31 16:29:13 +02:00 · fdecd512db
commit fdecd512db
parent b782f3a208
11 changed files with 319 additions and 211 deletions
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@ -10,10 +10,12 @@ Pre-Release 0.4.0a0 (WIP)
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
 - Removed CEMRL
+- Model saved with previous versions cannot be loaded (because of the pre-preprocessing)

 New Features:
 ^^^^^^^^^^^^^
 - Add support for Discrete observation spaces
+- Add saving/loading for policy weights, so the policy can be used without the model

 Bug Fixes:
 ^^^^^^^^^^
@ -26,6 +28,8 @@ Others:
 ^^^^^^^
 - Refactor handling of observation and action spaces
 - Refactored features extraction to have proper preprocessing
+- Refactored action distributions
+

 Documentation:
 ^^^^^^^^^^^^^^
--- a/tests/test_distributions.py
+++ b/tests/test_distributions.py
@ -38,7 +38,8 @@ def test_squashed_gaussian(model_class):
    gaussian_mean = th.rand(N_SAMPLES, N_ACTIONS)
    dist = SquashedDiagGaussianDistribution(N_ACTIONS)
    _, log_std = dist.proba_distribution_net(N_FEATURES)
-    actions, _ = dist.proba_distribution(gaussian_mean, log_std)
+    dist = dist.proba_distribution(gaussian_mean, log_std)
+    actions = dist.get_action()
    assert th.max(th.abs(actions)) <= 1.0

 def test_sde_distribution():
@ -51,7 +52,8 @@ def test_sde_distribution():
    _, log_std = dist.proba_distribution_net(N_FEATURES)
    dist.sample_weights(log_std, batch_size=N_SAMPLES)

-    actions, _ = dist.proba_distribution(deterministic_actions, log_std, state)
+    dist = dist.proba_distribution(deterministic_actions, log_std, state)
+    actions = dist.get_action()

    assert th.allclose(actions.mean(), dist.distribution.mean.mean(), rtol=1e-3)
    assert th.allclose(actions.std(), dist.distribution.scale.mean(), rtol=1e-3)
@ -71,11 +73,12 @@ def test_entropy(dist):
    _, log_std = dist.proba_distribution_net(N_FEATURES, log_std_init=th.log(th.tensor(0.2)))

    if isinstance(dist, DiagGaussianDistribution):
-        actions, dist = dist.proba_distribution(deterministic_actions, log_std)
+        dist = dist.proba_distribution(deterministic_actions, log_std)
    else:
        dist.sample_weights(log_std, batch_size=N_SAMPLES)
-        actions, dist = dist.proba_distribution(deterministic_actions, log_std, state)
+        dist = dist.proba_distribution(deterministic_actions, log_std, state)

+    actions = dist.get_action()
    entropy = dist.entropy()
    log_prob = dist.log_prob(actions)
    assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=5e-3)
@ -88,8 +91,9 @@ def test_categorical():
    set_random_seed(1)
    state = th.rand(N_SAMPLES, N_FEATURES)
    action_logits = th.rand(N_SAMPLES, N_ACTIONS)
-    actions, dist = dist.proba_distribution(action_logits)
+    dist = dist.proba_distribution(action_logits)

+    actions = dist.get_action()
    entropy = dist.entropy()
    log_prob = dist.log_prob(actions)
    assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=1e-4)
--- a/tests/test_identity.py
+++ b/tests/test_identity.py
@ -20,7 +20,7 @@ def test_continuous(model_class):
    env = IdentityEnvBox(eps=0.5)

    n_steps = {
-        A2C: 3000,
+        A2C: 3500,
        PPO: 3000,
        SAC: 700,
        TD3: 500
--- a/tests/test_save_load.py
+++ b/tests/test_save_load.py
@ -16,7 +16,7 @@ MODEL_LIST = [
    SAC,
 ]

-
+#
@pytest.mark.parametrize("model_class", MODEL_LIST)
 def test_save_load(model_class):
    """
@ -160,3 +160,61 @@ def test_save_load_replay_buffer(model_class):

    # clear file from os
    os.remove(replay_path)
+
+
+@pytest.mark.parametrize("model_class", MODEL_LIST)
+def test_save_load_policy(model_class):
+    """
+    Test saving and loading policy only.
+
+    :param model_class: (BaseRLModel) A RL model
+    """
+    env = DummyVecEnv([lambda: IdentityEnvBox(10)])
+
+    # create model
+    model = model_class('MlpPolicy', env, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
+    model.learn(total_timesteps=500, eval_freq=250)
+
+    env.reset()
+    observations = np.array([env.step(env.action_space.sample())[0] for _ in range(10)])
+    observations = observations.reshape(10, -1)
+
+    policy = model.policy
+
+    # Get dictionary of current parameters
+    params = deepcopy(policy.state_dict())
+
+    # Modify all parameters to be random values
+    random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items())
+
+    # Update model parameters with the new random values
+    policy.load_state_dict(random_params)
+
+    new_params = policy.state_dict()
+    # Check that all params are different now
+    for k in params:
+        assert not th.allclose(params[k], new_params[k]), "Parameters did not change as expected."
+
+    params = new_params
+
+    # get selected actions
+    selected_actions, _ = policy.predict(observations, deterministic=True)
+
+    # Save and load policy
+    policy.save("./logs/policy_weights.pkl")
+    # del policy
+    policy.load("./logs/policy_weights.pkl")
+
+    # check if params are still the same after load
+    new_params = policy.state_dict()
+
+    # Check that all params are the same as before save load procedure now
+    for key in params:
+        assert th.allclose(params[key], new_params[key]), "Policy parameters not the same after save and load."
+
+    # check if model still selects the same actions
+    new_selected_actions, _ = policy.predict(observations, deterministic=True)
+    assert np.allclose(selected_actions, new_selected_actions, 1e-4)
+
+    # clear file from os
+    os.remove("./logs/policy_weights.pkl")
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@ -158,27 +158,6 @@ class BaseRLModel(ABC):
            assert eval_env.num_envs == 1
        return eval_env

-    def scale_action(self, action: np.ndarray) -> np.ndarray:
-        """
-        Rescale the action from [low, high] to [-1, 1]
-        (no need for symmetric action space)
-
-        :param action: (np.ndarray) Action to scale
-        :return: (np.ndarray) Scaled action
-        """
-        low, high = self.action_space.low, self.action_space.high
-        return 2.0 * ((action - low) / (high - low)) - 1.0
-
-    def unscale_action(self, scaled_action: np.ndarray) -> np.ndarray:
-        """
-        Rescale the action from [-1, 1] to [low, high]
-        (no need for symmetric action space)
-
-        :param scaled_action: Action to un-scale
-        """
-        low, high = self.action_space.low, self.action_space.high
-        return low + (0.5 * (scaled_action + 1.0) * (high - low))
-
    def _setup_lr_schedule(self) -> None:
        """Transform to callable if needed."""
        self.lr_schedule = get_schedule_fn(self.learning_rate)
@ -318,57 +297,6 @@ class BaseRLModel(ABC):
        """
        raise NotImplementedError()

-    @staticmethod
-    def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.spaces.Space) -> bool:
-        """
-        For every observation type, detects and validates the shape,
-        then returns whether or not the observation is vectorized.
-
-        :param observation: (np.ndarray) the input observation to validate
-        :param observation_space: (gym.spaces) the observation space
-        :return: (bool) whether the given observation is vectorized or not
-        """
-        if isinstance(observation_space, gym.spaces.Box):
-            if observation.shape == observation_space.shape:
-                return False
-            elif observation.shape[1:] == observation_space.shape:
-                return True
-            else:
-                raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
-                                 "Box environment, please use {} ".format(observation_space.shape) +
-                                 "or (n_env, {}) for the observation shape."
-                                 .format(", ".join(map(str, observation_space.shape))))
-        elif isinstance(observation_space, gym.spaces.Discrete):
-            if observation.shape == ():  # A numpy array of a number, has shape empty tuple '()'
-                return False
-            elif len(observation.shape) == 1:
-                return True
-            else:
-                raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
-                                 "Discrete environment, please use (1,) or (n_env, 1) for the observation shape.")
-        # TODO: add support for MultiDiscrete and MultiBinary observation spaces
-        # elif isinstance(observation_space, gym.spaces.MultiDiscrete):
-        #     if observation.shape == (len(observation_space.nvec),):
-        #         return False
-        #     elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec):
-        #         return True
-        #     else:
-        #         raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) +
-        #                          "environment, please use ({},) or ".format(len(observation_space.nvec)) +
-        #                          "(n_env, {}) for the observation shape.".format(len(observation_space.nvec)))
-        # elif isinstance(observation_space, gym.spaces.MultiBinary):
-        #     if observation.shape == (observation_space.n,):
-        #         return False
-        #     elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n:
-        #         return True
-        #     else:
-        #         raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) +
-        #                          "environment, please use ({},) or ".format(observation_space.n) +
-        #                          "(n_env, {}) for the observation shape.".format(observation_space.n))
-        else:
-            raise ValueError("Error: Cannot determine if the observation is vectorized with the space type {}."
-                             .format(observation_space))
-
    def predict(self, observation: np.ndarray,
                state: Optional[np.ndarray] = None,
                mask: Optional[np.ndarray] = None,
@ -383,36 +311,7 @@ class BaseRLModel(ABC):
        :return: (Tuple[np.ndarray, Optional[np.ndarray]]) the model's action and the next state
            (used in recurrent policies)
        """
-        # TODO: move this block to BasePolicy
-        # if state is None:
-        #     state = self.initial_state
-        # if mask is None:
-        #     mask = [False for _ in range(self.n_envs)]
-        observation = np.array(observation)
-        vectorized_env = self._is_vectorized_observation(observation, self.observation_space)
-
-        observation = observation.reshape((-1,) + self.observation_space.shape)
-        observation = th.as_tensor(observation).to(self.device)
-        with th.no_grad():
-            actions = self.policy.predict(observation, deterministic=deterministic)
-        # Convert to numpy
-        actions = actions.cpu().numpy()
-
-        # Rescale to proper domain when using squashing
-        if isinstance(self.action_space, gym.spaces.Box) and self.policy.squash_output:
-            actions = self.unscale_action(actions)
-
-        clipped_actions = actions
-        # Clip the actions to avoid out of bound error when using gaussian distribution
-        if isinstance(self.action_space, gym.spaces.Box) and not self.policy.squash_output:
-            clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
-
-        if not vectorized_env:
-            if state is not None:
-                raise ValueError("Error: The environment must be vectorized when using recurrent policies.")
-            clipped_actions = clipped_actions[0]
-
-        return clipped_actions, state
+        return self.policy.predict(observation, state, mask, deterministic)

    @classmethod
    def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs):
@ -484,10 +383,7 @@ class BaseRLModel(ABC):
                    raise ValueError(f"Error: the file {load_path} could not be found")

        # set device to cpu if cuda is not available
-        if th.cuda.is_available():
-            device = th.device('cuda')
-        else:
-            device = th.device('cpu')
+        device = th.device('cuda') if th.cuda.is_available() else th.device('cpu')

        # Open the zip archive and load data
        try:
@ -534,20 +430,6 @@ class BaseRLModel(ABC):
                            # load the parameters with the right `map_location`
                            params[os.path.splitext(file_path)[0]] = th.load(file_content, map_location=device)

-                # for backward compatibility
-                if params.get('params') is not None:
-                    params_copy = {}
-                    for name in params:
-                        if name == 'params':
-                            params_copy['policy'] = params[name]
-                        elif name == 'opt':
-                            params_copy['policy.optimizer'] = params[name]
-                        # Special case for SAC
-                        elif name == 'ent_coef_optimizer':
-                            params_copy[name] = params[name]
-                        else:
-                            params_copy[name + '.optimizer'] = params[name]
-                    params = params_copy
        except zipfile.BadZipFile:
            # load_path wasn't a zip file
            raise ValueError(f"Error: the file {load_path} wasn't a zip-file")
@ -925,7 +807,7 @@ class OffPolicyRLModel(BaseRLModel):
                    unscaled_action, _ = self.predict(obs, deterministic=False)

                # Rescale the action from [low, high] to [-1, 1]
-                scaled_action = self.scale_action(unscaled_action)
+                scaled_action = self.policy.scale_action(unscaled_action)

                if self.use_sde:
                    # When using SDE, the action can be out of bounds
@ -941,7 +823,7 @@ class OffPolicyRLModel(BaseRLModel):
                    clipped_action = np.clip(clipped_action + action_noise(), -1, 1)

                # Rescale and perform action
-                new_obs, reward, done, infos = env.step(self.unscale_action(clipped_action))
+                new_obs, reward, done, infos = env.step(self.policy.unscale_action(clipped_action))

                # Only stop training if return value is False, not when it is None.
                if callback.on_step() is False:
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@ -33,12 +33,52 @@ class Distribution(object):

    def sample(self) -> th.Tensor:
        """
-        returns a sample from the probabilty distribution
+        Returns a sample from the probabilty distribution

        :return: (th.Tensor) the stochastic action
        """
        raise NotImplementedError

+    def mode(self) -> th.Tensor:
+        """
+        Returns the most likely action (deterministic output)
+        from the probabilty distribution
+
+        :return: (th.Tensor) the stochastic action
+        """
+        raise NotImplementedError
+
+    def get_action(self, deterministic: bool = False) -> th.Tensor:
+        """
+        Return an action according to the probabilty distribution.
+
+        :param deterministic: (bool)
+        :return: (th.Tensor)
+        """
+        if deterministic:
+            return self.mode()
+        else:
+            return self.sample()
+
+    def action_from_params(self, *args, **kwargs) -> th.Tensor:
+        """
+        Returns a sample from the probabilty distribution
+        given its parameters.
+
+        :return: (th.Tensor) the action
+        """
+        raise NotImplementedError
+
+    def log_prob_from_params(self, *args, **kwargs) -> Tuple[th.Tensor, th.Tensor]:
+        """
+        Returns a sample and the associated log probabilty
+        from the probabilty distribution
+        given its parameters.
+
+        :return: (th.Tuple[th.Tensor, th.Tensor]) action and log prob
+        """
+        raise NotImplementedError
+

 def sum_independent_dims(tensor: th.Tensor) -> th.Tensor:
    """
@ -88,23 +128,17 @@ class DiagGaussianDistribution(Distribution):
        return mean_actions, log_std

    def proba_distribution(self, mean_actions: th.Tensor,
-                           log_std: th.Tensor,
-                           deterministic: bool = False) -> Tuple[th.Tensor, 'DiagGaussianDistribution']:
+                           log_std: th.Tensor) -> 'DiagGaussianDistribution':
        """
-        Create and sample for the distribution given its parameters (mean, std)
+        Create the distribution given its parameters (mean, std)

        :param mean_actions: (th.Tensor)
        :param log_std: (th.Tensor)
-        :param deterministic: (bool)
-        :return: (th.Tensor)
+        :return: (DiagGaussianDistribution)
        """
        action_std = th.ones_like(mean_actions) * log_std.exp()
        self.distribution = Normal(mean_actions, action_std)
-        if deterministic:
-            action = self.mode()
-        else:
-            action = self.sample()
-        return action, self
+        return self

    def mode(self) -> th.Tensor:
        return self.distribution.mean
@ -115,7 +149,15 @@ class DiagGaussianDistribution(Distribution):
    def entropy(self) -> th.Tensor:
        return sum_independent_dims(self.distribution.entropy())

-    def log_prob_from_params(self, mean_actions: th.Tensor, log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
+    def action_from_params(self, mean_actions: th.Tensor,
+                           log_std: th.Tensor,
+                           deterministic: bool = False) -> th.Tensor:
+        # Update the proba distribution
+        self.proba_distribution(mean_actions, log_std)
+        return self.get_action(deterministic=deterministic)
+
+    def log_prob_from_params(self, mean_actions: th.Tensor,
+                             log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
        """
        Compute the log probabilty of taking an action
        given the distribution parameters.
@ -124,7 +166,7 @@ class DiagGaussianDistribution(Distribution):
        :param log_std: (th.Tensor)
        :return: (Tuple[th.Tensor, th.Tensor])
        """
-        action, _ = self.proba_distribution(mean_actions, log_std)
+        action = self.action_from_params(mean_actions, log_std)
        log_prob = self.log_prob(action)
        return action, log_prob

@ -156,10 +198,10 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution):
        self.epsilon = epsilon
        self.gaussian_action = None

-    def proba_distribution(self, mean_actions, log_std, deterministic=False):
-        action, _ = super(SquashedDiagGaussianDistribution, self).proba_distribution(mean_actions, log_std,
-                                                                                     deterministic)
-        return action, self
+    def proba_distribution(self, mean_actions: th.Tensor,
+                           log_std: th.Tensor) -> 'SquashedDiagGaussianDistribution':
+        super(SquashedDiagGaussianDistribution, self).proba_distribution(mean_actions, log_std)
+        return self

    def mode(self) -> th.Tensor:
        self.gaussian_action = self.distribution.mean
@ -175,12 +217,14 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution):
        self.gaussian_action = self.distribution.rsample()
        return th.tanh(self.gaussian_action)

-    def log_prob_from_params(self, mean_actions, log_std) -> Tuple[th.Tensor, th.Tensor]:
-        action, _ = self.proba_distribution(mean_actions, log_std)
+    def log_prob_from_params(self, mean_actions: th.Tensor,
+                             log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
+        action = self.action_from_params(mean_actions, log_std)
        log_prob = self.log_prob(action, self.gaussian_action)
        return action, log_prob

-    def log_prob(self, action: th.Tensor, gaussian_action: Optional[th.Tensor] = None) -> th.Tensor:
+    def log_prob(self, action: th.Tensor,
+                 gaussian_action: Optional[th.Tensor] = None) -> th.Tensor:
        # Inverse tanh
        # Naive implementation (not stable): 0.5 * torch.log((1 + x) / (1 - x))
        # We use numpy to avoid numerical instability
@ -220,14 +264,9 @@ class CategoricalDistribution(Distribution):
        action_logits = nn.Linear(latent_dim, self.action_dim)
        return action_logits

-    def proba_distribution(self, action_logits: th.Tensor,
-                           deterministic: bool = False) -> Tuple[th.Tensor, 'CategoricalDistribution']:
+    def proba_distribution(self, action_logits: th.Tensor) -> 'CategoricalDistribution':
        self.distribution = Categorical(logits=action_logits)
-        if deterministic:
-            action = self.mode()
-        else:
-            action = self.sample()
-        return action, self
+        return self

    def mode(self) -> th.Tensor:
        return th.argmax(self.distribution.probs, dim=1)
@ -238,8 +277,14 @@ class CategoricalDistribution(Distribution):
    def entropy(self) -> th.Tensor:
        return self.distribution.entropy()

+    def action_from_params(self, action_logits: th.Tensor,
+                           deterministic: bool = False) -> th.Tensor:
+        # Update the proba distribution
+        self.proba_distribution(action_logits)
+        return self.get_action(deterministic=deterministic)
+
    def log_prob_from_params(self, action_logits: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
-        action, _ = self.proba_distribution(action_logits)
+        action = self.action_from_params(action_logits)
        log_prob = self.log_prob(action)
        return action, log_prob

@ -283,6 +328,7 @@ class StateDependentNoiseDistribution(Distribution):
        self.weights_dist = None
        self.exploration_mat = None
        self.exploration_matrices = None
+        self._latent_sde = None
        self.use_expln = use_expln
        self.full_std = full_std
        self.epsilon = epsilon
@ -358,27 +404,26 @@ class StateDependentNoiseDistribution(Distribution):

    def proba_distribution(self, mean_actions: th.Tensor,
                           log_std: th.Tensor,
-                           latent_sde: th.Tensor,
-                           deterministic: bool = False) -> Tuple[th.Tensor, 'StateDependentNoiseDistribution']:
+                           latent_sde: th.Tensor) -> 'StateDependentNoiseDistribution':
        """
        Create and sample for the distribution given its parameters (mean, std)

        :param mean_actions: (th.Tensor)
        :param log_std: (th.Tensor)
        :param latent_sde: (th.Tensor)
-        :param deterministic: (bool)
-        :return: (Tuple[th.Tensor, Distribution])
+        :return: (StateDependentNoiseDistribution)
        """
        # Stop gradient if we don't want to influence the features
-        latent_sde = latent_sde if self.learn_features else latent_sde.detach()
+        self._latent_sde = latent_sde if self.learn_features else latent_sde.detach()
        variance = th.mm(latent_sde ** 2, self.get_std(log_std) ** 2)
        self.distribution = Normal(mean_actions, th.sqrt(variance + self.epsilon))
+        return self

+    def get_action(self, deterministic: bool = False) -> th.Tensor:
        if deterministic:
-            action = self.mode()
+            return self.mode()
        else:
-            action = self.sample(latent_sde)
-        return action, self
+            return self.sample(self._latent_sde)

    def mode(self) -> th.Tensor:
        action = self.distribution.mean
@ -412,10 +457,18 @@ class StateDependentNoiseDistribution(Distribution):
            return None
        return sum_independent_dims(self.distribution.entropy())

+    def action_from_params(self, mean_actions: th.Tensor,
+                           log_std: th.Tensor,
+                           latent_sde: th.Tensor,
+                           deterministic: bool = False) -> th.Tensor:
+        # Update the proba distribution
+        self.proba_distribution(mean_actions, log_std, latent_sde)
+        return self.get_action(deterministic=deterministic)
+
    def log_prob_from_params(self, mean_actions: th.Tensor,
                             log_std: th.Tensor,
                             latent_sde: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
-        action, _ = self.proba_distribution(mean_actions, log_std, latent_sde)
+        action = self.action_from_params(mean_actions, log_std, latent_sde)
        log_prob = self.log_prob(action)
        return action, log_prob

--- a/torchy_baselines/common/policies.py
+++ b/torchy_baselines/common/policies.py
@ -63,7 +63,7 @@ class BasePolicy(nn.Module):
    def forward(self, *_args, **kwargs):
        raise NotImplementedError()

-    def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
+    def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
        """
        Get the action according to the policy for a given observation.

@ -73,21 +73,145 @@ class BasePolicy(nn.Module):
        """
        raise NotImplementedError()

+    def predict(self, observation: np.ndarray,
+                state: Optional[np.ndarray] = None,
+                mask: Optional[np.ndarray] = None,
+                deterministic: bool = False) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+        """
+        Get the policy action and state from an observation (and optional state).
+
+        :param observation: (np.ndarray) the input observation
+        :param state: (Optional[np.ndarray]) The last states (can be None, used in recurrent policies)
+        :param mask: (Optional[np.ndarray]) The last masks (can be None, used in recurrent policies)
+        :param deterministic: (bool) Whether or not to return deterministic actions.
+        :return: (Tuple[np.ndarray, Optional[np.ndarray]]) the model's action and the next state
+            (used in recurrent policies)
+        """
+        # if state is None:
+        #     state = self.initial_state
+        # if mask is None:
+        #     mask = [False for _ in range(self.n_envs)]
+        observation = np.array(observation)
+        vectorized_env = self._is_vectorized_observation(observation, self.observation_space)
+
+        observation = observation.reshape((-1,) + self.observation_space.shape)
+        observation = th.as_tensor(observation).to(self.device)
+        with th.no_grad():
+            actions = self._predict(observation, deterministic=deterministic)
+        # Convert to numpy
+        actions = actions.cpu().numpy()
+
+        # Rescale to proper domain when using squashing
+        if isinstance(self.action_space, gym.spaces.Box) and self.squash_output:
+            actions = self.unscale_action(actions)
+
+        clipped_actions = actions
+        # Clip the actions to avoid out of bound error when using gaussian distribution
+        if isinstance(self.action_space, gym.spaces.Box) and not self.squash_output:
+            clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
+
+        if not vectorized_env:
+            if state is not None:
+                raise ValueError("Error: The environment must be vectorized when using recurrent policies.")
+            clipped_actions = clipped_actions[0]
+
+        return clipped_actions, state
+
+    def scale_action(self, action: np.ndarray) -> np.ndarray:
+        """
+        Rescale the action from [low, high] to [-1, 1]
+        (no need for symmetric action space)
+
+        :param action: (np.ndarray) Action to scale
+        :return: (np.ndarray) Scaled action
+        """
+        low, high = self.action_space.low, self.action_space.high
+        return 2.0 * ((action - low) / (high - low)) - 1.0
+
+    def unscale_action(self, scaled_action: np.ndarray) -> np.ndarray:
+        """
+        Rescale the action from [-1, 1] to [low, high]
+        (no need for symmetric action space)
+
+        :param scaled_action: Action to un-scale
+        """
+        low, high = self.action_space.low, self.action_space.high
+        return low + (0.5 * (scaled_action + 1.0) * (high - low))
+
+    @staticmethod
+    def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.spaces.Space) -> bool:
+        """
+        For every observation type, detects and validates the shape,
+        then returns whether or not the observation is vectorized.
+
+        :param observation: (np.ndarray) the input observation to validate
+        :param observation_space: (gym.spaces) the observation space
+        :return: (bool) whether the given observation is vectorized or not
+        """
+        if isinstance(observation_space, gym.spaces.Box):
+            if observation.shape == observation_space.shape:
+                return False
+            elif observation.shape[1:] == observation_space.shape:
+                return True
+            else:
+                raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
+                                 "Box environment, please use {} ".format(observation_space.shape) +
+                                 "or (n_env, {}) for the observation shape."
+                                 .format(", ".join(map(str, observation_space.shape))))
+        elif isinstance(observation_space, gym.spaces.Discrete):
+            if observation.shape == ():  # A numpy array of a number, has shape empty tuple '()'
+                return False
+            elif len(observation.shape) == 1:
+                return True
+            else:
+                raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
+                                 "Discrete environment, please use (1,) or (n_env, 1) for the observation shape.")
+        # TODO: add support for MultiDiscrete and MultiBinary observation spaces
+        # elif isinstance(observation_space, gym.spaces.MultiDiscrete):
+        #     if observation.shape == (len(observation_space.nvec),):
+        #         return False
+        #     elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec):
+        #         return True
+        #     else:
+        #         raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) +
+        #                          "environment, please use ({},) or ".format(len(observation_space.nvec)) +
+        #                          "(n_env, {}) for the observation shape.".format(len(observation_space.nvec)))
+        # elif isinstance(observation_space, gym.spaces.MultiBinary):
+        #     if observation.shape == (observation_space.n,):
+        #         return False
+        #     elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n:
+        #         return True
+        #     else:
+        #         raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) +
+        #                          "environment, please use ({},) or ".format(observation_space.n) +
+        #                          "(n_env, {}) for the observation shape.".format(observation_space.n))
+        else:
+            raise ValueError("Error: Cannot determine if the observation is vectorized with the space type {}."
+                             .format(observation_space))
+
+
    def save(self, path: str) -> None:
        """
-        Save model to a given location.
+        Save policy weights to a given location.
+        NOTE: we don't save policy parameters

        :param path: (str)
        """
+        previous_device = self.device
+        # Convert to cpu before saving
+        self = self.to('cpu')
        th.save(self.state_dict(), path)
+        self = self.to(previous_device)

    def load(self, path: str) -> None:
        """
-        Load saved model from path.
+        Load policy weights from path.
+        NOTE: we don't load policy parameters

        :param path: (str)
        """
        self.load_state_dict(th.load(path))
+        self = self.to(self.device)

    def load_from_vector(self, vector: np.ndarray):
        """
--- a/torchy_baselines/ppo/policies.py
+++ b/torchy_baselines/ppo/policies.py
@ -156,9 +156,9 @@ class PPOPolicy(BasePolicy):
        latent_pi, latent_vf, latent_sde = self._get_latent(obs)
        # Evaluate the values for the given observations
        value = self.value_net(latent_vf)
-        action, action_distribution = self._get_action_dist_from_latent(latent_pi, latent_sde=latent_sde,
-                                                                        deterministic=deterministic)
-        log_prob = action_distribution.log_prob(action)
+        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde=latent_sde)
+        action = distribution.get_action(deterministic=deterministic)
+        log_prob = distribution.log_prob(action)
        return action, value, log_prob

    def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
@ -180,33 +180,29 @@ class PPOPolicy(BasePolicy):
        return latent_pi, latent_vf, latent_sde

    def _get_action_dist_from_latent(self, latent_pi: th.Tensor,
-                                     latent_sde: Optional[th.Tensor] = None,
-                                     deterministic: bool = False) -> Tuple[th.Tensor, Distribution]:
+                                     latent_sde: Optional[th.Tensor] = None) -> Distribution:
        """
-        Retrieve action and associated action distribution
-        given the latent codes.
+        Retrieve action distribution given the latent codes.

        :param latent_pi: (th.Tensor) Latent code for the actor
        :param latent_sde: (Optional[th.Tensor]) Latent code for the SDE exploration function
-        :param deterministic: (bool) Whether to sample or use deterministic actions
-        :return: (Tuple[th.Tensor, Distribution]) Action and action distribution
+        :return: (Distribution) Action distribution
        """
        mean_actions = self.action_net(latent_pi)

        if isinstance(self.action_dist, DiagGaussianDistribution):
-            return self.action_dist.proba_distribution(mean_actions, self.log_std, deterministic=deterministic)
+            return self.action_dist.proba_distribution(mean_actions, self.log_std)

        elif isinstance(self.action_dist, CategoricalDistribution):
            # Here mean_actions are the logits before the softmax
-            return self.action_dist.proba_distribution(mean_actions, deterministic=deterministic)
+            return self.action_dist.proba_distribution(action_logits=mean_actions)

        elif isinstance(self.action_dist, StateDependentNoiseDistribution):
-            return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde,
-                                                       deterministic=deterministic)
+            return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde)
        else:
            raise ValueError('Invalid action distribution')

-    def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
+    def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
        """
        Get the action according to the policy for a given observation.

@ -215,27 +211,25 @@ class PPOPolicy(BasePolicy):
        :return: (th.Tensor) Taken action according to the policy
        """
        latent_pi, _, latent_sde = self._get_latent(observation)
-        action, _ = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic)
-        return action
+        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
+        return distribution.get_action(deterministic=deterministic)

    def evaluate_actions(self, obs: th.Tensor,
-                         actions: th.Tensor,
-                         deterministic: bool = False) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
+                         actions: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
        """
        Evaluate actions according to the current policy,
        given the observations.

        :param obs: (th.Tensor)
        :param actions: (th.Tensor)
-        :param deterministic: (bool)
        :return: (th.Tensor, th.Tensor, th.Tensor) estimated value, log likelihood of taking those actions
            and entropy of the action distribution.
        """
        latent_pi, latent_vf, latent_sde = self._get_latent(obs)
-        _, action_distribution = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic)
-        log_prob = action_distribution.log_prob(actions)
+        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
+        log_prob = distribution.log_prob(actions)
        values = self.value_net(latent_vf)
-        return values, log_prob, action_distribution.entropy()
+        return values, log_prob, distribution.entropy()


 MlpPolicy = PPOPolicy
--- a/torchy_baselines/sac/policies.py
+++ b/torchy_baselines/sac/policies.py
@ -108,14 +108,11 @@ class Actor(BasePolicy):
            'reset_noise() is only available when using SDE'
        self.action_dist.sample_weights(self.log_std, batch_size=batch_size)

-    def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
+    def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
        features = self.extract_features(obs)
        latent_pi = self.latent_pi(features)
        latent_sde = self.sde_features_extractor(features) if self.sde_features_extractor is not None else latent_pi
-        return latent_pi, latent_sde

-    def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
-        latent_pi, latent_sde = self._get_latent(obs)
        mean_actions = self.mu(latent_pi)

        if self.use_sde:
@ -130,9 +127,8 @@ class Actor(BasePolicy):
        mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
        kwargs = dict(latent_sde=latent_sde) if self.use_sde else {}
        # Note: the action is squashed
-        action, _ = self.action_dist.proba_distribution(mean_actions, log_std,
-                                                        deterministic=deterministic, **kwargs)
-        return action
+        return self.action_dist.action_from_params(mean_actions, log_std,
+                                                   deterministic=deterministic, **kwargs)

    def action_log_prob(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
        mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
@ -268,7 +264,7 @@ class SACPolicy(BasePolicy):
    def forward(self, obs: th.Tensor) -> th.Tensor:
        return self.predict(obs, deterministic=False)

-    def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
+    def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
        return self.actor(observation, deterministic)


--- a/torchy_baselines/td3/policies.py
+++ b/torchy_baselines/td3/policies.py
@ -104,11 +104,6 @@ class Actor(BasePolicy):
        """
        return self.action_dist.get_std(self.log_std)

-    def _get_action_dist_from_latent(self, latent_pi: th.Tensor,
-                                     latent_sde: th.Tensor) -> Tuple[th.Tensor, Distribution]:
-        mean_actions = self.mu(latent_pi)
-        return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde)
-
    def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
        features = self.extract_features(obs)
        latent_pi = self.latent_pi(features)
@ -126,9 +121,9 @@ class Actor(BasePolicy):
            and entropy of the action distribution.
        """
        latent_pi, latent_sde = self._get_latent(obs)
-        _, distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
+        mean_actions = self.mu(latent_pi)
+        distribution = self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde)
        log_prob = distribution.log_prob(action)
-        # value = self.value_net(latent_vf)
        return log_prob, distribution.entropy()

    def reset_noise(self) -> None:
@ -150,8 +145,6 @@ class Actor(BasePolicy):
            # -> set squash_output=True in the action_dist?
            # NOTE: the clipping is done in the rollout for now
            return self.mu(latent_pi) + noise
-            # action, _ = self._get_action_dist_from_latent(latent_pi)
-            # return action
        else:
            features = self.extract_features(obs)
            return self.mu(features)
@ -338,9 +331,9 @@ class TD3Policy(BasePolicy):
        return Critic(**self.net_args).to(self.device)

    def forward(self, observation: th.Tensor, deterministic: bool = False):
-        return self.predict(observation, deterministic=deterministic)
+        return self._predict(observation, deterministic=deterministic)

-    def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
+    def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
        return self.actor(observation, deterministic=deterministic)


--- a/torchy_baselines/version.txt
+++ b/torchy_baselines/version.txt
@ -1 +1 @@
-0.4.0a2
+0.4.0a3
 @ -1 +1 @@
 .4.0a2
 .4.0a3