mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-05-14 20:58:03 +00:00
* sets replay buffer device after loading * update changelog * update changelog * correct changelog * add test for replay buffer device * Fix test to actually test the bug fix * [ci skip] Update version * [ci skip] Update docker images --------- Co-authored-by: PatrickHelm <patrick.helm@gmx.net> Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org>
749 lines
27 KiB
Python
749 lines
27 KiB
Python
import base64
|
|
import io
|
|
import json
|
|
import os
|
|
import pathlib
|
|
import warnings
|
|
import zipfile
|
|
from collections import OrderedDict
|
|
from copy import deepcopy
|
|
|
|
import gymnasium as gym
|
|
import numpy as np
|
|
import pytest
|
|
import torch as th
|
|
|
|
from stable_baselines3 import A2C, DDPG, DQN, PPO, SAC, TD3
|
|
from stable_baselines3.common.base_class import BaseAlgorithm
|
|
from stable_baselines3.common.env_util import make_vec_env
|
|
from stable_baselines3.common.envs import FakeImageEnv, IdentityEnv, IdentityEnvBox
|
|
from stable_baselines3.common.save_util import load_from_pkl, open_path, save_to_pkl
|
|
from stable_baselines3.common.utils import get_device
|
|
from stable_baselines3.common.vec_env import DummyVecEnv
|
|
|
|
MODEL_LIST = [PPO, A2C, TD3, SAC, DQN, DDPG]
|
|
|
|
|
|
def select_env(model_class: BaseAlgorithm) -> gym.Env:
|
|
"""
|
|
Selects an environment with the correct action space as DQN only supports discrete action space
|
|
"""
|
|
if model_class == DQN:
|
|
return IdentityEnv(10)
|
|
else:
|
|
return IdentityEnvBox(-10, 10)
|
|
|
|
|
|
@pytest.mark.parametrize("model_class", MODEL_LIST)
|
|
def test_save_load(tmp_path, model_class):
|
|
"""
|
|
Test if 'save' and 'load' saves and loads model correctly
|
|
and if 'get_parameters' and 'set_parameters' and work correctly.
|
|
|
|
''warning does not test function of optimizer parameter load
|
|
|
|
:param model_class: (BaseAlgorithm) A RL model
|
|
"""
|
|
|
|
env = DummyVecEnv([lambda: select_env(model_class)])
|
|
|
|
# create model
|
|
model = model_class("MlpPolicy", env, policy_kwargs=dict(net_arch=[16]), verbose=1)
|
|
model.learn(total_timesteps=500)
|
|
|
|
env.reset()
|
|
observations = np.concatenate([env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0)
|
|
|
|
# Get parameters of different objects
|
|
# deepcopy to avoid referencing to tensors we are about to modify
|
|
original_params = deepcopy(model.get_parameters())
|
|
|
|
# Test different error cases of set_parameters.
|
|
# Test that invalid object names throw errors
|
|
invalid_object_params = deepcopy(original_params)
|
|
invalid_object_params["I_should_not_be_a_valid_object"] = "and_I_am_an_invalid_tensor"
|
|
with pytest.raises(ValueError):
|
|
model.set_parameters(invalid_object_params, exact_match=True)
|
|
with pytest.raises(ValueError):
|
|
model.set_parameters(invalid_object_params, exact_match=False)
|
|
|
|
# Test that exact_match catches when something was missed.
|
|
missing_object_params = {k: v for k, v in list(original_params.items())[:-1]}
|
|
with pytest.raises(ValueError):
|
|
model.set_parameters(missing_object_params, exact_match=True)
|
|
|
|
# Test that exact_match catches when something inside state-dict
|
|
# is missing but we have exact_match.
|
|
missing_state_dict_tensor_params = {}
|
|
for object_name in original_params:
|
|
object_params = {}
|
|
missing_state_dict_tensor_params[object_name] = object_params
|
|
# Skip last item in state-dict
|
|
for k, v in list(original_params[object_name].items())[:-1]:
|
|
object_params[k] = v
|
|
with pytest.raises(RuntimeError):
|
|
# PyTorch load_state_dict throws RuntimeError if strict but
|
|
# invalid state-dict.
|
|
model.set_parameters(missing_state_dict_tensor_params, exact_match=True)
|
|
|
|
# Test that parameters do indeed change.
|
|
random_params = {}
|
|
for object_name, params in original_params.items():
|
|
# Do not randomize optimizer parameters (custom layout)
|
|
if "optim" in object_name:
|
|
random_params[object_name] = params
|
|
else:
|
|
# Again, skip the last item in state-dict
|
|
random_params[object_name] = OrderedDict(
|
|
(param_name, th.rand_like(param)) for param_name, param in list(params.items())[:-1]
|
|
)
|
|
|
|
# Update model parameters with the new random values
|
|
model.set_parameters(random_params, exact_match=False)
|
|
|
|
new_params = model.get_parameters()
|
|
# Check that all params except the final item in each state-dict are different.
|
|
for object_name in original_params:
|
|
# Skip optimizers (no valid comparison with just th.allclose)
|
|
if "optim" in object_name:
|
|
continue
|
|
# state-dicts use ordered dictionaries, so key order
|
|
# is guaranteed.
|
|
last_key = list(original_params[object_name].keys())[-1]
|
|
for k in original_params[object_name]:
|
|
if k == last_key:
|
|
# Should be same as before
|
|
assert th.allclose(
|
|
original_params[object_name][k], new_params[object_name][k]
|
|
), "Parameter changed despite not included in the loaded parameters."
|
|
else:
|
|
# Should be different
|
|
assert not th.allclose(
|
|
original_params[object_name][k], new_params[object_name][k]
|
|
), "Parameters did not change as expected."
|
|
|
|
params = new_params
|
|
|
|
# get selected actions
|
|
selected_actions, _ = model.predict(observations, deterministic=True)
|
|
|
|
# Check
|
|
model.save(tmp_path / "test_save.zip")
|
|
del model
|
|
|
|
# Check if the model loads as expected for every possible choice of device:
|
|
for device in ["auto", "cpu", "cuda"]:
|
|
model = model_class.load(str(tmp_path / "test_save.zip"), env=env, device=device)
|
|
|
|
# check if the model was loaded to the correct device
|
|
assert model.device.type == get_device(device).type
|
|
assert model.policy.device.type == get_device(device).type
|
|
|
|
# check if params are still the same after load
|
|
new_params = model.get_parameters()
|
|
|
|
# Check that all params are the same as before save load procedure now
|
|
for object_name in new_params:
|
|
# Skip optimizers (no valid comparison with just th.allclose)
|
|
if "optim" in object_name:
|
|
continue
|
|
for key in params[object_name]:
|
|
assert new_params[object_name][key].device.type == get_device(device).type
|
|
assert th.allclose(
|
|
params[object_name][key].to("cpu"), new_params[object_name][key].to("cpu")
|
|
), "Model parameters not the same after save and load."
|
|
|
|
# check if model still selects the same actions
|
|
new_selected_actions, _ = model.predict(observations, deterministic=True)
|
|
assert np.allclose(selected_actions, new_selected_actions, 1e-4)
|
|
|
|
# check if learn still works
|
|
model.learn(total_timesteps=500)
|
|
|
|
del model
|
|
|
|
# clear file from os
|
|
os.remove(tmp_path / "test_save.zip")
|
|
|
|
|
|
@pytest.mark.parametrize("model_class", MODEL_LIST)
|
|
def test_set_env(tmp_path, model_class):
|
|
"""
|
|
Test if set_env function does work correct
|
|
|
|
:param model_class: (BaseAlgorithm) A RL model
|
|
"""
|
|
|
|
# use discrete for DQN
|
|
env = DummyVecEnv([lambda: select_env(model_class)])
|
|
env2 = DummyVecEnv([lambda: select_env(model_class)])
|
|
env3 = select_env(model_class)
|
|
env4 = DummyVecEnv([lambda: select_env(model_class) for _ in range(2)])
|
|
|
|
kwargs = {}
|
|
if model_class in {DQN, DDPG, SAC, TD3}:
|
|
kwargs = dict(learning_starts=50, train_freq=4)
|
|
elif model_class in {A2C, PPO}:
|
|
kwargs = dict(n_steps=64)
|
|
|
|
# create model
|
|
model = model_class("MlpPolicy", env, policy_kwargs=dict(net_arch=[16]), **kwargs)
|
|
# learn
|
|
model.learn(total_timesteps=64)
|
|
|
|
# change env
|
|
model.set_env(env2, force_reset=True)
|
|
# Check that last obs was discarded
|
|
assert model._last_obs is None
|
|
# learn again
|
|
model.learn(total_timesteps=64, reset_num_timesteps=True)
|
|
assert model.num_timesteps == 64
|
|
|
|
# change env test wrapping
|
|
model.set_env(env3)
|
|
# learn again
|
|
model.learn(total_timesteps=64)
|
|
|
|
# num_env must be the same
|
|
with pytest.raises(AssertionError):
|
|
model.set_env(env4)
|
|
|
|
# Keep the same env, disable reset
|
|
model.set_env(model.get_env(), force_reset=False)
|
|
assert model._last_obs is not None
|
|
# learn again
|
|
model.learn(total_timesteps=64, reset_num_timesteps=False)
|
|
assert model.num_timesteps == 2 * 64
|
|
|
|
current_env = model.get_env()
|
|
model.save(tmp_path / "test_save.zip")
|
|
del model
|
|
# Check that we can keep the number of timesteps after loading
|
|
# Here the env kept its state so we don't have to reset
|
|
model = model_class.load(tmp_path / "test_save.zip", env=current_env, force_reset=False)
|
|
assert model._last_obs is not None
|
|
model.learn(total_timesteps=64, reset_num_timesteps=False)
|
|
assert model.num_timesteps == 3 * 64
|
|
|
|
del model
|
|
# We are changing the env, the env must reset but we should keep the number of timesteps
|
|
model = model_class.load(tmp_path / "test_save.zip", env=env3, force_reset=True)
|
|
assert model._last_obs is None
|
|
model.learn(total_timesteps=64, reset_num_timesteps=False)
|
|
assert model.num_timesteps == 3 * 64
|
|
|
|
del model
|
|
# Load the model with a different number of environments
|
|
model = model_class.load(tmp_path / "test_save.zip", env=env4)
|
|
model.learn(total_timesteps=64)
|
|
|
|
# Clear saved file
|
|
os.remove(tmp_path / "test_save.zip")
|
|
|
|
|
|
@pytest.mark.parametrize("model_class", MODEL_LIST)
|
|
def test_exclude_include_saved_params(tmp_path, model_class):
|
|
"""
|
|
Test if exclude and include parameters of save() work
|
|
|
|
:param model_class: (BaseAlgorithm) A RL model
|
|
"""
|
|
env = DummyVecEnv([lambda: select_env(model_class)])
|
|
|
|
# create model, set verbose as 2, which is not standard
|
|
model = model_class("MlpPolicy", env, policy_kwargs=dict(net_arch=[16]), verbose=2)
|
|
|
|
# Check if exclude works
|
|
model.save(tmp_path / "test_save", exclude=["verbose"])
|
|
del model
|
|
model = model_class.load(str(tmp_path / "test_save.zip"))
|
|
# check if verbose was not saved
|
|
assert model.verbose != 2
|
|
|
|
# set verbose as something different then standard settings
|
|
model.verbose = 2
|
|
# Check if include works
|
|
model.save(tmp_path / "test_save", exclude=["verbose"], include=["verbose"])
|
|
del model
|
|
# Load with custom objects
|
|
custom_objects = dict(learning_rate=2e-5, dummy=1.0)
|
|
model = model_class.load(
|
|
str(tmp_path / "test_save.zip"),
|
|
custom_objects=custom_objects,
|
|
print_system_info=True,
|
|
)
|
|
assert model.verbose == 2
|
|
# Check that the custom object was taken into account
|
|
assert model.learning_rate == custom_objects["learning_rate"]
|
|
# Check that only parameters that are here already are replaced
|
|
assert not hasattr(model, "dummy")
|
|
|
|
# clear file from os
|
|
os.remove(tmp_path / "test_save.zip")
|
|
|
|
|
|
def test_save_load_pytorch_var(tmp_path):
|
|
model = SAC("MlpPolicy", "Pendulum-v1", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1))
|
|
model.learn(200)
|
|
save_path = str(tmp_path / "sac_pendulum")
|
|
model.save(save_path)
|
|
env = model.get_env()
|
|
log_ent_coef_before = model.log_ent_coef
|
|
|
|
del model
|
|
|
|
model = SAC.load(save_path, env=env)
|
|
assert th.allclose(log_ent_coef_before, model.log_ent_coef)
|
|
model.learn(200)
|
|
log_ent_coef_after = model.log_ent_coef
|
|
# Check that the entropy coefficient is still optimized
|
|
assert not th.allclose(log_ent_coef_before, log_ent_coef_after)
|
|
|
|
# With a fixed entropy coef
|
|
model = SAC("MlpPolicy", "Pendulum-v1", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1))
|
|
model.learn(200)
|
|
save_path = str(tmp_path / "sac_pendulum")
|
|
model.save(save_path)
|
|
env = model.get_env()
|
|
assert model.log_ent_coef is None
|
|
ent_coef_before = model.ent_coef_tensor
|
|
|
|
del model
|
|
|
|
model = SAC.load(save_path, env=env)
|
|
assert th.allclose(ent_coef_before, model.ent_coef_tensor)
|
|
model.learn(200)
|
|
ent_coef_after = model.ent_coef_tensor
|
|
assert model.log_ent_coef is None
|
|
# Check that the entropy coefficient is still the same
|
|
assert th.allclose(ent_coef_before, ent_coef_after)
|
|
|
|
|
|
@pytest.mark.parametrize("model_class", [A2C, TD3])
|
|
def test_save_load_env_cnn(tmp_path, model_class):
|
|
"""
|
|
Test loading with an env that requires a ``CnnPolicy``.
|
|
This is to test wrapping and observation space check.
|
|
We test one on-policy and one off-policy
|
|
algorithm as the rest share the loading part.
|
|
"""
|
|
env = FakeImageEnv(screen_height=40, screen_width=40, n_channels=2, discrete=False)
|
|
kwargs = dict(policy_kwargs=dict(net_arch=[32]))
|
|
if model_class == TD3:
|
|
kwargs.update(dict(buffer_size=100, learning_starts=50, train_freq=4))
|
|
|
|
model = model_class("CnnPolicy", env, **kwargs).learn(100)
|
|
model.save(tmp_path / "test_save")
|
|
# Test loading with env and continuing training
|
|
model = model_class.load(str(tmp_path / "test_save.zip"), env=env, **kwargs).learn(100)
|
|
# clear file from os
|
|
os.remove(tmp_path / "test_save.zip")
|
|
|
|
# Check we can load models saved with SB3 < 1.7.0
|
|
if model_class == A2C:
|
|
del model.policy.pi_features_extractor
|
|
model.save(tmp_path / "test_save")
|
|
with pytest.warns(UserWarning):
|
|
model_class.load(str(tmp_path / "test_save.zip"), env=env, **kwargs).learn(100)
|
|
os.remove(tmp_path / "test_save.zip")
|
|
|
|
|
|
@pytest.mark.parametrize("model_class", [SAC, TD3, DQN])
|
|
def test_save_load_replay_buffer(tmp_path, model_class):
|
|
path = pathlib.Path(tmp_path / "logs/replay_buffer.pkl")
|
|
path.parent.mkdir(exist_ok=True, parents=True) # to not raise a warning
|
|
model = model_class(
|
|
"MlpPolicy", select_env(model_class), buffer_size=1000, policy_kwargs=dict(net_arch=[64]), learning_starts=200
|
|
)
|
|
model.learn(300)
|
|
old_replay_buffer = deepcopy(model.replay_buffer)
|
|
model.save_replay_buffer(path)
|
|
model.replay_buffer = None
|
|
for device in ["cpu", "cuda"]:
|
|
# Manually force device to check that the replay buffer device
|
|
# is correctly updated
|
|
model.device = th.device(device)
|
|
model.load_replay_buffer(path)
|
|
assert model.replay_buffer.device.type == model.device.type
|
|
|
|
assert np.allclose(old_replay_buffer.observations, model.replay_buffer.observations)
|
|
assert np.allclose(old_replay_buffer.actions, model.replay_buffer.actions)
|
|
assert np.allclose(old_replay_buffer.rewards, model.replay_buffer.rewards)
|
|
assert np.allclose(old_replay_buffer.dones, model.replay_buffer.dones)
|
|
assert np.allclose(old_replay_buffer.timeouts, model.replay_buffer.timeouts)
|
|
infos = [[{"TimeLimit.truncated": truncated}] for truncated in old_replay_buffer.timeouts]
|
|
|
|
# test extending replay buffer
|
|
model.replay_buffer.extend(
|
|
old_replay_buffer.observations,
|
|
old_replay_buffer.observations,
|
|
old_replay_buffer.actions,
|
|
old_replay_buffer.rewards,
|
|
old_replay_buffer.dones,
|
|
infos,
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("model_class", [DQN, SAC, TD3])
|
|
@pytest.mark.parametrize("optimize_memory_usage", [False, True])
|
|
def test_warn_buffer(recwarn, model_class, optimize_memory_usage):
|
|
"""
|
|
When using memory efficient replay buffer,
|
|
a warning must be emitted when calling `.learn()`
|
|
multiple times.
|
|
See https://github.com/DLR-RM/stable-baselines3/issues/46
|
|
"""
|
|
# remove gym warnings
|
|
warnings.filterwarnings(action="ignore", category=DeprecationWarning)
|
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="gym")
|
|
|
|
model = model_class(
|
|
"MlpPolicy",
|
|
select_env(model_class),
|
|
buffer_size=100,
|
|
optimize_memory_usage=optimize_memory_usage,
|
|
# we cannot use optimize_memory_usage and handle_timeout_termination
|
|
# at the same time
|
|
replay_buffer_kwargs={"handle_timeout_termination": not optimize_memory_usage},
|
|
policy_kwargs=dict(net_arch=[64]),
|
|
learning_starts=10,
|
|
)
|
|
|
|
model.learn(150)
|
|
|
|
model.learn(150, reset_num_timesteps=False)
|
|
|
|
# Check that there is no warning
|
|
assert len(recwarn) == 0
|
|
|
|
model.learn(150)
|
|
|
|
if optimize_memory_usage:
|
|
assert len(recwarn) == 1
|
|
warning = recwarn.pop(UserWarning)
|
|
assert "The last trajectory in the replay buffer will be truncated" in str(warning.message)
|
|
else:
|
|
assert len(recwarn) == 0
|
|
|
|
|
|
@pytest.mark.parametrize("model_class", MODEL_LIST)
|
|
@pytest.mark.parametrize("policy_str", ["MlpPolicy", "CnnPolicy"])
|
|
@pytest.mark.parametrize("use_sde", [False, True])
|
|
def test_save_load_policy(tmp_path, model_class, policy_str, use_sde):
|
|
"""
|
|
Test saving and loading policy only.
|
|
|
|
:param model_class: (BaseAlgorithm) A RL model
|
|
:param policy_str: (str) Name of the policy.
|
|
"""
|
|
kwargs = dict(policy_kwargs=dict(net_arch=[16]))
|
|
|
|
# gSDE is only applicable for A2C, PPO and SAC
|
|
if use_sde and model_class not in [A2C, PPO, SAC]:
|
|
pytest.skip()
|
|
|
|
if policy_str == "MlpPolicy":
|
|
env = select_env(model_class)
|
|
else:
|
|
if model_class in [SAC, TD3, DQN, DDPG]:
|
|
# Avoid memory error when using replay buffer
|
|
# Reduce the size of the features
|
|
kwargs = dict(
|
|
buffer_size=250, learning_starts=100, policy_kwargs=dict(features_extractor_kwargs=dict(features_dim=32))
|
|
)
|
|
env = FakeImageEnv(screen_height=40, screen_width=40, n_channels=2, discrete=model_class == DQN)
|
|
|
|
if use_sde:
|
|
kwargs["use_sde"] = True
|
|
|
|
env = DummyVecEnv([lambda: env])
|
|
|
|
# create model
|
|
model = model_class(policy_str, env, verbose=1, **kwargs)
|
|
model.learn(total_timesteps=300)
|
|
|
|
env.reset()
|
|
observations = np.concatenate([env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0)
|
|
|
|
policy = model.policy
|
|
policy_class = policy.__class__
|
|
actor, actor_class = None, None
|
|
if model_class in [SAC, TD3]:
|
|
actor = policy.actor
|
|
actor_class = actor.__class__
|
|
|
|
# Get dictionary of current parameters
|
|
params = deepcopy(policy.state_dict())
|
|
|
|
# Modify all parameters to be random values
|
|
random_params = {param_name: th.rand_like(param) for param_name, param in params.items()}
|
|
|
|
# Update model parameters with the new random values
|
|
policy.load_state_dict(random_params)
|
|
|
|
new_params = policy.state_dict()
|
|
# Check that all params are different now
|
|
for k in params:
|
|
assert not th.allclose(params[k], new_params[k]), "Parameters did not change as expected."
|
|
|
|
params = new_params
|
|
|
|
# get selected actions
|
|
selected_actions, _ = policy.predict(observations, deterministic=True)
|
|
# Should also work with the actor only
|
|
if actor is not None:
|
|
selected_actions_actor, _ = actor.predict(observations, deterministic=True)
|
|
|
|
# Save and load policy
|
|
policy.save(tmp_path / "policy.pkl")
|
|
# Save and load actor
|
|
if actor is not None:
|
|
actor.save(tmp_path / "actor.pkl")
|
|
|
|
del policy, actor
|
|
|
|
policy = policy_class.load(tmp_path / "policy.pkl")
|
|
if actor_class is not None:
|
|
actor = actor_class.load(tmp_path / "actor.pkl")
|
|
|
|
# check if params are still the same after load
|
|
new_params = policy.state_dict()
|
|
|
|
# Check that all params are the same as before save load procedure now
|
|
for key in params:
|
|
assert th.allclose(params[key], new_params[key]), "Policy parameters not the same after save and load."
|
|
|
|
# check if model still selects the same actions
|
|
new_selected_actions, _ = policy.predict(observations, deterministic=True)
|
|
assert np.allclose(selected_actions, new_selected_actions, 1e-4)
|
|
|
|
if actor_class is not None:
|
|
new_selected_actions_actor, _ = actor.predict(observations, deterministic=True)
|
|
assert np.allclose(selected_actions_actor, new_selected_actions_actor, 1e-4)
|
|
assert np.allclose(selected_actions_actor, new_selected_actions, 1e-4)
|
|
|
|
# clear file from os
|
|
os.remove(tmp_path / "policy.pkl")
|
|
if actor_class is not None:
|
|
os.remove(tmp_path / "actor.pkl")
|
|
|
|
|
|
@pytest.mark.parametrize("model_class", [DQN])
|
|
@pytest.mark.parametrize("policy_str", ["MlpPolicy", "CnnPolicy"])
|
|
def test_save_load_q_net(tmp_path, model_class, policy_str):
|
|
"""
|
|
Test saving and loading q-network/quantile net only.
|
|
|
|
:param model_class: (BaseAlgorithm) A RL model
|
|
:param policy_str: (str) Name of the policy.
|
|
"""
|
|
kwargs = dict(policy_kwargs=dict(net_arch=[16]))
|
|
if policy_str == "MlpPolicy":
|
|
env = select_env(model_class)
|
|
else:
|
|
if model_class in [DQN]:
|
|
# Avoid memory error when using replay buffer
|
|
# Reduce the size of the features
|
|
kwargs = dict(
|
|
buffer_size=250,
|
|
learning_starts=100,
|
|
policy_kwargs=dict(features_extractor_kwargs=dict(features_dim=32)),
|
|
)
|
|
env = FakeImageEnv(screen_height=40, screen_width=40, n_channels=2, discrete=model_class == DQN)
|
|
|
|
env = DummyVecEnv([lambda: env])
|
|
|
|
# create model
|
|
model = model_class(policy_str, env, verbose=1, **kwargs)
|
|
model.learn(total_timesteps=300)
|
|
|
|
env.reset()
|
|
observations = np.concatenate([env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0)
|
|
|
|
q_net = model.q_net
|
|
q_net_class = q_net.__class__
|
|
|
|
# Get dictionary of current parameters
|
|
params = deepcopy(q_net.state_dict())
|
|
|
|
# Modify all parameters to be random values
|
|
random_params = {param_name: th.rand_like(param) for param_name, param in params.items()}
|
|
|
|
# Update model parameters with the new random values
|
|
q_net.load_state_dict(random_params)
|
|
|
|
new_params = q_net.state_dict()
|
|
# Check that all params are different now
|
|
for k in params:
|
|
assert not th.allclose(params[k], new_params[k]), "Parameters did not change as expected."
|
|
|
|
params = new_params
|
|
|
|
# get selected actions
|
|
selected_actions, _ = q_net.predict(observations, deterministic=True)
|
|
|
|
# Save and load q_net
|
|
q_net.save(tmp_path / "q_net.pkl")
|
|
|
|
del q_net
|
|
|
|
q_net = q_net_class.load(tmp_path / "q_net.pkl")
|
|
|
|
# check if params are still the same after load
|
|
new_params = q_net.state_dict()
|
|
|
|
# Check that all params are the same as before save load procedure now
|
|
for key in params:
|
|
assert th.allclose(params[key], new_params[key]), "Policy parameters not the same after save and load."
|
|
|
|
# check if model still selects the same actions
|
|
new_selected_actions, _ = q_net.predict(observations, deterministic=True)
|
|
assert np.allclose(selected_actions, new_selected_actions, 1e-4)
|
|
|
|
# clear file from os
|
|
os.remove(tmp_path / "q_net.pkl")
|
|
|
|
|
|
@pytest.mark.parametrize("pathtype", [str, pathlib.Path])
|
|
def test_open_file_str_pathlib(tmp_path, pathtype):
|
|
# check that suffix isn't added because we used open_path first
|
|
with open_path(pathtype(f"{tmp_path}/t1"), "w") as fp1:
|
|
save_to_pkl(fp1, "foo")
|
|
assert fp1.closed
|
|
with warnings.catch_warnings(record=True) as record:
|
|
assert load_from_pkl(pathtype(f"{tmp_path}/t1")) == "foo"
|
|
assert not record
|
|
|
|
# test custom suffix
|
|
with open_path(pathtype(f"{tmp_path}/t1.custom_ext"), "w") as fp1:
|
|
save_to_pkl(fp1, "foo")
|
|
assert fp1.closed
|
|
with warnings.catch_warnings(record=True) as record:
|
|
assert load_from_pkl(pathtype(f"{tmp_path}/t1.custom_ext")) == "foo"
|
|
assert not record
|
|
|
|
# test without suffix
|
|
with open_path(pathtype(f"{tmp_path}/t1"), "w", suffix="pkl") as fp1:
|
|
save_to_pkl(fp1, "foo")
|
|
assert fp1.closed
|
|
with warnings.catch_warnings(record=True) as record:
|
|
assert load_from_pkl(pathtype(f"{tmp_path}/t1.pkl")) == "foo"
|
|
assert not record
|
|
|
|
# test that a warning is raised when the path doesn't exist
|
|
with open_path(pathtype(f"{tmp_path}/t2.pkl"), "w") as fp1:
|
|
save_to_pkl(fp1, "foo")
|
|
assert fp1.closed
|
|
with warnings.catch_warnings(record=True) as record:
|
|
assert load_from_pkl(open_path(pathtype(f"{tmp_path}/t2"), "r", suffix="pkl")) == "foo"
|
|
assert len(record) == 0
|
|
|
|
with warnings.catch_warnings(record=True) as record:
|
|
assert load_from_pkl(open_path(pathtype(f"{tmp_path}/t2"), "r", suffix="pkl", verbose=2)) == "foo"
|
|
assert len(record) == 1
|
|
|
|
fp = pathlib.Path(f"{tmp_path}/t2").open("w")
|
|
fp.write("rubbish")
|
|
fp.close()
|
|
# test that a warning is only raised when verbose = 0
|
|
with warnings.catch_warnings(record=True) as record:
|
|
open_path(pathtype(f"{tmp_path}/t2"), "w", suffix="pkl", verbose=0).close()
|
|
open_path(pathtype(f"{tmp_path}/t2"), "w", suffix="pkl", verbose=1).close()
|
|
open_path(pathtype(f"{tmp_path}/t2"), "w", suffix="pkl", verbose=2).close()
|
|
assert len(record) == 1
|
|
|
|
|
|
def test_open_file(tmp_path):
|
|
# path must much the type
|
|
with pytest.raises(TypeError):
|
|
open_path(123, None, None, None)
|
|
|
|
p1 = tmp_path / "test1"
|
|
fp = p1.open("wb")
|
|
|
|
# provided path must match the mode
|
|
with pytest.raises(ValueError):
|
|
open_path(fp, "r")
|
|
with pytest.raises(ValueError):
|
|
open_path(fp, "randomstuff")
|
|
|
|
# test identity
|
|
_ = open_path(fp, "w")
|
|
assert _ is not None
|
|
assert fp is _
|
|
|
|
# Can't use a closed path
|
|
with pytest.raises(ValueError):
|
|
fp.close()
|
|
open_path(fp, "w")
|
|
|
|
buff = io.BytesIO()
|
|
assert buff.writable()
|
|
assert buff.readable() is ("w" == "w")
|
|
opened_buffer = open_path(buff, "w")
|
|
assert opened_buffer is buff
|
|
with pytest.raises(ValueError):
|
|
buff.close()
|
|
open_path(buff, "w")
|
|
|
|
|
|
@pytest.mark.expensive
|
|
def test_save_load_large_model(tmp_path):
|
|
"""
|
|
Test saving and loading a model with a large policy that is greater than 2GB. We
|
|
test only one algorithm since all algorithms share the same code for loading and
|
|
saving the model.
|
|
"""
|
|
env = select_env(TD3)
|
|
kwargs = dict(policy_kwargs=dict(net_arch=[8192, 8192, 8192]), device="cpu")
|
|
model = TD3("MlpPolicy", env, **kwargs)
|
|
|
|
# test saving
|
|
model.save(tmp_path / "test_save")
|
|
|
|
# test loading
|
|
model = TD3.load(str(tmp_path / "test_save.zip"), env=env, **kwargs)
|
|
|
|
# clear file from os
|
|
os.remove(tmp_path / "test_save.zip")
|
|
|
|
|
|
def test_load_invalid_object(tmp_path):
|
|
# See GH Issue #1122 for an example
|
|
# of invalid object loading
|
|
path = str(tmp_path / "ppo_pendulum.zip")
|
|
PPO("MlpPolicy", "Pendulum-v1", learning_rate=lambda _: 1.0).save(path)
|
|
|
|
with zipfile.ZipFile(path, mode="r") as archive:
|
|
json_data = json.loads(archive.read("data").decode())
|
|
|
|
# Intentionally corrupt the data
|
|
serialization = json_data["learning_rate"][":serialized:"]
|
|
base64_object = base64.b64decode(serialization.encode())
|
|
new_bytes = base64_object.replace(b"CodeType", b"CodeTyps")
|
|
base64_encoded = base64.b64encode(new_bytes).decode()
|
|
json_data["learning_rate"][":serialized:"] = base64_encoded
|
|
serialized_data = json.dumps(json_data, indent=4)
|
|
|
|
with open(tmp_path / "data", "w") as f:
|
|
f.write(serialized_data)
|
|
# Replace with the corrupted file
|
|
# probably doesn't work on windows
|
|
os.system(f"cd {tmp_path}; zip ppo_pendulum.zip data")
|
|
with pytest.warns(UserWarning, match=r"custom_objects"):
|
|
PPO.load(path)
|
|
# Load with custom object, no warnings
|
|
with warnings.catch_warnings(record=True) as record:
|
|
PPO.load(path, custom_objects=dict(learning_rate=lambda _: 1.0))
|
|
assert len(record) == 0
|
|
|
|
|
|
def test_dqn_target_update_interval(tmp_path):
|
|
# `target_update_interval` should not change when reloading the model. See GH Issue #1373.
|
|
env = make_vec_env(env_id="CartPole-v1", n_envs=2)
|
|
model = DQN("MlpPolicy", env, verbose=1, target_update_interval=100)
|
|
model.save(tmp_path / "dqn_cartpole")
|
|
del model
|
|
model = DQN.load(tmp_path / "dqn_cartpole")
|
|
os.remove(tmp_path / "dqn_cartpole.zip")
|
|
assert model.target_update_interval == 100
|