diff --git a/docs/guide/callbacks.rst b/docs/guide/callbacks.rst index 07b78d7..098f7c4 100644 --- a/docs/guide/callbacks.rst +++ b/docs/guide/callbacks.rst @@ -216,13 +216,13 @@ It will save the best model if ``best_model_save_path`` folder is specified and from stable_baselines3.common.callbacks import EvalCallback # Separate evaluation env - eval_env = gym.make('Pendulum-v1') + eval_env = gym.make("Pendulum-v1") # Use deterministic actions for evaluation - eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', - log_path='./logs/', eval_freq=500, + eval_callback = EvalCallback(eval_env, best_model_save_path="./logs/", + log_path="./logs/", eval_freq=500, deterministic=True, render=False) - model = SAC('MlpPolicy', 'Pendulum-v1') + model = SAC("MlpPolicy", "Pendulum-v1") model.learn(5000, callback=eval_callback) @@ -242,15 +242,15 @@ Alternatively, you can pass directly a list of callbacks to the ``learn()`` meth from stable_baselines3 import SAC from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback - checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/') + checkpoint_callback = CheckpointCallback(save_freq=1000, save_path="./logs/") # Separate evaluation env - eval_env = gym.make('Pendulum-v1') - eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model', - log_path='./logs/results', eval_freq=500) + eval_env = gym.make("Pendulum-v1") + eval_callback = EvalCallback(eval_env, best_model_save_path="./logs/best_model", + log_path="./logs/results", eval_freq=500) # Create the callback list callback = CallbackList([checkpoint_callback, eval_callback]) - model = SAC('MlpPolicy', 'Pendulum-v1') + model = SAC("MlpPolicy", "Pendulum-v1") # Equivalent to: # model.learn(5000, callback=[checkpoint_callback, eval_callback]) model.learn(5000, callback=callback) @@ -273,12 +273,12 @@ It must be used with the :ref:`EvalCallback` and use the event triggered by a ne from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold # Separate evaluation env - eval_env = gym.make('Pendulum-v1') + eval_env = gym.make("Pendulum-v1") # Stop training when the model reaches the reward threshold callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-200, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1) - model = SAC('MlpPolicy', 'Pendulum-v1', verbose=1) + model = SAC("MlpPolicy", "Pendulum-v1", verbose=1) # Almost infinite number of timesteps, but the training will stop # early as soon as the reward threshold is reached model.learn(int(1e10), callback=eval_callback) @@ -306,10 +306,10 @@ An :ref:`EventCallback` that will trigger its child callback every ``n_steps`` t # this is equivalent to defining CheckpointCallback(save_freq=500) # checkpoint_callback will be triggered every 500 steps - checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/') + checkpoint_on_event = CheckpointCallback(save_freq=1, save_path="./logs/") event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) - model = PPO('MlpPolicy', 'Pendulum-v1', verbose=1) + model = PPO("MlpPolicy", "Pendulum-v1", verbose=1) model.learn(int(2e4), callback=event_callback) @@ -338,7 +338,7 @@ and in total for ``max_episodes * n_envs`` episodes. # Stops training when the model reaches the maximum number of episodes callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1) - model = A2C('MlpPolicy', 'Pendulum-v1', verbose=1) + model = A2C("MlpPolicy", "Pendulum-v1", verbose=1) # Almost infinite number of timesteps, but the training will stop # early as soon as the max number of episodes is reached model.learn(int(1e10), callback=callback_max_episodes) diff --git a/docs/guide/checking_nan.rst b/docs/guide/checking_nan.rst index 92f8093..29f9e31 100644 --- a/docs/guide/checking_nan.rst +++ b/docs/guide/checking_nan.rst @@ -51,7 +51,7 @@ which defines for the python process, how it should handle floating point error. import numpy as np - np.seterr(all='raise') # define before your code. + np.seterr(all="raise") # define before your code. print("numpy test:") @@ -66,7 +66,7 @@ but this will also avoid overflow issues on floating point numbers: import numpy as np - np.seterr(all='raise') # define before your code. + np.seterr(all="raise") # define before your code. print("numpy overflow test:") @@ -81,11 +81,11 @@ but will not avoid the propagation issues: import numpy as np - np.seterr(all='raise') # define before your code. + np.seterr(all="raise") # define before your code. print("numpy propagation test:") - a = np.float64('NaN') + a = np.float64("NaN") b = np.float64(1.0) val = a + b # this will neither warn nor raise anything print(val) @@ -109,7 +109,7 @@ It will monitor the actions, observations, and rewards, indicating what action o class NanAndInfEnv(gym.Env): """Custom Environment that raised NaNs and Infs""" - metadata = {'render.modes': ['human']} + metadata = {"render.modes": ["human"]} def __init__(self): super(NanAndInfEnv, self).__init__() @@ -119,9 +119,9 @@ It will monitor the actions, observations, and rewards, indicating what action o def step(self, _action): randf = np.random.rand() if randf > 0.99: - obs = float('NaN') + obs = float("NaN") elif randf > 0.98: - obs = float('inf') + obs = float("inf") else: obs = randf return [obs], 0.0, False, {} @@ -129,7 +129,7 @@ It will monitor the actions, observations, and rewards, indicating what action o def reset(self): return [0.0] - def render(self, mode='human', close=False): + def render(self, mode="human", close=False): pass # Create environment @@ -137,7 +137,7 @@ It will monitor the actions, observations, and rewards, indicating what action o env = VecCheckNan(env, raise_exception=True) # Instantiate the agent - model = PPO('MlpPolicy', env) + model = PPO("MlpPolicy", env) # Train the agent model.learn(total_timesteps=int(2e5)) # this will crash explaining that the invalid value originated from the environment. diff --git a/docs/guide/custom_env.rst b/docs/guide/custom_env.rst index 2e2d1f7..9fbb527 100644 --- a/docs/guide/custom_env.rst +++ b/docs/guide/custom_env.rst @@ -27,7 +27,7 @@ That is to say, your environment must implement the following methods (and inher class CustomEnv(gym.Env): """Custom Environment that follows gym interface""" - metadata = {'render.modes': ['human']} + metadata = {"render.modes": ["human"]} def __init__(self, arg1, arg2, ...): super(CustomEnv, self).__init__() @@ -45,7 +45,7 @@ That is to say, your environment must implement the following methods (and inher def reset(self): ... return observation # reward, done, info can't be included - def render(self, mode='human'): + def render(self, mode="human"): ... def close (self): ... @@ -58,7 +58,7 @@ Then you can define and train a RL agent with: # Instantiate the env env = CustomEnv(arg1, ...) # Define and Train the agent - model = A2C('CnnPolicy', env).learn(total_timesteps=1000) + model = A2C("CnnPolicy", env).learn(total_timesteps=1000) To check that your environment follows the Gym interface that SB3 supports, please use: diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index 247c86c..bcc206a 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -71,10 +71,10 @@ In the following example, we will train, save and load a DQN model on the Lunar # Create environment - env = gym.make('LunarLander-v2') + env = gym.make("LunarLander-v2") # Instantiate the agent - model = DQN('MlpPolicy', env, verbose=1) + model = DQN("MlpPolicy", env, verbose=1) # Train the agent model.learn(total_timesteps=int(2e5)) # Save the agent @@ -138,7 +138,7 @@ Multiprocessing: Unleashing the Power of Vectorized Environments set_random_seed(seed) return _init - if __name__ == '__main__': + if __name__ == "__main__": env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use # Create the vectorized environment @@ -149,7 +149,7 @@ Multiprocessing: Unleashing the Power of Vectorized Environments # You can choose between `DummyVecEnv` (usually faster) and `SubprocVecEnv` # env = make_vec_env(env_id, n_envs=num_cpu, seed=0, vec_env_cls=SubprocVecEnv) - model = PPO('MlpPolicy', env, verbose=1) + model = PPO("MlpPolicy", env, verbose=1) model.learn(total_timesteps=25_000) obs = env.reset() @@ -182,7 +182,7 @@ Multiprocessing with off-policy algorithms # We collect 4 transitions per call to `ènv.step()` # and performs 2 gradient steps per call to `ènv.step()` # if gradient_steps=-1, then we would do 4 gradients steps per call to `ènv.step()` - model = SAC('MlpPolicy', env, train_freq=1, gradient_steps=2, verbose=1) + model = SAC("MlpPolicy", env, train_freq=1, gradient_steps=2, verbose=1) model.learn(total_timesteps=10_000) @@ -254,7 +254,7 @@ If your callback returns False, training is aborted early. super(SaveOnBestTrainingRewardCallback, self).__init__(verbose) self.check_freq = check_freq self.log_dir = log_dir - self.save_path = os.path.join(log_dir, 'best_model') + self.save_path = os.path.join(log_dir, "best_model") self.best_mean_reward = -np.inf def _init_callback(self) -> None: @@ -266,7 +266,7 @@ If your callback returns False, training is aborted early. if self.n_calls % self.check_freq == 0: # Retrieve training reward - x, y = ts2xy(load_results(self.log_dir), 'timesteps') + x, y = ts2xy(load_results(self.log_dir), "timesteps") if len(x) > 0: # Mean training reward over the last 100 episodes mean_reward = np.mean(y[-100:]) @@ -289,14 +289,14 @@ If your callback returns False, training is aborted early. os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment - env = gym.make('LunarLanderContinuous-v2') + env = gym.make("LunarLanderContinuous-v2") env = Monitor(env, log_dir) # Add some action noise for exploration n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # Because we use parameter noise, we should use a MlpPolicy with layer normalization - model = TD3('MlpPolicy', env, action_noise=action_noise, verbose=0) + model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=0) # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) # Train the agent @@ -336,11 +336,11 @@ and multiprocessing for you. To install the Atari environments, run the command # There already exists an environment generator # that will make and wrap atari environments correctly. # Here we are also multi-worker training (n_envs=4 => 4 environments) - env = make_atari_env('PongNoFrameskip-v4', n_envs=4, seed=0) + env = make_atari_env("PongNoFrameskip-v4", n_envs=4, seed=0) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) - model = A2C('CnnPolicy', env, verbose=1) + model = A2C("CnnPolicy", env, verbose=1) model.learn(total_timesteps=25_000) obs = env.reset() @@ -382,7 +382,7 @@ will compute a running average and standard deviation of input features (it can env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) - model = PPO('MlpPolicy', env) + model = PPO("MlpPolicy", env) model.learn(total_timesteps=2000) # Don't forget to save the VecNormalize statistics when saving the agent @@ -564,7 +564,7 @@ Behind the scene, SB3 uses an :ref:`EvalCallback `. # Create the model, the training environment # and the test environment (for evaluation) - model = SAC('MlpPolicy', 'Pendulum-v1', verbose=1, + model = SAC("MlpPolicy", "Pendulum-v1", verbose=1, learning_rate=1e-3, create_eval_env=True) # Evaluate the model every 1000 steps on 5 test episodes @@ -717,7 +717,7 @@ to keep track of the agent progress. from stable_baselines3.common.vec_env import VecExtractDictObs, VecMonitor # ProcgenEnv is already vectorized - venv = ProcgenEnv(num_envs=2, env_name='starpilot') + venv = ProcgenEnv(num_envs=2, env_name="starpilot") # To use only part of the observation: # venv = VecExtractDictObs(venv, "rgb") @@ -753,8 +753,8 @@ Record a mp4 video (here using a random agent). import gym from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv - env_id = 'CartPole-v1' - video_folder = 'logs/videos/' + env_id = "CartPole-v1" + video_folder = "logs/videos/" video_length = 100 env = DummyVecEnv([lambda: gym.make(env_id)]) @@ -792,11 +792,11 @@ Bonus: Make a GIF of a Trained Agent images = [] obs = model.env.reset() - img = model.env.render(mode='rgb_array') + img = model.env.render(mode="rgb_array") for i in range(350): images.append(img) action, _ = model.predict(obs) obs, _, _ ,_ = model.env.step(action) - img = model.env.render(mode='rgb_array') + img = model.env.render(mode="rgb_array") - imageio.mimsave('lander_a2c.gif', [np.array(img) for i, img in enumerate(images) if i%2 == 0], fps=29) + imageio.mimsave("lander_a2c.gif", [np.array(img) for i, img in enumerate(images) if i%2 == 0], fps=29) diff --git a/docs/guide/quickstart.rst b/docs/guide/quickstart.rst index 064139d..3365b49 100644 --- a/docs/guide/quickstart.rst +++ b/docs/guide/quickstart.rst @@ -14,9 +14,9 @@ Here is a quick example of how to train and run A2C on a CartPole environment: from stable_baselines3 import A2C - env = gym.make('CartPole-v1') + env = gym.make("CartPole-v1") - model = A2C('MlpPolicy', env, verbose=1) + model = A2C("MlpPolicy", env, verbose=1) model.learn(total_timesteps=10000) obs = env.reset() @@ -40,4 +40,4 @@ the policy is registered: from stable_baselines3 import A2C - model = A2C('MlpPolicy', 'CartPole-v1').learn(10000) + model = A2C("MlpPolicy", "CartPole-v1").learn(10000) diff --git a/docs/guide/tensorboard.rst b/docs/guide/tensorboard.rst index 89681d1..15cc67e 100644 --- a/docs/guide/tensorboard.rst +++ b/docs/guide/tensorboard.rst @@ -12,7 +12,7 @@ To use Tensorboard with stable baselines3, you simply need to pass the location from stable_baselines3 import A2C - model = A2C('MlpPolicy', 'CartPole-v1', verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/") + model = A2C("MlpPolicy", "CartPole-v1", verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/") model.learn(total_timesteps=10_000) @@ -22,7 +22,7 @@ You can also define custom logging name when training (by default it is the algo from stable_baselines3 import A2C - model = A2C('MlpPolicy', 'CartPole-v1', verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/") + model = A2C("MlpPolicy", "CartPole-v1", verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/") model.learn(total_timesteps=10_000, tb_log_name="first_run") # Pass reset_num_timesteps=False to continue the training curve in tensorboard # By default, it will create a new curve @@ -91,7 +91,7 @@ Here is a simple example on how to log both additional tensor or arbitrary scala def _on_step(self) -> bool: # Log scalar value (here a random variable) value = np.random.random() - self.logger.record('random_value', value) + self.logger.record("random_value", value) return True diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 738b676..36b77e5 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -58,6 +58,7 @@ Documentation: - Added link to a GitHub issue in the custom policy documentation (@AlexPasqua) - Update doc on exporting models (fixes and added torch jit) - Fixed typos (@Akhilez) +- Standardized the use of ``"`` for string representation in documentation Release 1.6.0 (2022-07-11) --------------------------- diff --git a/docs/modules/her.rst b/docs/modules/her.rst index 82bf745..0b73351 100644 --- a/docs/modules/her.rst +++ b/docs/modules/her.rst @@ -73,7 +73,7 @@ This example is only to demonstrate the use of the library and its functions, an env = BitFlippingEnv(n_bits=N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) # Available strategies (cf paper): future, final, episode - goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE + goal_selection_strategy = "future" # equivalent to GoalSelectionStrategy.FUTURE # If True the HER transitions will get sampled online online_sampling = True @@ -101,7 +101,7 @@ This example is only to demonstrate the use of the library and its functions, an model.save("./her_bit_env") # Because it needs access to `env.compute_reward()` # HER must be loaded with the env - model = model_class.load('./her_bit_env', env=env) + model = model_class.load("./her_bit_env", env=env) obs = env.reset() for _ in range(100):