From fd0cd82339511b54cd3907df228a656f2a32f0b8 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Mon, 8 May 2023 13:48:26 +0200
Subject: [PATCH] Update outdated custom env doc (#1490)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update outdated custom env doc

* fix render_mode and term/trunc/reset_info

* gym -> gymnasium

---------

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
---
 README.md                 |  2 +-
 docs/guide/custom_env.rst | 29 +++++++++++++++--------------
 docs/guide/examples.rst   |  7 +++----
 docs/guide/quickstart.rst |  8 ++++----
 docs/guide/rl_tips.rst    |  6 +++---
 docs/misc/changelog.rst   |  1 +
 docs/modules/a2c.rst      | 12 +++++-------
 docs/modules/ddpg.rst     | 10 +++++-----
 docs/modules/dqn.rst      | 11 +++++------
 docs/modules/her.rst      | 10 ++++------
 docs/modules/ppo.rst      | 10 +++++-----
 docs/modules/sac.rst      | 12 +++++-------
 docs/modules/td3.rst      | 10 +++++-----
 setup.py                  |  2 +-
 14 files changed, 62 insertions(+), 68 deletions(-)

diff --git a/README.md b/README.md
index f7b9adc..8ceb052 100644
--- a/README.md
+++ b/README.md
@@ -139,7 +139,7 @@ for i in range(1000):
 env.close()
 ```
 
-Or just train a model with a one liner if [the environment is registered in Gym](https://github.com/openai/gym/wiki/Environments) and if [the policy is registered](https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html):
+Or just train a model with a one liner if [the environment is registered in Gymnasium](https://gymnasium.farama.org/tutorials/gymnasium_basics/environment_creation/#registering-envs) and if [the policy is registered](https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html):
 
 ```python
 from stable_baselines3 import PPO
diff --git a/docs/guide/custom_env.rst b/docs/guide/custom_env.rst
index 822c821..a449963 100644
--- a/docs/guide/custom_env.rst
+++ b/docs/guide/custom_env.rst
@@ -3,18 +3,19 @@
 Using Custom Environments
 ==========================
 
-To use the RL baselines with custom environments, they just need to follow the *gym* interface.
-That is to say, your environment must implement the following methods (and inherits from OpenAI Gym Class):
+To use the RL baselines with custom environments, they just need to follow the *gymnasium*  `interface <https://gymnasium.farama.org/tutorials/gymnasium_basics/environment_creation/#sphx-glr-tutorials-gymnasium-basics-environment-creation-py>`_.
+That is to say, your environment must implement the following methods (and inherits from Gym Class):
 
 
 .. note::
-	If you are using images as input, the observation must be of type ``np.uint8`` and be contained in [0, 255].
-	By default, the observation is normalized by SB3 pre-processing (dividing by 255 to have values in [0, 1]) when using CNN policies.
-	Images can be either channel-first or channel-last.
+
+  If you are using images as input, the observation must be of type ``np.uint8`` and be contained in [0, 255].
+  By default, the observation is normalized by SB3 pre-processing (dividing by 255 to have values in [0, 1]) when using CNN policies.
+  Images can be either channel-first or channel-last.
 
   If you want to use ``CnnPolicy`` or ``MultiInputPolicy`` with image-like observation (3D tensor) that are already normalized, you must pass ``normalize_images=False``
-	to the policy (using ``policy_kwargs`` parameter, ``policy_kwargs=dict(normalize_images=False)``)
-	and make sure your image is in the **channel-first** format.
+  to the policy (using ``policy_kwargs`` parameter, ``policy_kwargs=dict(normalize_images=False)``)
+  and make sure your image is in the **channel-first** format.
 
 
 .. note::
@@ -34,7 +35,7 @@ That is to say, your environment must implement the following methods (and inher
   class CustomEnv(gym.Env):
       """Custom Environment that follows gym interface."""
 
-      metadata = {"render.modes": ["human"]}
+      metadata = {"render_modes": ["human"], "render_fps": 30}
 
       def __init__(self, arg1, arg2, ...):
           super().__init__()
@@ -48,11 +49,11 @@ That is to say, your environment must implement the following methods (and inher
 
       def step(self, action):
           ...
-          return observation, reward, done, info
+          return observation, reward, terminated, truncated, info
 
-      def reset(self):
+      def reset(self, seed=None, options=None):
           ...
-          return observation  # reward, done, info can't be included
+          return observation, info
 
       def render(self):
           ...
@@ -81,11 +82,11 @@ To check that your environment follows the Gym interface that SB3 supports, plea
 	# It will check your custom environment and output additional warnings if needed
 	check_env(env)
 
-Gym also have its own `env checker <https://www.gymlibrary.ml/content/api/#checking-api-conformity>`_ but it checks a superset of what SB3 supports (SB3 does not support all Gym features).
+Gymnasium also have its own `env checker <https://gymnasium.farama.org/api/utils/#gymnasium.utils.env_checker.check_env>`_ but it checks a superset of what SB3 supports (SB3 does not support all Gym features).
 
-We have created a `colab notebook <https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/master/5_custom_gym_env.ipynb>`_ for a concrete example on creating a custom environment along with an example of using it with Stable-Baselines3 interface.
+We have created a `colab notebook <https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/sb3/5_custom_gym_env.ipynb>`_ for a concrete example on creating a custom environment along with an example of using it with Stable-Baselines3 interface.
 
-Alternatively, you may look at OpenAI Gym `built-in environments <https://www.gymlibrary.ml/>`_. However, the readers are cautioned as per OpenAI Gym `official wiki <https://github.com/openai/gym/wiki/FAQ>`_, its advised not to customize their built-in environments. It is better to copy and create new ones if you need to modify them.
+Alternatively, you may look at Gymnasium `built-in environments <https://gymnasium.farama.org>`_.
 
 Optionally, you can also register the environment with gym, that will allow you to create the RL agent in one line (and use ``gym.make()`` to instantiate the env):
 
diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst
index 818f282..caefdbe 100644
--- a/docs/guide/examples.rst
+++ b/docs/guide/examples.rst
@@ -71,7 +71,7 @@ In the following example, we will train, save and load a DQN model on the Lunar
 
 
   # Create environment
-  env = gym.make("LunarLander-v2")
+  env = gym.make("LunarLander-v2", render_mode="rgb_array")
 
   # Instantiate the agent
   model = DQN("MlpPolicy", env, verbose=1)
@@ -99,7 +99,7 @@ In the following example, we will train, save and load a DQN model on the Lunar
   for i in range(1000):
       action, _states = model.predict(obs, deterministic=True)
       obs, rewards, dones, info = vec_env.step(action)
-      vec_env.render()
+      vec_env.render("human")
 
 
 Multiprocessing: Unleashing the Power of Vectorized Environments
@@ -116,7 +116,6 @@ Multiprocessing: Unleashing the Power of Vectorized Environments
 .. code-block:: python
 
   import gymnasium as gym
-  import numpy as np
 
   from stable_baselines3 import PPO
   from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
@@ -512,6 +511,7 @@ The parking env is a goal-conditioned continuous control task, in which the vehi
   # Load saved model
   # Because it needs access to `env.compute_reward()`
   # HER must be loaded with the env
+  env = gym.make("parking-v0", render_mode="human") # Change the render mode
   model = SAC.load("her_sac_highway", env=env)
 
   obs, info = env.reset()
@@ -521,7 +521,6 @@ The parking env is a goal-conditioned continuous control task, in which the vehi
   for _ in range(100):
       action, _ = model.predict(obs, deterministic=True)
       obs, reward, terminated, truncated, info = env.step(action)
-      env.render()
       episode_reward += reward
       if terminated or truncated or info.get("is_success", False):
           print("Reward:", episode_reward, "Success?", info.get("is_success", False))
diff --git a/docs/guide/quickstart.rst b/docs/guide/quickstart.rst
index b22ac54..ba0a988 100644
--- a/docs/guide/quickstart.rst
+++ b/docs/guide/quickstart.rst
@@ -20,7 +20,7 @@ Here is a quick example of how to train and run A2C on a CartPole environment:
 
   from stable_baselines3 import A2C
 
-  env = gym.make("CartPole-v1")
+  env = gym.make("CartPole-v1", render_mode="rgb_array")
 
   model = A2C("MlpPolicy", env, verbose=1)
   model.learn(total_timesteps=10_000)
@@ -30,7 +30,7 @@ Here is a quick example of how to train and run A2C on a CartPole environment:
   for i in range(1000):
       action, _state = model.predict(obs, deterministic=True)
       obs, reward, done, info = vec_env.step(action)
-      vec_env.render()
+      vec_env.render("human")
       # VecEnv resets automatically
       # if done:
       #   obs = vec_env.reset()
@@ -40,8 +40,8 @@ Here is a quick example of how to train and run A2C on a CartPole environment:
 	You can find explanations about the logger output and names in the :ref:`Logger <logger>` section.
 
 
-Or just train a model with a one liner if
-`the environment is registered in Gym <https://github.com/openai/gym/wiki/Environments>`_ and if
+Or just train a model with a one line if
+`the environment is registered in Gymnasium <https://gymnasium.farama.org/tutorials/gymnasium_basics/environment_creation/#registering-envs>`_ and if
 the policy is registered:
 
 .. code-block:: python
diff --git a/docs/guide/rl_tips.rst b/docs/guide/rl_tips.rst
index aa06a11..f82c163 100644
--- a/docs/guide/rl_tips.rst
+++ b/docs/guide/rl_tips.rst
@@ -210,14 +210,14 @@ If you want to quickly try a random agent on your environment, you can also do:
 .. code-block:: python
 
   env = YourEnv()
-  obs = env.reset()
+  obs, info = env.reset()
   n_steps = 10
   for _ in range(n_steps):
       # Random action
       action = env.action_space.sample()
-      obs, reward, done, info = env.step(action)
+      obs, reward, terminated, truncated, info = env.step(action)
       if done:
-          obs = env.reset()
+          obs, info = env.reset()
 
 
 **Why should I normalize the action space?**
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 096648a..2edf3ad 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -70,6 +70,7 @@ Documentation:
 - Make it more explicit when using ``VecEnv`` vs Gym env
 - Added UAV_Navigation_DRL_AirSim to the project page (@heleidsn)
 - Added ``EvalCallback`` example (@sidney-tio)
+- Update custom env documentation
 
 
 Release 1.8.0 (2023-04-07)
diff --git a/docs/modules/a2c.rst b/docs/modules/a2c.rst
index 84a94ea..08e0222 100644
--- a/docs/modules/a2c.rst
+++ b/docs/modules/a2c.rst
@@ -53,15 +53,13 @@ Train a A2C agent on ``CartPole-v1`` using 4 environments.
 
 .. code-block:: python
 
-  import gymnasium as gym
-
   from stable_baselines3 import A2C
   from stable_baselines3.common.env_util import make_vec_env
 
   # Parallel environments
-  env = make_vec_env("CartPole-v1", n_envs=4)
+  vec_env = make_vec_env("CartPole-v1", n_envs=4)
 
-  model = A2C("MlpPolicy", env, verbose=1)
+  model = A2C("MlpPolicy", vec_env, verbose=1)
   model.learn(total_timesteps=25000)
   model.save("a2c_cartpole")
 
@@ -69,11 +67,11 @@ Train a A2C agent on ``CartPole-v1`` using 4 environments.
 
   model = A2C.load("a2c_cartpole")
 
-  obs = env.reset()
+  obs = vec_env.reset()
   while True:
       action, _states = model.predict(obs)
-      obs, rewards, dones, info = env.step(action)
-      env.render()
+      obs, rewards, dones, info = vec_env.step(action)
+      vec_env.render("human")
 
 
 .. note::
diff --git a/docs/modules/ddpg.rst b/docs/modules/ddpg.rst
index 4ac28cc..d1a2311 100644
--- a/docs/modules/ddpg.rst
+++ b/docs/modules/ddpg.rst
@@ -67,7 +67,7 @@ This example is only to demonstrate the use of the library and its functions, an
   from stable_baselines3 import DDPG
   from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
 
-  env = gym.make("Pendulum-v1")
+  env = gym.make("Pendulum-v1", render_mode="rgb_array")
 
   # The noise objects for DDPG
   n_actions = env.action_space.shape[-1]
@@ -76,17 +76,17 @@ This example is only to demonstrate the use of the library and its functions, an
   model = DDPG("MlpPolicy", env, action_noise=action_noise, verbose=1)
   model.learn(total_timesteps=10000, log_interval=10)
   model.save("ddpg_pendulum")
-  env = model.get_env()
+  vec_env = model.get_env()
 
   del model # remove to demonstrate saving and loading
 
   model = DDPG.load("ddpg_pendulum")
 
-  obs = env.reset()
+  obs = vec_env.reset()
   while True:
       action, _states = model.predict(obs)
-      obs, rewards, dones, info = env.step(action)
-      env.render()
+      obs, rewards, dones, info = vec_env.step(action)
+      env.render("human")
 
 Results
 -------
diff --git a/docs/modules/dqn.rst b/docs/modules/dqn.rst
index 0569aa5..85d4866 100644
--- a/docs/modules/dqn.rst
+++ b/docs/modules/dqn.rst
@@ -60,7 +60,7 @@ This example is only to demonstrate the use of the library and its functions, an
 
   from stable_baselines3 import DQN
 
-  env = gym.make("CartPole-v1")
+  env = gym.make("CartPole-v1", render_mode="human")
 
   model = DQN("MlpPolicy", env, verbose=1)
   model.learn(total_timesteps=10000, log_interval=4)
@@ -70,13 +70,12 @@ This example is only to demonstrate the use of the library and its functions, an
 
   model = DQN.load("dqn_cartpole")
 
-  obs = env.reset()
+  obs, info = env.reset()
   while True:
       action, _states = model.predict(obs, deterministic=True)
-      obs, reward, done, info = env.step(action)
-      env.render()
-      if done:
-        obs = env.reset()
+      obs, reward, terminated, truncated, info = env.step(action)
+      if terminated or truncated:
+          obs, info = env.reset()
 
 
 Results
diff --git a/docs/modules/her.rst b/docs/modules/her.rst
index f23c76c..81b68d8 100644
--- a/docs/modules/her.rst
+++ b/docs/modules/her.rst
@@ -65,7 +65,6 @@ This example is only to demonstrate the use of the library and its functions, an
     from stable_baselines3 import HerReplayBuffer, DDPG, DQN, SAC, TD3
     from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
     from stable_baselines3.common.envs import BitFlippingEnv
-    from stable_baselines3.common.vec_env import DummyVecEnv
 
     model_class = DQN  # works also with SAC, DDPG and TD3
     N_BITS = 15
@@ -96,13 +95,12 @@ This example is only to demonstrate the use of the library and its functions, an
     # HER must be loaded with the env
     model = model_class.load("./her_bit_env", env=env)
 
-    obs = env.reset()
+    obs, info = env.reset()
     for _ in range(100):
         action, _ = model.predict(obs, deterministic=True)
-        obs, reward, done, _ = env.step(action)
-
-        if done:
-            obs = env.reset()
+        obs, reward, terminated, truncated, _ = env.step(action)
+        if terminated or truncated:
+            obs, info = env.reset()
 
 
 Results
diff --git a/docs/modules/ppo.rst b/docs/modules/ppo.rst
index a822cb4..ace2fcc 100644
--- a/docs/modules/ppo.rst
+++ b/docs/modules/ppo.rst
@@ -71,9 +71,9 @@ Train a PPO agent on ``CartPole-v1`` using 4 environments.
   from stable_baselines3.common.env_util import make_vec_env
 
   # Parallel environments
-  env = make_vec_env("CartPole-v1", n_envs=4)
+  vec_env = make_vec_env("CartPole-v1", n_envs=4)
 
-  model = PPO("MlpPolicy", env, verbose=1)
+  model = PPO("MlpPolicy", vec_env, verbose=1)
   model.learn(total_timesteps=25000)
   model.save("ppo_cartpole")
 
@@ -81,11 +81,11 @@ Train a PPO agent on ``CartPole-v1`` using 4 environments.
 
   model = PPO.load("ppo_cartpole")
 
-  obs = env.reset()
+  obs = vec_env.reset()
   while True:
       action, _states = model.predict(obs)
-      obs, rewards, dones, info = env.step(action)
-      env.render()
+      obs, rewards, dones, info = vec_env.step(action)
+      vec_env.render("human")
 
 
 Results
diff --git a/docs/modules/sac.rst b/docs/modules/sac.rst
index 0e9bb3f..960a282 100644
--- a/docs/modules/sac.rst
+++ b/docs/modules/sac.rst
@@ -69,11 +69,10 @@ This example is only to demonstrate the use of the library and its functions, an
 .. code-block:: python
 
   import gymnasium as gym
-  import numpy as np
 
   from stable_baselines3 import SAC
 
-  env = gym.make("Pendulum-v1")
+  env = gym.make("Pendulum-v1", render_mode="human")
 
   model = SAC("MlpPolicy", env, verbose=1)
   model.learn(total_timesteps=10000, log_interval=4)
@@ -83,13 +82,12 @@ This example is only to demonstrate the use of the library and its functions, an
 
   model = SAC.load("sac_pendulum")
 
-  obs = env.reset()
+  obs, info = env.reset()
   while True:
       action, _states = model.predict(obs, deterministic=True)
-      obs, reward, done, info = env.step(action)
-      env.render()
-      if done:
-        obs = env.reset()
+      obs, reward, terminated, truncated, info = env.step(action)
+      if terminated or truncated:
+          obs, info = env.reset()
 
 
 Results
diff --git a/docs/modules/td3.rst b/docs/modules/td3.rst
index 7c17e64..7f9a154 100644
--- a/docs/modules/td3.rst
+++ b/docs/modules/td3.rst
@@ -67,7 +67,7 @@ This example is only to demonstrate the use of the library and its functions, an
   from stable_baselines3 import TD3
   from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
 
-  env = gym.make("Pendulum-v1")
+  env = gym.make("Pendulum-v1", render_mode="rgb_array")
 
   # The noise objects for TD3
   n_actions = env.action_space.shape[-1]
@@ -76,17 +76,17 @@ This example is only to demonstrate the use of the library and its functions, an
   model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=1)
   model.learn(total_timesteps=10000, log_interval=10)
   model.save("td3_pendulum")
-  env = model.get_env()
+  vec_env = model.get_env()
 
   del model # remove to demonstrate saving and loading
 
   model = TD3.load("td3_pendulum")
 
-  obs = env.reset()
+  obs = vec_env.reset()
   while True:
       action, _states = model.predict(obs)
-      obs, rewards, dones, info = env.step(action)
-      env.render()
+      obs, rewards, dones, info = vec_env.step(action)
+      vec_env.render("human")
 
 Results
 -------
diff --git a/setup.py b/setup.py
index d72e77d..b170e6e 100644
--- a/setup.py
+++ b/setup.py
@@ -149,7 +149,7 @@ setup(
     url="https://github.com/DLR-RM/stable-baselines3",
     author_email="antonin.raffin@dlr.de",
     keywords="reinforcement-learning-algorithms reinforcement-learning machine-learning "
-    "gym openai stable baselines toolbox python data-science",
+    "gymnasium gym openai stable baselines toolbox python data-science",
     license="MIT",
     long_description=long_description,
     long_description_content_type="text/markdown",