diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst
index 929e587..a8f8de3 100644
--- a/docs/guide/examples.rst
+++ b/docs/guide/examples.rst
@@ -149,7 +149,7 @@ Multiprocessing: Unleashing the Power of Vectorized Environments
       # env = make_vec_env(env_id, n_envs=num_cpu, seed=0, vec_env_cls=SubprocVecEnv)
 
       model = PPO('MlpPolicy', env, verbose=1)
-      model.learn(total_timesteps=25000)
+      model.learn(total_timesteps=25_000)
 
       obs = env.reset()
       for _ in range(1000):
@@ -177,7 +177,7 @@ These dictionaries are randomly initilaized on the creation of the environment a
   env = SimpleMultiObsEnv(random_start=False)
 
   model = PPO("MultiInputPolicy", env, verbose=1)
-  model.learn(total_timesteps=1e5)
+  model.learn(total_timesteps=100_000)
 
 
 Using Callback: Monitoring Training
@@ -217,12 +217,12 @@ If your callback returns False, training is aborted early.
       Callback for saving a model (the check is done every ``check_freq`` steps)
       based on the training reward (in practice, we recommend using ``EvalCallback``).
 
-      :param check_freq: (int)
-      :param log_dir: (str) Path to the folder where the model will be saved.
+      :param check_freq:
+      :param log_dir: Path to the folder where the model will be saved.
         It must contains the file created by the ``Monitor`` wrapper.
-      :param verbose: (int)
+      :param verbose: Verbosity level.
       """
-      def __init__(self, check_freq: int, log_dir: str, verbose=1):
+      def __init__(self, check_freq: int, log_dir: str, verbose: int = 1):
           super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
           self.check_freq = check_freq
           self.log_dir = log_dir
@@ -243,15 +243,15 @@ If your callback returns False, training is aborted early.
                 # Mean training reward over the last 100 episodes
                 mean_reward = np.mean(y[-100:])
                 if self.verbose > 0:
-                  print("Num timesteps: {}".format(self.num_timesteps))
-                  print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))
+                  print(f"Num timesteps: {self.num_timesteps}")
+                  print(f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}")
 
                 # New best model, you could save the agent here
                 if mean_reward > self.best_mean_reward:
                     self.best_mean_reward = mean_reward
                     # Example for saving best model
                     if self.verbose > 0:
-                      print("Saving new best model to {}".format(self.save_path))
+                      print(f"Saving new best model to {self.save_path}")
                     self.model.save(self.save_path)
 
           return True
@@ -313,7 +313,7 @@ and multiprocessing for you.
   env = VecFrameStack(env, n_stack=4)
 
   model = A2C('CnnPolicy', env, verbose=1)
-  model.learn(total_timesteps=25000)
+  model.learn(total_timesteps=25_000)
 
   obs = env.reset()
   while True:
@@ -495,10 +495,10 @@ linear and constant schedules.
 
   # Initial learning rate of 0.001
   model = PPO("MlpPolicy", "CartPole-v1", learning_rate=linear_schedule(0.001), verbose=1)
-  model.learn(total_timesteps=20000)
+  model.learn(total_timesteps=20_000)
   # By default, `reset_num_timesteps` is True, in which case the learning rate schedule resets.
   # progress_remaining = 1.0 - (num_timesteps / total_timesteps)
-  model.learn(total_timesteps=10000, reset_num_timesteps=True)
+  model.learn(total_timesteps=10_000, reset_num_timesteps=True)
 
 
 Advanced Saving and Loading
@@ -630,7 +630,7 @@ A2C policy gradient updates on the model.
 
   # Use traditional actor-critic policy gradient updates to
   # find good initial parameters
-  model.learn(total_timesteps=10000)
+  model.learn(total_timesteps=10_000)
 
   # Include only variables with "policy", "action" (policy) or "shared_net" (shared layers)
   # in their name: only these ones affect the action.
@@ -698,7 +698,7 @@ to keep track of the agent progress.
   venv = VecMonitor(venv=venv)
 
   model = PPO("MultiInputPolicy", venv, verbose=1)
-  model.learn(10000)
+  model.learn(10_000)
 
 
 Record a Video
@@ -726,7 +726,7 @@ Record a mp4 video (here using a random agent).
   # Record the video starting at the first step
   env = VecVideoRecorder(env, video_folder,
                          record_video_trigger=lambda x: x == 0, video_length=video_length,
-                         name_prefix="random-agent-{}".format(env_id))
+                         name_prefix=f"random-agent-{env_id}")
 
   env.reset()
   for _ in range(video_length + 1):
@@ -750,7 +750,7 @@ Bonus: Make a GIF of a Trained Agent
 
   from stable_baselines3 import A2C
 
-  model = A2C("MlpPolicy", "LunarLander-v2").learn(100000)
+  model = A2C("MlpPolicy", "LunarLander-v2").learn(100_000)
 
   images = []
   obs = model.env.reset()
diff --git a/docs/guide/tensorboard.rst b/docs/guide/tensorboard.rst
index 833ab1b..0929b9e 100644
--- a/docs/guide/tensorboard.rst
+++ b/docs/guide/tensorboard.rst
@@ -13,7 +13,7 @@ To use Tensorboard with stable baselines3, you simply need to pass the location
     from stable_baselines3 import A2C
 
     model = A2C('MlpPolicy', 'CartPole-v1', verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/")
-    model.learn(total_timesteps=10000)
+    model.learn(total_timesteps=10_000)
 
 
 You can also define custom logging name when training (by default it is the algorithm name)
@@ -23,11 +23,11 @@ You can also define custom logging name when training (by default it is the algo
     from stable_baselines3 import A2C
 
     model = A2C('MlpPolicy', 'CartPole-v1', verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/")
-    model.learn(total_timesteps=10000, tb_log_name="first_run")
+    model.learn(total_timesteps=10_000, tb_log_name="first_run")
     # Pass reset_num_timesteps=False to continue the training curve in tensorboard
     # By default, it will create a new curve
-    model.learn(total_timesteps=10000, tb_log_name="second_run", reset_num_timesteps=False)
-    model.learn(total_timesteps=10000, tb_log_name="third_run", reset_num_timesteps=False)
+    model.learn(total_timesteps=10_000, tb_log_name="second_run", reset_num_timesteps=False)
+    model.learn(total_timesteps=10_000, tb_log_name="third_run", reset_num_timesteps=False)
 
 
 Once the learn function is called, you can monitor the RL agent during or after the training, with the following bash command:
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 8ff6173..3be83f8 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -4,9 +4,15 @@ Changelog
 ==========
 
 
-Release 1.2.1a4 (WIP)
+Release 1.2.1a5 (WIP)
 ---------------------------
 
+.. warning::
+
+  This version will be the last one supporting Python 3.6 (end of life in Dec 2021).
+  We highly recommended you to upgrade to Python >= 3.7.
+
+
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
 - ``sde_net_arch`` argument in policies is deprecated and will be removed in a future version.
@@ -31,6 +37,7 @@ Bug Fixes:
   when observation normalization is disabled.
 - Fixed a bug where ``DQN`` would throw an error when using ``Discrete`` observation and stochastic actions
 - Fixed a bug where sub-classed observation spaces could not be used
+- Added ``force_reset`` argument to ``load()`` and ``set_env()`` in order to be able to call ``learn(reset_num_timesteps=False)`` with a new environment
 
 Deprecations:
 ^^^^^^^^^^^^^
@@ -40,6 +47,7 @@ Others:
 - Cap gym max version to 0.19 to avoid issues with atari-py and other breaking changes
 - Improved error message when using dict observation with the wrong policy
 - Improved error message when using ``EvalCallback`` with two envs not wrapped the same way.
+- Added additional infos about supported python version for PyPi in ``setup.py``
 
 Documentation:
 ^^^^^^^^^^^^^^
@@ -51,7 +59,7 @@ Documentation:
 - Fix PPO environment name (@IljaAvadiev)
 - Fix custom env doc and add env registration example
 - Update algorithms from SB3 Contrib
-
+- Use underscores for numeric literals in examples to improve clarity
 
 Release 1.2.0 (2021-09-03)
 ---------------------------
diff --git a/setup.py b/setup.py
index cb57859..c5b15f5 100644
--- a/setup.py
+++ b/setup.py
@@ -134,6 +134,15 @@ setup(
     long_description=long_description,
     long_description_content_type="text/markdown",
     version=__version__,
+    python_requires=">=3.6",
+    # PyPI package information.
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+    ],
 )
 
 # python setup.py sdist
diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py
index 1b38555..8872f41 100644
--- a/stable_baselines3/common/base_class.py
+++ b/stable_baselines3/common/base_class.py
@@ -478,7 +478,7 @@ class BaseAlgorithm(ABC):
         """
         return self._vec_normalize_env
 
-    def set_env(self, env: GymEnv) -> None:
+    def set_env(self, env: GymEnv, force_reset: bool = True) -> None:
         """
         Checks the validity of the environment, and if it is coherent, set it as the current environment.
         Furthermore wrap any non vectorized env into a vectorized
@@ -487,12 +487,19 @@ class BaseAlgorithm(ABC):
         - action_space
 
         :param env: The environment for learning a policy
+        :param force_reset: Force call to ``reset()`` before training
+            to avoid unexpected behavior.
+            See issue https://github.com/DLR-RM/stable-baselines3/issues/597
         """
         # if it is not a VecEnv, make it a VecEnv
         # and do other transformations (dict obs, image transpose) if needed
         env = self._wrap_env(env, self.verbose)
         # Check that the observation spaces match
         check_for_correct_spaces(env, self.observation_space, self.action_space)
+        # Discard `_last_obs`, this will force the env to reset before training
+        # See issue https://github.com/DLR-RM/stable-baselines3/issues/597
+        if force_reset:
+            self._last_obs = None
 
         self.n_envs = env.num_envs
         self.env = env
@@ -636,6 +643,7 @@ class BaseAlgorithm(ABC):
         device: Union[th.device, str] = "auto",
         custom_objects: Optional[Dict[str, Any]] = None,
         print_system_info: bool = False,
+        force_reset: bool = True,
         **kwargs,
     ) -> "BaseAlgorithm":
         """
@@ -654,6 +662,9 @@ class BaseAlgorithm(ABC):
             file that can not be deserialized.
         :param print_system_info: Whether to print system info from the saved model
             and the current system info (useful to debug loading issues)
+        :param force_reset: Force call to ``reset()`` before training
+            to avoid unexpected behavior.
+            See https://github.com/DLR-RM/stable-baselines3/issues/597
         :param kwargs: extra arguments to change the model when loading
         """
         if print_system_info:
@@ -683,6 +694,10 @@ class BaseAlgorithm(ABC):
             env = cls._wrap_env(env, data["verbose"])
             # Check if given env is valid
             check_for_correct_spaces(env, data["observation_space"], data["action_space"])
+            # Discard `_last_obs`, this will force the env to reset before training
+            # See issue https://github.com/DLR-RM/stable-baselines3/issues/597
+            if force_reset and data is not None:
+                data["_last_obs"] = None
         else:
             # Use stored env, if one exists. If not, continue as is (can be used for predict)
             if "env" in data:
diff --git a/stable_baselines3/version.txt b/stable_baselines3/version.txt
index 16156e3..5bcefe3 100644
--- a/stable_baselines3/version.txt
+++ b/stable_baselines3/version.txt
@@ -1 +1 @@
-1.2.1a4
+1.2.1a5
diff --git a/tests/test_save_load.py b/tests/test_save_load.py
index 1454b98..69a3f48 100644
--- a/tests/test_save_load.py
+++ b/tests/test_save_load.py
@@ -163,9 +163,10 @@ def test_save_load(tmp_path, model_class):
 
 
 @pytest.mark.parametrize("model_class", MODEL_LIST)
-def test_set_env(model_class):
+def test_set_env(tmp_path, model_class):
     """
     Test if set_env function does work correct
+
     :param model_class: (BaseAlgorithm) A RL model
     """
 
@@ -176,24 +177,54 @@ def test_set_env(model_class):
 
     kwargs = {}
     if model_class in {DQN, DDPG, SAC, TD3}:
-        kwargs = dict(learning_starts=100, train_freq=4)
+        kwargs = dict(learning_starts=50, train_freq=4)
     elif model_class in {A2C, PPO}:
         kwargs = dict(n_steps=64)
 
     # create model
     model = model_class("MlpPolicy", env, policy_kwargs=dict(net_arch=[16]), **kwargs)
     # learn
-    model.learn(total_timesteps=128)
+    model.learn(total_timesteps=64)
 
     # change env
-    model.set_env(env2)
+    model.set_env(env2, force_reset=True)
+    # Check that last obs was discarded
+    assert model._last_obs is None
     # learn again
-    model.learn(total_timesteps=128)
+    model.learn(total_timesteps=64, reset_num_timesteps=True)
+    assert model.num_timesteps == 64
 
     # change env test wrapping
     model.set_env(env3)
     # learn again
-    model.learn(total_timesteps=128)
+    model.learn(total_timesteps=64)
+
+    # Keep the same env, disable reset
+    model.set_env(model.get_env(), force_reset=False)
+    assert model._last_obs is not None
+    # learn again
+    model.learn(total_timesteps=64, reset_num_timesteps=False)
+    assert model.num_timesteps == 2 * 64
+
+    current_env = model.get_env()
+    model.save(tmp_path / "test_save.zip")
+    del model
+    # Check that we can keep the number of timesteps after loading
+    # Here the env kept its state so we don't have to reset
+    model = model_class.load(tmp_path / "test_save.zip", env=current_env, force_reset=False)
+    assert model._last_obs is not None
+    model.learn(total_timesteps=64, reset_num_timesteps=False)
+    assert model.num_timesteps == 3 * 64
+
+    del model
+    # We are changing the env, the env must reset but we should keep the number of timesteps
+    model = model_class.load(tmp_path / "test_save.zip", env=env3, force_reset=True)
+    assert model._last_obs is None
+    model.learn(total_timesteps=64, reset_num_timesteps=False)
+    assert model.num_timesteps == 3 * 64
+
+    # Clear saved file
+    os.remove(tmp_path / "test_save.zip")
 
 
 @pytest.mark.parametrize("model_class", MODEL_LIST)