DLR-RM · araffin · May 29, 2022 · May 29, 2022 · May 29, 2022 · May 31, 2022
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -29,7 +29,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         # cpu version of pytorch - faster to download
-        pip install torch==1.11+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        pip install torch==1.11.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu
         pip install pybullet==3.1.9
         pip install -r requirements.txt
         # Use headless version

diff --git a/.github/workflows/trained_agents.yml b/.github/workflows/trained_agents.yml
@@ -29,7 +29,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         # cpu version of pytorch - faster to download
-        pip install torch==1.11+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        pip install torch==1.11.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu
         pip install pybullet==3.1.9
         pip install -r requirements.txt
         # Use headless version

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
 ## Release 1.7.0a1 (WIP)
 
 ### Breaking Changes
+- Upgraded to gym 0.24
 
 ### New Features
 - Specifying custom policies in yaml file is now supported (@Rick-v-E)

diff --git a/README.md b/README.md
@@ -289,7 +289,7 @@ for multiple, specify a list:
 
 ```yaml
 env_wrapper:
-    - rl_zoo3.wrappers.DoneOnSuccessWrapper:
+    - rl_zoo3.wrappers.TruncatedOnSuccessWrapper:
         reward_offset: 1.0
     - sb3_contrib.common.wrappers.TimeFeatureWrapper
 ```

diff --git a/hyperparams/her.yml b/hyperparams/her.yml
@@ -59,7 +59,7 @@ FetchSlide-v1:
 FetchPickAndPlace-v1:
   env_wrapper:
     - sb3_contrib.common.wrappers.TimeFeatureWrapper
-    # - rl_zoo3.wrappers.DoneOnSuccessWrapper:
+    # - rl_zoo3.wrappers.TruncatedOnSuccessWrapper:
     #     reward_offset: 0
     #     n_successes: 4
     # - stable_baselines3.common.monitor.Monitor

diff --git a/hyperparams/ppo.yml b/hyperparams/ppo.yml
@@ -317,7 +317,7 @@ MiniGrid-FourRooms-v0:
   learning_rate: 2.5e-4
   clip_range: 0.2
 
-CarRacing-v0:
+CarRacing-v1:
   env_wrapper:
     - rl_zoo3.wrappers.FrameSkip:
         skip: 2

diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml
@@ -161,7 +161,7 @@ MinitaurBulletDuckEnv-v0:
   learning_starts: 10000
 
 # To be tuned
-CarRacing-v0:
+CarRacing-v1:
   env_wrapper:
     - rl_zoo3.wrappers.FrameSkip:
         skip: 2

diff --git a/requirements.txt b/requirements.txt
@@ -1,20 +1,23 @@
-gym==0.21
-stable-baselines3[extra,tests,docs]>=1.6.2
-sb3-contrib>=1.6.2
+gym==0.24.1
+# stable-baselines3[extra,tests,docs]>=1.5.1a7
+git+https://github.com/carlosluis/stable-baselines3@fix_tests#egg=stable_baselines3[extra,tests,docs]
+# sb3-contrib>=1.5.0
+git+https://github.com/Stable-Baselines-Team/stable-baselines3-contrib@feat/new-gym-version
 box2d-py==2.3.8
 pybullet
 gym-minigrid
-scikit-optimize
-optuna
+# scikit-optimize
+optuna~=2.10.1
 pytablewriter~=0.64
 pyyaml>=5.1
 cloudpickle>=1.5.0
 plotly
-panda-gym==1.1.1 # tmp fix: until compatibility with panda-gym v2
+panda-gym~=2.0.2
 rliable>=1.0.5
 wandb
+ale-py==0.7.5
 huggingface_sb3>=2.2.1, <3.*
-seaborn
+seaborn~=0.11.2
 tqdm
 rich
 importlib-metadata~=4.13 # flake8 not compatible with importlib-metadata>5.0
diff --git a/rl_zoo3/enjoy.py b/rl_zoo3/enjoy.py
@@ -188,8 +188,10 @@ def enjoy():  # noqa: C901
             "clip_range": lambda _: 0.0,
         }
 
-    model = ALGOS[algo].load(model_path, env=env, custom_objects=custom_objects, device=args.device, **kwargs)
+    if "HerReplayBuffer" in hyperparams.get("replay_buffer_class", ""):
+        kwargs["env"] = env
 
+    model = ALGOS[algo].load(model_path, custom_objects=custom_objects, device=args.device, **kwargs)
     obs = env.reset()
 
     # Deterministic by default except for atari games
@@ -218,9 +220,9 @@ def enjoy():  # noqa: C901
                 episode_start=episode_start,
                 deterministic=deterministic,
             )
-            obs, reward, done, infos = env.step(action)
+            obs, reward, termination, truncation, infos = env.step(action)
 
-            episode_start = done
+            episode_start = termination or truncation
 
             if not args.no_render:
                 env.render("human")
@@ -236,8 +238,8 @@ def enjoy():  # noqa: C901
                     if episode_infos is not None:
                         print(f"Atari Episode Score: {episode_infos['r']:.2f}")
                         print("Atari Episode Length", episode_infos["l"])
-
-                if done and not is_atari and args.verbose > 0:
+                # TODO: episode_start is a confusing name here, should we rename to episode_end?
+                if episode_start and not is_atari and args.verbose > 0: 
                     # NOTE: for env using VecNormalize, the mean reward
                     # is a normalized reward when `--norm_reward` flag is passed
                     print(f"Episode Reward: {episode_reward:.2f}")
@@ -248,7 +250,8 @@ def enjoy():  # noqa: C901
                     ep_len = 0
 
                 # Reset also when the goal is achieved when using HER
-                if done and infos[0].get("is_success") is not None:
+                # TODO: episode_start is a confusing name here, should we rename to episode_end?
+                if episode_start and infos[0].get("is_success") is not None:
                     if args.verbose > 1:
                         print("Success?", infos[0].get("is_success", False))
 

diff --git a/rl_zoo3/exp_manager.py b/rl_zoo3/exp_manager.py
@@ -4,6 +4,7 @@
 import time
 import warnings
 from collections import OrderedDict
+from copy import deepcopy
 from pathlib import Path
 from pprint import pprint
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -570,11 +571,14 @@ def create_envs(self, n_envs: int, eval_env: bool = False, no_log: bool = False)
 
         # On most env, SubprocVecEnv does not help and is quite memory hungry
         # therefore we use DummyVecEnv by default
+        # Fix for gym 0.24, to keep old behavior
+        env_kwargs = deepcopy(self.env_kwargs)
+        env_kwargs.update(disable_env_checker=True)
         env = make_vec_env(
             env_id=self.env_name.gym_id,
             n_envs=n_envs,
             seed=self.seed,
-            env_kwargs=self.env_kwargs,
+            env_kwargs=env_kwargs,
             monitor_dir=log_dir,
             wrapper_class=self.env_wrapper,
             vec_env_cls=self.vec_env_class,

diff --git a/rl_zoo3/record_video.py b/rl_zoo3/record_video.py
@@ -153,8 +153,8 @@
                 episode_start=episode_starts,
                 deterministic=deterministic,
             )
-            obs, _, dones, _ = env.step(action)
-            episode_starts = dones
+            obs, _, terminated, truncated, _ = env.step(action)
+            episode_starts = np.logical_or(terminated, truncated)
             if not args.no_render:
                 env.render()
     except KeyboardInterrupt:

diff --git a/rl_zoo3/utils.py b/rl_zoo3/utils.py
@@ -2,6 +2,7 @@
 import glob
 import importlib
 import os
+from copy import deepcopy
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 import gym
@@ -231,6 +232,10 @@ def create_test_env(
         vec_env_cls = SubprocVecEnv
         # start_method = 'spawn' for thread safe
 
+    # Fix for gym 0.24, to keep old behavior
+    env_kwargs = deepcopy(env_kwargs)
+    env_kwargs.update(disable_env_checker=True)
+
     env = make_vec_env(
         env_id,
         n_envs=n_envs,

diff --git a/rl_zoo3/wrappers.py b/rl_zoo3/wrappers.py
@@ -1,9 +1,11 @@
+from typing import Optional
+
 import gym
 import numpy as np
 from sb3_contrib.common.wrappers import TimeFeatureWrapper  # noqa: F401 (backward compatibility)
 
 
-class DoneOnSuccessWrapper(gym.Wrapper):
+class TruncatedOnSuccessWrapper(gym.Wrapper):
     """
     Reset on success and offsets the reward.
     Useful for GoalEnv.
@@ -15,20 +17,21 @@ def __init__(self, env: gym.Env, reward_offset: float = 0.0, n_successes: int =
         self.n_successes = n_successes
         self.current_successes = 0
 
-    def reset(self):
+    def reset(self, seed: Optional[int] = None):
         self.current_successes = 0
-        return self.env.reset()
+        kwargs = {} if seed is None else {"seed": seed}
+        return self.env.reset(**kwargs)
 
     def step(self, action):
-        obs, reward, done, info = self.env.step(action)
+        obs, reward, terminated, truncated, info = self.env.step(action)
         if info.get("is_success", False):
             self.current_successes += 1
         else:
             self.current_successes = 0
         # number of successes in a row
-        done = done or self.current_successes >= self.n_successes
+        truncated = truncated or self.current_successes >= self.n_successes
         reward += self.reward_offset
-        return obs, reward, done, info
+        return obs, reward, terminated, truncated, info
 
     def compute_reward(self, achieved_goal, desired_goal, info):
         reward = self.env.compute_reward(achieved_goal, desired_goal, info)
@@ -103,17 +106,17 @@ def reset(self):
         return self.env.reset()
 
     def step(self, action):
-        obs, reward, done, info = self.env.step(action)
+        obs, reward, terminated, truncated, info = self.env.step(action)
 
         self.accumulated_reward += reward
         self.current_step += 1
 
-        if self.current_step % self.delay == 0 or done:
+        if self.current_step % self.delay == 0 or terminated or truncated:
             reward = self.accumulated_reward
             self.accumulated_reward = 0.0
         else:
             reward = 0.0
-        return obs, reward, done, info
+        return obs, reward, terminated, truncated, info
 
 
 class HistoryWrapper(gym.Wrapper):
@@ -155,24 +158,25 @@ def __init__(self, env: gym.Env, horizon: int = 2):
     def _create_obs_from_history(self):
         return np.concatenate((self.obs_history, self.action_history))
 
-    def reset(self):
+    def reset(self, seed: Optional[int] = None):
         # Flush the history
         self.obs_history[...] = 0
         self.action_history[...] = 0
-        obs = self.env.reset()
+        kwargs = {} if seed is None else {"seed": seed}
+        obs = self.env.reset(**kwargs)
         self.obs_history[..., -obs.shape[-1] :] = obs
         return self._create_obs_from_history()
 
     def step(self, action):
-        obs, reward, done, info = self.env.step(action)
+        obs, reward, terminated, truncated, info = self.env.step(action)
         last_ax_size = obs.shape[-1]
 
         self.obs_history = np.roll(self.obs_history, shift=-last_ax_size, axis=-1)
         self.obs_history[..., -obs.shape[-1] :] = obs
 
         self.action_history = np.roll(self.action_history, shift=-action.shape[-1], axis=-1)
         self.action_history[..., -action.shape[-1] :] = action
-        return self._create_obs_from_history(), reward, done, info
+        return self._create_obs_from_history(), reward, terminated, truncated, info
 
 
 class HistoryWrapperObsDict(gym.Wrapper):
@@ -214,11 +218,12 @@ def __init__(self, env: gym.Env, horizon: int = 2):
     def _create_obs_from_history(self):
         return np.concatenate((self.obs_history, self.action_history))
 
-    def reset(self):
+    def reset(self, seed: Optional[int] = None):
         # Flush the history
         self.obs_history[...] = 0
         self.action_history[...] = 0
-        obs_dict = self.env.reset()
+        kwargs = {} if seed is None else {"seed": seed}
+        obs_dict = self.env.reset(**kwargs)
         obs = obs_dict["observation"]
         self.obs_history[..., -obs.shape[-1] :] = obs
 
@@ -227,7 +232,7 @@ def reset(self):
         return obs_dict
 
     def step(self, action):
-        obs_dict, reward, done, info = self.env.step(action)
+        obs_dict, reward, terminated, truncated, info = self.env.step(action)
         obs = obs_dict["observation"]
         last_ax_size = obs.shape[-1]
 
@@ -239,7 +244,7 @@ def step(self, action):
 
         obs_dict["observation"] = self._create_obs_from_history()
 
-        return obs_dict, reward, done, info
+        return obs_dict, reward, terminated, truncated, info
 
 
 class FrameSkip(gym.Wrapper):
@@ -260,17 +265,16 @@ def step(self, action: np.ndarray):
         Repeat action, sum reward.
 
         :param action: the action
-        :return: observation, reward, done, information
+        :return: observation, reward, terminated, truncated, information
         """
         total_reward = 0.0
-        done = None
         for _ in range(self._skip):
-            obs, reward, done, info = self.env.step(action)
+            obs, reward, terminated, truncated, info = self.env.step(action)
             total_reward += reward
-            if done:
+            if terminated or truncated:
                 break
 
-        return obs, total_reward, done, info
+        return obs, total_reward, terminated, truncated, info
 
     def reset(self):
         return self.env.reset()

diff --git a/tests/test_hyperparams_opt.py b/tests/test_hyperparams_opt.py
@@ -47,6 +47,7 @@ def test_optimize(tmp_path, sampler, pruner, experiment):
     args = ["-n", str(N_STEPS), "--algo", algo, "--env", env_id, "-params", 'policy_kwargs:"dict(net_arch=[32])"', "n_envs:1"]
     args += ["n_steps:10"] if algo == "ppo" else []
     args += [
+        "--no-optim-plots",
         "--seed",
         "14",
         "--log-folder",