diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 622bba139..4ac3ce536 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,19 +33,18 @@ jobs: python -m pip install --upgrade pip pip install setuptools==65.5.0 # cpu version of pytorch - faster to download - pip install torch==1.11+cpu -f https://download.pytorch.org/whl/torch_stable.html - pip install pybullet==3.1.9 + pip install torch==1.11.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu + pip install pybullet==3.2.5 pip install -r requirements.txt # Use headless version pip install opencv-python-headless - # install parking-env to test HER (pinned so it works with gym 0.21) - pip install highway-env==1.5.0 + pip install highway-env==1.7.1 pip install -e . - name: Type check run: | make type # skip mypy type check for python3.7 (last forever for some reason) - if: "!(matrix.python-version == '3.7')" + if: "!(matrix.python-version == '3.7')" - name: Check codestyle run: | make check-codestyle diff --git a/.github/workflows/trained_agents.yml b/.github/workflows/trained_agents.yml index 3e2d6d27b..bc54afeaf 100644 --- a/.github/workflows/trained_agents.yml +++ b/.github/workflows/trained_agents.yml @@ -33,13 +33,12 @@ jobs: python -m pip install --upgrade pip pip install setuptools==65.5.0 # cpu version of pytorch - faster to download - pip install torch==1.11+cpu -f https://download.pytorch.org/whl/torch_stable.html - pip install pybullet==3.1.9 + pip install torch==1.11.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu + pip install pybullet==3.2.5 pip install -r requirements.txt # Use headless version pip install opencv-python-headless - # install parking-env to test HER (pinned so it works with gym 0.21) - pip install highway-env==1.5.0 + pip install highway-env==1.7.1 # Add support for pickle5 protocol # TODO: remove me when dropping python 3.7 pip install pickle5 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index 290c793ce..000000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,26 +0,0 @@ -image: stablebaselines/rl-baselines3-zoo-cpu:latest - -# Recursive clone -variables: - GIT_SUBMODULE_STRATEGY: recursive - -type-check: - script: - - make type - -pytest: - script: - # MKL_THREADING_LAYER=GNU to avoid MKL_THREADING_LAYER=INTEL incompatibility error - - MKL_THREADING_LAYER=GNU make pytest - coverage: '/^TOTAL.+?(\d+\%)$/' - -check-trained-agents: - script: - # MKL_THREADING_LAYER=GNU to avoid MKL_THREADING_LAYER=INTEL incompatibility error - - pip install pickle5 # Add support for pickle5 protocol - - MKL_THREADING_LAYER=GNU make check-trained-agents - -lint: - script: - - make check-codestyle - - make lint diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f27d2db6..b84af63c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## Release 2.0.0a0 (WIP) + +### Breaking Changes +- Upgraded to gym 0.26+ +- Fixed bug in HistoryWrapper, now returns the correct obs space limits + +### New Features +- Gym 0.26+ patches to continue working with pybullet and TimeLimit wrapper + +### Bug fixes + ## Release 1.8.0a2 (WIP) ### Breaking Changes diff --git a/README.md b/README.md index a6a4c98e3..d90e4711a 100644 --- a/README.md +++ b/README.md @@ -301,7 +301,7 @@ for multiple, specify a list: ```yaml env_wrapper: - - rl_zoo3.wrappers.DoneOnSuccessWrapper: + - rl_zoo3.wrappers.TruncatedOnSuccessWrapper: reward_offset: 1.0 - sb3_contrib.common.wrappers.TimeFeatureWrapper ``` diff --git a/docker/Dockerfile b/docker/Dockerfile index bd4d85967..9e4e6d767 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -21,7 +21,7 @@ RUN \ mkdir -p ${CODE_DIR}/rl_zoo3 && \ pip uninstall -y stable-baselines3 && \ pip install -r /tmp/requirements.txt && \ - pip install pip install highway-env==1.5 && \ + pip install highway-env>=1.7.1 && \ rm -rf $HOME/.cache/pip ENV PATH=$VENV/bin:$PATH diff --git a/hyperparams/her.yml b/hyperparams/her.yml index a8249f46d..1235314b9 100644 --- a/hyperparams/her.yml +++ b/hyperparams/her.yml @@ -59,7 +59,7 @@ FetchSlide-v1: FetchPickAndPlace-v1: env_wrapper: - sb3_contrib.common.wrappers.TimeFeatureWrapper - # - rl_zoo3.wrappers.DoneOnSuccessWrapper: + # - rl_zoo3.wrappers.TruncatedOnSuccessWrapper: # reward_offset: 0 # n_successes: 4 # - stable_baselines3.common.monitor.Monitor diff --git a/hyperparams/ppo.yml b/hyperparams/ppo.yml index 321ab0fdd..004feb50d 100644 --- a/hyperparams/ppo.yml +++ b/hyperparams/ppo.yml @@ -317,7 +317,7 @@ MiniGrid-FourRooms-v0: learning_rate: 2.5e-4 clip_range: 0.2 -CarRacing-v0: +CarRacing-v1: env_wrapper: - rl_zoo3.wrappers.FrameSkip: skip: 2 diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml index 9d41262e4..4457f50fb 100644 --- a/hyperparams/sac.yml +++ b/hyperparams/sac.yml @@ -161,7 +161,7 @@ MinitaurBulletDuckEnv-v0: learning_starts: 10000 # To be tuned -CarRacing-v0: +CarRacing-v1: env_wrapper: - rl_zoo3.wrappers.FrameSkip: skip: 2 diff --git a/requirements.txt b/requirements.txt index 5bb2a0460..dc8db9fb9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,25 @@ -gym==0.21 -stable-baselines3[extra,tests,docs]>=1.8.0a2 -sb3-contrib>=1.8.0a2 +gym==0.26.2 +# stable-baselines3[extra,tests,docs]>=1.7.0 +git+https://github.com/carlosluis/stable-baselines3@fix_tests#egg=stable_baselines3[extra,tests,docs] +# sb3-contrib>=1.7.0 +git+https://github.com/Stable-Baselines-Team/stable-baselines3-contrib@feat/new-gym-version box2d-py==2.3.8 pybullet gym-minigrid -scikit-optimize -optuna +# scikit-optimize +optuna~=2.10.1 pytablewriter~=0.64 pyyaml>=5.1 cloudpickle>=1.5.0 plotly -panda-gym==1.1.1 # tmp fix: until compatibility with panda-gym v2 +# need to upgrade to gymnasium: +# panda-gym~=3.0.1 rliable>=1.0.5 wandb +ale-py~=0.8.0 huggingface_sb3>=2.2.1, <3.* -seaborn +seaborn~=0.11.2 tqdm rich importlib-metadata~=4.13 # flake8 not compatible with importlib-metadata>5.0 +moviepy diff --git a/rl_zoo3/__init__.py b/rl_zoo3/__init__.py index 4d91fdf5a..b88384fb5 100644 --- a/rl_zoo3/__init__.py +++ b/rl_zoo3/__init__.py @@ -1,5 +1,12 @@ import os +# Important: import gym patches before everything +# isort: off + +import rl_zoo3.gym_patches # noqa: F401 + +# isort: on + from rl_zoo3.utils import ( ALGOS, create_test_env, diff --git a/rl_zoo3/enjoy.py b/rl_zoo3/enjoy.py index 82471d764..d6c3964be 100644 --- a/rl_zoo3/enjoy.py +++ b/rl_zoo3/enjoy.py @@ -188,8 +188,10 @@ def enjoy() -> None: # noqa: C901 "clip_range": lambda _: 0.0, } - model = ALGOS[algo].load(model_path, env=env, custom_objects=custom_objects, device=args.device, **kwargs) + if "HerReplayBuffer" in hyperparams.get("replay_buffer_class", ""): + kwargs["env"] = env + model = ALGOS[algo].load(model_path, custom_objects=custom_objects, device=args.device, **kwargs) obs = env.reset() # Deterministic by default except for atari games diff --git a/rl_zoo3/exp_manager.py b/rl_zoo3/exp_manager.py index 4503121b0..a9bb9a1b9 100644 --- a/rl_zoo3/exp_manager.py +++ b/rl_zoo3/exp_manager.py @@ -513,22 +513,22 @@ def create_callbacks(self): @staticmethod def is_atari(env_id: str) -> bool: - entry_point = gym.envs.registry.env_specs[env_id].entry_point # pytype: disable=module-attr + entry_point = gym.envs.registry[env_id].entry_point # pytype: disable=module-attr return "AtariEnv" in str(entry_point) @staticmethod def is_bullet(env_id: str) -> bool: - entry_point = gym.envs.registry.env_specs[env_id].entry_point # pytype: disable=module-attr + entry_point = gym.envs.registry[env_id].entry_point # pytype: disable=module-attr return "pybullet_envs" in str(entry_point) @staticmethod def is_robotics_env(env_id: str) -> bool: - entry_point = gym.envs.registry.env_specs[env_id].entry_point # pytype: disable=module-attr + entry_point = gym.envs.registry[env_id].entry_point # pytype: disable=module-attr return "gym.envs.robotics" in str(entry_point) or "panda_gym.envs" in str(entry_point) @staticmethod def is_panda_gym(env_id: str) -> bool: - entry_point = gym.envs.registry.env_specs[env_id].entry_point # pytype: disable=module-attr + entry_point = gym.envs.registry[env_id].entry_point # pytype: disable=module-attr return "panda_gym.envs" in str(entry_point) def _maybe_normalize(self, env: VecEnv, eval_env: bool) -> VecEnv: @@ -595,6 +595,10 @@ def create_envs(self, n_envs: int, eval_env: bool = False, no_log: bool = False) # See https://github.com/HumanCompatibleAI/imitation/pull/160 spec = gym.spec(self.env_name.gym_id) + # Make Pybullet compatible with gym 0.26 + if self.is_bullet(self.env_name.gym_id): + self.env_kwargs.update(dict(apply_api_compatibility=True)) + def make_env(**kwargs) -> gym.Env: env = spec.make(**kwargs) return env diff --git a/rl_zoo3/gym_patches.py b/rl_zoo3/gym_patches.py new file mode 100644 index 000000000..8aca37f68 --- /dev/null +++ b/rl_zoo3/gym_patches.py @@ -0,0 +1,58 @@ +""" +Patches for gym 0.26+ so RL Zoo3 keeps working as before +(notably TimeLimit wrapper and Pybullet envs) +""" +from typing import Any, Dict + +import numpy as np + +# Deprecation warning with gym 0.26 and numpy 1.24 +np.bool8 = np.bool_ # type: ignore[attr-defined] + +import gym # noqa: E402 + + +class PatchedRegistry(dict): + """ + gym.envs.registration.registry + is now a dictionnary and no longer an EnvRegistry() object. + """ + + @property + def env_specs(self) -> Dict[str, Any]: + return self + + +class PatchedTimeLimit(gym.wrappers.TimeLimit): + """ + See https://github.com/openai/gym/issues/3102 + and https://github.com/Farama-Foundation/Gymnasium/pull/101: + keep the behavior as before and provide additionnal info + that the episode reached a timeout, but only + when the episode is over because of that. + """ + + def step(self, action): + observation, reward, terminated, truncated, info = self.env.step(action) + self._elapsed_steps += 1 + + if self._elapsed_steps >= self._max_episode_steps: + done = truncated or terminated + # TimeLimit.truncated key may have been already set by the environment + # do not overwrite it + # only set it when the episode is not over for other reasons + episode_truncated = not done or info.get("TimeLimit.truncated", False) + info["TimeLimit.truncated"] = episode_truncated + # truncated may have been set by the env too + truncated = truncated or episode_truncated + + return observation, reward, terminated, truncated, info + + +patched_registry = PatchedRegistry() +patched_registry.update(gym.envs.registration.registry) +gym.envs.registry = patched_registry +gym.envs.registration.registry = patched_registry +gym.wrappers.TimeLimit = PatchedTimeLimit # type: ignore[misc] +gym.wrappers.time_limit.TimeLimit = PatchedTimeLimit # type: ignore[misc] +gym.envs.registration.TimeLimit = PatchedTimeLimit # type: ignore[misc] diff --git a/rl_zoo3/import_envs.py b/rl_zoo3/import_envs.py index f918e2e0c..36b3af06f 100644 --- a/rl_zoo3/import_envs.py +++ b/rl_zoo3/import_envs.py @@ -1,3 +1,5 @@ +from typing import Optional + import gym from gym.envs.registration import register @@ -51,8 +53,8 @@ # Register no vel envs def create_no_vel_env(env_id: str): - def make_env(): - env = gym.make(env_id) + def make_env(render_mode: Optional[str] = None): + env = gym.make(env_id, render_mode=render_mode) env = MaskVelocityWrapper(env) return env diff --git a/rl_zoo3/record_video.py b/rl_zoo3/record_video.py index 90528738b..632093805 100644 --- a/rl_zoo3/record_video.py +++ b/rl_zoo3/record_video.py @@ -89,6 +89,9 @@ if args.env_kwargs is not None: env_kwargs.update(args.env_kwargs) + # Force rgb_array rendering (gym 0.26+) + env_kwargs.update(render_mode="rgb_array") + env = create_test_env( env_name.gym_id, n_envs=n_envs, @@ -133,6 +136,12 @@ if video_folder is None: video_folder = os.path.join(log_path, "videos") + if is_atari: + # Patch Atari for rendering + # see https://github.com/mgbellemare/Arcade-Learning-Environment/issues/473 + env.unwrapped.render_mode = env_kwargs.get("render_mode") + env.render_mode = env_kwargs.get("render_mode") + # Note: apparently it renders by default env = VecVideoRecorder( env, @@ -153,10 +162,10 @@ episode_start=episode_starts, deterministic=deterministic, ) - obs, _, dones, _ = env.step(action) # type: ignore[assignment] - episode_starts = dones if not args.no_render: env.render() + obs, _, dones, _ = env.step(action) # type: ignore[assignment] + episode_starts = dones except KeyboardInterrupt: pass diff --git a/rl_zoo3/train.py b/rl_zoo3/train.py index 1e52a5fc0..55149b7f2 100644 --- a/rl_zoo3/train.py +++ b/rl_zoo3/train.py @@ -164,7 +164,7 @@ def train() -> None: importlib.import_module(env_module) env_id = args.env - registered_envs = set(gym.envs.registry.env_specs.keys()) # pytype: disable=module-attr + registered_envs = set(gym.envs.registry.keys()) # pytype: disable=module-attr if args.yaml_file is not None: raise ValueError( diff --git a/rl_zoo3/utils.py b/rl_zoo3/utils.py index f23265883..d43dc8f05 100644 --- a/rl_zoo3/utils.py +++ b/rl_zoo3/utils.py @@ -2,6 +2,7 @@ import glob import importlib import os +from copy import deepcopy from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import gym @@ -39,12 +40,8 @@ def flatten_dict_observations(env: gym.Env) -> gym.Env: - assert isinstance(env.observation_space, spaces.Dict) - try: - return gym.wrappers.FlattenObservation(env) - except AttributeError: - keys = env.observation_space.spaces.keys() - return gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys)) + assert isinstance(env.observation_space, gym.spaces.Dict) + return gym.wrappers.FlattenObservation(env) def get_wrapper_class(hyperparams: Dict[str, Any], key: str = "env_wrapper") -> Optional[Callable[[gym.Env], gym.Env]]: @@ -234,12 +231,14 @@ def create_test_env( vec_env_cls = SubprocVecEnv # type: ignore[assignment] # start_method = 'spawn' for thread safe - # panda-gym is based on pybullet, whose rendering requires to be configure at initialization - if ExperimentManager.is_panda_gym(env_id) and should_render: - if env_kwargs is None: - env_kwargs = {"render": True} - else: - env_kwargs["render"] = True + # Fix for gym 0.26, to keep old behavior + env_kwargs = env_kwargs or {} + env_kwargs = deepcopy(env_kwargs) + if "render_mode" not in env_kwargs and should_render: + env_kwargs.update(render_mode="human") + + if ExperimentManager.is_bullet(env_id): + env_kwargs.update(apply_api_compatibility=True) env = make_vec_env( env_id, @@ -255,7 +254,7 @@ def create_test_env( if "vec_env_wrapper" in hyperparams.keys(): vec_env_wrapper = get_wrapper_class(hyperparams, "vec_env_wrapper") assert vec_env_wrapper is not None - env = vec_env_wrapper(env) + env = vec_env_wrapper(env) # type: ignore[assignment, arg-type] del hyperparams["vec_env_wrapper"] # Load saved stats for normalizing input and rewards diff --git a/rl_zoo3/version.txt b/rl_zoo3/version.txt index c3d22c01c..35a785a76 100644 --- a/rl_zoo3/version.txt +++ b/rl_zoo3/version.txt @@ -1 +1 @@ -1.8.0a2 +2.0.0a0 diff --git a/rl_zoo3/wrappers.py b/rl_zoo3/wrappers.py index e94e51a70..e3763cbaf 100644 --- a/rl_zoo3/wrappers.py +++ b/rl_zoo3/wrappers.py @@ -1,10 +1,13 @@ +from typing import Dict, Optional, Tuple + import gym import numpy as np from gym import spaces from sb3_contrib.common.wrappers import TimeFeatureWrapper # noqa: F401 (backward compatibility) +from stable_baselines3.common.type_aliases import Gym26ResetReturn, Gym26StepReturn -class DoneOnSuccessWrapper(gym.Wrapper): +class TruncatedOnSuccessWrapper(gym.Wrapper): """ Reset on success and offsets the reward. Useful for GoalEnv. @@ -16,20 +19,21 @@ def __init__(self, env: gym.Env, reward_offset: float = 0.0, n_successes: int = self.n_successes = n_successes self.current_successes = 0 - def reset(self): + def reset(self, seed: Optional[int] = None, options: Optional[dict] = None) -> Gym26ResetReturn: self.current_successes = 0 - return self.env.reset() + assert options is None, "Options not supported for now" + return self.env.reset(seed=seed) - def step(self, action): - obs, reward, done, info = self.env.step(action) + def step(self, action) -> Gym26StepReturn: + obs, reward, terminated, truncated, info = self.env.step(action) if info.get("is_success", False): self.current_successes += 1 else: self.current_successes = 0 # number of successes in a row - done = done or self.current_successes >= self.n_successes + truncated = truncated or self.current_successes >= self.n_successes reward += self.reward_offset - return obs, reward, done, info + return obs, reward, terminated, truncated, info def compute_reward(self, achieved_goal, desired_goal, info): reward = self.env.compute_reward(achieved_goal, desired_goal, info) @@ -49,9 +53,9 @@ def __init__(self, env: gym.Env, noise_std: float = 0.1): super().__init__(env) self.noise_std = noise_std - def step(self, action): + def step(self, action) -> Gym26StepReturn: noise = np.random.normal(np.zeros_like(action), np.ones_like(action) * self.noise_std) - noisy_action = action + noise + noisy_action = np.clip(action + noise, self.action_space.low, self.action_space.high) return self.env.step(noisy_action) @@ -72,11 +76,12 @@ def __init__(self, env: gym.Env, smoothing_coef: float = 0.0): # self.alpha = self.smoothing_coef # self.beta = np.sqrt(1 - self.alpha ** 2) / (1 - self.alpha) - def reset(self): + def reset(self, seed: Optional[int] = None, options: Optional[dict] = None) -> Gym26ResetReturn: self.smoothed_action = None - return self.env.reset() + assert options is None, "Options not supported for now" + return self.env.reset(seed=seed) - def step(self, action): + def step(self, action) -> Gym26StepReturn: if self.smoothed_action is None: self.smoothed_action = np.zeros_like(action) self.smoothed_action = self.smoothing_coef * self.smoothed_action + (1 - self.smoothing_coef) * action @@ -98,23 +103,24 @@ def __init__(self, env: gym.Env, delay: int = 10): self.current_step = 0 self.accumulated_reward = 0.0 - def reset(self): + def reset(self, seed: Optional[int] = None, options: Optional[dict] = None) -> Gym26ResetReturn: self.current_step = 0 self.accumulated_reward = 0.0 - return self.env.reset() + assert options is None, "Options not supported for now" + return self.env.reset(seed=seed) - def step(self, action): - obs, reward, done, info = self.env.step(action) + def step(self, action) -> Gym26StepReturn: + obs, reward, terminated, truncated, info = self.env.step(action) self.accumulated_reward += reward self.current_step += 1 - if self.current_step % self.delay == 0 or done: + if self.current_step % self.delay == 0 or terminated or truncated: reward = self.accumulated_reward self.accumulated_reward = 0.0 else: reward = 0.0 - return obs, reward, done, info + return obs, reward, terminated, truncated, info class HistoryWrapper(gym.Wrapper): @@ -131,12 +137,11 @@ def __init__(self, env: gym.Env, horizon: int = 2): wrapped_obs_space = env.observation_space wrapped_action_space = env.action_space - # TODO: double check, it seems wrong when we have different low and highs - low_obs = np.repeat(wrapped_obs_space.low, horizon, axis=-1) - high_obs = np.repeat(wrapped_obs_space.high, horizon, axis=-1) + low_obs = np.tile(wrapped_obs_space.low, horizon) + high_obs = np.tile(wrapped_obs_space.high, horizon) - low_action = np.repeat(wrapped_action_space.low, horizon, axis=-1) - high_action = np.repeat(wrapped_action_space.high, horizon, axis=-1) + low_action = np.tile(wrapped_action_space.low, horizon) + high_action = np.tile(wrapped_action_space.high, horizon) low = np.concatenate((low_obs, low_action)) high = np.concatenate((high_obs, high_action)) @@ -153,19 +158,20 @@ def __init__(self, env: gym.Env, horizon: int = 2): self.obs_history = np.zeros(low_obs.shape, low_obs.dtype) self.action_history = np.zeros(low_action.shape, low_action.dtype) - def _create_obs_from_history(self): + def _create_obs_from_history(self) -> np.ndarray: return np.concatenate((self.obs_history, self.action_history)) - def reset(self): + def reset(self, seed: Optional[int] = None, options: Optional[dict] = None) -> Tuple[np.ndarray, Dict]: # Flush the history self.obs_history[...] = 0 self.action_history[...] = 0 - obs = self.env.reset() + assert options is None, "Options not supported for now" + obs, info = self.env.reset(seed=seed) self.obs_history[..., -obs.shape[-1] :] = obs - return self._create_obs_from_history() + return self._create_obs_from_history(), info - def step(self, action): - obs, reward, done, info = self.env.step(action) + def step(self, action) -> Tuple[np.ndarray, float, bool, bool, Dict]: + obs, reward, terminated, truncated, info = self.env.step(action) last_ax_size = obs.shape[-1] self.obs_history = np.roll(self.obs_history, shift=-last_ax_size, axis=-1) @@ -173,7 +179,7 @@ def step(self, action): self.action_history = np.roll(self.action_history, shift=-action.shape[-1], axis=-1) self.action_history[..., -action.shape[-1] :] = action - return self._create_obs_from_history(), reward, done, info + return self._create_obs_from_history(), reward, terminated, truncated, info class HistoryWrapperObsDict(gym.Wrapper): @@ -190,12 +196,11 @@ def __init__(self, env: gym.Env, horizon: int = 2): wrapped_obs_space = env.observation_space.spaces["observation"] wrapped_action_space = env.action_space - # TODO: double check, it seems wrong when we have different low and highs - low_obs = np.repeat(wrapped_obs_space.low, horizon, axis=-1) - high_obs = np.repeat(wrapped_obs_space.high, horizon, axis=-1) + low_obs = np.tile(wrapped_obs_space.low, horizon) + high_obs = np.tile(wrapped_obs_space.high, horizon) - low_action = np.repeat(wrapped_action_space.low, horizon, axis=-1) - high_action = np.repeat(wrapped_action_space.high, horizon, axis=-1) + low_action = np.tile(wrapped_action_space.low, horizon) + high_action = np.tile(wrapped_action_space.high, horizon) low = np.concatenate((low_obs, low_action)) high = np.concatenate((high_obs, high_action)) @@ -212,23 +217,24 @@ def __init__(self, env: gym.Env, horizon: int = 2): self.obs_history = np.zeros(low_obs.shape, low_obs.dtype) self.action_history = np.zeros(low_action.shape, low_action.dtype) - def _create_obs_from_history(self): + def _create_obs_from_history(self) -> np.ndarray: return np.concatenate((self.obs_history, self.action_history)) - def reset(self): + def reset(self, seed: Optional[int] = None, options: Optional[dict] = None) -> Tuple[Dict[str, np.ndarray], Dict]: # Flush the history self.obs_history[...] = 0 self.action_history[...] = 0 - obs_dict = self.env.reset() + assert options is None, "Options not supported for now" + obs_dict, info = self.env.reset(seed=seed) obs = obs_dict["observation"] self.obs_history[..., -obs.shape[-1] :] = obs obs_dict["observation"] = self._create_obs_from_history() - return obs_dict + return obs_dict, info - def step(self, action): - obs_dict, reward, done, info = self.env.step(action) + def step(self, action) -> Tuple[Dict[str, np.ndarray], float, bool, bool, Dict]: + obs_dict, reward, terminated, truncated, info = self.env.step(action) obs = obs_dict["observation"] last_ax_size = obs.shape[-1] @@ -240,7 +246,7 @@ def step(self, action): obs_dict["observation"] = self._create_obs_from_history() - return obs_dict, reward, done, info + return obs_dict, reward, terminated, truncated, info class FrameSkip(gym.Wrapper): @@ -255,26 +261,22 @@ def __init__(self, env: gym.Env, skip: int = 4): super().__init__(env) self._skip = skip - def step(self, action: np.ndarray): + def step(self, action) -> Gym26StepReturn: """ Step the environment with the given action Repeat action, sum reward. :param action: the action - :return: observation, reward, done, information + :return: observation, reward, terminated, truncated, information """ total_reward = 0.0 - done = None for _ in range(self._skip): - obs, reward, done, info = self.env.step(action) + obs, reward, terminated, truncated, info = self.env.step(action) total_reward += reward - if done: + if terminated or truncated: break - return obs, total_reward, done, info - - def reset(self): - return self.env.reset() + return obs, total_reward, terminated, truncated, info class MaskVelocityWrapper(gym.ObservationWrapper): diff --git a/setup.cfg b/setup.cfg index 14b7d4b95..732937b1f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -8,6 +8,7 @@ filterwarnings = ignore::DeprecationWarning:tensorboard # Gym warnings ignore::UserWarning:gym + ignore::DeprecationWarning:.*passive_env_checker.* markers = slow: marks tests as slow (deselect with '-m "not slow"') serial @@ -44,4 +45,5 @@ show_error_codes = True exclude = (?x)( rl_zoo3/hyperparams_opt.py$ | rl_zoo3/exp_manager.py$ + | rl_zoo3/wrappers.py$ ) diff --git a/setup.py b/setup.py index 73ddc7a48..939675231 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ }, entry_points={"console_scripts": ["rl_zoo3=rl_zoo3.cli:main"]}, install_requires=[ - "sb3-contrib>=1.8.0a2", + "sb3_contrib @ git+https://github.com/Stable-Baselines-Team/stable-baselines3-contrib@feat/new-gym-version", "huggingface_sb3>=2.2.1, <3.*", "tqdm", "rich", @@ -44,7 +44,7 @@ url="https://github.com/DLR-RM/rl-baselines3-zoo", author_email="antonin.raffin@dlr.de", keywords="reinforcement-learning-algorithms reinforcement-learning machine-learning " - "gym openai stable baselines sb3 toolbox python data-science", + "gym gymnasium openai stable baselines sb3 toolbox python data-science", license="MIT", long_description=long_description, long_description_content_type="text/markdown", diff --git a/tests/test_enjoy.py b/tests/test_enjoy.py index 0e66be78f..49d69636d 100644 --- a/tests/test_enjoy.py +++ b/tests/test_enjoy.py @@ -32,10 +32,16 @@ def test_trained_agents(trained_model): if "CarRacing" in env_id: return + # FIXME: skip Panda gym envs + # need panda gym >= 3.0.1 and gymnasium + if "Panda" in env_id: + return + # Skip mujoco envs if "Fetch" in trained_model or "-v3" in trained_model: return + # FIXME: switch to MiniGrid package if "-MiniGrid-" in trained_model: args = args + ["--gym-packages", "gym_minigrid"] @@ -92,7 +98,8 @@ def test_record_video(tmp_path): args = ["-n", "100", "--algo", "sac", "--env", "Pendulum-v1", "-o", str(tmp_path)] # Skip if no X-Server - pytest.importorskip("pyglet.gl") + if not os.environ.get("DISPLAY"): + pytest.skip("No X-Server") return_code = subprocess.call(["python", "-m", "rl_zoo3.record_video"] + args) _assert_eq(return_code, 0) @@ -130,7 +137,8 @@ def test_record_training(tmp_path): ] # Skip if no X-Server - pytest.importorskip("pyglet.gl") + if not os.environ.get("DISPLAY"): + pytest.skip("No X-Server") return_code = subprocess.call(["python", "train.py"] + args_training) _assert_eq(return_code, 0) diff --git a/tests/test_hyperparams_opt.py b/tests/test_hyperparams_opt.py index 89396a1f1..9c6bf9a61 100644 --- a/tests/test_hyperparams_opt.py +++ b/tests/test_hyperparams_opt.py @@ -47,6 +47,7 @@ def test_optimize(tmp_path, sampler, pruner, experiment): args = ["-n", str(N_STEPS), "--algo", algo, "--env", env_id, "-params", 'policy_kwargs:"dict(net_arch=[32])"', "n_envs:1"] args += ["n_steps:10"] if algo == "ppo" else [] args += [ + "--no-optim-plots", "--seed", "14", "--log-folder", diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py index c494a0076..bf8cf0325 100644 --- a/tests/test_wrappers.py +++ b/tests/test_wrappers.py @@ -1,16 +1,16 @@ import gym -import pybullet_envs # noqa: F401 import pytest from stable_baselines3 import A2C from stable_baselines3.common.env_checker import check_env from stable_baselines3.common.env_util import DummyVecEnv +import rl_zoo3.import_envs # noqa: F401 from rl_zoo3.utils import get_wrapper_class from rl_zoo3.wrappers import ActionNoiseWrapper, DelayedRewardWrapper, HistoryWrapper, TimeFeatureWrapper def test_wrappers(): - env = gym.make("AntBulletEnv-v0") + env = gym.make("AntBulletEnv-v0", apply_api_compatibility=True) env = DelayedRewardWrapper(env) env = ActionNoiseWrapper(env) env = HistoryWrapper(env) @@ -27,7 +27,7 @@ def test_wrappers(): ], ) def test_get_wrapper(env_wrapper): - env = gym.make("AntBulletEnv-v0") + env = gym.make("AntBulletEnv-v0", apply_api_compatibility=True) hyperparams = {"env_wrapper": env_wrapper} wrapper_class = get_wrapper_class(hyperparams) if env_wrapper is not None: @@ -44,7 +44,8 @@ def test_get_wrapper(env_wrapper): ], ) def test_get_vec_env_wrapper(vec_env_wrapper): - env = DummyVecEnv([lambda: gym.make("AntBulletEnv-v0")]) + env = gym.make("AntBulletEnv-v0", apply_api_compatibility=True) + env = DummyVecEnv([lambda: env]) hyperparams = {"vec_env_wrapper": vec_env_wrapper} wrapper_class = get_wrapper_class(hyperparams, "vec_env_wrapper") if wrapper_class is not None: