yandexdataschool
diff --git a/‎week04_approx_rl/dqn/__init__.py‎ b/‎week04_approx_rl/dqn/__init__.py‎
diff --git a/‎week04_approx_rl/dqn/analysis.py‎
Lines changed: 50 additions & 0 deletions b/‎week04_approx_rl/dqn/analysis.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎week04_approx_rl/atari_wrappers.py‎ renamed to ‎week04_approx_rl/dqn/atari_wrappers.py‎
Lines changed: 3 additions & 59 deletions b/‎week04_approx_rl/atari_wrappers.py‎ renamed to ‎week04_approx_rl/dqn/atari_wrappers.py‎
Lines changed: 3 additions & 59 deletions
diff --git a/‎week04_approx_rl/replay_buffer.py‎ renamed to ‎week04_approx_rl/dqn/replay_buffer.py‎
Lines changed: 51 additions & 5 deletions b/‎week04_approx_rl/replay_buffer.py‎ renamed to ‎week04_approx_rl/dqn/replay_buffer.py‎
Lines changed: 51 additions & 5 deletions
diff --git a/‎week04_approx_rl/dqn/utils.py‎
Lines changed: 14 additions & 0 deletions b/‎week04_approx_rl/dqn/utils.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎week04_approx_rl/framebuffer.py‎
Lines changed: 0 additions & 45 deletions b/‎week04_approx_rl/framebuffer.py‎
Lines changed: 0 additions & 45 deletions
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from collections.abc import Reversible
+import numpy as np
+
+
+def play_and_log_episode(env, agent, t_max=10000):
+    """
+    Plays an episode using the greedy policy and logs for each timestep:
+    - state
+    - qvalues (estimated by the agent)
+    - actions
+    - rewards
+
+    Also logs:
+    - the final (usually termo=inal) state.
+    - whether the episode was terminated
+
+    Uses the greedy policy.
+    """
+    assert t_max > 0, t_max
+
+    states = []
+    qvalues_all = []
+    actions = []
+    rewards = []
+
+    s, _ = env.reset()
+    for step in range(t_max):
+        s = np.array(s)
+        states.append(s)
+        qvalues = agent.get_qvalues(s[None])[0]
+        qvalues_all.append(qvalues)
+        action = np.argmax(qvalues)
+        actions.append(action)
+        s, r, terminated, truncated, _ = env.step(action)
+        rewards.append(r)
+        if terminated or truncated:
+            break
+    states.append(s)  # the last state
+
+    return_pack = {
+        "states": np.array(states),
+        "qvalues": np.array(qvalues_all),
+        "actions": np.array(actions),
+        "rewards": np.array(rewards),
+        "episode_finished": terminated,
+    }
+
+    return return_pack
@@ -1,53 +1,13 @@
 # taken from stable_baselines3.
 
-import numpy as np
-from gymnasium import Wrapper, RewardWrapper, ObservationWrapper
-from gymnasium.spaces import Box
-
-
-class MaxAndSkipEnv(Wrapper):
-    def __init__(self, env, skip=4):
-        """Return only every `skip`-th frame"""
-        super().__init__(env)
-        # most recent raw observations (for max pooling across time steps)
-        self._obs_buffer = np.zeros(
-            (2,) + env.observation_space.shape, dtype=np.uint8)
-        self._skip = skip
-
-    def step(self, action):
-        """Repeat action, sum reward, and max over last observations."""
-        total_reward = 0.0
-        terminated = truncated = False
-        for i in range(self._skip):
-            obs, reward, terminated, truncated, info = self.env.step(action)
-            if i == self._skip - 2:
-                self._obs_buffer[0] = obs
-            if i == self._skip - 1:
-                self._obs_buffer[1] = obs
-            total_reward += reward
-            if terminated or truncated:
-                break
-        # Note that the observation on the terminated=True frame
-        # doesn't matter
-        max_frame = self._obs_buffer.max(axis=0)
-
-        return max_frame, total_reward, terminated, truncated, info
-
-
-class ClipRewardEnv(RewardWrapper):
-    def __init__(self, env):
-        super().__init__(env)
-
-    def reward(self, reward):
-        """Bin reward to {+1, 0, -1} by its sign."""
-        return np.sign(reward)
+from gymnasium import Wrapper
 
 
 class FireResetEnv(Wrapper):
     def __init__(self, env):
         """Take action on reset for environments that are fixed until firing."""
         super().__init__(env)
-        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
+        assert env.unwrapped.get_action_meanings()[1] == "FIRE"
         assert len(env.unwrapped.get_action_meanings()) >= 3
 
     def reset(self, **kwargs):
@@ -94,27 +54,11 @@ def reset(self, **kwargs):
         else:
             # no-op step to advance from terminal/lost life state
             obs, _, terminated, truncated, info = self.env.step(0)
-          
+
             # The no-op step can lead to a game over, so we need to check it again
             # to see if we should reset the environment and avoid the
             # monitor.py `RuntimeError: Tried to step environment that needs reset`
             if terminated or truncated:
                 obs, info = self.env.reset(**kwargs)
         self.lives = self.env.unwrapped.ale.lives()
         return obs, info
-
-
-# in torch imgs have shape [c, h, w] instead of common [h, w, c]
-class AntiTorchWrapper(ObservationWrapper):
-    def __init__(self, env):
-        super().__init__(env)
-
-        self.img_size = [env.observation_space.shape[i]
-                         for i in [1, 2, 0]
-                         ]
-        self.observation_space = Box(0.0, 1.0, self.img_size)
-
-    def observation(self, img):
-        """what happens to each observation"""
-        img = img.transpose(1, 2, 0)
-        return img
@@ -44,7 +44,7 @@ def _encode_sample(self, idxes):
             np.array(actions),
             np.array(rewards),
             np.array(obses_tp1),
-            np.array(dones)
+            np.array(dones),
         )
 
     def sample(self, batch_size):
@@ -67,8 +67,54 @@ def sample(self, batch_size):
             done_mask[i] = 1 if executing act_batch[i] resulted in
             the end of an episode and 0 otherwise.
         """
-        idxes = [
-            random.randint(0, len(self._storage) - 1)
-            for _ in range(batch_size)
-        ]
+        idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
         return self._encode_sample(idxes)
+
+
+class LazyFramesVectorReplayBuffer(ReplayBuffer):
+    """
+    ReplayBuffer for vectorized environments, which are wrapped into FrameBuffers.
+
+    If an environment is first wrapped into a FrameBuffer and then vectorized,
+    then the resulting VecEnv will not use LazyFrames, but it will directly
+    use np.ndarrays, thus greatly increasing RAM consumption by the buffer.
+
+    Instead, we first vectorize an environment and only then wrap in into FrameBuffers.
+    It's not as convenient, but it keeps the advantage in memory from LazyFrames.
+
+    So,
+    observations and next_obervations are stored as LazyFrames
+    of shape (n_frames, n_envs, ...)
+    actions, rewards and dones are stored as np.ndarrays of shape (n_envs,).
+
+    """
+
+    # (n_frames, n_envs, *)
+
+    def _encode_sample(self, idxes):
+        """
+        For each index in idxes samples a (s, a, r, s', done) transition
+        from a randomly chosen environment of the corresponding VecEnv.
+        """
+        obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
+        for i in idxes:
+            data = self._storage[i]
+            obs_t, action, reward, obs_tp1, done = data
+            n_envs = action.shape[0]
+            env_idx_chosen_for_sample = random.randint(0, n_envs - 1)
+            obses_t.append(
+                np.array(obs_t, copy=False)[:, env_idx_chosen_for_sample],
+            )
+            actions.append(np.array(action, copy=False)[env_idx_chosen_for_sample])
+            rewards.append(reward[env_idx_chosen_for_sample])
+            obses_tp1.append(
+                np.array(obs_tp1, copy=False)[:, env_idx_chosen_for_sample],
+            )
+            dones.append(done[env_idx_chosen_for_sample])
+        return (
+            np.array(obses_t),
+            np.array(actions),
+            np.array(rewards),
+            np.array(obses_tp1),
+            np.array(dones),
+        )
@@ -0,0 +1,14 @@
+import psutil  # type: ignore
+
+
+def is_enough_ram(min_available_gb=0.1):
+    mem = psutil.virtual_memory()
+    return mem.available >= min_available_gb * (1024**3)
+
+
+def linear_decay(
+    init_val: float, final_val: float, cur_step: int, total_steps: int
+) -> float:
+    if cur_step >= total_steps:
+        return final_val
+    return (init_val * (total_steps - cur_step) + final_val * cur_step) / total_steps