verl-project
diff --git a/‎docs/advance/rollout_is_migration.md‎
Lines changed: 27 additions & 21 deletions b/‎docs/advance/rollout_is_migration.md‎
Lines changed: 27 additions & 21 deletions
diff --git a/‎examples/rollout_importance_sampling/README.md‎
Lines changed: 22 additions & 22 deletions b/‎examples/rollout_importance_sampling/README.md‎
Lines changed: 22 additions & 22 deletions
diff --git a/‎examples/rollout_importance_sampling/run_with_rollout_is.sh‎
Lines changed: 7 additions & 5 deletions b/‎examples/rollout_importance_sampling/run_with_rollout_is.sh‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎recipe/dapo/dapo_ray_trainer.py‎
Lines changed: 5 additions & 5 deletions b/‎recipe/dapo/dapo_ray_trainer.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎recipe/one_step_off_policy/ray_trainer.py‎
Lines changed: 5 additions & 5 deletions b/‎recipe/one_step_off_policy/ray_trainer.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎tests/trainer/ppo/test_rollout_is_integration.py‎
Lines changed: 55 additions & 8 deletions b/‎tests/trainer/ppo/test_rollout_is_integration.py‎
Lines changed: 55 additions & 8 deletions
diff --git a/‎verl/trainer/config/_generated_ppo_megatron_trainer.yaml‎
Lines changed: 6 additions & 6 deletions b/‎verl/trainer/config/_generated_ppo_megatron_trainer.yaml‎
Lines changed: 6 additions & 6 deletions
@@ -36,10 +36,12 @@ The old implementation:
 ### **Added (New Implementation)**
 
 ```yaml
-# New Rollout IS configuration
+# New Rollout IS configuration (all in algorithm config)
 algorithm:
-  rollout_is: true
+  # Main control: set threshold to enable (null = disabled)
   rollout_is_threshold: 2.0
+  # Whether to apply weights to loss (default: false = metrics only)
+  rollout_is: true
   rollout_is_threshold_lower: null  # Auto-reciprocal
   rollout_is_level: token
   rollout_is_mode: truncate
@@ -121,11 +123,17 @@ The new implementation:
 
 ## Configuration Parameters
 
+### `algorithm.rollout_is_threshold` (float or null)
+**Main on/off switch.** Upper threshold for IS weights.
+- `null` = disabled (no computation, no metrics)
+- `float` value (e.g., 2.0) = enabled (compute weights and metrics)
+
 ### `algorithm.rollout_is` (bool)
-Enable/disable IS correction. Default: `False`
+Whether to apply IS weights to policy loss. Default: `False`
+- `true` = apply weights to loss (full IS correction)
+- `false` = compute metrics only (useful for monitoring before enabling)
 
-### `algorithm.rollout_is_threshold` (float or null)
-Upper threshold for IS weights. Set to `null` to disable IS completely.
+**Recommended threshold ranges:**
 - Token level: 1.5 - 5.0
 - Sequence level: 2.0 - 10.0
 - Geometric level: 1.0002 - 1.001
@@ -164,8 +172,8 @@ actor_rollout_ref:
 **After (New):**
 ```yaml
 algorithm:
-  rollout_is: true
-  rollout_is_threshold: 2.0
+  rollout_is_threshold: 2.0  # Main control
+  rollout_is: true           # Apply to loss (default: false)
   rollout_is_level: token
   rollout_is_mode: truncate
 
@@ -430,41 +438,39 @@ Monitor metrics for 1-2 epochs before adjusting parameters.
 
 ## Configuration Examples
 
-### Example 1: Token-level with Truncate
+### Example 1: Full IS Correction
 ```yaml
 algorithm:
-  rollout_is: true
   rollout_is_threshold: 2.0
+  rollout_is: true  # Apply weights to loss
   rollout_is_level: token
   rollout_is_mode: truncate
 ```
 
-### Example 2: Geometric Mean with Clip
+### Example 2: Metrics Only (Monitoring Mode)
 ```yaml
 algorithm:
-  rollout_is: true
-  rollout_is_threshold: 1.0002
-  rollout_is_threshold_lower: 0.9998
-  rollout_is_level: geometric
-  rollout_is_mode: clip
+  rollout_is_threshold: 2.0
+  rollout_is: false  # Compute metrics, don't apply weights
+  rollout_is_level: token
+  rollout_is_mode: truncate
 ```
 
-### Example 3: Wider Threshold with Clip
+### Example 3: Geometric Mean with Clip
 ```yaml
 algorithm:
+  rollout_is_threshold: 1.0002
   rollout_is: true
-  rollout_is_threshold: 3.0
-  rollout_is_threshold_lower: 0.33
-  rollout_is_level: token
+  rollout_is_threshold_lower: 0.9998
+  rollout_is_level: geometric
   rollout_is_mode: clip
-  rollout_is_veto_threshold: 1e-5
 ```
 
 ### Example 4: Asymmetric Thresholds
 ```yaml
 algorithm:
-  rollout_is: true
   rollout_is_threshold: 5.0
+  rollout_is: true
   rollout_is_threshold_lower: 0.8
   rollout_is_level: token
   rollout_is_mode: clip
 
@@ -19,8 +19,10 @@ Rollout Importance Sampling corrects for distribution mismatch when:
 
 ```yaml
 algorithm:
-  rollout_is: true
+  # Main control: set threshold to enable (null = disabled)
   rollout_is_threshold: 2.0
+  # Whether to apply weights to policy loss (true) or just compute metrics (false)
+  rollout_is: true
   rollout_is_level: token
   rollout_is_mode: truncate
 
@@ -56,66 +58,64 @@ bash examples/rollout_importance_sampling/run_with_rollout_is.sh
 
 ### Key Parameters
 
-- `rollout_is`: Enable/disable IS correction (boolean)
-- `rollout_is_threshold`: Upper threshold for IS weights (float or null to disable)
+- `rollout_is_threshold`: Upper threshold for IS weights (null = disabled, float = enabled). **Main on/off switch.**
+- `rollout_is`: Whether to apply weights to loss (true) or just compute metrics (false). Default: false.
 - `rollout_is_threshold_lower`: Lower threshold (null = auto 1/upper)
 - `rollout_is_veto_threshold`: Catastrophic outlier threshold (default: 1e-4)
 
 ## Configuration Examples
 
-### Example 1: Token-level with Truncate
+### Example 1: Full IS Correction (Apply Weights)
 
 ```yaml
 algorithm:
-  rollout_is: true
   rollout_is_threshold: 2.0
+  rollout_is: true  # Apply to loss
   rollout_is_level: token
   rollout_is_mode: truncate
   rollout_is_veto_threshold: 1e-4
 ```
 
-### Example 2: Geometric Mean with Clip
+### Example 2: Metrics Only (No Weight Application)
+
+```yaml
+algorithm:
+  rollout_is_threshold: 2.0
+  rollout_is: false  # Compute metrics only, don't apply to loss
+  rollout_is_level: token
+  rollout_is_mode: truncate
+```
+
+### Example 3: Geometric Mean with Clip
 
 ```yaml
 algorithm:
-  rollout_is: true
   rollout_is_threshold: 1.0002
+  rollout_is: true
   rollout_is_threshold_lower: 0.9998
   rollout_is_level: geometric
   rollout_is_mode: clip
   rollout_is_veto_threshold: 1e-4
 ```
 
-### Example 3: Sequence-level with Truncate
+### Example 4: Sequence-level with Truncate
 
 ```yaml
 algorithm:
-  rollout_is: true
   rollout_is_threshold: 5.0
+  rollout_is: true
   rollout_is_threshold_lower: null  # Auto-reciprocal: 0.2
   rollout_is_level: sequence
   rollout_is_mode: truncate
   rollout_is_veto_threshold: 1e-4
 ```
 
-### Example 4: Wider Threshold with Clip
-
-```yaml
-algorithm:
-  rollout_is: true
-  rollout_is_threshold: 3.0
-  rollout_is_threshold_lower: 0.33
-  rollout_is_level: token
-  rollout_is_mode: clip
-  rollout_is_veto_threshold: 1e-5
-```
-
 ### Example 5: Asymmetric Thresholds
 
 ```yaml
 algorithm:
-  rollout_is: true
   rollout_is_threshold: 5.0
+  rollout_is: true
   rollout_is_threshold_lower: 0.8
   rollout_is_level: token
   rollout_is_mode: clip
 
@@ -8,12 +8,13 @@ set -xeuo pipefail
 # Rollout Importance Sampling Configuration
 # ==============================================================================
 
-# Enable rollout IS
-rollout_is=True
-
-# Upper threshold for IS weights
+# Main control: Upper threshold for IS weights (null = disabled, float = enabled)
 rollout_is_threshold=2.0
 
+# Whether to apply IS weights to policy loss
+# true = apply weights to loss, false = compute metrics only
+rollout_is=true
+
 # Lower threshold (null = auto-reciprocal, i.e., 1/upper = 0.5)
 rollout_is_threshold_lower=null
 
@@ -87,9 +88,10 @@ python3 -m verl.trainer.main_ppo \
 echo "Training completed!"
 echo ""
 echo "Rollout IS Configuration:"
+echo "  - Threshold: ${rollout_is_threshold}"
+echo "  - Apply to loss: ${rollout_is}"
 echo "  - Level: ${rollout_is_level}"
 echo "  - Mode: ${rollout_is_mode}"
-echo "  - Threshold: ${rollout_is_threshold}"
 echo ""
 echo "Monitor these key metrics in wandb:"
 echo "  - mismatch/rollout_is_mean (should be ~1.0)"
 
@@ -304,6 +304,11 @@ def fit(self):
                             values = self.critic_wg.compute_values(batch)
                             batch = batch.union(values)
 
+                    # Compute rollout IS weights and mismatch metrics (inherited from RayPPOTrainer)
+                    batch, is_metrics = self.compute_rollout_importance_weights_and_add_to_batch(batch)
+                    # IS and mismatch metrics already have mismatch/ prefix
+                    metrics.update(is_metrics)
+
                     with marked_timer("adv", timing_raw, "brown"):
                         # compute advantages, executed on the driver process
                         norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
@@ -316,11 +321,6 @@ def fit(self):
                             norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
                         )
 
-                        # Compute rollout IS weights and mismatch metrics (inherited from RayPPOTrainer)
-                        batch, is_metrics = self.compute_rollout_importance_weights_and_add_to_batch(batch)
-                        # IS and mismatch metrics already have mismatch/ prefix
-                        metrics.update(is_metrics)
-
                     # update critic
                     if self.use_critic:
                         with marked_timer("update_critic", timing_raw, "pink"):
 
@@ -577,6 +577,11 @@ def fit(self):
                     else:
                         batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
 
+                    # Compute rollout IS weights and mismatch metrics (inherited from RayPPOTrainer)
+                    batch, is_metrics = self.compute_rollout_importance_weights_and_add_to_batch(batch)
+                    # IS and mismatch metrics already have mismatch/ prefix
+                    metrics.update(is_metrics)
+
                     # compute advantages, executed on the driver process
 
                     norm_adv_by_std_in_grpo = self.config.algorithm.get(
@@ -593,11 +598,6 @@ def fit(self):
                         config=self.config.algorithm,
                     )
 
-                    # Compute rollout IS weights and mismatch metrics (inherited from RayPPOTrainer)
-                    batch, is_metrics = self.compute_rollout_importance_weights_and_add_to_batch(batch)
-                    # IS and mismatch metrics already have mismatch/ prefix
-                    metrics.update(is_metrics)
-
                 # update critic
                 if self.use_critic:
                     with marked_timer("update_critic", timing_raw, color="pink"):
 
@@ -40,25 +40,25 @@ def sample_data(self):
 
     @pytest.fixture
     def config_with_rollout_is(self):
-        """Create config with rollout IS enabled."""
+        """Create config for policy loss computation.
+
+        Note: rollout_is config has been moved to algorithm config.
+        This config only needs fields used by policy loss (clip_ratio, etc).
+        """
         config = ActorConfig(
             strategy="fsdp",
             rollout_n=1,
             ppo_micro_batch_size=2,
-            rollout_is=True,
-            rollout_is_threshold=2.0,
-            rollout_is_level="token",
-            rollout_is_mode="truncate",
-            rollout_is_veto_threshold=1e-4,
             clip_ratio=0.2,
         )
         return config
 
     def test_policy_loss_with_rollout_is(self, sample_data, config_with_rollout_is):
         """Test that policy loss computation works with rollout IS weights.
 
-        Note: In production, IS weights are computed centrally in the trainer.
-        This test simulates that by computing weights before passing to policy loss.
+        Note: In production, IS weights are computed centrally in the trainer
+        (before advantage computation) and passed to policy loss.
+        This test simulates that workflow.
         """
         # First compute IS weights (as trainer would do centrally)
         rollout_is_weights_proto, _ = compute_rollout_importance_weights(
@@ -189,6 +189,53 @@ def test_veto_mechanism(self):
         assert metrics["mismatch/rollout_is_veto_fraction"] > 0
         assert metrics["mismatch/rollout_is_veto_fraction"] <= 1.0
 
+    def test_metrics_only_mode(self, sample_data, config_with_rollout_is):
+        """Test metrics-only mode: compute IS weights/metrics but don't apply to loss.
+
+        This tests the use case where rollout_is_threshold is set (enables computation)
+        but rollout_is=False (disables weight application to policy loss).
+        """
+        # Compute IS weights (as trainer would do)
+        rollout_is_weights_proto, is_metrics = compute_rollout_importance_weights(
+            old_log_prob=sample_data["old_log_prob"],
+            rollout_log_prob=sample_data["rollout_log_prob"],
+            response_mask=sample_data["response_mask"],
+            rollout_is_level="token",
+            rollout_is_mode="truncate",
+            rollout_is_threshold=2.0,
+        )
+
+        # Metrics should be computed
+        assert len(is_metrics) > 0
+        assert "mismatch/rollout_is_mean" in is_metrics
+
+        # In metrics-only mode, we compute loss WITHOUT applying weights
+        # (simulating rollout_is=False)
+        pg_loss_no_weights, _, _, _ = compute_policy_loss_vanilla(
+            old_log_prob=sample_data["old_log_prob"],
+            log_prob=sample_data["log_prob"],
+            advantages=sample_data["advantages"],
+            response_mask=sample_data["response_mask"],
+            loss_agg_mode="token-mean",
+            config=config_with_rollout_is,
+            rollout_is_weights=None,  # Don't apply weights
+        )
+
+        # Compare to loss WITH weights (rollout_is=True)
+        rollout_is_weights = rollout_is_weights_proto.batch["rollout_is_weights"]
+        pg_loss_with_weights, _, _, _ = compute_policy_loss_vanilla(
+            old_log_prob=sample_data["old_log_prob"],
+            log_prob=sample_data["log_prob"],
+            advantages=sample_data["advantages"],
+            response_mask=sample_data["response_mask"],
+            loss_agg_mode="token-mean",
+            config=config_with_rollout_is,
+            rollout_is_weights=rollout_is_weights,
+        )
+
+        # Losses should be different (weights have an effect)
+        assert not torch.allclose(pg_loss_no_weights, pg_loss_with_weights)
+
 
 if __name__ == "__main__":
     pytest.main([__file__, "-v", "-s"])
@@ -75,12 +75,6 @@ actor_rollout_ref:
     clip_ratio_c: 3.0
     loss_agg_mode: token-mean
     entropy_coeff: 0
-    rollout_is: false
-    rollout_is_threshold: null
-    rollout_is_threshold_lower: null
-    rollout_is_level: token
-    rollout_is_mode: truncate
-    rollout_is_veto_threshold: 0.0001
     use_kl_loss: false
     use_torch_compile: true
     kl_loss_coef: 0.001
@@ -487,6 +481,12 @@ algorithm:
   pf_ppo:
     reweight_method: pow
     weight_pow: 2.0
+  rollout_is_threshold: null
+  rollout_is_threshold_lower: null
+  rollout_is_level: token
+  rollout_is_mode: truncate
+  rollout_is_veto_threshold: 0.0001
+  rollout_is: false
 trainer:
   balance_batch: true
   total_epochs: 30