Complete algorithm config dataclass migration

- Updated split_monkey_patch.py: converted all 6 config.algorithm references to algorithm_config - Updated main_ppo_split.py: added imports and algorithm config conversion - Updated documentation files: ray_trainer.rst and dapo.md to use algorithm_config - Fixed remaining config.algorithm references in sppo_ray_trainer.py __init__ method - Extended AlgorithmConfig with FilterGroupsConfig and sppo_eta fields for full compatibility - Added verification tests to ensure all changes work correctly - All trainer files now consistently use self.algorithm_config pattern - All main files include proper algorithm config conversion - Documentation updated to reflect new usage patterns
verl-project · eric-haibin-lin · Jul 4, 2025 · Jun 20, 2025 · Jun 21, 2025 · Jun 21, 2025
commit 9ed10fad98e4953e58de30ba1f23e30702f13d1d
diff --git a/docs/algo/dapo.md b/docs/algo/dapo.md
@@ -90,7 +90,7 @@ prompt_bsz = self.config.data.train_batch_size
 if num_prompt_in_batch < prompt_bsz:
     print(f'{num_prompt_in_batch=} < {prompt_bsz=}')
     num_gen_batches += 1
-    max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
+    max_num_gen_batches = self.algorithm_config.filter_groups.max_num_gen_batches
     if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
         print(f'{num_gen_batches=} < {max_num_gen_batches=}. Keep generating...')
         continue

diff --git a/docs/workers/ray_trainer.rst b/docs/workers/ray_trainer.rst
@@ -177,14 +177,14 @@ To extend to other RLHF algorithms, such as DPO, GRPO, please refer to
                    # compute rewards. apply_kl_penalty if available
                    batch, kl_metrics = apply_kl_penalty(batch,
                                                            kl_ctrl=self.kl_ctrl_in_reward,
-                                                           kl_penalty=self.config.algorithm.kl_penalty)
+                                                           kl_penalty=self.algorithm_config.kl_penalty)
                    metrics.update(kl_metrics)
 
                    # compute advantages, executed on the driver process
                    batch = compute_advantage(batch,
-                                               self.config.algorithm.gamma,
-                                               self.config.algorithm.lam,
-                                               adv_estimator=self.config.algorithm.adv_estimator)
+                                               self.algorithm_config.gamma,
+                                               self.algorithm_config.lam,
+                                               adv_estimator=self.algorithm_config.adv_estimator)
                metrics['timing/adv'] = timer.last
 
                # update critic

diff --git a/examples/split_placement/main_ppo_split.py b/examples/split_placement/main_ppo_split.py
@@ -22,6 +22,8 @@
 
 from verl import DataProto
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer
+from verl.trainer.config.algorithm_config import AlgorithmConfig
+from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.reward_score import gsm8k, math
 
 
@@ -166,8 +168,11 @@ def main_task(config):
         Role.Critic: critic_pool_id,
     }
 
+    # Convert algorithm config to dataclass
+    algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)
+
     # use reference model
-    if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+    if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
         role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
         mapping[Role.RefPolicy] = actor_rollout_ref_pool_id
 

diff --git a/examples/split_placement/split_monkey_patch.py b/examples/split_placement/split_monkey_patch.py
@@ -31,6 +31,8 @@
     compute_data_metrics,
     compute_timing_metrics,
 )
+from verl.trainer.config.algorithm_config import AlgorithmConfig
+from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.metric import reduce_metrics
 
 
@@ -88,7 +90,7 @@ def fit(self):
                     timing_raw.update(gen_batch_output.meta_info["timing"])
                     gen_batch_output.meta_info.pop("timing", None)
 
-                if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
+                if self.algorithm_config.adv_estimator == AdvantageEstimator.REMAX:
                     with _timer("gen_max", timing_raw):
                         gen_baseline_batch = deepcopy(gen_batch)
                         gen_baseline_batch.meta_info["do_sample"] = False
@@ -150,19 +152,19 @@ def fit(self):
                     batch.batch["token_level_scores"] = reward_tensor
 
                     # compute rewards. apply_kl_penalty if available
-                    if self.config.algorithm.use_kl_in_reward:
-                        batch, kl_metrics = apply_kl_penalty(batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty)
+                    if self.algorithm_config.use_kl_in_reward:
+                        batch, kl_metrics = apply_kl_penalty(batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algorithm_config.kl_penalty)
                         metrics.update(kl_metrics)
                     else:
                         batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
 
                     # compute advantages, executed on the driver process
-                    norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
+                    norm_adv_by_std_in_grpo = self.algorithm_config.norm_adv_by_std_in_grpo
                     batch = compute_advantage(
                         batch,
-                        adv_estimator=self.config.algorithm.adv_estimator,
-                        gamma=self.config.algorithm.gamma,
-                        lam=self.config.algorithm.lam,
+                        adv_estimator=self.algorithm_config.adv_estimator,
+                        gamma=self.algorithm_config.gamma,
+                        lam=self.algorithm_config.lam,
                         num_repeat=self.config.actor_rollout_ref.rollout.n,
                         norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
                     )

diff --git a/recipe/dapo/dapo_ray_trainer.py b/recipe/dapo/dapo_ray_trainer.py
@@ -34,6 +34,8 @@
     reduce_metrics,
 )
 from verl.trainer.ppo.ray_trainer import AdvantageEstimator, RayPPOTrainer, apply_kl_penalty, compute_advantage, compute_response_mask
+from verl.trainer.config.algorithm_config import AlgorithmConfig
+from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.debug import marked_timer
 
 
@@ -123,7 +125,7 @@ def fit(self):
                         timing_raw.update(gen_batch_output.meta_info["timing"])
                         gen_batch_output.meta_info.pop("timing", None)
 
-                    if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
+                    if self.algorithm_config.adv_estimator == AdvantageEstimator.REMAX:
                         with marked_timer("gen_max", timing_raw, "red"):
                             gen_baseline_batch = deepcopy(gen_batch)
                             gen_baseline_batch.meta_info["do_sample"] = False
@@ -170,17 +172,17 @@ def fit(self):
                             new_batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
 
                         # compute rewards. apply_kl_penalty if available
-                        if self.config.algorithm.use_kl_in_reward:
-                            new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty)
+                        if self.algorithm_config.use_kl_in_reward:
+                            new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algorithm_config.kl_penalty)
                             metrics.update(kl_metrics)  # TODO: This will be cleared if we use multiple genenration batches
                         else:
                             new_batch.batch["token_level_rewards"] = new_batch.batch["token_level_scores"]
 
-                    if not self.config.algorithm.filter_groups.enable:
+                    if not self.algorithm_config.filter_groups.enable:
                         batch = new_batch
                     else:  # NOTE: When prompts after filtering is less than train batch size,
                         # we skip to the next generation batch
-                        metric_name = self.config.algorithm.filter_groups.metric
+                        metric_name = self.algorithm_config.filter_groups.metric
                         if metric_name == "seq_final_reward":
                             # Turn to numpy for easier filtering
                             new_batch.non_tensor_batch["seq_final_reward"] = new_batch.batch["token_level_rewards"].sum(dim=-1).numpy()
@@ -210,7 +212,7 @@ def fit(self):
                         prompt_bsz = self.config.data.train_batch_size
                         if num_prompt_in_batch < prompt_bsz:
                             print(f"{num_prompt_in_batch=} < {prompt_bsz=}")
-                            max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
+                            max_num_gen_batches = self.algorithm_config.filter_groups.max_num_gen_batches
                             if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
                                 print(f"{num_gen_batches=}. Keep generating...")
                                 progress_bar.update(1)
@@ -263,12 +265,12 @@ def fit(self):
 
                     with marked_timer("adv", timing_raw, "brown"):
                         # compute advantages, executed on the driver process
-                        norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
+                        norm_adv_by_std_in_grpo = self.algorithm_config.norm_adv_by_std_in_grpo
                         batch = compute_advantage(
                             batch,
-                            adv_estimator=self.config.algorithm.adv_estimator,
-                            gamma=self.config.algorithm.gamma,
-                            lam=self.config.algorithm.lam,
+                            adv_estimator=self.algorithm_config.adv_estimator,
+                            gamma=self.algorithm_config.gamma,
+                            lam=self.algorithm_config.lam,
                             num_repeat=self.config.actor_rollout_ref.rollout.n,
                             norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
                         )

diff --git a/recipe/dapo/main_dapo.py b/recipe/dapo/main_dapo.py
@@ -23,6 +23,8 @@
 from omegaconf import OmegaConf
 
 from verl.trainer.ppo.reward import get_custom_reward_fn
+from verl.trainer.config.algorithm_config import AlgorithmConfig
+from verl.utils.config import omega_conf_to_dataclass
 
 from .dapo_ray_trainer import RayDAPOTrainer
 
@@ -122,8 +124,11 @@ def run(self, config):
             role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
             mapping[Role.RewardModel] = global_pool_id
 
+        # Convert algorithm config to dataclass
+        algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)
+
         # reference model
-        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+        if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
             role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
             mapping[Role.RefPolicy] = global_pool_id
 

diff --git a/recipe/entropy/entropy_ray_trainer.py b/recipe/entropy/entropy_ray_trainer.py
@@ -33,6 +33,8 @@
     reduce_metrics,
 )
 from verl.trainer.ppo.ray_trainer import AdvantageEstimator, RayPPOTrainer, apply_kl_penalty, compute_advantage, compute_response_mask
+from verl.trainer.config.algorithm_config import AlgorithmConfig
+from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.debug import simple_timer
 
 
@@ -117,7 +119,7 @@ def fit(self):
                             gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
                             self.async_rollout_manager.sleep()
 
-                    if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
+                    if self.algorithm_config.adv_estimator == AdvantageEstimator.REMAX:
                         with simple_timer("gen_max", timing_raw):
                             gen_baseline_batch = deepcopy(gen_batch)
                             gen_baseline_batch.meta_info["do_sample"] = False
@@ -165,17 +167,17 @@ def fit(self):
                             new_batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
 
                         # compute rewards. apply_kl_penalty if available
-                        if self.config.algorithm.use_kl_in_reward:
-                            new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty)
+                        if self.algorithm_config.use_kl_in_reward:
+                            new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algorithm_config.kl_penalty)
                             metrics.update(kl_metrics)  # TODO: This will be cleared if we use multiple genenration batches
                         else:
                             new_batch.batch["token_level_rewards"] = new_batch.batch["token_level_scores"]
 
-                    if not self.config.algorithm.filter_groups.enable:
+                    if not self.algorithm_config.filter_groups.enable:
                         batch = new_batch
                     else:  # NOTE: When prompts after filtering is less than train batch size,
                         # we skip to the next generation batch
-                        metric_name = self.config.algorithm.filter_groups.metric
+                        metric_name = self.algorithm_config.filter_groups.metric
                         if metric_name == "seq_final_reward":
                             # Turn to numpy for easier filtering
                             new_batch.non_tensor_batch["seq_final_reward"] = new_batch.batch["token_level_rewards"].sum(dim=-1).numpy()
@@ -205,7 +207,7 @@ def fit(self):
                         prompt_bsz = self.config.data.train_batch_size
                         if num_prompt_in_batch < prompt_bsz:
                             print(f"{num_prompt_in_batch=} < {prompt_bsz=}")
-                            max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
+                            max_num_gen_batches = self.algorithm_config.filter_groups.max_num_gen_batches
                             if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
                                 print(f"{num_gen_batches=}. Keep generating...")
                                 continue
@@ -249,12 +251,12 @@ def fit(self):
 
                     with simple_timer("adv", timing_raw):
                         # compute advantages, executed on the driver process
-                        norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
+                        norm_adv_by_std_in_grpo = self.algorithm_config.norm_adv_by_std_in_grpo
                         batch = compute_advantage(
                             batch,
-                            adv_estimator=self.config.algorithm.adv_estimator,
-                            gamma=self.config.algorithm.gamma,
-                            lam=self.config.algorithm.lam,
+                            adv_estimator=self.algorithm_config.adv_estimator,
+                            gamma=self.algorithm_config.gamma,
+                            lam=self.algorithm_config.lam,
                             num_repeat=self.config.actor_rollout_ref.rollout.n,
                             norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
                         )

diff --git a/recipe/entropy/main_entropy.py b/recipe/entropy/main_entropy.py
@@ -20,6 +20,8 @@
 
 from .entropy_ray_trainer import RayEntropyTrainer
 from .reward import load_reward_manager
+from verl.trainer.config.algorithm_config import AlgorithmConfig
+from verl.utils.config import omega_conf_to_dataclass
 
 
 @hydra.main(config_path="config", config_name="entropy_trainer", version_base=None)
@@ -129,8 +131,11 @@ def run(self, config):
             role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
             mapping[Role.RewardModel] = global_pool_id
 
+        # Convert algorithm config to dataclass
+        algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)
+
         # use reference model
-        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+        if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
             role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
             mapping[Role.RefPolicy] = global_pool_id
 

diff --git a/recipe/prime/main_prime.py b/recipe/prime/main_prime.py
@@ -33,6 +33,8 @@
 import ray
 
 from .prime_ray_trainer import RayPRIMETrainer
+from verl.trainer.config.algorithm_config import AlgorithmConfig
+from verl.utils.config import omega_conf_to_dataclass
 
 
 @hydra.main(config_path="config", config_name="prime_trainer", version_base=None)
@@ -103,8 +105,11 @@ def main_task(config, compute_score=None):
         Role.ActorRollout: global_pool_id,
     }
 
+    # Convert algorithm config to dataclass
+    algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)
+
     # use reference model
-    if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+    if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
         role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
         mapping[Role.RefPolicy] = global_pool_id
 

diff --git a/recipe/prime/prime_ray_trainer.py b/recipe/prime/prime_ray_trainer.py
@@ -31,7 +31,9 @@
 from verl.trainer.ppo.core_algos import agg_loss
 from verl.trainer.ppo.metric_utils import _compute_response_info
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
+from verl.trainer.config.algorithm_config import AlgorithmConfig
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path
+from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
 from verl.utils.debug.performance import simple_timer
 from verl.utils.metric import reduce_metrics
@@ -357,7 +359,7 @@ def fit(self):
                         timing_raw.update(gen_batch_output.meta_info["timing"])
                         gen_batch_output.meta_info.pop("timing", None)
 
-                    if self.config.algorithm.adv_estimator == "remax":
+                    if self.algorithm_config.adv_estimator == "remax":
                         with simple_timer("gen_max", timing_raw):
                             gen_baseline_batch = deepcopy(gen_batch)
                             gen_baseline_batch.meta_info["do_sample"] = False
@@ -452,7 +454,7 @@ def fit(self):
                                 metrics.update(reward_output_metrics)
 
                         # compute advantages, executed on the driver process
-                        batch = compute_advantage(batch, adv_estimator=self.config.algorithm.adv_estimator, config=self.config)
+                        batch = compute_advantage(batch, adv_estimator=self.algorithm_config.adv_estimator, config=self.config)
 
                     # update actor
                     with simple_timer("update_actor", timing_raw):

diff --git a/recipe/sppo/main_sppo.py b/recipe/sppo/main_sppo.py
@@ -23,6 +23,8 @@
 import ray
 
 from verl.trainer.ppo.reward import load_reward_manager
+from verl.trainer.config.algorithm_config import AlgorithmConfig
+from verl.utils.config import omega_conf_to_dataclass
 
 from .sppo_ray_trainer import RaySPPOTrainer
 
@@ -122,8 +124,11 @@ def run(self, config):
             role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
             mapping[Role.RewardModel] = global_pool_id
 
+        # Convert algorithm config to dataclass
+        algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)
+
         # use reference model
-        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+        if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
             role_worker_mapping[Role.RefPolicy] = ray.remote(SPPOActorRolloutRefWorker)
             mapping[Role.RefPolicy] = global_pool_id