Revert changes and rename algorithm config to algo config

- Reverted changes in entropy_ray_trainer.py, prime_ray_trainer.py, sppo_ray_trainer.py, split_monkey_patch.py - Reverted corresponding main files: main_entropy.py, main_prime.py, main_sppo.py, main_ppo_split.py - Renamed algorithm_config.py → algo_config.py - Renamed AlgorithmConfig class → AlgoConfig class - Updated all remaining references to use algo_config namespace - Updated imports from algorithm_config to algo_config - Renamed test file: test_algorithm_config_on_cpu.py → test_algo_config_on_cpu.py - Updated documentation files to use algo_config - Maintained functionality in files that were kept: main_ppo.py, ray_trainer.py, dapo files
verl-project · eric-haibin-lin · Jul 4, 2025 · Jun 20, 2025 · Jun 21, 2025 · Jun 21, 2025
commit d7aa12b8bc96125f6acb540f91dd476833d66a95
diff --git a/docs/algo/dapo.md b/docs/algo/dapo.md
@@ -90,7 +90,7 @@ prompt_bsz = self.config.data.train_batch_size
 if num_prompt_in_batch < prompt_bsz:
     print(f'{num_prompt_in_batch=} < {prompt_bsz=}')
     num_gen_batches += 1
-    max_num_gen_batches = self.algorithm_config.filter_groups.max_num_gen_batches
+    max_num_gen_batches = self.algo_config.filter_groups.max_num_gen_batches
     if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
         print(f'{num_gen_batches=} < {max_num_gen_batches=}. Keep generating...')
         continue

diff --git a/docs/workers/ray_trainer.rst b/docs/workers/ray_trainer.rst
@@ -177,14 +177,14 @@ To extend to other RLHF algorithms, such as DPO, GRPO, please refer to
                    # compute rewards. apply_kl_penalty if available
                    batch, kl_metrics = apply_kl_penalty(batch,
                                                            kl_ctrl=self.kl_ctrl_in_reward,
-                                                           kl_penalty=self.algorithm_config.kl_penalty)
+                                                           kl_penalty=self.algo_config.kl_penalty)
                    metrics.update(kl_metrics)
 
                    # compute advantages, executed on the driver process
                    batch = compute_advantage(batch,
-                                               self.algorithm_config.gamma,
-                                               self.algorithm_config.lam,
-                                               adv_estimator=self.algorithm_config.adv_estimator)
+                                               self.algo_config.gamma,
+                                               self.algo_config.lam,
+                                               adv_estimator=self.algo_config.adv_estimator)
                metrics['timing/adv'] = timer.last
 
                # update critic

diff --git a/examples/split_placement/main_ppo_split.py b/examples/split_placement/main_ppo_split.py
@@ -22,8 +22,6 @@
 
 from verl import DataProto
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer
-from verl.trainer.config.algorithm_config import AlgorithmConfig
-from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.reward_score import gsm8k, math
 
 
@@ -168,11 +166,8 @@ def main_task(config):
         Role.Critic: critic_pool_id,
     }
 
-    # Convert algorithm config to dataclass
-    algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)
-
     # use reference model
-    if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+    if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
         role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
         mapping[Role.RefPolicy] = actor_rollout_ref_pool_id
 

diff --git a/examples/split_placement/split_monkey_patch.py b/examples/split_placement/split_monkey_patch.py
@@ -31,8 +31,6 @@
     compute_data_metrics,
     compute_timing_metrics,
 )
-from verl.trainer.config.algorithm_config import AlgorithmConfig
-from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.metric import reduce_metrics
 
 
@@ -90,7 +88,7 @@ def fit(self):
                     timing_raw.update(gen_batch_output.meta_info["timing"])
                     gen_batch_output.meta_info.pop("timing", None)
 
-                if self.algorithm_config.adv_estimator == AdvantageEstimator.REMAX:
+                if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
                     with _timer("gen_max", timing_raw):
                         gen_baseline_batch = deepcopy(gen_batch)
                         gen_baseline_batch.meta_info["do_sample"] = False
@@ -152,19 +150,19 @@ def fit(self):
                     batch.batch["token_level_scores"] = reward_tensor
 
                     # compute rewards. apply_kl_penalty if available
-                    if self.algorithm_config.use_kl_in_reward:
-                        batch, kl_metrics = apply_kl_penalty(batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algorithm_config.kl_penalty)
+                    if self.config.algorithm.use_kl_in_reward:
+                        batch, kl_metrics = apply_kl_penalty(batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty)
                         metrics.update(kl_metrics)
                     else:
                         batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
 
                     # compute advantages, executed on the driver process
-                    norm_adv_by_std_in_grpo = self.algorithm_config.norm_adv_by_std_in_grpo
+                    norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
                     batch = compute_advantage(
                         batch,
-                        adv_estimator=self.algorithm_config.adv_estimator,
-                        gamma=self.algorithm_config.gamma,
-                        lam=self.algorithm_config.lam,
+                        adv_estimator=self.config.algorithm.adv_estimator,
+                        gamma=self.config.algorithm.gamma,
+                        lam=self.config.algorithm.lam,
                         num_repeat=self.config.actor_rollout_ref.rollout.n,
                         norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
                     )

diff --git a/recipe/dapo/dapo_ray_trainer.py b/recipe/dapo/dapo_ray_trainer.py
@@ -34,7 +34,7 @@
     reduce_metrics,
 )
 from verl.trainer.ppo.ray_trainer import AdvantageEstimator, RayPPOTrainer, apply_kl_penalty, compute_advantage, compute_response_mask
-from verl.trainer.config.algorithm_config import AlgorithmConfig
+from verl.trainer.config.algo_config import AlgoConfig
 from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.debug import marked_timer
 
@@ -125,7 +125,7 @@ def fit(self):
                         timing_raw.update(gen_batch_output.meta_info["timing"])
                         gen_batch_output.meta_info.pop("timing", None)
 
-                    if self.algorithm_config.adv_estimator == AdvantageEstimator.REMAX:
+                    if self.algo_config.adv_estimator == AdvantageEstimator.REMAX:
                         with marked_timer("gen_max", timing_raw, "red"):
                             gen_baseline_batch = deepcopy(gen_batch)
                             gen_baseline_batch.meta_info["do_sample"] = False
@@ -172,17 +172,17 @@ def fit(self):
                             new_batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
 
                         # compute rewards. apply_kl_penalty if available
-                        if self.algorithm_config.use_kl_in_reward:
-                            new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algorithm_config.kl_penalty)
+                        if self.algo_config.use_kl_in_reward:
+                            new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algo_config.kl_penalty)
                             metrics.update(kl_metrics)  # TODO: This will be cleared if we use multiple genenration batches
                         else:
                             new_batch.batch["token_level_rewards"] = new_batch.batch["token_level_scores"]
 
-                    if not self.algorithm_config.filter_groups.enable:
+                    if not self.algo_config.filter_groups.enable:
                         batch = new_batch
                     else:  # NOTE: When prompts after filtering is less than train batch size,
                         # we skip to the next generation batch
-                        metric_name = self.algorithm_config.filter_groups.metric
+                        metric_name = self.algo_config.filter_groups.metric
                         if metric_name == "seq_final_reward":
                             # Turn to numpy for easier filtering
                             new_batch.non_tensor_batch["seq_final_reward"] = new_batch.batch["token_level_rewards"].sum(dim=-1).numpy()
@@ -212,7 +212,7 @@ def fit(self):
                         prompt_bsz = self.config.data.train_batch_size
                         if num_prompt_in_batch < prompt_bsz:
                             print(f"{num_prompt_in_batch=} < {prompt_bsz=}")
-                            max_num_gen_batches = self.algorithm_config.filter_groups.max_num_gen_batches
+                            max_num_gen_batches = self.algo_config.filter_groups.max_num_gen_batches
                             if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
                                 print(f"{num_gen_batches=}. Keep generating...")
                                 progress_bar.update(1)
@@ -265,12 +265,12 @@ def fit(self):
 
                     with marked_timer("adv", timing_raw, "brown"):
                         # compute advantages, executed on the driver process
-                        norm_adv_by_std_in_grpo = self.algorithm_config.norm_adv_by_std_in_grpo
+                        norm_adv_by_std_in_grpo = self.algo_config.norm_adv_by_std_in_grpo
                         batch = compute_advantage(
                             batch,
-                            adv_estimator=self.algorithm_config.adv_estimator,
-                            gamma=self.algorithm_config.gamma,
-                            lam=self.algorithm_config.lam,
+                            adv_estimator=self.algo_config.adv_estimator,
+                            gamma=self.algo_config.gamma,
+                            lam=self.algo_config.lam,
                             num_repeat=self.config.actor_rollout_ref.rollout.n,
                             norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
                         )

diff --git a/recipe/dapo/main_dapo.py b/recipe/dapo/main_dapo.py
@@ -23,7 +23,7 @@
 from omegaconf import OmegaConf
 
 from verl.trainer.ppo.reward import get_custom_reward_fn
-from verl.trainer.config.algorithm_config import AlgorithmConfig
+from verl.trainer.config.algo_config import AlgoConfig
 from verl.utils.config import omega_conf_to_dataclass
 
 from .dapo_ray_trainer import RayDAPOTrainer
@@ -125,10 +125,10 @@ def run(self, config):
             mapping[Role.RewardModel] = global_pool_id
 
         # Convert algorithm config to dataclass
-        algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)
+        algo_config = omega_conf_to_dataclass(config.algorithm, AlgoConfig)
 
         # reference model
-        if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+        if algo_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
             role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
             mapping[Role.RefPolicy] = global_pool_id
 

diff --git a/recipe/entropy/entropy_ray_trainer.py b/recipe/entropy/entropy_ray_trainer.py
@@ -33,8 +33,6 @@
     reduce_metrics,
 )
 from verl.trainer.ppo.ray_trainer import AdvantageEstimator, RayPPOTrainer, apply_kl_penalty, compute_advantage, compute_response_mask
-from verl.trainer.config.algorithm_config import AlgorithmConfig
-from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.debug import simple_timer
 
 
@@ -119,7 +117,7 @@ def fit(self):
                             gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
                             self.async_rollout_manager.sleep()
 
-                    if self.algorithm_config.adv_estimator == AdvantageEstimator.REMAX:
+                    if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
                         with simple_timer("gen_max", timing_raw):
                             gen_baseline_batch = deepcopy(gen_batch)
                             gen_baseline_batch.meta_info["do_sample"] = False
@@ -167,17 +165,17 @@ def fit(self):
                             new_batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
 
                         # compute rewards. apply_kl_penalty if available
-                        if self.algorithm_config.use_kl_in_reward:
-                            new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algorithm_config.kl_penalty)
+                        if self.config.algorithm.use_kl_in_reward:
+                            new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty)
                             metrics.update(kl_metrics)  # TODO: This will be cleared if we use multiple genenration batches
                         else:
                             new_batch.batch["token_level_rewards"] = new_batch.batch["token_level_scores"]
 
-                    if not self.algorithm_config.filter_groups.enable:
+                    if not self.config.algorithm.filter_groups.enable:
                         batch = new_batch
                     else:  # NOTE: When prompts after filtering is less than train batch size,
                         # we skip to the next generation batch
-                        metric_name = self.algorithm_config.filter_groups.metric
+                        metric_name = self.config.algorithm.filter_groups.metric
                         if metric_name == "seq_final_reward":
                             # Turn to numpy for easier filtering
                             new_batch.non_tensor_batch["seq_final_reward"] = new_batch.batch["token_level_rewards"].sum(dim=-1).numpy()
@@ -207,7 +205,7 @@ def fit(self):
                         prompt_bsz = self.config.data.train_batch_size
                         if num_prompt_in_batch < prompt_bsz:
                             print(f"{num_prompt_in_batch=} < {prompt_bsz=}")
-                            max_num_gen_batches = self.algorithm_config.filter_groups.max_num_gen_batches
+                            max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
                             if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
                                 print(f"{num_gen_batches=}. Keep generating...")
                                 continue
@@ -251,12 +249,12 @@ def fit(self):
 
                     with simple_timer("adv", timing_raw):
                         # compute advantages, executed on the driver process
-                        norm_adv_by_std_in_grpo = self.algorithm_config.norm_adv_by_std_in_grpo
+                        norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
                         batch = compute_advantage(
                             batch,
-                            adv_estimator=self.algorithm_config.adv_estimator,
-                            gamma=self.algorithm_config.gamma,
-                            lam=self.algorithm_config.lam,
+                            adv_estimator=self.config.algorithm.adv_estimator,
+                            gamma=self.config.algorithm.gamma,
+                            lam=self.config.algorithm.lam,
                             num_repeat=self.config.actor_rollout_ref.rollout.n,
                             norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
                         )

diff --git a/recipe/entropy/main_entropy.py b/recipe/entropy/main_entropy.py
@@ -20,8 +20,6 @@
 
 from .entropy_ray_trainer import RayEntropyTrainer
 from .reward import load_reward_manager
-from verl.trainer.config.algorithm_config import AlgorithmConfig
-from verl.utils.config import omega_conf_to_dataclass
 
 
 @hydra.main(config_path="config", config_name="entropy_trainer", version_base=None)
@@ -131,11 +129,8 @@ def run(self, config):
             role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
             mapping[Role.RewardModel] = global_pool_id
 
-        # Convert algorithm config to dataclass
-        algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)
-
         # use reference model
-        if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
             role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
             mapping[Role.RefPolicy] = global_pool_id
 

diff --git a/recipe/prime/main_prime.py b/recipe/prime/main_prime.py
@@ -33,8 +33,6 @@
 import ray
 
 from .prime_ray_trainer import RayPRIMETrainer
-from verl.trainer.config.algorithm_config import AlgorithmConfig
-from verl.utils.config import omega_conf_to_dataclass
 
 
 @hydra.main(config_path="config", config_name="prime_trainer", version_base=None)
@@ -105,11 +103,8 @@ def main_task(config, compute_score=None):
         Role.ActorRollout: global_pool_id,
     }
 
-    # Convert algorithm config to dataclass
-    algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)
-
     # use reference model
-    if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+    if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
         role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
         mapping[Role.RefPolicy] = global_pool_id
 

diff --git a/recipe/prime/prime_ray_trainer.py b/recipe/prime/prime_ray_trainer.py
@@ -31,9 +31,7 @@
 from verl.trainer.ppo.core_algos import agg_loss
 from verl.trainer.ppo.metric_utils import _compute_response_info
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
-from verl.trainer.config.algorithm_config import AlgorithmConfig
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path
-from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
 from verl.utils.debug.performance import simple_timer
 from verl.utils.metric import reduce_metrics
@@ -359,7 +357,7 @@ def fit(self):
                         timing_raw.update(gen_batch_output.meta_info["timing"])
                         gen_batch_output.meta_info.pop("timing", None)
 
-                    if self.algorithm_config.adv_estimator == "remax":
+                    if self.config.algorithm.adv_estimator == "remax":
                         with simple_timer("gen_max", timing_raw):
                             gen_baseline_batch = deepcopy(gen_batch)
                             gen_baseline_batch.meta_info["do_sample"] = False
@@ -454,7 +452,7 @@ def fit(self):
                                 metrics.update(reward_output_metrics)
 
                         # compute advantages, executed on the driver process
-                        batch = compute_advantage(batch, adv_estimator=self.algorithm_config.adv_estimator, config=self.config)
+                        batch = compute_advantage(batch, adv_estimator=self.config.algorithm.adv_estimator, config=self.config)
 
                     # update actor
                     with simple_timer("update_actor", timing_raw):

diff --git a/recipe/sppo/main_sppo.py b/recipe/sppo/main_sppo.py
@@ -23,8 +23,6 @@
 import ray
 
 from verl.trainer.ppo.reward import load_reward_manager
-from verl.trainer.config.algorithm_config import AlgorithmConfig
-from verl.utils.config import omega_conf_to_dataclass
 
 from .sppo_ray_trainer import RaySPPOTrainer
 
@@ -124,11 +122,8 @@ def run(self, config):
             role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
             mapping[Role.RewardModel] = global_pool_id
 
-        # Convert algorithm config to dataclass
-        algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)
-
         # use reference model
-        if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
             role_worker_mapping[Role.RefPolicy] = ray.remote(SPPOActorRolloutRefWorker)
             mapping[Role.RefPolicy] = global_pool_id