Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
4e1c05e
[trainer, cfg] feat: Add AlgorithmConfig dataclass for type-safe algo…
openhands-agent Jun 20, 2025
9ed10fa
Complete algorithm config dataclass migration
openhands-agent Jun 21, 2025
646a1e7
Remove temporary test files
openhands-agent Jun 21, 2025
d7aa12b
Revert changes and rename algorithm config to algo config
openhands-agent Jun 21, 2025
109340d
Update compute_advantage type annotations and fix linting
openhands-agent Jun 21, 2025
89e4b34
Update all core_algos.py functions to use AlgoConfig type annotations
openhands-agent Jun 21, 2025
f0f406f
Fix compute_grpo_outcome_advantage function signature to include Algo…
openhands-agent Jun 21, 2025
637a358
Merge main into feat/algorithm-config-dataclass
openhands-agent Jun 22, 2025
9eeab2e
init frozen adaptor
eric-haibin-lin Jun 29, 2025
1b85290
move to profiler folder
eric-haibin-lin Jun 30, 2025
ba93223
backward compat namespace move
eric-haibin-lin Jun 30, 2025
da8d771
fix lint
eric-haibin-lin Jun 30, 2025
0b1cb62
remove omega_conf_to_dataclass type
eric-haibin-lin Jun 30, 2025
2c25c76
Refactor algorithm config classes to use frozen dataclasses and BaseC…
devin-ai-integration[bot] Jun 30, 2025
520b23d
Revert documentation changes and fix omega_conf_to_dataclass call
devin-ai-integration[bot] Jun 30, 2025
80685b4
Fix config.get() call in compute_advantage function
devin-ai-integration[bot] Jun 30, 2025
2df1773
Merge main branch and resolve conflicts
devin-ai-integration[bot] Jun 30, 2025
52c62b3
Fix lint issues after merge
devin-ai-integration[bot] Jun 30, 2025
562a111
Fix type annotation and docstring coverage issues
devin-ai-integration[bot] Jun 30, 2025
81d7edf
Add test_base_config_on_cpu.py to allow list and update omega_conf_to…
devin-ai-integration[bot] Jun 30, 2025
a6df414
fix test
eric-haibin-lin Jun 30, 2025
6e743a5
fix litn
eric-haibin-lin Jun 30, 2025
ffa8d77
convert to dataclass upfront
eric-haibin-lin Jun 30, 2025
12c22b8
Merge branch 'feat/algorithm-config-dataclass' of code.byted.org:data…
eric-haibin-lin Jun 30, 2025
e2fac2c
update import stmt
eric-haibin-lin Jun 30, 2025
969a734
merge with main
eric-haibin-lin Jun 30, 2025
69a1a17
fix lint
eric-haibin-lin Jun 30, 2025
f1f4047
add _target_ to megatron config
eric-haibin-lin Jun 30, 2025
7bcd0fe
fix ranks init
eric-haibin-lin Jun 30, 2025
0eacb9f
adjust line-len
eric-haibin-lin Jul 1, 2025
ac19891
adjust len=120
eric-haibin-lin Jul 1, 2025
c907607
merge with main
eric-haibin-lin Jul 1, 2025
e63bbb0
fix lint
eric-haibin-lin Jul 1, 2025
8bce67d
merge with master
eric-haibin-lin Jul 3, 2025
fb93f20
merge with main
eric-haibin-lin Jul 4, 2025
c195f00
Merge remote-tracking branch 'oss/main' into feat/algorithm-config-da…
eric-haibin-lin Jul 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Complete algorithm config dataclass migration
- Updated split_monkey_patch.py: converted all 6 config.algorithm references to algorithm_config
- Updated main_ppo_split.py: added imports and algorithm config conversion
- Updated documentation files: ray_trainer.rst and dapo.md to use algorithm_config
- Fixed remaining config.algorithm references in sppo_ray_trainer.py __init__ method
- Extended AlgorithmConfig with FilterGroupsConfig and sppo_eta fields for full compatibility
- Added verification tests to ensure all changes work correctly
- All trainer files now consistently use self.algorithm_config pattern
- All main files include proper algorithm config conversion
- Documentation updated to reflect new usage patterns
  • Loading branch information
openhands-agent committed Jun 21, 2025
commit 9ed10fad98e4953e58de30ba1f23e30702f13d1d
2 changes: 1 addition & 1 deletion docs/algo/dapo.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ prompt_bsz = self.config.data.train_batch_size
if num_prompt_in_batch < prompt_bsz:
print(f'{num_prompt_in_batch=} < {prompt_bsz=}')
num_gen_batches += 1
max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
max_num_gen_batches = self.algorithm_config.filter_groups.max_num_gen_batches
if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
print(f'{num_gen_batches=} < {max_num_gen_batches=}. Keep generating...')
continue
Expand Down
8 changes: 4 additions & 4 deletions docs/workers/ray_trainer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -177,14 +177,14 @@ To extend to other RLHF algorithms, such as DPO, GRPO, please refer to
# compute rewards. apply_kl_penalty if available
batch, kl_metrics = apply_kl_penalty(batch,
kl_ctrl=self.kl_ctrl_in_reward,
kl_penalty=self.config.algorithm.kl_penalty)
kl_penalty=self.algorithm_config.kl_penalty)
metrics.update(kl_metrics)

# compute advantages, executed on the driver process
batch = compute_advantage(batch,
self.config.algorithm.gamma,
self.config.algorithm.lam,
adv_estimator=self.config.algorithm.adv_estimator)
self.algorithm_config.gamma,
self.algorithm_config.lam,
adv_estimator=self.algorithm_config.adv_estimator)
metrics['timing/adv'] = timer.last

# update critic
Expand Down
7 changes: 6 additions & 1 deletion examples/split_placement/main_ppo_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

from verl import DataProto
from verl.trainer.ppo.ray_trainer import RayPPOTrainer
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass
from verl.utils.reward_score import gsm8k, math


Expand Down Expand Up @@ -166,8 +168,11 @@ def main_task(config):
Role.Critic: critic_pool_id,
}

# Convert algorithm config to dataclass
algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)

# use reference model
if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
mapping[Role.RefPolicy] = actor_rollout_ref_pool_id

Expand Down
16 changes: 9 additions & 7 deletions examples/split_placement/split_monkey_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
compute_data_metrics,
compute_timing_metrics,
)
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass
from verl.utils.metric import reduce_metrics


Expand Down Expand Up @@ -88,7 +90,7 @@ def fit(self):
timing_raw.update(gen_batch_output.meta_info["timing"])
gen_batch_output.meta_info.pop("timing", None)

if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
if self.algorithm_config.adv_estimator == AdvantageEstimator.REMAX:
with _timer("gen_max", timing_raw):
gen_baseline_batch = deepcopy(gen_batch)
gen_baseline_batch.meta_info["do_sample"] = False
Expand Down Expand Up @@ -150,19 +152,19 @@ def fit(self):
batch.batch["token_level_scores"] = reward_tensor

# compute rewards. apply_kl_penalty if available
if self.config.algorithm.use_kl_in_reward:
batch, kl_metrics = apply_kl_penalty(batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty)
if self.algorithm_config.use_kl_in_reward:
batch, kl_metrics = apply_kl_penalty(batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algorithm_config.kl_penalty)
metrics.update(kl_metrics)
else:
batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]

# compute advantages, executed on the driver process
norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
norm_adv_by_std_in_grpo = self.algorithm_config.norm_adv_by_std_in_grpo
batch = compute_advantage(
batch,
adv_estimator=self.config.algorithm.adv_estimator,
gamma=self.config.algorithm.gamma,
lam=self.config.algorithm.lam,
adv_estimator=self.algorithm_config.adv_estimator,
gamma=self.algorithm_config.gamma,
lam=self.algorithm_config.lam,
num_repeat=self.config.actor_rollout_ref.rollout.n,
norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
)
Expand Down
22 changes: 12 additions & 10 deletions recipe/dapo/dapo_ray_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
reduce_metrics,
)
from verl.trainer.ppo.ray_trainer import AdvantageEstimator, RayPPOTrainer, apply_kl_penalty, compute_advantage, compute_response_mask
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass
from verl.utils.debug import marked_timer


Expand Down Expand Up @@ -123,7 +125,7 @@ def fit(self):
timing_raw.update(gen_batch_output.meta_info["timing"])
gen_batch_output.meta_info.pop("timing", None)

if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
if self.algorithm_config.adv_estimator == AdvantageEstimator.REMAX:
with marked_timer("gen_max", timing_raw, "red"):
gen_baseline_batch = deepcopy(gen_batch)
gen_baseline_batch.meta_info["do_sample"] = False
Expand Down Expand Up @@ -170,17 +172,17 @@ def fit(self):
new_batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})

# compute rewards. apply_kl_penalty if available
if self.config.algorithm.use_kl_in_reward:
new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty)
if self.algorithm_config.use_kl_in_reward:
new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algorithm_config.kl_penalty)
metrics.update(kl_metrics) # TODO: This will be cleared if we use multiple genenration batches
else:
new_batch.batch["token_level_rewards"] = new_batch.batch["token_level_scores"]

if not self.config.algorithm.filter_groups.enable:
if not self.algorithm_config.filter_groups.enable:
batch = new_batch
else: # NOTE: When prompts after filtering is less than train batch size,
# we skip to the next generation batch
metric_name = self.config.algorithm.filter_groups.metric
metric_name = self.algorithm_config.filter_groups.metric
if metric_name == "seq_final_reward":
# Turn to numpy for easier filtering
new_batch.non_tensor_batch["seq_final_reward"] = new_batch.batch["token_level_rewards"].sum(dim=-1).numpy()
Expand Down Expand Up @@ -210,7 +212,7 @@ def fit(self):
prompt_bsz = self.config.data.train_batch_size
if num_prompt_in_batch < prompt_bsz:
print(f"{num_prompt_in_batch=} < {prompt_bsz=}")
max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
max_num_gen_batches = self.algorithm_config.filter_groups.max_num_gen_batches
if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
print(f"{num_gen_batches=}. Keep generating...")
progress_bar.update(1)
Expand Down Expand Up @@ -263,12 +265,12 @@ def fit(self):

with marked_timer("adv", timing_raw, "brown"):
# compute advantages, executed on the driver process
norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
norm_adv_by_std_in_grpo = self.algorithm_config.norm_adv_by_std_in_grpo
batch = compute_advantage(
batch,
adv_estimator=self.config.algorithm.adv_estimator,
gamma=self.config.algorithm.gamma,
lam=self.config.algorithm.lam,
adv_estimator=self.algorithm_config.adv_estimator,
gamma=self.algorithm_config.gamma,
lam=self.algorithm_config.lam,
num_repeat=self.config.actor_rollout_ref.rollout.n,
norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
)
Expand Down
7 changes: 6 additions & 1 deletion recipe/dapo/main_dapo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from omegaconf import OmegaConf

from verl.trainer.ppo.reward import get_custom_reward_fn
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass

from .dapo_ray_trainer import RayDAPOTrainer

Expand Down Expand Up @@ -122,8 +124,11 @@ def run(self, config):
role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
mapping[Role.RewardModel] = global_pool_id

# Convert algorithm config to dataclass
algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)

# reference model
if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
mapping[Role.RefPolicy] = global_pool_id

Expand Down
22 changes: 12 additions & 10 deletions recipe/entropy/entropy_ray_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
reduce_metrics,
)
from verl.trainer.ppo.ray_trainer import AdvantageEstimator, RayPPOTrainer, apply_kl_penalty, compute_advantage, compute_response_mask
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass
from verl.utils.debug import simple_timer


Expand Down Expand Up @@ -117,7 +119,7 @@ def fit(self):
gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
self.async_rollout_manager.sleep()

if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
if self.algorithm_config.adv_estimator == AdvantageEstimator.REMAX:
with simple_timer("gen_max", timing_raw):
gen_baseline_batch = deepcopy(gen_batch)
gen_baseline_batch.meta_info["do_sample"] = False
Expand Down Expand Up @@ -165,17 +167,17 @@ def fit(self):
new_batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})

# compute rewards. apply_kl_penalty if available
if self.config.algorithm.use_kl_in_reward:
new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty)
if self.algorithm_config.use_kl_in_reward:
new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algorithm_config.kl_penalty)
metrics.update(kl_metrics) # TODO: This will be cleared if we use multiple genenration batches
else:
new_batch.batch["token_level_rewards"] = new_batch.batch["token_level_scores"]

if not self.config.algorithm.filter_groups.enable:
if not self.algorithm_config.filter_groups.enable:
batch = new_batch
else: # NOTE: When prompts after filtering is less than train batch size,
# we skip to the next generation batch
metric_name = self.config.algorithm.filter_groups.metric
metric_name = self.algorithm_config.filter_groups.metric
if metric_name == "seq_final_reward":
# Turn to numpy for easier filtering
new_batch.non_tensor_batch["seq_final_reward"] = new_batch.batch["token_level_rewards"].sum(dim=-1).numpy()
Expand Down Expand Up @@ -205,7 +207,7 @@ def fit(self):
prompt_bsz = self.config.data.train_batch_size
if num_prompt_in_batch < prompt_bsz:
print(f"{num_prompt_in_batch=} < {prompt_bsz=}")
max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
max_num_gen_batches = self.algorithm_config.filter_groups.max_num_gen_batches
if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
print(f"{num_gen_batches=}. Keep generating...")
continue
Expand Down Expand Up @@ -249,12 +251,12 @@ def fit(self):

with simple_timer("adv", timing_raw):
# compute advantages, executed on the driver process
norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
norm_adv_by_std_in_grpo = self.algorithm_config.norm_adv_by_std_in_grpo
batch = compute_advantage(
batch,
adv_estimator=self.config.algorithm.adv_estimator,
gamma=self.config.algorithm.gamma,
lam=self.config.algorithm.lam,
adv_estimator=self.algorithm_config.adv_estimator,
gamma=self.algorithm_config.gamma,
lam=self.algorithm_config.lam,
num_repeat=self.config.actor_rollout_ref.rollout.n,
norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
)
Expand Down
7 changes: 6 additions & 1 deletion recipe/entropy/main_entropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

from .entropy_ray_trainer import RayEntropyTrainer
from .reward import load_reward_manager
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass


@hydra.main(config_path="config", config_name="entropy_trainer", version_base=None)
Expand Down Expand Up @@ -129,8 +131,11 @@ def run(self, config):
role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
mapping[Role.RewardModel] = global_pool_id

# Convert algorithm config to dataclass
algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)

# use reference model
if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
mapping[Role.RefPolicy] = global_pool_id

Expand Down
7 changes: 6 additions & 1 deletion recipe/prime/main_prime.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
import ray

from .prime_ray_trainer import RayPRIMETrainer
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass


@hydra.main(config_path="config", config_name="prime_trainer", version_base=None)
Expand Down Expand Up @@ -103,8 +105,11 @@ def main_task(config, compute_score=None):
Role.ActorRollout: global_pool_id,
}

# Convert algorithm config to dataclass
algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)

# use reference model
if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
mapping[Role.RefPolicy] = global_pool_id

Expand Down
6 changes: 4 additions & 2 deletions recipe/prime/prime_ray_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
from verl.trainer.ppo.core_algos import agg_loss
from verl.trainer.ppo.metric_utils import _compute_response_info
from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path
from verl.utils.config import omega_conf_to_dataclass
from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
from verl.utils.debug.performance import simple_timer
from verl.utils.metric import reduce_metrics
Expand Down Expand Up @@ -357,7 +359,7 @@ def fit(self):
timing_raw.update(gen_batch_output.meta_info["timing"])
gen_batch_output.meta_info.pop("timing", None)

if self.config.algorithm.adv_estimator == "remax":
if self.algorithm_config.adv_estimator == "remax":
with simple_timer("gen_max", timing_raw):
gen_baseline_batch = deepcopy(gen_batch)
gen_baseline_batch.meta_info["do_sample"] = False
Expand Down Expand Up @@ -452,7 +454,7 @@ def fit(self):
metrics.update(reward_output_metrics)

# compute advantages, executed on the driver process
batch = compute_advantage(batch, adv_estimator=self.config.algorithm.adv_estimator, config=self.config)
batch = compute_advantage(batch, adv_estimator=self.algorithm_config.adv_estimator, config=self.config)

# update actor
with simple_timer("update_actor", timing_raw):
Expand Down
7 changes: 6 additions & 1 deletion recipe/sppo/main_sppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import ray

from verl.trainer.ppo.reward import load_reward_manager
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass

from .sppo_ray_trainer import RaySPPOTrainer

Expand Down Expand Up @@ -122,8 +124,11 @@ def run(self, config):
role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
mapping[Role.RewardModel] = global_pool_id

# Convert algorithm config to dataclass
algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)

# use reference model
if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
role_worker_mapping[Role.RefPolicy] = ray.remote(SPPOActorRolloutRefWorker)
mapping[Role.RefPolicy] = global_pool_id

Expand Down
Loading