Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
4e1c05e
[trainer, cfg] feat: Add AlgorithmConfig dataclass for type-safe algo…
openhands-agent Jun 20, 2025
9ed10fa
Complete algorithm config dataclass migration
openhands-agent Jun 21, 2025
646a1e7
Remove temporary test files
openhands-agent Jun 21, 2025
d7aa12b
Revert changes and rename algorithm config to algo config
openhands-agent Jun 21, 2025
109340d
Update compute_advantage type annotations and fix linting
openhands-agent Jun 21, 2025
89e4b34
Update all core_algos.py functions to use AlgoConfig type annotations
openhands-agent Jun 21, 2025
f0f406f
Fix compute_grpo_outcome_advantage function signature to include Algo…
openhands-agent Jun 21, 2025
637a358
Merge main into feat/algorithm-config-dataclass
openhands-agent Jun 22, 2025
9eeab2e
init frozen adaptor
eric-haibin-lin Jun 29, 2025
1b85290
move to profiler folder
eric-haibin-lin Jun 30, 2025
ba93223
backward compat namespace move
eric-haibin-lin Jun 30, 2025
da8d771
fix lint
eric-haibin-lin Jun 30, 2025
0b1cb62
remove omega_conf_to_dataclass type
eric-haibin-lin Jun 30, 2025
2c25c76
Refactor algorithm config classes to use frozen dataclasses and BaseC…
devin-ai-integration[bot] Jun 30, 2025
520b23d
Revert documentation changes and fix omega_conf_to_dataclass call
devin-ai-integration[bot] Jun 30, 2025
80685b4
Fix config.get() call in compute_advantage function
devin-ai-integration[bot] Jun 30, 2025
2df1773
Merge main branch and resolve conflicts
devin-ai-integration[bot] Jun 30, 2025
52c62b3
Fix lint issues after merge
devin-ai-integration[bot] Jun 30, 2025
562a111
Fix type annotation and docstring coverage issues
devin-ai-integration[bot] Jun 30, 2025
81d7edf
Add test_base_config_on_cpu.py to allow list and update omega_conf_to…
devin-ai-integration[bot] Jun 30, 2025
a6df414
fix test
eric-haibin-lin Jun 30, 2025
6e743a5
fix litn
eric-haibin-lin Jun 30, 2025
ffa8d77
convert to dataclass upfront
eric-haibin-lin Jun 30, 2025
12c22b8
Merge branch 'feat/algorithm-config-dataclass' of code.byted.org:data…
eric-haibin-lin Jun 30, 2025
e2fac2c
update import stmt
eric-haibin-lin Jun 30, 2025
969a734
merge with main
eric-haibin-lin Jun 30, 2025
69a1a17
fix lint
eric-haibin-lin Jun 30, 2025
f1f4047
add _target_ to megatron config
eric-haibin-lin Jun 30, 2025
7bcd0fe
fix ranks init
eric-haibin-lin Jun 30, 2025
0eacb9f
adjust line-len
eric-haibin-lin Jul 1, 2025
ac19891
adjust len=120
eric-haibin-lin Jul 1, 2025
c907607
merge with main
eric-haibin-lin Jul 1, 2025
e63bbb0
fix lint
eric-haibin-lin Jul 1, 2025
8bce67d
merge with master
eric-haibin-lin Jul 3, 2025
fb93f20
merge with main
eric-haibin-lin Jul 4, 2025
c195f00
Merge remote-tracking branch 'oss/main' into feat/algorithm-config-da…
eric-haibin-lin Jul 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Revert changes and rename algorithm config to algo config
- Reverted changes in entropy_ray_trainer.py, prime_ray_trainer.py, sppo_ray_trainer.py, split_monkey_patch.py
- Reverted corresponding main files: main_entropy.py, main_prime.py, main_sppo.py, main_ppo_split.py
- Renamed algorithm_config.py → algo_config.py
- Renamed AlgorithmConfig class → AlgoConfig class
- Updated all remaining references to use algo_config namespace
- Updated imports from algorithm_config to algo_config
- Renamed test file: test_algorithm_config_on_cpu.py → test_algo_config_on_cpu.py
- Updated documentation files to use algo_config
- Maintained functionality in files that were kept: main_ppo.py, ray_trainer.py, dapo files
  • Loading branch information
openhands-agent committed Jun 21, 2025
commit d7aa12b8bc96125f6acb540f91dd476833d66a95
2 changes: 1 addition & 1 deletion docs/algo/dapo.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ prompt_bsz = self.config.data.train_batch_size
if num_prompt_in_batch < prompt_bsz:
print(f'{num_prompt_in_batch=} < {prompt_bsz=}')
num_gen_batches += 1
max_num_gen_batches = self.algorithm_config.filter_groups.max_num_gen_batches
max_num_gen_batches = self.algo_config.filter_groups.max_num_gen_batches
if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
print(f'{num_gen_batches=} < {max_num_gen_batches=}. Keep generating...')
continue
Expand Down
8 changes: 4 additions & 4 deletions docs/workers/ray_trainer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -177,14 +177,14 @@ To extend to other RLHF algorithms, such as DPO, GRPO, please refer to
# compute rewards. apply_kl_penalty if available
batch, kl_metrics = apply_kl_penalty(batch,
kl_ctrl=self.kl_ctrl_in_reward,
kl_penalty=self.algorithm_config.kl_penalty)
kl_penalty=self.algo_config.kl_penalty)
metrics.update(kl_metrics)

# compute advantages, executed on the driver process
batch = compute_advantage(batch,
self.algorithm_config.gamma,
self.algorithm_config.lam,
adv_estimator=self.algorithm_config.adv_estimator)
self.algo_config.gamma,
self.algo_config.lam,
adv_estimator=self.algo_config.adv_estimator)
metrics['timing/adv'] = timer.last

# update critic
Expand Down
7 changes: 1 addition & 6 deletions examples/split_placement/main_ppo_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@

from verl import DataProto
from verl.trainer.ppo.ray_trainer import RayPPOTrainer
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass
from verl.utils.reward_score import gsm8k, math


Expand Down Expand Up @@ -168,11 +166,8 @@ def main_task(config):
Role.Critic: critic_pool_id,
}

# Convert algorithm config to dataclass
algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)

# use reference model
if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
mapping[Role.RefPolicy] = actor_rollout_ref_pool_id

Expand Down
16 changes: 7 additions & 9 deletions examples/split_placement/split_monkey_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@
compute_data_metrics,
compute_timing_metrics,
)
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass
from verl.utils.metric import reduce_metrics


Expand Down Expand Up @@ -90,7 +88,7 @@ def fit(self):
timing_raw.update(gen_batch_output.meta_info["timing"])
gen_batch_output.meta_info.pop("timing", None)

if self.algorithm_config.adv_estimator == AdvantageEstimator.REMAX:
if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
with _timer("gen_max", timing_raw):
gen_baseline_batch = deepcopy(gen_batch)
gen_baseline_batch.meta_info["do_sample"] = False
Expand Down Expand Up @@ -152,19 +150,19 @@ def fit(self):
batch.batch["token_level_scores"] = reward_tensor

# compute rewards. apply_kl_penalty if available
if self.algorithm_config.use_kl_in_reward:
batch, kl_metrics = apply_kl_penalty(batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algorithm_config.kl_penalty)
if self.config.algorithm.use_kl_in_reward:
batch, kl_metrics = apply_kl_penalty(batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty)
metrics.update(kl_metrics)
else:
batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]

# compute advantages, executed on the driver process
norm_adv_by_std_in_grpo = self.algorithm_config.norm_adv_by_std_in_grpo
norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
batch = compute_advantage(
batch,
adv_estimator=self.algorithm_config.adv_estimator,
gamma=self.algorithm_config.gamma,
lam=self.algorithm_config.lam,
adv_estimator=self.config.algorithm.adv_estimator,
gamma=self.config.algorithm.gamma,
lam=self.config.algorithm.lam,
num_repeat=self.config.actor_rollout_ref.rollout.n,
norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
)
Expand Down
22 changes: 11 additions & 11 deletions recipe/dapo/dapo_ray_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
reduce_metrics,
)
from verl.trainer.ppo.ray_trainer import AdvantageEstimator, RayPPOTrainer, apply_kl_penalty, compute_advantage, compute_response_mask
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.trainer.config.algo_config import AlgoConfig
from verl.utils.config import omega_conf_to_dataclass
from verl.utils.debug import marked_timer

Expand Down Expand Up @@ -125,7 +125,7 @@ def fit(self):
timing_raw.update(gen_batch_output.meta_info["timing"])
gen_batch_output.meta_info.pop("timing", None)

if self.algorithm_config.adv_estimator == AdvantageEstimator.REMAX:
if self.algo_config.adv_estimator == AdvantageEstimator.REMAX:
with marked_timer("gen_max", timing_raw, "red"):
gen_baseline_batch = deepcopy(gen_batch)
gen_baseline_batch.meta_info["do_sample"] = False
Expand Down Expand Up @@ -172,17 +172,17 @@ def fit(self):
new_batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})

# compute rewards. apply_kl_penalty if available
if self.algorithm_config.use_kl_in_reward:
new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algorithm_config.kl_penalty)
if self.algo_config.use_kl_in_reward:
new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algo_config.kl_penalty)
metrics.update(kl_metrics) # TODO: This will be cleared if we use multiple genenration batches
else:
new_batch.batch["token_level_rewards"] = new_batch.batch["token_level_scores"]

if not self.algorithm_config.filter_groups.enable:
if not self.algo_config.filter_groups.enable:
batch = new_batch
else: # NOTE: When prompts after filtering is less than train batch size,
# we skip to the next generation batch
metric_name = self.algorithm_config.filter_groups.metric
metric_name = self.algo_config.filter_groups.metric
if metric_name == "seq_final_reward":
# Turn to numpy for easier filtering
new_batch.non_tensor_batch["seq_final_reward"] = new_batch.batch["token_level_rewards"].sum(dim=-1).numpy()
Expand Down Expand Up @@ -212,7 +212,7 @@ def fit(self):
prompt_bsz = self.config.data.train_batch_size
if num_prompt_in_batch < prompt_bsz:
print(f"{num_prompt_in_batch=} < {prompt_bsz=}")
max_num_gen_batches = self.algorithm_config.filter_groups.max_num_gen_batches
max_num_gen_batches = self.algo_config.filter_groups.max_num_gen_batches
if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
print(f"{num_gen_batches=}. Keep generating...")
progress_bar.update(1)
Expand Down Expand Up @@ -265,12 +265,12 @@ def fit(self):

with marked_timer("adv", timing_raw, "brown"):
# compute advantages, executed on the driver process
norm_adv_by_std_in_grpo = self.algorithm_config.norm_adv_by_std_in_grpo
norm_adv_by_std_in_grpo = self.algo_config.norm_adv_by_std_in_grpo
batch = compute_advantage(
batch,
adv_estimator=self.algorithm_config.adv_estimator,
gamma=self.algorithm_config.gamma,
lam=self.algorithm_config.lam,
adv_estimator=self.algo_config.adv_estimator,
gamma=self.algo_config.gamma,
lam=self.algo_config.lam,
num_repeat=self.config.actor_rollout_ref.rollout.n,
norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
)
Expand Down
6 changes: 3 additions & 3 deletions recipe/dapo/main_dapo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from omegaconf import OmegaConf

from verl.trainer.ppo.reward import get_custom_reward_fn
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.trainer.config.algo_config import AlgoConfig
from verl.utils.config import omega_conf_to_dataclass

from .dapo_ray_trainer import RayDAPOTrainer
Expand Down Expand Up @@ -125,10 +125,10 @@ def run(self, config):
mapping[Role.RewardModel] = global_pool_id

# Convert algorithm config to dataclass
algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)
algo_config = omega_conf_to_dataclass(config.algorithm, AlgoConfig)

# reference model
if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
if algo_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
mapping[Role.RefPolicy] = global_pool_id

Expand Down
22 changes: 10 additions & 12 deletions recipe/entropy/entropy_ray_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@
reduce_metrics,
)
from verl.trainer.ppo.ray_trainer import AdvantageEstimator, RayPPOTrainer, apply_kl_penalty, compute_advantage, compute_response_mask
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass
from verl.utils.debug import simple_timer


Expand Down Expand Up @@ -119,7 +117,7 @@ def fit(self):
gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
self.async_rollout_manager.sleep()

if self.algorithm_config.adv_estimator == AdvantageEstimator.REMAX:
if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
with simple_timer("gen_max", timing_raw):
gen_baseline_batch = deepcopy(gen_batch)
gen_baseline_batch.meta_info["do_sample"] = False
Expand Down Expand Up @@ -167,17 +165,17 @@ def fit(self):
new_batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})

# compute rewards. apply_kl_penalty if available
if self.algorithm_config.use_kl_in_reward:
new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.algorithm_config.kl_penalty)
if self.config.algorithm.use_kl_in_reward:
new_batch, kl_metrics = apply_kl_penalty(new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty)
metrics.update(kl_metrics) # TODO: This will be cleared if we use multiple genenration batches
else:
new_batch.batch["token_level_rewards"] = new_batch.batch["token_level_scores"]

if not self.algorithm_config.filter_groups.enable:
if not self.config.algorithm.filter_groups.enable:
batch = new_batch
else: # NOTE: When prompts after filtering is less than train batch size,
# we skip to the next generation batch
metric_name = self.algorithm_config.filter_groups.metric
metric_name = self.config.algorithm.filter_groups.metric
if metric_name == "seq_final_reward":
# Turn to numpy for easier filtering
new_batch.non_tensor_batch["seq_final_reward"] = new_batch.batch["token_level_rewards"].sum(dim=-1).numpy()
Expand Down Expand Up @@ -207,7 +205,7 @@ def fit(self):
prompt_bsz = self.config.data.train_batch_size
if num_prompt_in_batch < prompt_bsz:
print(f"{num_prompt_in_batch=} < {prompt_bsz=}")
max_num_gen_batches = self.algorithm_config.filter_groups.max_num_gen_batches
max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
print(f"{num_gen_batches=}. Keep generating...")
continue
Expand Down Expand Up @@ -251,12 +249,12 @@ def fit(self):

with simple_timer("adv", timing_raw):
# compute advantages, executed on the driver process
norm_adv_by_std_in_grpo = self.algorithm_config.norm_adv_by_std_in_grpo
norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
batch = compute_advantage(
batch,
adv_estimator=self.algorithm_config.adv_estimator,
gamma=self.algorithm_config.gamma,
lam=self.algorithm_config.lam,
adv_estimator=self.config.algorithm.adv_estimator,
gamma=self.config.algorithm.gamma,
lam=self.config.algorithm.lam,
num_repeat=self.config.actor_rollout_ref.rollout.n,
norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
)
Expand Down
7 changes: 1 addition & 6 deletions recipe/entropy/main_entropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@

from .entropy_ray_trainer import RayEntropyTrainer
from .reward import load_reward_manager
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass


@hydra.main(config_path="config", config_name="entropy_trainer", version_base=None)
Expand Down Expand Up @@ -131,11 +129,8 @@ def run(self, config):
role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
mapping[Role.RewardModel] = global_pool_id

# Convert algorithm config to dataclass
algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)

# use reference model
if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
mapping[Role.RefPolicy] = global_pool_id

Expand Down
7 changes: 1 addition & 6 deletions recipe/prime/main_prime.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@
import ray

from .prime_ray_trainer import RayPRIMETrainer
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass


@hydra.main(config_path="config", config_name="prime_trainer", version_base=None)
Expand Down Expand Up @@ -105,11 +103,8 @@ def main_task(config, compute_score=None):
Role.ActorRollout: global_pool_id,
}

# Convert algorithm config to dataclass
algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)

# use reference model
if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
mapping[Role.RefPolicy] = global_pool_id

Expand Down
6 changes: 2 additions & 4 deletions recipe/prime/prime_ray_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@
from verl.trainer.ppo.core_algos import agg_loss
from verl.trainer.ppo.metric_utils import _compute_response_info
from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path
from verl.utils.config import omega_conf_to_dataclass
from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
from verl.utils.debug.performance import simple_timer
from verl.utils.metric import reduce_metrics
Expand Down Expand Up @@ -359,7 +357,7 @@ def fit(self):
timing_raw.update(gen_batch_output.meta_info["timing"])
gen_batch_output.meta_info.pop("timing", None)

if self.algorithm_config.adv_estimator == "remax":
if self.config.algorithm.adv_estimator == "remax":
with simple_timer("gen_max", timing_raw):
gen_baseline_batch = deepcopy(gen_batch)
gen_baseline_batch.meta_info["do_sample"] = False
Expand Down Expand Up @@ -454,7 +452,7 @@ def fit(self):
metrics.update(reward_output_metrics)

# compute advantages, executed on the driver process
batch = compute_advantage(batch, adv_estimator=self.algorithm_config.adv_estimator, config=self.config)
batch = compute_advantage(batch, adv_estimator=self.config.algorithm.adv_estimator, config=self.config)

# update actor
with simple_timer("update_actor", timing_raw):
Expand Down
7 changes: 1 addition & 6 deletions recipe/sppo/main_sppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
import ray

from verl.trainer.ppo.reward import load_reward_manager
from verl.trainer.config.algorithm_config import AlgorithmConfig
from verl.utils.config import omega_conf_to_dataclass

from .sppo_ray_trainer import RaySPPOTrainer

Expand Down Expand Up @@ -124,11 +122,8 @@ def run(self, config):
role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
mapping[Role.RewardModel] = global_pool_id

# Convert algorithm config to dataclass
algorithm_config = omega_conf_to_dataclass(config.algorithm, AlgorithmConfig)

# use reference model
if algorithm_config.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
role_worker_mapping[Role.RefPolicy] = ray.remote(SPPOActorRolloutRefWorker)
mapping[Role.RefPolicy] = global_pool_id

Expand Down
Loading