verl-project
diff --git a/‎tests/special_sanity/test_config_docs.py‎
Lines changed: 3 additions & 0 deletions b/‎tests/special_sanity/test_config_docs.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/trainer/config/test_legacy_config_on_cpu.py‎
Lines changed: 43 additions & 13 deletions b/‎tests/trainer/config/test_legacy_config_on_cpu.py‎
Lines changed: 43 additions & 13 deletions
diff --git a/‎verl/trainer/config/actor/actor.yaml‎
Lines changed: 5 additions & 1 deletion b/‎verl/trainer/config/actor/actor.yaml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎verl/trainer/config/actor/dp_actor.yaml‎
Lines changed: 1 addition & 0 deletions b/‎verl/trainer/config/actor/dp_actor.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎verl/trainer/config/actor/megatron_actor.yaml‎
Lines changed: 3 additions & 0 deletions b/‎verl/trainer/config/actor/megatron_actor.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎verl/trainer/config/data/legacy_data.yaml‎
Lines changed: 98 additions & 0 deletions b/‎verl/trainer/config/data/legacy_data.yaml‎
Lines changed: 98 additions & 0 deletions
@@ -62,6 +62,9 @@ def test_trainer_config_doc():
         "verl/trainer/config/ppo_trainer.yaml",
         "verl/trainer/config/actor/actor.yaml",
         "verl/trainer/config/actor/dp_actor.yaml",
+        "verl/trainer/config/ref/ref.yaml",
+        "verl/trainer/config/ref/dp_ref.yaml",
+        "verl/trainer/config/rollout/rollout.yaml",
     ]
     success = True
     for yaml_to_inspect in yamls_to_inspect:
 
@@ -12,16 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import unittest
 
+from hydra import compose, initialize_config_dir
+from hydra.core.global_hydra import GlobalHydra
 from omegaconf import OmegaConf
 
 
 class TestConfigComparison(unittest.TestCase):
     """Test that current configs match their legacy counterparts exactly."""
 
-    def _compare_configs_recursively(self, current_config, legacy_config, path=""):
-        """Recursively compare two OmegaConf configs and assert they are identical."""
+    def _compare_configs_recursively(self, current_config, legacy_config, path="", legacy_allow_missing=False):
+        """Recursively compare two OmegaConf configs and assert they are identical.
+
+        Args:
+            legacy_allow_missing (bool): sometimes the legacy megatron config contains fewer keys and
+              we allow that to happen
+        """
         if isinstance(current_config, dict) and isinstance(legacy_config, dict):
             current_keys = set(current_config.keys())
             legacy_keys = set(legacy_config.keys())
@@ -32,19 +40,29 @@ def _compare_configs_recursively(self, current_config, legacy_config, path=""):
             if missing_in_current:
                 self.fail(f"Keys missing in current config at {path}: {missing_in_current}")
             if missing_in_legacy:
-                self.fail(f"Keys missing in legacy config at {path}: {missing_in_legacy}")
+                # if the legacy
+                msg = f"Keys missing in legacy config at {path}: {missing_in_legacy}"
+                if legacy_allow_missing:
+                    print(msg)
+                else:
+                    self.fail(msg)
 
             for key in current_keys:
                 current_path = f"{path}.{key}" if path else key
-                self._compare_configs_recursively(current_config[key], legacy_config[key], current_path)
+                if key in legacy_config:
+                    self._compare_configs_recursively(
+                        current_config[key], legacy_config[key], current_path, legacy_allow_missing=legacy_allow_missing
+                    )
         elif isinstance(current_config, list) and isinstance(legacy_config, list):
             self.assertEqual(
                 len(current_config),
                 len(legacy_config),
                 f"List lengths differ at {path}: current={len(current_config)}, legacy={len(legacy_config)}",
             )
             for i, (current_item, legacy_item) in enumerate(zip(current_config, legacy_config)):
-                self._compare_configs_recursively(current_item, legacy_item, f"{path}[{i}]")
+                self._compare_configs_recursively(
+                    current_item, legacy_item, f"{path}[{i}]", legacy_allow_missing=legacy_allow_missing
+                )
         else:
             self.assertEqual(
                 current_config,
@@ -66,7 +84,6 @@ def test_ppo_trainer_config_matches_legacy(self):
                 current_config = compose(config_name="ppo_trainer")
 
             legacy_config = OmegaConf.load("tests/trainer/config/legacy_ppo_trainer.yaml")
-
             current_dict = OmegaConf.to_container(current_config, resolve=True)
             legacy_dict = OmegaConf.to_container(legacy_config, resolve=True)
 
@@ -79,29 +96,42 @@ def test_ppo_trainer_config_matches_legacy(self):
 
     def test_ppo_megatron_trainer_config_matches_legacy(self):
         """Test that ppo_megatron_trainer.yaml matches legacy_ppo_megatron_trainer.yaml exactly."""
-        import os
-
-        from hydra import compose, initialize_config_dir
-        from hydra.core.global_hydra import GlobalHydra
 
         GlobalHydra.instance().clear()
 
         try:
-            with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config"), version_base=None):
+            with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
                 current_config = compose(config_name="ppo_megatron_trainer")
 
             legacy_config = OmegaConf.load("tests/trainer/config/legacy_ppo_megatron_trainer.yaml")
-
             current_dict = OmegaConf.to_container(current_config, resolve=True)
             legacy_dict = OmegaConf.to_container(legacy_config, resolve=True)
 
             if "defaults" in current_dict:
                 del current_dict["defaults"]
 
-            self._compare_configs_recursively(current_dict, legacy_dict)
+            self._compare_configs_recursively(current_dict, legacy_dict, legacy_allow_missing=True)
         finally:
             GlobalHydra.instance().clear()
 
+    def test_load_component(self):
+        """Test that ppo_megatron_trainer.yaml matches legacy_ppo_megatron_trainer.yaml exactly."""
+
+        GlobalHydra.instance().clear()
+        configs_to_load = [
+            ("verl/trainer/config/actor", "dp_actor"),
+            ("verl/trainer/config/actor", "megatron_actor"),
+            ("verl/trainer/config/ref", "dp_ref"),
+            ("verl/trainer/config/ref", "megatron_ref"),
+            ("verl/trainer/config/rollout", "rollout"),
+        ]
+        for config_dir, config_file in configs_to_load:
+            try:
+                with initialize_config_dir(config_dir=os.path.abspath(config_dir)):
+                    compose(config_name=config_file)
+            finally:
+                GlobalHydra.instance().clear()
+
 
 if __name__ == "__main__":
     unittest.main()
@@ -18,10 +18,12 @@ ppo_micro_batch_size: null
 ppo_micro_batch_size_per_gpu: null
 
 # Whether to automatically adjust batch size at runtime
+# oc.select: the default val for ref.log_prob_use_dynamic_bsz
 use_dynamic_bsz: false
 
 # Max tokens per GPU in one PPO batch; affects gradient accumulation
 # Typically it should be: n * ${data.max_prompt_length} + ${data.max_response_length}
+# oc.select: the default val for ref.log_prob_max_token_len_per_gpu
 ppo_max_token_len_per_gpu: 16384
 
 # PPO clip ratio
@@ -67,6 +69,7 @@ entropy_coeff: 0
 use_kl_loss: false
 
 # Whether to use torch.compile()
+# oc.select: the default val for ref.use_torch_compile
 use_torch_compile: true
 
 # KL loss coefficient when use_kl_loss is enabled. For GRPO
@@ -89,7 +92,8 @@ checkpoint:
   save_contents: ['model', 'optimizer', 'extra']
 
   # For more flexibility, you can specify the contents to load from the checkpoint.
-  load_contents: ${actor_rollout_ref.actor.checkpoint.save_contents}
+  # .xxx refers to the local variable xxx from the same level of hierarchy similar to python pkg
+  load_contents: ${.save_contents}
 
 # optimizer configs
 optim:
 
@@ -20,6 +20,7 @@ strategy: fsdp
 grad_clip: 1.0
 
 # Sequence parallelism size for Ulysses-style model parallelism
+# oc.select: the default val for ref.ulysses_sequence_parallel_size
 ulysses_sequence_parallel_size: 1
 
 # calculate entropy with chunking to reduce memory peak
 
@@ -73,11 +73,14 @@ megatron:
 
   dist_checkpointing_path: null
 
+  # oc.select: default val for ref.megatron.seed
   seed: 42
 
   # additional transformer config like: num_layers_in_first(/last)_pipeline_stage
+  # oc.select: default val for ref.megatron.override_transformer_config
   override_transformer_config: {}
 
+  # oc.select: default val for ref.megatron.use_mbridge
   use_mbridge: False
 
 # profile the actor model in `update_policy` 
 
@@ -0,0 +1,98 @@
+# Tokenizer class or path. If null, it will be inferred from the model.
+tokenizer: null
+
+# Whether to use shared memory for data loading.
+use_shm: False
+
+# Training set parquet. Can be a list or a single file.
+# The program will read all files into memory, so it can't be too large (< 100GB).
+# The path can be either a local path or an HDFS path.
+# For HDFS path, we provide utils to download it to DRAM and convert it to a local path.
+train_files: ~/data/rlhf/gsm8k/train.parquet
+
+# Validation parquet. Can be a list or a single file.
+val_files: ~/data/rlhf/gsm8k/test.parquet
+
+# The field in the dataset where the prompt is located. Default is 'prompt'.
+prompt_key: prompt
+
+# The field used to select the reward function (if using different ones per example).
+reward_fn_key: data_source
+
+# Maximum prompt length. All prompts will be left-padded to this length.
+# An error will be reported if the length is too long.
+# oc.select: default val for rollout.prompt_length
+max_prompt_length: 512
+
+# Maximum response length. Rollout in RL algorithms (e.g. PPO) generates up to this length.
+# oc.select: default val for rollout.response_length
+max_response_length: 512
+
+# Batch size sampled for one training iteration of different RL algorithms.
+train_batch_size: 1024
+
+# Batch size used during validation. Can be null.
+val_batch_size: null
+
+# Whether to return the original input_ids without adding chat template.
+# This is used when the reward model's chat template differs from the policy.
+# If using a model-based RM with different templates, this should be True.
+return_raw_input_ids: False
+
+# Whether to return the original chat (prompt) without applying chat template.
+return_raw_chat: False
+
+# Whether to return the full prompt with chat template.
+return_full_prompt: False
+
+# Whether to shuffle the data in the dataloader.
+shuffle: True
+
+# num dataloader workers
+dataloader_num_workers: 8
+
+# Whether to shuffle the validation set.
+validation_shuffle: False
+
+# Whether to filter overlong prompts.
+filter_overlong_prompts: False
+
+# Number of workers for filtering overlong prompts.
+# For large-scale datasets, filtering can be time-consuming.
+# Use multiprocessing to speed up. Default is 1.
+filter_overlong_prompts_workers: 1
+
+# Truncate the input_ids or prompt if they exceed max_prompt_length.
+# Options: 'error', 'left', or 'right'. Default is 'error'.
+truncation: error
+
+# The field in the multi-modal dataset where the image is located. Default is 'images'.
+image_key: images
+
+# The field in the multi-modal dataset where the video is located.
+video_key: videos
+
+# If the remote tokenizer has a Python file, this flag determines whether to allow using it.
+trust_remote_code: False
+
+# Optional: specify a custom dataset class path and name if overriding default loading behavior.
+custom_cls:
+
+  # The path to the file containing your customized dataset class. If not specified, pre-implemented dataset will be used.
+  path: null
+
+  # The name of the dataset class within the specified file.
+  name: null
+
+# Whether to return multi-modal inputs in the dataset. Set to False if rollout generates new multi-modal inputs.
+return_multi_modal_inputs: True
+
+# settings related to data sampler
+sampler:
+
+  # the path to the module containing a curriculum class which implements the
+  # AbstractSampler interface
+  class_path: null
+
+  # the name of the curriculum class like `MySampler`
+  class_name: null