verl-project · zhaochenyang20 · Jul 9, 2025 · Jul 1, 2025 · Jul 1, 2025 · Jul 1, 2025
diff --git a/tests/special_sanity/check_license.py b/tests/special_sanity/check_license.py
@@ -21,13 +21,15 @@
 license_head_individual = "Copyright 2025 Individual Contributor:"
 license_head_sglang = "Copyright 2023-2024 SGLang Team"
 license_head_modelbest = "Copyright 2025 ModelBest Inc. and/or its affiliates"
+license_head_amazon = "Copyright 2025 Amazon.com Inc and/or its affiliates"
 license_headers = [
     license_head_bytedance,
     license_head_bytedance_25,
     license_head_prime,
     license_head_individual,
     license_head_sglang,
     license_head_modelbest,
+    license_head_amazon,
 ]
 
 

diff --git a/verl/trainer/config/ppo_trainer.yaml b/verl/trainer/config/ppo_trainer.yaml
@@ -88,6 +88,17 @@ data:
     # The name of the dataset class within the specified file.
     name: null
 
+  # Data generation configuration for augmenting the dataset.
+  datagen:
+
+    # The path to the file containing your customized data generation class.
+    # E.g. 'pkg://verl.utils.dataset.dynamicgen_dataset'
+    path: null
+
+    # The class name of the data generation class within the specified file.
+    # E.g. 'MockDataGenerator'
+    name: null
+
 # config for actor, rollout and reference model
 actor_rollout_ref:
 

@@ -188,8 +188,8 @@ def run(self, config):
         from verl.utils.dataset.rl_dataset import collate_fn
 
         # Create training and validation datasets.
-        train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor)
-        val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor)
+        train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor, is_train=True)
+        val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor, is_train=False)
         train_sampler = create_rl_sampler(config.data, train_dataset)
 
         # Initialize the PPO trainer.
@@ -214,7 +214,7 @@ def run(self, config):
         trainer.fit()
 
 
-def create_rl_dataset(data_paths, data_config, tokenizer, processor):
+def create_rl_dataset(data_paths, data_config, tokenizer, processor, is_train=True):
     """Create a dataset.
 
     Arguments:
@@ -243,6 +243,13 @@ def create_rl_dataset(data_paths, data_config, tokenizer, processor):
                 f"The custom dataset class '{data_config.custom_cls.name}' from "
                 f"'{data_config.custom_cls.path}' must inherit from torch.utils.data.Dataset"
             )
+    elif "datagen" in data_config and data_config.datagen.get("path", None) is not None and is_train:
+        # If a data generation strategy is specified, use the DynamicGenDataset class
+        from verl.utils.dataset.dynamicgen_dataset import DynamicGenDataset
+
+        dataset_cls = DynamicGenDataset
+        print("Using DynamicGenDataset for data generation.")
+
     else:
         # Use the default RLHFDataset class if no custom class is specified
         dataset_cls = RLHFDataset

@@ -1367,3 +1367,7 @@ def fit(self):
                     pprint(f"Final validation metrics: {last_val_metrics}")
                     progress_bar.close()
                     return
+
+                if hasattr(self.train_dataset, "on_batch_end"):
+                    # The dataset may be changed after each training batch
+                    self.train_dataset.on_batch_end()
diff --git a/verl/utils/dataset/dynamicgen_dataset.py b/verl/utils/dataset/dynamicgen_dataset.py
@@ -0,0 +1,108 @@
+# Copyright 2025 Amazon.com Inc and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+FSDP PPO Trainer with Ray-based single controller.
+This trainer supports model-agonistic model initialization with huggingface
+"""
+
+import logging
+from abc import ABC, abstractmethod
+from typing import List, Optional, Union
+
+import datasets
+from omegaconf import DictConfig
+from torch.utils.data import Dataset
+from transformers import PreTrainedTokenizer, ProcessorMixin
+
+from verl.utils.dataset import RLHFDataset
+from verl.utils.import_utils import load_extern_type
+
+logger = logging.getLogger(__name__)
+
+
+class AbstractDataGenerator(ABC):
+    def __init__(self, config: DictConfig):
+        self.config = config
+
+    @abstractmethod
+    def generate(self, dataset: Dataset) -> datasets.Dataset:
+        """
+        Generate method must be implemented by subclasses.
+        Args:
+            dataset: The dataset to generate from.
+        Returns:
+            Processed data or result as implemented by the subclass.
+        """
+        pass
+
+
+class MockDataGenerator(AbstractDataGenerator):
+    """
+    A noop data gen class that only reappends the first datapoint.
+    This class is useful as a placeholder and testing.
+    """
+
+    def __init__(self, config: DictConfig = None):
+        super().__init__(config)
+
+    def generate(self, dataset: Dataset) -> datasets.Dataset:
+        print("MockDataGenerator: No operation performed on the dataset.")
+        return dataset.dataframe.select([0])
+
+
+class DynamicGenDataset(RLHFDataset):
+    """
+    A dataset class that uses a data generation strategy to process data.
+    This class extends RLHFDataset and uses an AbstractDataGen instance to generate data.
+    """
+
+    def __init__(
+        self,
+        data_files: Union[str, List[str]],
+        tokenizer: PreTrainedTokenizer,
+        config: DictConfig,
+        processor: Optional[ProcessorMixin] = None,
+    ):
+        super().__init__(data_files, tokenizer, config, processor)
+        self.datagen: AbstractDataGenerator = config.datagen
+        assert "datagen" in config and config.datagen.get("path", None) is not None, (
+            f"datagen path is not set in config: {config}"
+        )
+        # Dynamically load the custom datagen class
+        datagen_cls = load_extern_type(config.datagen.path, config.datagen.name)
+
+        # Verify that the custom datagen class inherits from AbstractDataGenerator
+        abs_cls = AbstractDataGenerator
+        if not issubclass(datagen_cls, abs_cls):
+            raise TypeError(
+                f"The custom datagen class '{config.datagen.name}' from '{config.datagen.path}'"
+                + " must inherit from {abs_cls}"
+            )
+
+        self.data_generator = datagen_cls(config.datagen)
+        self.on_batch_end()
+
+    def append_dataframe(self, new_dataframe: datasets.Dataset):
+        new_dataframe = self.maybe_filter_out_long_prompts(new_dataframe)
+        self.dataframe = datasets.concatenate_datasets([self.dataframe, new_dataframe])
+
+        logger.info(f"new dataset len: {len(self.dataframe)}")
+
+    def on_batch_end(self) -> None:
+        """
+        Generate data using the provided data generation strategy.
+        Note: This method is intended to change the dataset after each training batch.
+        """
+        new_data = self.data_generator.generate(self)
+        self.append_dataframe(new_data)
diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
@@ -135,6 +135,9 @@ def _read_files_and_tokenize(self):
 
         print(f"dataset len: {len(self.dataframe)}")
 
+        self.dataframe = self.maybe_filter_out_long_prompts(self.dataframe)
+
+    def maybe_filter_out_long_prompts(self, dataframe: datasets.Dataset = None):
         # filter out too long prompts
         if self.filter_overlong_prompts:
             tokenizer = self.tokenizer
@@ -165,13 +168,14 @@ def doc2len(doc) -> int:
                 def doc2len(doc) -> int:
                     return len(tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True))
 
-            self.dataframe = self.dataframe.filter(
+            dataframe = dataframe.filter(
                 lambda doc: doc2len(doc) <= self.max_prompt_length,
                 num_proc=self.num_workers,
                 desc=f"Filtering prompts longer than {self.max_prompt_length} tokens",
             )
 
-            print(f"filter dataset len: {len(self.dataframe)}")
+            print(f"filter dataset len: {len(dataframe)}")
+        return dataframe
 
     def resume_dataset_state(self):
         self.serialize_dataset = not hasattr(self, "original_data_files")