Merge branch 'main' into lucas/fast_fp8_v2

LiyuanLucasLiu · LiyuanLucasLiu · commit 8543afebde5c · 2025-08-24T20:26:46.000-07:00
diff --git a/README.md b/README.md
@@ -116,7 +116,8 @@ Below are the combinations of the environments that we have tested on.
 |--|--|--|--|--|--|--|--|
 | `hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0` | 12.6 | 2.43.0 | 0.8.3 | [flash-rl](https://github.com/yaof20/verl/tree/flash-rl/recipe/flash_rl) | 1.0.1 | ✅ Tested | ✅ Tested |
 | `hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0` | 12.6 | 2.43.0 | 0.8.4 | [flash-rl](https://github.com/yaof20/verl/tree/flash-rl/recipe/flash_rl) | 1.0.1 | ✅ Tested | |
-| `hiyouga/verl:ngc-th2.7.0-cu12.6-vllm0.9.1` | 12.6 | 2.43.0 | 0.8.4 | [flash-rl-vllm0.9.1](https://github.com/yaof20/verl/tree/flash-rl-vllm0.9.1/recipe/flash_rl) | 1.0.2| ✅ Tested | |
+| `hiyouga/verl:ngc-th2.7.0-cu12.6-vllm0.9.1` | 12.6 | 2.43.0 | 0.9.1 | [flash-rl-vllm0.9.1](https://github.com/yaof20/verl/tree/flash-rl-vllm0.9.1/recipe/flash_rl) | 1.0.2| ✅ Tested | |
+| `hiyouga/verl:ngc-th2.7.1-cu12.6-vllm0.10.0` | 12.6 | 2.48.0 | 0.10.0 | [flash-rl-vllm0.9.1](https://github.com/yaof20/verl/tree/flash-rl-vllm0.9.1/recipe/flash_rl) | 1.0.2| ✅ Tested | |
 
 ## 🚧 Roadmap & Future Improvements
 
diff --git a/flash_rl/vllm_patch.py b/flash_rl/vllm_patch.py
@@ -2,6 +2,7 @@
 import os
 import vllm
 import torch 
+from torch import nn
 import types
 import gc
 from packaging.version import parse
@@ -35,6 +36,56 @@ def bond_method_to_cls(func, obj):
     '_assert_and_load',
 ]
 
+# fine-grained process weights after loading implementation where we skip operations for certain modules
+def process_weights_after_loading_skippable(model: nn.Module, model_config,
+                                  target_device: torch.device) -> None:
+    from vllm.attention import Attention
+    from vllm.model_executor.layers.linear import QKVCrossParallelLinear
+    from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
+    from vllm.model_executor.model_loader.utils import device_loading_context
+    from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import CompressedTensorsLinearMethod, CompressedTensorsW8A8Int8
+    from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import CutlassScaledMMLinearKernel
+        
+    for _, module in model.named_modules():
+        if isinstance(module, QKVCrossParallelLinear):
+            # NOTE(Isotr0py): special case for cross QKV layer because
+            # q and kv proj aren't registered as submodules intentionally
+            module.process_weights_after_loading()
+            continue
+        quant_method = getattr(module, "quant_method", None)
+        quant_method = getattr(module, "quant_method", None)
+
+        # hack for skpping postprocess for default cutlass kernel
+        scheme = getattr(module, "scheme", None)
+        is_cutlass = isinstance(quant_method, CompressedTensorsLinearMethod) and isinstance(scheme, CompressedTensorsW8A8Int8) and isinstance(scheme.kernel, CutlassScaledMMLinearKernel)
+        can_skip_postprocess = False
+        if is_cutlass:
+            assert isinstance(scheme, CompressedTensorsW8A8Int8) 
+            kernel = scheme.kernel
+            can_skip_postprocess = kernel.config.is_channelwise and not kernel.config.is_static_input_scheme and kernel.config.input_symmetric
+
+        if can_skip_postprocess:
+            continue
+
+        if isinstance(quant_method, QuantizeMethodBase):
+            # When quant methods need to process weights after loading
+            # (for repacking, quantizing, etc), they expect parameters
+            # to be on the global target device. This scope is for the
+            # case where cpu offloading is used, where we will move the
+            # parameters onto device for processing and back off after.
+            with device_loading_context(module, target_device):
+                quant_method.process_weights_after_loading(module)
+
+    # Currently only used by MLA.
+    # NOTE: This intentionally happens after other modules so we can easily
+    # decompress the weights for MLA.
+    for _, module in model.named_modules():
+        if isinstance(module, Attention) and \
+            hasattr(module, "process_weights_after_loading"):
+            # TODO(lucas): see if there is a way to unify the signatures
+            # of process_weights_after_loading
+            module.process_weights_after_loading(model_config.dtype)
+
 def hacked_process_weights_after_loading(
     original_process_weights_after_loading,
     model, 
@@ -184,6 +235,7 @@ def patch_vllm_process_weights_after_loading():
                 loader.beforeflashrl_process_weights_after_loading = original_process_weights_after_loading
                 
                 from functools import partial
+                # don't skip for vllm < 0.9.1 for now
                 loader._process_weights_after_loading = partial(hacked_process_weights_after_loading, original_process_weights_after_loading)
 
                 logger.debug("Successfully patched the _process_weights_after_loading function of vllm")
@@ -198,8 +250,13 @@ def patch_vllm_process_weights_after_loading():
                 from vllm.model_executor.model_loader import utils
                 
                 if not hasattr(utils, 'beforeflashrl_process_weights_after_loading'):
+                    
+                    # use the skippable version for 0.9.1 and 0.9.2
+                    if parse(vllm.__version__) in [parse("0.9.1"), parse("0.9.2")]:
+                        original_process_weights_after_loading = process_weights_after_loading_skippable
+                    else:
+                        original_process_weights_after_loading = utils.process_weights_after_loading
 
-                    original_process_weights_after_loading = utils.process_weights_after_loading
                     utils.beforeflashrl_process_weights_after_loading = original_process_weights_after_loading
 
                     from functools import partial