Merge pull request #24 from SumanthRH/skyrl-changes

LiyuanLucasLiu · web-flow · commit 1a17045ec878 · 2025-08-24T20:25:11.000-07:00
Upstreaming changes from SkyRL integration
diff --git a/flash_rl/vllm_patch.py b/flash_rl/vllm_patch.py
@@ -2,6 +2,7 @@
 import os
 import vllm
 import torch 
+from torch import nn
 import types
 import gc
 from packaging.version import parse
@@ -35,6 +36,56 @@ def bond_method_to_cls(func, obj):
     '_assert_and_load',
 ]
 
+# fine-grained process weights after loading implementation where we skip operations for certain modules
+def process_weights_after_loading_skippable(model: nn.Module, model_config,
+                                  target_device: torch.device) -> None:
+    from vllm.attention import Attention
+    from vllm.model_executor.layers.linear import QKVCrossParallelLinear
+    from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
+    from vllm.model_executor.model_loader.utils import device_loading_context
+    from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import CompressedTensorsLinearMethod, CompressedTensorsW8A8Int8
+    from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import CutlassScaledMMLinearKernel
+        
+    for _, module in model.named_modules():
+        if isinstance(module, QKVCrossParallelLinear):
+            # NOTE(Isotr0py): special case for cross QKV layer because
+            # q and kv proj aren't registered as submodules intentionally
+            module.process_weights_after_loading()
+            continue
+        quant_method = getattr(module, "quant_method", None)
+        quant_method = getattr(module, "quant_method", None)
+
+        # hack for skpping postprocess for default cutlass kernel
+        scheme = getattr(module, "scheme", None)
+        is_cutlass = isinstance(quant_method, CompressedTensorsLinearMethod) and isinstance(scheme, CompressedTensorsW8A8Int8) and isinstance(scheme.kernel, CutlassScaledMMLinearKernel)
+        can_skip_postprocess = False
+        if is_cutlass:
+            assert isinstance(scheme, CompressedTensorsW8A8Int8) 
+            kernel = scheme.kernel
+            can_skip_postprocess = kernel.config.is_channelwise and not kernel.config.is_static_input_scheme and kernel.config.input_symmetric
+
+        if can_skip_postprocess:
+            continue
+
+        if isinstance(quant_method, QuantizeMethodBase):
+            # When quant methods need to process weights after loading
+            # (for repacking, quantizing, etc), they expect parameters
+            # to be on the global target device. This scope is for the
+            # case where cpu offloading is used, where we will move the
+            # parameters onto device for processing and back off after.
+            with device_loading_context(module, target_device):
+                quant_method.process_weights_after_loading(module)
+
+    # Currently only used by MLA.
+    # NOTE: This intentionally happens after other modules so we can easily
+    # decompress the weights for MLA.
+    for _, module in model.named_modules():
+        if isinstance(module, Attention) and \
+            hasattr(module, "process_weights_after_loading"):
+            # TODO(lucas): see if there is a way to unify the signatures
+            # of process_weights_after_loading
+            module.process_weights_after_loading(model_config.dtype)
+
 def hacked_process_weights_after_loading(
     original_process_weights_after_loading,
     model, 
@@ -90,6 +141,7 @@ def patch_vllm_process_weights_after_loading():
                 loader.beforeflashrl_process_weights_after_loading = original_process_weights_after_loading
                 
                 from functools import partial
+                # don't skip for vllm < 0.9.1 for now
                 loader._process_weights_after_loading = partial(hacked_process_weights_after_loading, original_process_weights_after_loading)
 
                 logger.debug("Successfully patched the _process_weights_after_loading function of vllm")
@@ -104,8 +156,13 @@ def patch_vllm_process_weights_after_loading():
                 from vllm.model_executor.model_loader import utils
                 
                 if not hasattr(utils, 'beforeflashrl_process_weights_after_loading'):
+                    
+                    # use the skippable version for 0.9.1 and 0.9.2
+                    if parse(vllm.__version__) in [parse("0.9.1"), parse("0.9.2")]:
+                        original_process_weights_after_loading = process_weights_after_loading_skippable
+                    else:
+                        original_process_weights_after_loading = utils.process_weights_after_loading
 
-                    original_process_weights_after_loading = utils.process_weights_after_loading
                     utils.beforeflashrl_process_weights_after_loading = original_process_weights_after_loading
 
                     from functools import partial