Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Reduced the number of lines to correct the error
  • Loading branch information
JohnConnor123 committed Jan 31, 2026
commit 5e1ef063c6adc02f3ac4ae27c5b2330612810853
37 changes: 0 additions & 37 deletions verl/utils/vllm/vllm_fp8_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

import logging
from dataclasses import dataclass, field
import importlib
from unittest.mock import patch

import torch
Expand Down Expand Up @@ -94,41 +93,6 @@ def _wrapped(self, layer) -> None:
return _wrapped


def _patch_vllm_qkvparallellinear_workspace_attr() -> None:
"""Patch vLLM's QKVParallelLinear to have a lazy `workspace` attr.

vLLM FP8 (Marlin) may access `layer.workspace` during profiling/first runs
and lazily allocate it when needed. If the attribute is missing, vLLM can
crash with an AttributeError.

This patch is intentionally narrow and idempotent:
- targets vLLM's `QKVParallelLinear` class only
- sets `workspace=None` when missing (no allocation)
- guarded by a per-class sentinel to avoid patching multiple times
"""
try:
mod = importlib.import_module("vllm.model_executor.layers.linear")
cls = getattr(mod, "QKVParallelLinear", None)
except Exception:
cls = None

if cls is None:
return
if getattr(cls, "_verl_fp8_workspace_shim_applied", False):
return

orig_init = cls.__init__

def _wrapped_init(self, *args, **kwargs):
orig_init(self, *args, **kwargs)
if not hasattr(self, "workspace"):
self.workspace = None

cls.__init__ = _wrapped_init
cls._verl_fp8_workspace_shim_applied = True
logger.info("vLLM FP8 (Marlin): applied QKVParallelLinear workspace shim")


def is_fp8_model(vllm_config):
from vllm.model_executor.layers.quantization.fp8 import Fp8Config

Expand Down Expand Up @@ -514,7 +478,6 @@ def apply_vllm_fp8_patches():
return

logger.info("Applying vllm fp8 patches for blockwise quantization")
_patch_vllm_qkvparallellinear_workspace_attr()

vllm_ver = version.parse(vllm.__version__)
is_vllm_12_or_later = vllm_ver >= version.parse("0.12.0")
Expand Down