We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent b10c17c commit 1c53561Copy full SHA for 1c53561
flash_rl/vllm_patch.py
@@ -108,7 +108,12 @@ def hacked_process_weights_after_loading(
108
quant_method = getattr(module, "quant_method", None)
109
if isinstance(quant_method, QuantizeMethodBase):
110
111
- if isinstance(quant_method, Fp8LinearMethod) or isinstance(quant_method, CompressedTensorsW8A8Int8):
+ if isinstance(quant_method, Fp8LinearMethod):
112
+ # for fast processing, we will do manual processing later
113
+ assert not quant_method.use_marlin, 'marlin (w8a16) does not support fp8_fast processing'
114
+ continue
115
+
116
+ if isinstance(quant_method, CompressedTensorsW8A8Int8):
117
# for fast processing, we will do manual processing later
118
continue
119
0 commit comments