Fix: torchao quantized tensors don't support copy argument in .to()

Fixes RuntimeError when loading models with torchao quantization. The _ensure_cpu_pinned function now checks if a tensor is quantized before attempting to move it to CPU, avoiding the use of copy=True for quantized tensors that don't support this argument (e.g., AffineQuantizedTensor). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
ostris · relaxis · Oct 22, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 29, 2025
commit 0cacab851277d79bc7eeb09ecd3be542c24cd0ca
diff --git a/toolkit/memory_management/manager_modules.py b/toolkit/memory_management/manager_modules.py
@@ -98,10 +98,19 @@ def _is_quantized_tensor(t: Optional[torch.Tensor]) -> bool:
 def _ensure_cpu_pinned(t: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
     if t is None:
         return None
+    # Check if quantized BEFORE moving to CPU, as some quantized tensor types
+    # (e.g., torchao's AffineQuantizedTensor) don't support the copy argument
+    is_quantized = _is_quantized_tensor(t)
+
     if t.device.type != "cpu":
-        t = t.to("cpu", copy=True)
+        # Use copy=True for regular tensors, but not for quantized tensors
+        if is_quantized:
+            t = t.to("cpu")
+        else:
+            t = t.to("cpu", copy=True)
+
     # Don't attempt to pin quantized tensors; many backends don't support it
-    if _is_quantized_tensor(t):
+    if is_quantized:
         return t
     if torch.cuda.is_available():
         try: