NVIDIA · lucaslie · Dec 2, 2025 · Nov 27, 2025
@@ -1,4 +1,9 @@
-"""Cached attention op for chunked delta rule using the fla kernel library."""
+"""Cached attention op for delta rule using the fla kernel library.
+
+Delta Rule is based on this paper: https://arxiv.org/abs/2406.06484
+
+Kernels are based on this repo: https://github.com/fla-org/flash-linear-attention
+"""
 
 from typing import List, Tuple
 

@@ -1,4 +1,9 @@
-"""Custom ops corresponding to fla's chunked delta rule."""
+"""Custom ops corresponding to fla's chunked delta rule.
+
+Delta Rule is based on this paper: https://arxiv.org/abs/2406.06484
+
+Kernels are based on this repo: https://github.com/fla-org/flash-linear-attention
+"""
 
 from typing import Optional
 

@@ -4,10 +4,6 @@
 
 from tensorrt_llm._torch.modules.fla.l2norm import l2norm_fwd
 
-# TODO: add a pattern matcher for this such that
-# 1. pattern match to torch_l2norm
-# 2. fuse transform to map to desired backend like fla
-
 
 @torch.library.custom_op("auto_deploy::torch_l2norm", mutates_args=())
 def _torch_l2norm(x: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:

@@ -70,11 +70,13 @@ def torch_rmsnorm(input: torch.Tensor, weight: torch.Tensor, eps: float) -> torc
         weight: Scaling weights for the normalized output.
         eps: Small constant for numerical stability.
     """
-    input_dtype = input.dtype
+    # pre-allocate output to ensure same dtype+stride as input
+    out = torch.empty_like(input)
     input = input.to(torch.float32)
     variance = input.pow(2).mean(-1, keepdim=True)
     input = input * torch.rsqrt(variance + eps)
-    return (weight * input.to(input_dtype)).contiguous()
+    out.copy_((weight * input.to(out.dtype)))
+    return out
 
 
 @torch_rmsnorm.register_fake