don't fuse get_grad

Lightning-AI · t-vi · Jun 6, 2025 · May 20, 2025 · May 21, 2025 · May 21, 2025
commit f0182eef0107df26ff4673c0b1abbead0311e96c
@@ -286,6 +286,9 @@ def cse_single_bsym(
         skip_output=True,
     )
 
+    if bsym.sym.id == prims.PrimIDs.GET_GRAD:
+        return new_bsym
+
     # Skip appending this bsym to the new bound symbols due to its rhs being a common subexpression.
     rhs = new_bsym.rhs
     if (prior_bsym := rhs_to_bsym_map.get(rhs)) is not None and bsym._executor is prior_bsym._executor:

@@ -250,6 +250,7 @@ def _arange_transform(
 
 
 def _get_grad_transform(a: TensorProxy) -> TensorProxy:
+    # return a.grad
     return prims.get_grad(a)
 
 
@@ -259,6 +260,7 @@ def _put_grad_transform(a: TensorProxy, val: TensorProxy) -> None:
     # force that and recurse so that the symbol is not removed/no error is raised.
     # This is breaking the intent of transform_for_operator_executor_execution, in that
     # it is not executable.  But there will be postprocessing that will make it executable.
+    # a.grad = val
     prims.put_grad(a, val)
     return None
 
@@ -2357,6 +2359,16 @@ def _shape_impl(t):
 shape = ex.register_operator("shape", meta=prims.shape_meta, fn=_shape_impl)
 _register_implementation(prims.shape, shape, checker=_always_executable)
 
+
+# def _grad_impl(t):
+#     t.retain_grad()
+#     return t.grad
+
+
+# grad = ex.register_operator("get_grad", meta=prims.get_grad, fn=_grad_impl)
+# _register_implementation(prims.get_grad, grad, checker=_always_executable)
+
+
 shallow_copy = ex.register_operator("shallow_copy", meta=prims.shallow_copy, fn=lambda x: x)
 _register_implementation(prims.shallow_copy, shallow_copy, checker=_always_executable)
 

@@ -339,7 +339,11 @@ def split_into_forward_and_backward(joint_trace):
     assert isinstance(fw_output, tuple)
 
     grad_outs = [None for _ in fw_output]
-    output_pos = {o.name: i for i, o in enumerate(fw_output) if isinstance(o, thunder.TensorProxy)}
+    output_pos = {}
+    for i, o in enumerate(fw_output):
+        if isinstance(o, thunder.TensorProxy):
+            output_pos.setdefault(o.name, []).append(i)
+    # output_pos = {o.name: i for i, o in enumerate(fw_output) if isinstance(o, thunder.TensorProxy)}
 
     # the proxies we need to compute in the forward - we start with the outputs of the forward
     forward_proxy_names = {o.name for o in thunder.core.pytree.tree_iter(fw_output) if isinstance(o, thunder.Proxy)}
@@ -374,8 +378,8 @@ def split_into_forward_and_backward(joint_trace):
             continue
 
         # get grad is always part of the input, record the grad_out (will be part of the "cotangents" list)
-        if bsym.sym == prims.get_grad:
-            grad_outs[output_pos[bsym.args[0].name]] = bsym.output
+        if bsym.sym == prims.get_grad or bsym.sym.id == "get_grad":
+            grad_outs[output_pos[bsym.args[0].name].pop(0)] = bsym.output
             continue
 
         # copy_ updating a forward proxy is special regardless of the output