Lightning-AI · t-vi · Jun 6, 2025 · May 20, 2025 · May 21, 2025 · May 21, 2025
@@ -551,16 +551,26 @@ def apply_transforms_and_build_cache_entry(cd, cs, cache_info, prologue_trc, com
             else:
                 requires_grad = False
 
+            delay_trace_split = compile_options.get("delay_trace_split", True)
+
             if requires_grad:
-                # Currently split_forward_backward also includes
-                # transform_for_execution and various sorting of symbols,
-                # applying transform_for_execution after this would be
-                # breaking the order of operations
-                computation_trc, backward_trc = split_forward_backward(computation_trc, cd, cs, *computation_trc.args)
-                # Note computation_trc and backward_trc have been appended to cs.last_(backward_)traces
-                # by split_forward_backward
-
-            if not requires_grad:
+                if delay_trace_split:
+
+                    from thunder.transforms.autodiff import grad_transform_on_trace
+
+                    computation_trc = grad_transform_on_trace(computation_trc)
+                else:
+                    # Currently split_forward_backward also includes
+                    # transform_for_execution and various sorting of symbols,
+                    # applying transform_for_execution after this would be
+                    # breaking the order of operations
+                    computation_trc, backward_trc = split_forward_backward(
+                        computation_trc, cd, cs, *computation_trc.args
+                    )
+                    # Note computation_trc and backward_trc have been appended to cs.last_(backward_)traces
+                    # by split_forward_backward
+
+            if backward_trc is None:
                 from thunder.executors.passes import transform_for_execution as transform_for_execution_pass
                 from thunder.executors.passes import _transform_for_operator_executor_execution
                 from thunder.distributed.utils import maybe_sort_waits
@@ -576,9 +586,23 @@ def apply_transforms_and_build_cache_entry(cd, cs, cache_info, prologue_trc, com
                     executors_list=cd.executors_list,
                     use_del_last_used=False,
                 )
-                computation_traces.extend(extraces)
-                computation_trc = computation_traces[-1]
-                computation_trc = thunder.executors.passes.del_last_used(computation_trc)
+                computation_trc = extraces[-1]
+
+            if requires_grad and delay_trace_split:
+                from thunder.core.rematerialization import rematerialize
+                from thunder.executors.passes import update_fusion_call_ctx
+                from thunder.transforms.autodiff import split_into_forward_and_backward
+
+                computation_trc = rematerialize(computation_trc)
+                computation_trc = update_fusion_call_ctx(computation_trc)
+                computation_trc = dce(computation_trc)
+                computation_trc, backward_trc = split_into_forward_and_backward(computation_trc)
+
+            computation_trc = thunder.executors.passes.del_last_used(computation_trc)
+            computation_traces.append(computation_trc)
+            if backward_trc is not None:
+                backward_trc = thunder.executors.passes.del_last_used(backward_trc, clear_mutable_collections=True)
+                backward_traces.append(backward_trc)
 
             if not compile_options.get("disable_inplace_copy_check", False):
                 thunder.core.transform_common._inplace_copy_sanity_check(computation_trc)

@@ -286,6 +286,9 @@ def cse_single_bsym(
         skip_output=True,
     )
 
+    if bsym.sym.id == prims.PrimIDs.GET_GRAD:
+        return new_bsym
+
     # Skip appending this bsym to the new bound symbols due to its rhs being a common subexpression.
     rhs = new_bsym.rhs
     if (prior_bsym := rhs_to_bsym_map.get(rhs)) is not None and bsym._executor is prior_bsym._executor:

@@ -43,7 +43,7 @@ def _transform_for_operator_executor_execution(trace: TraceCtx, executors_list:
     # - if none of the above apply and we have a prim, raise an error
     class OpExProcessor(TraceSubstitutionProcessor):
         def process_bsym(self, bsym: BoundSymbol) -> None:
-            if bsym.sym.python_impl is not None:
+            if bsym.sym.python_impl is not None or bsym.sym.id == prims.PrimIDs.GET_GRAD:
                 # keep the bound symbol and use the python impl
                 self.add_processed_bsyms([bsym])
                 self.set_result(bsym.output)

@@ -252,6 +252,9 @@ def split_forward_backward(computation_trc: TraceCtx, compile_data, compile_stat
     # the forward trace and inputs of the backward trace.
     fw_trace, bw_trace = forward_and_backward_from_trace(primal_trace, torch_autograd=True)
 
+    if bw_trace is None:
+        return fw_trace, None
+
     fw_traces = [fw_trace]
     bw_traces = [bw_trace]
 

diff --git a/thunder/executors/torch_compile.py b/thunder/executors/torch_compile.py
@@ -256,4 +256,4 @@ def cuda_device_checker(*args, **kwargs):
 
 torch_compile_ex = TorchCompileExecutor(name="torchcompile")
 register_executor(torch_compile_ex)
-torch_compile_ex._implmap = {op: ImplInfo() for op in pytorch_ex.implmap}
+torch_compile_ex._implmap = {op: ImplInfo() for op in pytorch_ex.implmap if op != prims.PrimIDs.GET_GRAD}
@@ -2339,6 +2339,7 @@ def _shape_impl(t):
 shape = ex.register_operator("shape", meta=prims.shape_meta, fn=_shape_impl)
 _register_implementation(prims.shape, shape, checker=_always_executable)
 
+
 shallow_copy = ex.register_operator("shallow_copy", meta=prims.shallow_copy, fn=lambda x: x)
 _register_implementation(prims.shallow_copy, shallow_copy, checker=_always_executable)
 

@@ -115,5 +115,5 @@ def test_nanogpt_block():
     # We are checking the estimated memory against a fixed value for consistency.
     assert max_mem_fw[0] == 381754368
     assert sum(max_mem_fw[1].values()) == 375462912
-    assert max_mem_bw[0] == 437292032
-    assert sum(max_mem_bw[1].values()) == 34642944
+    assert max_mem_bw[0] == 641097728
+    assert sum(max_mem_bw[1].values()) == 440474624
diff --git a/thunder/tests/test_torch_compile_executor.py b/thunder/tests/test_torch_compile_executor.py
@@ -85,7 +85,7 @@ def test_torch_compile_cat_rope_single_fusion():
 
     backward_execution_trace = thunder.last_backward_traces(jfn)[-1]
     assert len(get_fusions(backward_execution_trace)) == 1
-    assert len(backward_execution_trace.bound_symbols) == 14
+    assert len(backward_execution_trace.bound_symbols) == 17
 
 
 @pytest.mark.skipif(not is_inductor_supported() or platform.system() == "Windows", reason="inductor unsupported")

@@ -339,6 +339,9 @@ def shallow_copy_if_input(p):
     trace, _ = AugmentedForwardProcessor(trace)()
     # run through DCE in case some of the gradients of intermediates are not needed.
     trace = thunder.core.transform_common.dce(trace)
+    # group get_grad symbols together for torch compile fusions
+    # !!! is it preferrable to do this here or in the torch compile fusion pass?
+    _group_get_grad_bsyms(trace)
 
     end_time_ns = time.perf_counter_ns()
     elapsed_time_ns = end_time_ns - start_time_ns
@@ -349,6 +352,19 @@ def shallow_copy_if_input(p):
     return trace
 
 
+def _group_get_grad_bsyms(trace):
+    i = 0
+    n = len(trace.bound_symbols)
+    while i < n and trace.bound_symbols[i].sym != prims.get_grad:
+        i += 1
+    if i == n:
+        return
+    get_grad_bsyms = list(filter(lambda bsym: bsym.sym == prims.get_grad, trace.bound_symbols))
+    bsyms = list(filter(lambda bsym: bsym.sym != prims.get_grad, trace.bound_symbols))
+    bsyms = bsyms[:i] + list(get_grad_bsyms) + bsyms[i:]
+    trace.bound_symbols = bsyms
+
+
 def split_into_forward_and_backward(joint_trace):
     """split a joint trace for forward and backward into separate ones, including recomputation (aka activation checkpointing)"""
 
@@ -376,7 +392,10 @@ def split_into_forward_and_backward(joint_trace):
     assert isinstance(fw_output, tuple)
 
     grad_outs = [None for _ in fw_output]
-    output_pos = {o.name: i for i, o in enumerate(fw_output) if isinstance(o, thunder.TensorProxy)}
+    output_pos = {}
+    for i, o in enumerate(fw_output):
+        if isinstance(o, thunder.TensorProxy):
+            output_pos.setdefault(o.name, []).append(i)
 
     # the proxies we need to compute in the forward - we start with the outputs of the forward
     forward_proxy_names = {o.name for o in thunder.core.pytree.tree_iter(fw_output) if isinstance(o, thunder.Proxy)}
@@ -412,11 +431,11 @@ def split_into_forward_and_backward(joint_trace):
 
         # get grad is always part of the input, record the grad_out (will be part of the "cotangents" list)
         if bsym.sym == prims.get_grad:
-            grad_outs[output_pos[bsym.args[0].name]] = bsym.output
+            grad_outs[output_pos[bsym.args[0].name].pop(0)] = bsym.output
             continue
 
         # copy_ updating a forward proxy is special regardless of the output
-        if bsym.sym == prims.copy_ and bsym.args[1].name in forward_proxy_names:
+        if (bsym.sym == prims.copy_ or bsym.sym.name == "copy_") and bsym.args[1].name in forward_proxy_names:
             # todo: should we also handle ltorch.copy_ ?
             forward_part_bsyms.insert(0, bsym.from_bsym())
             forward_proxy_names.update(a.name for a in bsym.flat_proxy_args)
@@ -467,6 +486,11 @@ def split_into_forward_and_backward(joint_trace):
     with thunder.core.trace.tracectx(forward_trace):
         prims.python_return(fw_output_dict, (saved_for_backward_tensors, saved_for_backward_other))
 
+    if len(backward_part_bsyms) == 0 and not any(
+        [True if arg is not None else False for arg in return_bsym.args[0]["grad_flat_args"]]
+    ):
+        return forward_trace, None
+
     # then we construct the backward trace, unpacking saved_for_backward and cotangents lists
     def backward_fn(saved_for_backward, cotangents):
         pass