Lightning-AI · t-vi · Jun 13, 2025 · Mar 21, 2025 · Mar 24, 2025 · Mar 24, 2025
@@ -405,7 +405,13 @@ def _alias_tensor_of_args_kwargs_dict(*args, **kwargs) -> dict[int, list[int]]:
         data_ptr_to_tensor_group_index = {}
         tensor_group_index_to_tensor_indices = defaultdict(list)
         for idx, t in enumerate(flat_args):
-            if pytorch.is_tensor(t) and t.layout == pytorch.strided:
+            # Using type(t) is pytorch.Tensor as TensorSubclasses don't support calling
+            # data_ptr().
+            # Eg. RuntimeError: Attempted to access the data pointer on an invalid python storage. (data_ptr access on TensorSubclass)
+            #
+            # isinstance(t, pytorch.Tensor) or pytorch.is_tensor(t) will match all Tensor objects including
+            # subclasses.
+            if type(t) is pytorch.Tensor and t.layout is pytorch.strided:
                 data_ptr = t.untyped_storage().data_ptr()
                 if data_ptr not in data_ptr_to_tensor_group_index:
                     data_ptr_to_tensor_group_index[data_ptr] = len(data_ptr_to_tensor_group_index)

@@ -152,6 +152,11 @@ def to_printable(
     if isinstance(x, ProxyInterface):
         return x
 
+    from thunder.torch.experimental.dtensor_codeutils import populate_object_ctx_for_dtensor_spec
+
+    if populate_object_ctx_for_dtensor_spec(x, object_ctx):
+        return x
+
     if dataclasses.is_dataclass(x):
         # Add `class` to the object_ctx so that we can reuse it during the trace execution.
         if isinstance(x, type):  # dataclass type
@@ -236,6 +241,11 @@ def prettyprint(
     if isinstance(x, ContextObject):
         return m(x.name)
 
+    from thunder.torch.experimental.dtensor_codeutils import prettyprint_dtensor_spec
+
+    if dtensor_repr := prettyprint_dtensor_spec(x):
+        return m(dtensor_repr)
+
     if dataclasses.is_dataclass(x):
         # For a dataclass instance of class
         # class MyContainer:

@@ -74,6 +74,15 @@
 from thunder.torch import _torch_to_thunder_function_map
 from thunder.clang import _clang_fn_set
 from thunder.core.pytree import tree_map, tree_iter
+from thunder.torch.experimental.dtensor_proxy import is_dtensor_proxy
+from thunder.torch.experimental.dtensor_torch_and_prims import (
+    check_dtensor_spec_repr,
+    handle_check_dtensor_spec_in_prologue,
+    register_dtensor_torch_and_prims,
+)
+
+# TODO: Find a better place to register these ops (mostly in thunder/torch/__init__.py but without cyclical dependency).
+register_dtensor_torch_and_prims()
 
 #
 # jit_ext.py implements extensions of thunder's interpreter
@@ -273,9 +282,12 @@ def proxify(self, value: WrappedValue) -> Any:
 
             if p is not uvalue:
                 value.register_proxy(p)
+
             # TODO: other caching modes
             co: CACHE_OPTIONS = get_cache_option()
             if co is CACHE_OPTIONS.CONSTANT_VALUES:
+                if is_dtensor_proxy(p):
+                    self.add_constraint((check_dtensor_spec_repr, p, uvalue._spec))
                 self.add_constraint((clang.check_tensor_shape_and_metadata, p))
             elif co is CACHE_OPTIONS.SYMBOLIC_VALUES:
                 # TODO: establish guarding logic to allow non-broadcast shape change
@@ -1880,6 +1892,10 @@ def is_variableified_tensorproxy(v: Variable | Proxy) -> Proxy:
                     if isinstance(s, Proxy):
                         unpack(s)
 
+            # Add checks for local tensor, mesh and placment of a DTensor
+            if handle_check_dtensor_spec_in_prologue(prim, prologue_trace, args):
+                continue
+
             prim(*args)
 
         cache_info = thunder._get_cache_info()

@@ -1833,7 +1833,8 @@ def _get_grad_meta(a: Number | NumberProxy | TensorProxy, /) -> Number | TensorP
     utils.check_type(a, (Number, NumberProxy, TensorProxy))
 
     if isinstance(a, TensorProxy):
-        return TensorProxy(like=a)
+        # NOTE: `a` could be a TensorProxy subclass and it's type should be preserved.
+        return type(a)(like=a)
 
     # NOTE a is a Number in this branch
     return numberproxy(pytype(a), 0)

@@ -2071,6 +2071,12 @@ def proxy(x: Any, *, name: str | None = None, history: None | tuple = None) -> A
     if x is ...:
         return AnyProxy(x, name=name, history=history)
 
+    # Import here to avoid cyclical dependency.
+    from thunder.torch.experimental.dtensor_proxy import proxify_dtensor
+
+    if (dtensor_proxy := proxify_dtensor(x, name, history)) is not None:
+        return dtensor_proxy
+
     if isinstance(x, torch.Tensor):
         return tensorproxy(x, name=name, history=history)
 

@@ -69,6 +69,7 @@ def tree_flatten(args, namespace=OPTREE_NAMESPACE):
             torch.autograd.function.FunctionCtx,
             immutable_list,
             *torch.types.py_sym_types,
+            *((torch.distributed._tensor.DTensor,) if torch.distributed.is_available() else ()),
         }
         and not isinstance(args, (ProxyInterface))
         and not is_likely_from_collections_namedtuple(args)

@@ -35,7 +35,7 @@
 from thunder.core.langctxs import langctx, Languages
 from thunder.core.pytree import tree_flatten, tree_map, tree_unflatten, tree_flatten_with_dataclass
 from thunder.core.symbol import BoundSymbol, BoundSymbolInterface, Symbol, has_tags
-from thunder.core.trace import TraceCtx as Trace
+from thunder.core.trace import TraceCtx as Trace, get_tracectx
 from thunder.core.trace import VariableInterface as Variable
 from thunder.core.trace import (
     detached_trace,
@@ -3127,7 +3127,8 @@ def forward_and_backward_from_trace(trace: Trace, torch_autograd=False) -> Forwa
 
     def ones_like(x):
         if isinstance(x, TensorProxy):
-            return full_like(x, fill_value=1)
+            # NOTE: x could be a subclass of TensorProxy and that should be preserved.
+            return type(x)(like=x)
         elif isinstance(x, NumberProxy):
             return type(x.value)(1)
         else:

@@ -6,6 +6,7 @@
 import inspect
 import itertools
 import copy
+from types import NoneType
 from collections import defaultdict
 from collections import namedtuple
 
@@ -14,6 +15,11 @@
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.utils.weak import TensorWeakRef
 
+if torch.distributed.is_available():
+    from torch.distributed.tensor import DTensor
+else:
+    DTensor = NoneType
+
 from thunder.torch.default_torch_ops import torch_auto_registered_ops
 from thunder.torch import _torch_to_thunder_function_map
 from thunder.torch.langctx import torchctx
@@ -508,7 +514,8 @@ def _get_storage_shape(t: torch.Tensor):
 
 
 def _get_min_and_val(t: torch.Tensor) -> tuple[Number | None, Number | None]:
-    if isinstance(t, FakeTensor) or t.device.type == "meta" or t.numel() == 0:
+    # We assume that for TensorSubclass, `aminmax` is not supported which is true for FakeTensor and DTensor.
+    if (isinstance(t, torch.Tensor) and type(t) is not torch.Tensor) or t.device.type == "meta" or t.numel() == 0:
         return None, None
     if t.dtype in (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz):
         t = t.to(torch.float32)

@@ -447,6 +447,11 @@ def split_forward_backward(computation_trc: TraceCtx, compile_data, compile_stat
     # We only want the forward function to be called with `te.fp8_autocast` manager.
     bw_extrace._include_te_fp8_autocast = False
 
+    # We only want to apply it on backward trace.
+    from thunder.torch.experimental.dtensor_utils import check_dtensor_cotangent_metadata_in_backward
+
+    bw_extrace = check_dtensor_cotangent_metadata_in_backward(bw_extrace)
+
     if len(bw_extrace.bound_symbols) == 1:
         # only return, no unpacking, so no gradient is calculated
         bw_extrace = None

@@ -0,0 +1,120 @@
+import unittest
+
+import pytest
+import torch
+
+if not torch.distributed.is_available():
+    pytest.skip(allow_module_level=True)
+
+from thunder.dynamo import thunderfx
+import thunder
+
+from thunder.tests.distributed.helper import DistributedParallelTestCase
+from torch.distributed._tensor import DeviceMesh, Shard, distribute_tensor
+from torch.distributed.tensor.placement_types import Placement, Shard, Replicate
+
+
+@unittest.skipUnless(
+    torch.cuda.is_available() and torch.distributed.is_nccl_available(),
+    "DTensor test requires CUDA and NCCL `torch.distributed` backend",
+)
+class DTensorTest(DistributedParallelTestCase):
+    def test_dtensor_basic_op(self):
+        num_devices = self.world_size
+        mesh = DeviceMesh("cuda", list(range(num_devices)))
+
+        dim_size = 16
+
+        def _helper(fn, in_dtensor, w_dtensor, compile_fn):
+            expected = torch.compile(fn)(in_dtensor, w_dtensor)
+            tmodel = compile_fn(fn)
+            actual = tmodel(in_dtensor, w_dtensor)
+
+            torch.testing.assert_close(actual, expected)
+
+            g_o = distribute_tensor(torch.ones(dim_size, dim_size), mesh, [Shard(0)])
+            expected_g = torch.autograd.grad(
+                expected,
+                (in_dtensor, w_dtensor),
+                g_o,
+            )
+            actual_g = torch.autograd.grad(actual, (in_dtensor, w_dtensor), g_o)
+
+            torch.testing.assert_close(actual_g, expected_g)
+            if compile_fn is thunderfx:
+                assert len(tmodel._backend.subgraph_infos[0].split_reasons) > 0, "TODO: Fix with thunderfx path"
+
+        w_dtensor = distribute_tensor(torch.randn(dim_size, dim_size, requires_grad=True), mesh, [Shard(0)])
+        in_dtensor = distribute_tensor(torch.randn(dim_size, dim_size, requires_grad=True), mesh, [Shard(0)])
+
+        # Verify torch API works
+        _helper(lambda x, w: torch.mul(x, w), in_dtensor, w_dtensor, compile_fn=thunder.jit)
+        _helper(lambda x, w: torch.mul(x, w), in_dtensor, w_dtensor, compile_fn=thunderfx)
+
+        # Verify calling method works
+        _helper(lambda x, w: torch.Tensor.mul(x, w), in_dtensor, w_dtensor, compile_fn=thunder.jit)
+        _helper(lambda x, w: torch.Tensor.mul(x, w), in_dtensor, w_dtensor, compile_fn=thunderfx)
+
+        # # Verify calling special method works
+        _helper(lambda x, w: x * w, in_dtensor, w_dtensor, compile_fn=thunder.jit)
+        _helper(lambda x, w: x * w, in_dtensor, w_dtensor, compile_fn=thunderfx)
+
+    def test_dtensor_unsupported(self):
+        num_devices = self.world_size
+        mesh = DeviceMesh("cuda", list(range(num_devices)))
+
+        dim_size = 16
+
+        w_dtensor = distribute_tensor(torch.randn(dim_size, dim_size, requires_grad=True), mesh, [Shard(0)])
+
+        in_dtensor = distribute_tensor(torch.randn(dim_size, dim_size, requires_grad=True), mesh, [Shard(0)])
+
+        def fn(x, w):
+            return torch.div(x, w)
+
+        tmodel = thunder.jit(fn)
+        with pytest.raises(AssertionError):
+            tmodel(in_dtensor, w_dtensor)
+
+        def fn(x, w):
+            return x / w
+
+        tmodel = thunder.jit(fn)
+        with pytest.raises(AssertionError):
+            tmodel(in_dtensor, w_dtensor)
+
+    def test_dtensor_unsupported_mixed_input(self):
+        num_devices = self.world_size
+        mesh = DeviceMesh("cuda", list(range(num_devices)))
+
+        dim_size = 16
+
+        def fn(x, w):
+            return torch.div(x, w)
+
+        w = torch.randn(dim_size, dim_size, requires_grad=True)
+
+        in_dtensor = distribute_tensor(torch.randn(dim_size, dim_size, requires_grad=True), mesh, [Shard(0)])
+
+        tmodel = thunder.jit(fn)
+        with pytest.raises(AssertionError):
+            tmodel(in_dtensor, w)
+
+    def test_dtensor_incorrect_cotangent(self):
+        num_devices = self.world_size
+        mesh = DeviceMesh("cuda", list(range(num_devices)))
+
+        dim_size = 16
+
+        w_dtensor = distribute_tensor(torch.randn(dim_size, dim_size, requires_grad=True), mesh, [Shard(0)])
+        in_dtensor = distribute_tensor(torch.randn(dim_size, dim_size, requires_grad=True), mesh, [Shard(0)])
+
+        def fn(x, w):
+            return torch.mul(x, w)
+
+        tmodel = thunder.jit(fn)
+        actual = tmodel(in_dtensor, w_dtensor)
+        g_o = distribute_tensor(torch.ones(dim_size, dim_size), mesh, [Shard(1)])
+
+        with pytest.raises(RuntimeError, match="has changed for cotangent between tracing and runtime"):
+            torch.autograd.grad(actual, (in_dtensor, w_dtensor), g_o)
@@ -413,6 +413,7 @@ def forward(self, x):
     assert flat_arg_names == arg_names
 
 
+@pytest.mark.xfail(strict=True)
 def test_constant_folding():
     # Helper to verify we see the expected constant tensors
     # in exec_trace.

@@ -139,6 +139,7 @@ def __init__(
         is_prim: bool = False,
         tags: None | list[Any] = None,
         out_of_place: Symbol | None = None,
+        allow_tensor_subclass_proxy: bool = False,
     ):
         self.torchfns = torchfns
         self.is_method = is_method or (method_name is not None)
@@ -151,9 +152,30 @@ def __init__(
         self.tags = tags
         self.out_of_place = out_of_place
 
+        # This flag is used to enable/disable a torchsymbol to accept
+        # TensorProxy subclass as input (eg. DTensorProxy).
+        # By default, this is `False` as we don't want general `torchsymbol`
+        # which are meant for TensorProxy to accept DTensorProxy.
+        self.allow_tensor_subclass_proxy = allow_tensor_subclass_proxy
+
     def __call__(self, fn: Callable) -> Symbol:
         _fn = langctx(Languages.TORCH)(fn)
 
+        if not self.allow_tensor_subclass_proxy:
+
+            @wraps(_fn)
+            def wrapper(*args, **kwargs):
+                filter_tensor_proxies = list(
+                    filter(lambda t: isinstance(t, TensorProxy), tree_flatten((args, kwargs))[0])
+                )
+                assert all(map(lambda t: type(t) is TensorProxy, filter_tensor_proxies)), (
+                    f"Expected all inputs to be TensorProxy but found {list(map(lambda t: type(t), filter_tensor_proxies))}"
+                )
+                return _fn(*args, **kwargs)
+
+        else:
+            wrapper = _fn
+
         id: str
         if self.id is None:
             name = fn.__name__
@@ -178,10 +200,10 @@ def __call__(self, fn: Callable) -> Symbol:
 
         if self.is_prim:
             sym = Symbol(
-                name=fn.__name__, meta=langctx(Languages.PRIMS)(_fn), id=id, is_prim=self.is_prim, tags=self.tags
+                name=fn.__name__, meta=langctx(Languages.PRIMS)(wrapper), id=id, is_prim=self.is_prim, tags=self.tags
             )
         else:
-            sym = Symbol(name=fn.__name__, meta=_fn, id=id, is_prim=self.is_prim, tags=self.tags)
+            sym = Symbol(name=fn.__name__, meta=wrapper, id=id, is_prim=self.is_prim, tags=self.tags)
 
         if self.is_method:
             method_name: str = self.method_name if self.method_name is not None else fn.__name__

@@ -0,0 +1,35 @@
+from typing import Any
+from torch.distributed.tensor._dtensor_spec import DTensorSpec, DeviceMesh, TensorMeta
+from torch.distributed.tensor import DeviceMesh, Partial, Placement, Replicate, Shard
+
+
+def populate_object_ctx_for_dtensor_spec(x: Any, object_ctx: dict[str, Any]) -> bool:
+    """
+    Populate object context for DTensorSpec.
+
+    ..note::
+        This function will mutate the `object_ctx`
+
+    Returns:
+        bool: True if `x` is DTensorSpec (and also updates `object_ctx`) otherwise False.
+    """
+    if isinstance(x, DTensorSpec):
+        object_ctx.update(
+            {
+                "DTensorSpec": DTensorSpec,
+                "DeviceMesh": DeviceMesh,
+                "Placement": Placement,
+                "Replicate": Replicate,
+                "Shard": Shard,
+                "Partial": Partial,
+                "TensorMeta": TensorMeta,
+            }
+        )
+        return True
+    return False
+
+
+def prettyprint_dtensor_spec(x):
+    if isinstance(x, DTensorSpec):
+        return x.__repr__()
+    return ""