[single_controller] feat: Support dispatch/collect nested tensors with 3 or more dimensions (verl-project#4940)

JacobHelwig · web-flow · commit 1db267287809 · 2026-01-17T13:01:31.000+08:00
### What does this PR do? There are 2 errors that prevent dispatching and collecting nested tensors with 3 or more dimensions. #### Dispatch When chunking a `TensorDict` with more than 1 nested tensor that has 3 or more dimensions, re-use of the variable name `td` in the function args and inner loop results in a `KeyError`: https://github.com/volcengine/verl/blob/e204cd80bd0886c75606d4b82ba88eed2658d1c7/verl/utils/tensordict_utils.py#L276-L312 #### Collection When collecting returned `TensorDict`s that have a nested tensor with 3 or more dimensions, there is an assertion enforcing that the nested tensor have exactly 2 dimensions, although the function works for tensors with an arbitrary number of dimensions: https://github.com/volcengine/verl/blob/e204cd80bd0886c75606d4b82ba88eed2658d1c7/verl/utils/tensordict_utils.py#L159-L192 ### Tests The added tests demonstrate both errors when run on `main`: ```python FAILED tests/test_protocol_v2_on_cpu.py::test_concat_nested_tensor - AssertionError: nested tensor must have 2 dimensions. Got torch.Size([2, 4, j32]) FAILED tests/test_protocol_v2_on_cpu.py::test_chunk_tensordict - KeyError: 'key "position_ids" not found in TensorDict with keys [\'attention_mask\', \'input_ids\', \'multi_modal_inputs\']' ```
diff --git a/tests/test_protocol_v2_on_cpu.py b/tests/test_protocol_v2_on_cpu.py
@@ -674,6 +674,7 @@ def test_dataproto_chunk_after_index():
 
 
 def test_concat_nested_tensor():
+    # Test 2D nested tensors
     vocab_size = 128
     a = torch.randint(low=0, high=vocab_size, size=(11,))
     b = torch.randint(low=0, high=vocab_size, size=(13,))
@@ -690,6 +691,42 @@ def test_concat_nested_tensor():
 
     assert torch.all(torch.eq(output_values, expected)).item()
 
+    # Test 3D nested tensors
+    a_3d = torch.randint(low=0, high=vocab_size, size=(4, 4))
+    b_3d = torch.randint(low=0, high=vocab_size, size=(4, 5))
+    c_3d = torch.randint(low=0, high=vocab_size, size=(4, 6))
+    d_3d = torch.randint(low=0, high=vocab_size, size=(4, 7))
+
+    nested_a_b_3d = torch.nested.as_nested_tensor([a_3d, b_3d], layout=torch.jagged)
+    nested_c_d_3d = torch.nested.as_nested_tensor([c_3d, d_3d], layout=torch.jagged)
+
+    output_3d = tu.concat_nested_tensors([nested_a_b_3d, nested_c_d_3d])
+
+    assert output_3d.shape[0] == 4
+    output_3d_unbind = output_3d.unbind(0)
+    assert torch.all(torch.eq(output_3d_unbind[0], a_3d)).item()
+    assert torch.all(torch.eq(output_3d_unbind[1], b_3d)).item()
+    assert torch.all(torch.eq(output_3d_unbind[2], c_3d)).item()
+    assert torch.all(torch.eq(output_3d_unbind[3], d_3d)).item()
+
+    # Test 4D nested tensors
+    a_4d = torch.randint(low=0, high=vocab_size, size=(2, 3, 4))
+    b_4d = torch.randint(low=0, high=vocab_size, size=(2, 3, 5))
+    c_4d = torch.randint(low=0, high=vocab_size, size=(2, 3, 3))
+    d_4d = torch.randint(low=0, high=vocab_size, size=(2, 3, 6))
+
+    nested_a_b_4d = torch.nested.as_nested_tensor([a_4d, b_4d], layout=torch.jagged)
+    nested_c_d_4d = torch.nested.as_nested_tensor([c_4d, d_4d], layout=torch.jagged)
+
+    output_4d = tu.concat_nested_tensors([nested_a_b_4d, nested_c_d_4d])
+
+    assert output_4d.shape[0] == 4
+    output_4d_unbind = output_4d.unbind(0)
+    assert torch.all(torch.eq(output_4d_unbind[0], a_4d)).item()
+    assert torch.all(torch.eq(output_4d_unbind[1], b_4d)).item()
+    assert torch.all(torch.eq(output_4d_unbind[2], c_4d)).item()
+    assert torch.all(torch.eq(output_4d_unbind[3], d_4d)).item()
+
 
 def test_concat_tensordict():
     vocab_size = 128
@@ -755,6 +792,15 @@ def test_chunk_tensordict():
     input_ids = torch.nested.as_nested_tensor(
         [torch.arange(4), torch.arange(5), torch.arange(6), torch.arange(7)], layout=torch.jagged
     )
+    attention_mask = torch.nested.as_nested_tensor(
+        [
+            torch.randint(low=0, high=2, size=[3, 4]),
+            torch.randint(low=0, high=2, size=[3, 5]),
+            torch.randint(low=0, high=2, size=[3, 6]),
+            torch.randint(low=0, high=2, size=[3, 7]),
+        ],
+        layout=torch.jagged,
+    )
 
     multi_modal_inputs = torch.stack(
         [
@@ -768,6 +814,7 @@ def test_chunk_tensordict():
         {
             "input_ids": input_ids,
             "position_ids": position_ids,
+            "attention_mask": attention_mask,
             "multi_modal_inputs": multi_modal_inputs,
         },
     )
diff --git a/verl/utils/tensordict_utils.py b/verl/utils/tensordict_utils.py
@@ -157,22 +157,22 @@ def get_non_tensor_data(data: TensorDict, key: str, default):
 
 
 def concat_nested_tensors(tensors: list[torch.Tensor]) -> torch.Tensor:
-    """Concatenate multiple 2D nested tensors along the batch dimension.
+    """Concatenate multiple nested tensors along the batch dimension.
 
     Takes a list of nested tensors with jagged layout and concatenates them
-    into a single nested tensor. Each input tensor must be 2D and contiguous.
+    into a single nested tensor. Each input tensor must have 2 or more dimensions and be contiguous.
 
     Args:
-        tensors: List of 2D nested tensors to concatenate. All tensors must
-            be nested, contiguous, and have exactly 2 dimensions.
+        tensors: List of nested tensors to concatenate. All tensors must
+            be nested, contiguous, and have 2 or more dimensions.
 
     Returns:
         A new nested tensor with jagged layout containing all rows from
         the input tensors concatenated along dimension 0.
 
     Raises:
         AssertionError: If any tensor is not nested, not contiguous, or
-            doesn't have exactly 2 dimensions.
+            doesn't have 2 or more dimensions.
 
     Example:
         >>> t1 = torch.nested.as_nested_tensor([torch.randn(3), torch.randn(5)], layout=torch.jagged)
@@ -184,7 +184,7 @@ def concat_nested_tensors(tensors: list[torch.Tensor]) -> torch.Tensor:
         assert tensor.is_nested and tensor.is_contiguous()
     unbind_tensors = []
     for tensor in tensors:
-        assert len(tensor.shape) == 2, f"nested tensor must have 2 dimensions. Got {tensor.shape}"
+        assert len(tensor.shape) >= 2, f"nested tensor must have 2 or more dimensions. Got {tensor.shape}"
         unbind_tensor = tensor.unbind(0)
         unbind_tensors.extend(list(unbind_tensor))
 
@@ -306,8 +306,10 @@ def chunk_tensordict(td: TensorDict, chunks: int) -> list[TensorDict]:
     tds = new_td.chunk(chunks=chunks)
     for key in keys:
         tensors = td[key].unbind(dim=0)
-        for i, td in enumerate(tds):
-            td[key] = torch.nested.as_nested_tensor(tensors[i * chunk_size : (i + 1) * chunk_size], layout=torch.jagged)
+        for i, chunk_td in enumerate(tds):
+            chunk_td[key] = torch.nested.as_nested_tensor(
+                tensors[i * chunk_size : (i + 1) * chunk_size], layout=torch.jagged
+            )
 
     return tds