[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci Signed-off-by: Jaime Cardenas <[email protected]>
NVIDIA · jaimec00 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 28, 2025
commit f6f034b743f0a36f09945dd04026460db81e1591
diff --git a/tests/pytorch/selective_layernorm_mlp/compare.py b/tests/pytorch/selective_layernorm_mlp/compare.py
@@ -1,4 +1,3 @@
-
 import time
 
 import torch
@@ -7,6 +6,7 @@
 torch.manual_seed(1234)
 device = torch.device("cuda")
 
+
 class _Sequential(torch.nn.Sequential):
     """Sequential model that forwards keyword arguments to modules"""
 
@@ -16,10 +16,11 @@ def forward(self, input_: torch.Tensor, **kwargs) -> torch.Tensor:
             x = module(x, **kwargs)
         return x
 
+
 class ModelConfig:
     def __init__(
-        self, 
-        hidden_size: int = 128, 
+        self,
+        hidden_size: int = 128,
         ffn_hidden_size: int = 512,
         layers: int = 1,
     ):
@@ -48,14 +49,16 @@ def build(self):
 
         return ln_model, sln_model
 
+
 config = {
     # "small": ModelConfig(128, 512, 12),
     # "medium": ModelConfig(512, 2048, 12),
     # "large": ModelConfig(1024, 4096, 12),
     "huge": ModelConfig(2048, 8192, 12),
 }
 
-data_sizes = [2**7, 2**10, 2**14, 2**16]#2**18]
+data_sizes = [2**7, 2**10, 2**14, 2**16]  # 2**18]
+
 
 class Profiler:
     def __init__(self):
@@ -68,7 +71,7 @@ def __init__(self):
                 "bwd_stats": {
                     "mem": [],
                     "time": [],
-                }
+                },
             },
             "sln_stats": {
                 "fwd_stats": {
@@ -78,7 +81,7 @@ def __init__(self):
                 "bwd_stats": {
                     "mem": [],
                     "time": [],
-                }
+                },
             },
             "diff": {
                 "out": [],
@@ -88,8 +91,7 @@ def __init__(self):
                 "fc1_bias": [],
                 "fc2_weight": [],
                 "fc2_bias": [],
-            }
-
+            },
         }
 
     def compare(self, ln_model, sln_model, data):
@@ -161,11 +163,19 @@ def _run_bwd(model, out):
         self.stats["sln_stats"]["bwd_stats"]["time"].append(sln_bwd_time)
         self.stats["sln_stats"]["bwd_stats"]["mem"].append(sln_bwd_mem)
 
-        for key in ["layer_norm_weight", "layer_norm_bias", "fc1_weight", "fc1_bias", "fc2_weight", "fc2_bias"]:
+        for key in [
+            "layer_norm_weight",
+            "layer_norm_bias",
+            "fc1_weight",
+            "fc1_bias",
+            "fc2_weight",
+            "fc2_bias",
+        ]:
             self.stats["diff"][key].append(self._max_diff(ln_grads[key], sln_grads[key]))
 
     def summarize(self):
         """Print a concise summary of collected statistics."""
+
         def _summarize(values):
             if not values:
                 return {"avg": 0.0, "min": 0.0, "max": 0.0}
@@ -202,7 +212,14 @@ def _summarize(values):
         print(f"Forward output max diff avg: {summary:.3e}")
 
         print("Gradient max diff averages:")
-        for key in ["layer_norm_weight", "layer_norm_bias", "fc1_weight", "fc1_bias", "fc2_weight", "fc2_bias"]:
+        for key in [
+            "layer_norm_weight",
+            "layer_norm_bias",
+            "fc1_weight",
+            "fc1_bias",
+            "fc2_weight",
+            "fc2_bias",
+        ]:
             summary = sum(diff_stats[key]) / len(diff_stats[key])
             print(f"  {key}: {summary:.3e}")
         print()
@@ -229,6 +246,7 @@ def _collect_param_grads(self, model):
     def _param_key(self, name):
         return name.split(".")[-1]
 
+
 def main():
 
     for size in config:
@@ -243,8 +261,12 @@ def main():
 
             profiler.compare(ln_model, sln_model, dummy_data)
 
-            print(f"summarizing comparison for seq={seq_len}, hidden={config[size]._hidden_size}, ffn_fidden={config[size]._ffn_hidden_size}, layers={config[size]._layers}\n")
+            print(
+                f"summarizing comparison for seq={seq_len}, hidden={config[size]._hidden_size},"
+                f" ffn_fidden={config[size]._ffn_hidden_size}, layers={config[size]._layers}\n"
+            )
             profiler.summarize()
 
+
 if __name__ == "__main__":
     main()
diff --git a/tests/pytorch/selective_layernorm_mlp/distributed/run_numerics.py b/tests/pytorch/selective_layernorm_mlp/distributed/run_numerics.py
@@ -31,6 +31,7 @@
 from transformer_engine.pytorch.constants import NVFP4_BLOCK_SCALING_SIZE
 from transformer_engine.pytorch.distributed import gather_along_first_dim
 
+
 def _compare_tensors(name, test, ref, rtol, atol):
     # Make sure tensors aren't zero and we don't pass trivially
     if test.count_nonzero() == 0:
@@ -380,7 +381,6 @@ def _alloc_main_grad(model_single_node, model_distributed):
             param.main_grad = torch.zeros_like(param, dtype=torch.float32)
 
 
-
 ############################################
 #               LayerNormMLP               #
 ############################################
@@ -488,4 +488,3 @@ def test_selective_layernorm_mlp():
         for set_parallel_mode in [True]:
             for sequence_parallel in [False, True]:
                 _test_selective_layernorm_mlp(set_parallel_mode, sequence_parallel, **kwargs)
-
diff --git a/tests/pytorch/selective_layernorm_mlp/test_cuda_graphs.py b/tests/pytorch/selective_layernorm_mlp/test_cuda_graphs.py
@@ -397,4 +397,3 @@ def test_make_graphed_callables_with_fp8_weight_caching(
         fp8_recipe=fp8_recipe,
         fp8_weight_caching=True,
     )
-
diff --git a/tests/pytorch/selective_layernorm_mlp/test_deferred_init.py b/tests/pytorch/selective_layernorm_mlp/test_deferred_init.py
@@ -11,8 +11,7 @@
     te.SelectiveLayerNormMLP,
 ]
 
-_composed_modules = [
-]
+_composed_modules = []
 
 batch_size = 32
 seq_length = 2048

diff --git a/tests/pytorch/selective_layernorm_mlp/test_numerics.py b/tests/pytorch/selective_layernorm_mlp/test_numerics.py
@@ -202,7 +202,6 @@ def forward(self, x):
         return (w * x_normed).to(x.dtype)
 
 
-
 class TorchQuickGELU(nn.Module):
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         return input * torch.sigmoid(1.702 * input)
@@ -272,7 +271,6 @@ def forward(self, x):
         return self.fc2(t)
 
 
-
 def nvfp4_rht_and_2d_quantization():
     nvfp4_recipe = recipe.NVFP4BlockScaling()
     nvfp4_recipe.fp4_quant_fwd_inp = recipe.QParams(
@@ -405,7 +403,9 @@ def reset_global_fp8_state():
 @pytest.mark.parametrize("normalization", all_normalizations)
 @pytest.mark.parametrize("return_bias", all_boolean)
 @pytest.mark.parametrize("bias", all_boolean)
-def test_selective_layernorm_mlp_accuracy(dtype, bs, model, activation, normalization, return_bias, bias):
+def test_selective_layernorm_mlp_accuracy(
+    dtype, bs, model, activation, normalization, return_bias, bias
+):
     config = model_configs[model]
 
     te_ln_mlp = TestReturnBiasModule(
@@ -529,4 +529,3 @@ def test_selective_layernorm_mlp_accuracy_delay_wgrad_compute(
     # Shoule be bit-wise match
     for i, (o, o_ref) in enumerate(zip(te_outputs, te_outputs_ref)):
         torch.testing.assert_close(o, o_ref, rtol=0, atol=0)
-
diff --git a/tests/pytorch/selective_layernorm_mlp/test_recipe.py b/tests/pytorch/selective_layernorm_mlp/test_recipe.py
@@ -48,12 +48,9 @@ def setup_class(cls) -> None:
         torch.manual_seed(seed)
         torch.cuda.manual_seed(seed)
 
-
     @pytest.mark.parametrize(
         "module_class",
-        [
-            SelectiveLayerNormMLP
-        ],
+        [SelectiveLayerNormMLP],
     )
     def test_quantizer_update(self, module_class):
         in_features = 32

diff --git a/tests/pytorch/selective_layernorm_mlp/test_sanity.py b/tests/pytorch/selective_layernorm_mlp/test_sanity.py
@@ -130,6 +130,7 @@ def reset_global_fp8_state():
     yield
     FP8GlobalStateManager.reset()
 
+
 def _test_sanity_common(
     block, dtype, config, fp8_recipe, skip_wgrad, skip_dgrad, microbatching=True
 ):
@@ -160,7 +161,6 @@ def _test_sanity_common(
     torch.cuda.synchronize()
 
 
-
 @pytest.mark.parametrize("dtype", param_types)
 @pytest.mark.parametrize("fp8_recipe", fp8_recipes)
 @pytest.mark.parametrize("model", ["small", "weird"])

diff --git a/transformer_engine/pytorch/module/selective_layernorm_mlp.py b/transformer_engine/pytorch/module/selective_layernorm_mlp.py
@@ -275,7 +275,7 @@ def _forward(
                 module,
                 skip_fp8_weight_update,
                 symmetric_ar_type,
-                debug,                
+                debug,
             ]
 
         # Make sure input dimensions are compatible
@@ -297,7 +297,9 @@ def _forward(
             ln_bias = cast_if_needed(ln_bias, activation_dtype)
 
         tp_world_size = get_distributed_world_size(tp_group)
-        backwards_needs_fc1_input = is_grad_enabled and fc1_weight.requires_grad and recompute_for_bwd
+        backwards_needs_fc1_input = (
+            is_grad_enabled and fc1_weight.requires_grad and recompute_for_bwd
+        )
         device = inp.device
 
         # Configure Userbuffers communication (comm+GEMM overlap)
@@ -410,7 +412,9 @@ def _forward(
             # If weights are not quantized, we call get_weight_workspace,
             # which handles weight caching etc.
             # FP8 cast to workspace buffer
-            update_workspace = (is_first_microbatch is None or is_first_microbatch) and not recompute_for_bwd
+            update_workspace = (
+                is_first_microbatch is None or is_first_microbatch
+            ) and not recompute_for_bwd
             fc1_weight_quantizer.set_usage(rowwise=True, columnwise=is_grad_enabled)
             fc2_weight_quantizer.set_usage(rowwise=True, columnwise=is_grad_enabled)
             fc1_weight_final = module.get_weight_workspace(
@@ -576,7 +580,6 @@ def _forward(
                 clear_tensor_data(act_out)
                 act_out = None
 
-
             if fuse_wgrad_accumulation:
                 # This check is needed to ensure that main_grad is not created
                 # during the forward pass when using MCore FSDP as it creates
@@ -850,7 +853,7 @@ def forward(
             debug,
             recompute_for_bwd=False,
         )
-    
+
     @staticmethod
     def _recompute(ctx):
         # pylint: disable=missing-function-docstring
Original file line number	Diff line number	Diff line change
Expand Up		@@ -397,4 +397,3 @@ def test_make_graphed_callables_with_fp8_weight_caching(
		fp8_recipe=fp8_recipe,
		fp8_weight_caching=True,
		)