Add debugperf_large model configuration and profiling support

huggingface · 3outeille · Nov 24, 2025 · Nov 24, 2025 · Nov 25, 2025 · Dec 8, 2025
commit d78b0e69addb8abfbd155207eeb871bd0349f3f5
diff --git a/torchtitan/experiments/transformers_modeling_backend/__init__.py b/torchtitan/experiments/transformers_modeling_backend/__init__.py
@@ -33,6 +33,16 @@
             rope_theta=500000,
         ),
     ),
+    "debugperf_large": HFTransformerModelArgs(
+        titan_dense_args=TitanDenseModelArgs(
+            dim=1024,
+            n_layers=12,
+            n_heads=16,
+            n_kv_heads=16,
+            vocab_size=32000,
+            rope_theta=500000,
+        ),
+    ),
     "debugmodel": HFTransformerModelArgs(
         titan_dense_args=TitanDenseModelArgs(
             dim=256,

diff --git a/torchtitan/experiments/transformers_modeling_backend/tooling_dev/debug_local.sh b/torchtitan/experiments/transformers_modeling_backend/tooling_dev/debug_local.sh
@@ -20,38 +20,38 @@ model_names=(
 for model_name in "${tt_model_names[@]}"; do
     rm -rf debug_local_results/${model_name}
 
-    python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf --model_type torchtitan
-    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf/seed_checkpoint --qos high
-    while [ ! -f debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt)" != "completed" ]; do
+    python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf_large --model_type torchtitan --enable_profiling --profile_freq 5
+    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large/seed_checkpoint --qos high
+    while [ ! -f debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt)" != "completed" ]; do
         echo "Waiting for seed checkpoint from ${model_name} to complete ..."
         sleep 1
     done
-    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf --qos high
+    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large --qos high
     echo "================"
 done
 
 for model_name in "${model_names[@]}"; do
     rm -rf debug_local_results/${model_name}
 
-    python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf --model_type transformers_modeling_backend --hf_assets_path "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
-    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf/seed_checkpoint --qos high
-    while [ ! -f debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt)" != "completed" ]; do
+    python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf_large --model_type transformers_modeling_backend --hf_assets_path "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" --enable_profiling --profile_freq 5
+    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large/seed_checkpoint --qos high
+    while [ ! -f debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt)" != "completed" ]; do
         echo "Waiting for seed checkpoint from ${model_name} to complete ..."
         sleep 1
     done
-    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf --qos high
+    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large --qos high
     echo "================"
 done
 
 # for model_name in "${moe_model_names[@]}"; do
 #     rm -rf debug_local_results/${model_name}
 
-#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf
-#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf/seed_checkpoint --qos high
-#     while [ ! -f debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt)" != "completed" ]; do
+#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf_large
+#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large/seed_checkpoint --qos high
+#     while [ ! -f debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt)" != "completed" ]; do
 #         echo "Waiting for seed checkpoint from ${model_name} to complete ..."
 #         sleep 1
 #     done
-#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf --qos high
+#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large --qos high
 #     echo "================"
 # done
diff --git a/torchtitan/experiments/transformers_modeling_backend/tooling_dev/test_hf_integration.py b/torchtitan/experiments/transformers_modeling_backend/tooling_dev/test_hf_integration.py
@@ -114,7 +114,7 @@ def _create_slurm_script(
     print(f"Slurm script created at {script_path}")
 
 
-def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str = "transformers_modeling_backend", hf_assets_path: str = None):
+def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str = "transformers_modeling_backend", hf_assets_path: str = None, enable_profiling: bool = False, profile_freq: int = 5):
     """
     results/
         |_ meta-llama
@@ -183,6 +183,12 @@ def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str =
     # Set absolute path to dataset to avoid path resolution issues
     config["training"]["dataset_path"] = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"
 
+    # Configure profiling
+    if enable_profiling:
+        config["profiling"]["enable_profiling"] = True
+        config["profiling"]["profile_freq"] = profile_freq
+        config["profiling"]["save_traces_folder"] = "profile_trace"
+
     # parallelism_configs = [
     #     BASELINE, # baseline
     #     "fsdp2_tp2_cp1_pp1",
@@ -1193,6 +1199,10 @@ def fmt_min_avg_max(min_v, avg_v, max_v, fmt="{:.2f}"):
                                        help="Model type: 'transformers_modeling_backend' for HF models, 'torchtitan' for torchtitan native")
     create_configs_parser.add_argument("--hf_assets_path", type=str, default=None,
                                        help="Override hf_assets_path (tokenizer path). If not provided, uses default based on model_type.")
+    create_configs_parser.add_argument("--enable_profiling", action="store_true",
+                                       help="Enable PyTorch profiler for trace collection")
+    create_configs_parser.add_argument("--profile_freq", type=int, default=5,
+                                       help="Profiling frequency (steps between profiles)")
 
     submit_jobs_parser = subparsers.add_parser("submit_jobs")
     submit_jobs_parser.add_argument("--inp_dir", type=str, required=True)
@@ -1219,7 +1229,7 @@ def fmt_min_avg_max(min_v, avg_v, max_v, fmt="{:.2f}"):
     args = parser.parse_args()
 
     if args.action == "create_configs":
-        create_configs(args.model_name, args.out_dir, args.flavor, args.model_type, args.hf_assets_path)
+        create_configs(args.model_name, args.out_dir, args.flavor, args.model_type, args.hf_assets_path, args.enable_profiling, args.profile_freq)
     elif args.action == "submit_jobs":
         submit_jobs(args.inp_dir, args.qos, args.only)
     elif args.action == "report":

diff --git a/torchtitan/models/llama3/__init__.py b/torchtitan/models/llama3/__init__.py
@@ -30,6 +30,14 @@
     "debugperf": TransformerModelArgs(
         dim=256, n_layers=6, n_heads=16, n_kv_heads=16, vocab_size=2048, rope_theta=500000
     ),
+    "debugperf_large": TransformerModelArgs(
+        dim=1024,
+        n_layers=12,
+        n_heads=16,
+        n_kv_heads=16,
+        vocab_size=32000,
+        rope_theta=500000,
+    ),
     "debugmodel": TransformerModelArgs(
         dim=256, n_layers=6, n_heads=16, vocab_size=2048, rope_theta=500000
     ),