add debugperf model for fair comparison

huggingface · 3outeille · Nov 24, 2025 · Nov 24, 2025 · Nov 25, 2025 · Dec 8, 2025
commit 66998c2e7f0e77a89b835009f9e8eb11e1bb4da9
diff --git a/torchtitan/experiments/transformers_modeling_backend/__init__.py b/torchtitan/experiments/transformers_modeling_backend/__init__.py
@@ -23,6 +23,16 @@
 
 
 flavors = {
+    "debugperf": HFTransformerModelArgs(
+        titan_dense_args=TitanDenseModelArgs(
+            dim=256,
+            n_layers=6,
+            n_heads=16,
+            n_kv_heads=16,
+            vocab_size=2048,
+            rope_theta=500000,
+        ),
+    ),
     "debugmodel": HFTransformerModelArgs(
         titan_dense_args=TitanDenseModelArgs(
             dim=256,

diff --git a/torchtitan/experiments/transformers_modeling_backend/tooling_dev/debug_local.sh b/torchtitan/experiments/transformers_modeling_backend/tooling_dev/debug_local.sh
@@ -1,64 +1,57 @@
 #!/usr/bin/bash
 
-# create a list of model_name
+# Shared model configuration for fair comparison
+VOCAB_SIZE=2048
+N_LAYERS=6
+N_HEADS=16
+N_KV_HEADS=16
+DIM=256
+ROPE_THETA=500000
 
 tt_model_names=(
     "llama3"
 )
 
-# model_names=(
-#     # "meta-llama/Llama-3.2-1B"  # ✅
-#     # "microsoft/phi-2" # ✅/
-#     # "Qwen/Qwen2.5-7B" # ✅
-#     # "mistralai/Mistral-7B-v0.1" # ✅
-#     # "google/gemma-3-270m" # ❌ new layers to handle 
-#     # "ByteDance-Seed/Seed-Coder-8B-Instruct" # ✅
-#     # "Qwen/Qwen3-4B-Instruct-2507" # ✅
-# )
-
-# moe_model_names=(
-#     # "deepseek_v3"
-#     # "deepseek-ai/DeepSeek-V3"
-#     # "moonshotai/Moonlight-16B-A3B"
-#     # "openai/gpt-oss-20b"
-#     # "moonshotai/Kimi-K2-Instruct"
-# )
+model_names=(
+    "meta-llama/Llama-3.2-1B"  # ✅
+)
 
+# TorchTitan models - pass same model args
 for model_name in "${tt_model_names[@]}"; do
     rm -rf debug_local_results/${model_name}
 
-    python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugmodel --model_type torchtitan
-    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugmodel/seed_checkpoint --qos high
-    while [ ! -f debug_local_results/${model_name}/debugmodel/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugmodel/seed_checkpoint/status.txt)" != "completed" ]; do
+    python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf --model_type torchtitan
+    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf/seed_checkpoint --qos high
+    while [ ! -f debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt)" != "completed" ]; do
         echo "Waiting for seed checkpoint from ${model_name} to complete ..."
         sleep 1
     done
-    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugmodel --qos high
+    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf --qos high
     echo "================"
 done
 
-# for model_name in "${model_names[@]}"; do
-#     rm -rf debug_local_results/${model_name}
+for model_name in "${model_names[@]}"; do
+    rm -rf debug_local_results/${model_name}
 
-#     python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugmodel --model_type transformers_modeling_backend
-#     python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugmodel/seed_checkpoint --qos high
-#     while [ ! -f debug_local_results/${model_name}/debugmodel/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugmodel/seed_checkpoint/status.txt)" != "completed" ]; do
-#         echo "Waiting for seed checkpoint from ${model_name} to complete ..."
-#         sleep 1
-#     done
-#     python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugmodel --qos high
-#     echo "================"
-# done
+    python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf --model_type transformers_modeling_backend --hf_assets_path "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
+    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf/seed_checkpoint --qos high
+    while [ ! -f debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt)" != "completed" ]; do
+        echo "Waiting for seed checkpoint from ${model_name} to complete ..."
+        sleep 1
+    done
+    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf --qos high
+    echo "================"
+done
 
 # for model_name in "${moe_model_names[@]}"; do
 #     rm -rf debug_local_results/${model_name}
 
-#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugmodel
-#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugmodel/seed_checkpoint --qos high
-#     while [ ! -f debug_local_results/${model_name}/debugmodel/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugmodel/seed_checkpoint/status.txt)" != "completed" ]; do
+#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf
+#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf/seed_checkpoint --qos high
+#     while [ ! -f debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt)" != "completed" ]; do
 #         echo "Waiting for seed checkpoint from ${model_name} to complete ..."
 #         sleep 1
 #     done
-#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugmodel --qos high
+#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf --qos high
 #     echo "================"
 # done
diff --git a/torchtitan/experiments/transformers_modeling_backend/tooling_dev/template.slurm b/torchtitan/experiments/transformers_modeling_backend/tooling_dev/template.slurm
@@ -7,7 +7,8 @@
 #SBATCH --ntasks-per-node=1
 #SBATCH --qos={{ qos }}
 #SBATCH --cpus-per-task=12
-#SBATCH --partition=hopper-prod
+#SBATCH --partition=hopper-extra
+#SBATCH --time=00:30:00
 
 # Misc initializations.
 echo "========================"

diff --git a/torchtitan/experiments/transformers_modeling_backend/tooling_dev/test_hf_integration.py b/torchtitan/experiments/transformers_modeling_backend/tooling_dev/test_hf_integration.py
@@ -19,8 +19,8 @@
 from rich.table import Table
 from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
 
-BASELINE = "fsdp2_tp1_cp1_pp1"
-# BASELINE = "fsdp1_tp1_cp1_pp1"
+# BASELINE = "fsdp2_tp1_cp1_pp1"
+BASELINE = "fsdp1_tp1_cp1_pp1"
 
 console = Console()
 
@@ -114,7 +114,7 @@ def _create_slurm_script(
     print(f"Slurm script created at {script_path}")
 
 
-def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str = "transformers_modeling_backend"):
+def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str = "transformers_modeling_backend", hf_assets_path: str = None):
     """
     results/
         |_ meta-llama
@@ -166,29 +166,33 @@ def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str =
         config["hf_transformers"]["model"] = model_name
         config["model"]["flavor"] = flavor
 
-        # Extract just the model name from repo_id (e.g., "Llama-3.2-1B" from "meta-llama/Llama-3.2-1B")
-        model_name_only = model_name.split("/")[-1] if "/" in model_name else model_name
-        config["model"]["hf_assets_path"] = f"./{out_dir}/{model_name}/assets/hf/{model_name_only}"
+        # Use provided hf_assets_path or default
+        if hf_assets_path:
+            config["model"]["hf_assets_path"] = hf_assets_path
+        else:
+            # Extract just the model name from repo_id (e.g., "Llama-3.2-1B" from "meta-llama/Llama-3.2-1B")
+            model_name_only = model_name.split("/")[-1] if "/" in model_name else model_name
+            config["model"]["hf_assets_path"] = f"./{out_dir}/{model_name}/assets/hf/{model_name_only}"
     elif model_type == "torchtitan":
         config["model"]["name"] = model_name
         config["model"]["flavor"] = flavor
-        config["model"]["hf_assets_path"] = f"/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
+        config["model"]["hf_assets_path"] = hf_assets_path or "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
     else:
         raise ValueError(f"Unknown model_type: {model_type}. Must be 'transformers_modeling_backend' or 'torchtitan'")
 
     # Set absolute path to dataset to avoid path resolution issues
     config["training"]["dataset_path"] = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"
 
-    parallelism_configs = [
-        BASELINE, # baseline
-        "fsdp2_tp2_cp1_pp1",
-        "fsdp2_tp1_cp1_pp2",
-        "fsdp2_tp1_cp2_pp1",
-        "fsdp2_tp1_cp2_pp2",
-        "fsdp2_tp2_cp2_pp1",
-        "fsdp2_tp2_cp1_pp2",
-        "fsdp2_tp2_cp2_pp2",
-    ]
+    # parallelism_configs = [
+    #     BASELINE, # baseline
+    #     "fsdp2_tp2_cp1_pp1",
+    #     "fsdp2_tp1_cp1_pp2",
+    #     "fsdp2_tp1_cp2_pp1",
+    #     "fsdp2_tp1_cp2_pp2",
+    #     "fsdp2_tp2_cp2_pp1",
+    #     "fsdp2_tp2_cp1_pp2",
+    #     "fsdp2_tp2_cp2_pp2",
+    # ]
 
     # parallelism_configs = [
     #     BASELINE, # baseline
@@ -201,11 +205,11 @@ def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str =
     #     # "fsdp1_tp2_cp2_pp2",
     # ]
 
-    # parallelism_configs = [
-    #     BASELINE, # baseline
-    #     # "fsdp2_tp1_cp1_pp2",
-    #     # "fsdp1_tp1_cp1_pp2",
-    # ]
+    parallelism_configs = [
+        BASELINE, # baseline
+        # "fsdp2_tp1_cp1_pp2",
+        # "fsdp1_tp1_cp1_pp2",
+    ]
 
     out_path = Path(out_dir) / model_name / flavor
     out_path.mkdir(parents=True, exist_ok=True)
@@ -1187,6 +1191,8 @@ def fmt_min_avg_max(min_v, avg_v, max_v, fmt="{:.2f}"):
     create_configs_parser.add_argument("--model_type", type=str, default="transformers_modeling_backend",
                                        choices=["transformers_modeling_backend", "torchtitan"],
                                        help="Model type: 'transformers_modeling_backend' for HF models, 'torchtitan' for torchtitan native")
+    create_configs_parser.add_argument("--hf_assets_path", type=str, default=None,
+                                       help="Override hf_assets_path (tokenizer path). If not provided, uses default based on model_type.")
 
     submit_jobs_parser = subparsers.add_parser("submit_jobs")
     submit_jobs_parser.add_argument("--inp_dir", type=str, required=True)
@@ -1213,7 +1219,7 @@ def fmt_min_avg_max(min_v, avg_v, max_v, fmt="{:.2f}"):
     args = parser.parse_args()
 
     if args.action == "create_configs":
-        create_configs(args.model_name, args.out_dir, args.flavor, args.model_type)
+        create_configs(args.model_name, args.out_dir, args.flavor, args.model_type, args.hf_assets_path)
     elif args.action == "submit_jobs":
         submit_jobs(args.inp_dir, args.qos, args.only)
     elif args.action == "report":

diff --git a/torchtitan/models/llama3/__init__.py b/torchtitan/models/llama3/__init__.py
@@ -27,6 +27,9 @@
 
 
 llama3_args = {
+    "debugperf": TransformerModelArgs(
+        dim=256, n_layers=6, n_heads=16, n_kv_heads=16, vocab_size=2048, rope_theta=500000
+    ),
     "debugmodel": TransformerModelArgs(
         dim=256, n_layers=6, n_heads=16, vocab_size=2048, rope_theta=500000
     ),