Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add debugperf model for fair comparison
  • Loading branch information
3outeille committed Dec 8, 2025
commit 66998c2e7f0e77a89b835009f9e8eb11e1bb4da9
10 changes: 10 additions & 0 deletions torchtitan/experiments/transformers_modeling_backend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@


flavors = {
"debugperf": HFTransformerModelArgs(
titan_dense_args=TitanDenseModelArgs(
dim=256,
n_layers=6,
n_heads=16,
n_kv_heads=16,
vocab_size=2048,
rope_theta=500000,
),
),
"debugmodel": HFTransformerModelArgs(
titan_dense_args=TitanDenseModelArgs(
dim=256,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,64 +1,57 @@
#!/usr/bin/bash

# create a list of model_name
# Shared model configuration for fair comparison
VOCAB_SIZE=2048
N_LAYERS=6
N_HEADS=16
N_KV_HEADS=16
DIM=256
ROPE_THETA=500000

tt_model_names=(
"llama3"
)

# model_names=(
# # "meta-llama/Llama-3.2-1B" # ✅
# # "microsoft/phi-2" # ✅/
# # "Qwen/Qwen2.5-7B" # ✅
# # "mistralai/Mistral-7B-v0.1" # ✅
# # "google/gemma-3-270m" # ❌ new layers to handle
# # "ByteDance-Seed/Seed-Coder-8B-Instruct" # ✅
# # "Qwen/Qwen3-4B-Instruct-2507" # ✅
# )

# moe_model_names=(
# # "deepseek_v3"
# # "deepseek-ai/DeepSeek-V3"
# # "moonshotai/Moonlight-16B-A3B"
# # "openai/gpt-oss-20b"
# # "moonshotai/Kimi-K2-Instruct"
# )
model_names=(
"meta-llama/Llama-3.2-1B" # ✅
)

# TorchTitan models - pass same model args
for model_name in "${tt_model_names[@]}"; do
rm -rf debug_local_results/${model_name}

python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugmodel --model_type torchtitan
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugmodel/seed_checkpoint --qos high
while [ ! -f debug_local_results/${model_name}/debugmodel/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugmodel/seed_checkpoint/status.txt)" != "completed" ]; do
python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf --model_type torchtitan
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf/seed_checkpoint --qos high
while [ ! -f debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt)" != "completed" ]; do
echo "Waiting for seed checkpoint from ${model_name} to complete ..."
sleep 1
done
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugmodel --qos high
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf --qos high
echo "================"
done

# for model_name in "${model_names[@]}"; do
# rm -rf debug_local_results/${model_name}
for model_name in "${model_names[@]}"; do
rm -rf debug_local_results/${model_name}

# python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugmodel --model_type transformers_modeling_backend
# python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugmodel/seed_checkpoint --qos high
# while [ ! -f debug_local_results/${model_name}/debugmodel/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugmodel/seed_checkpoint/status.txt)" != "completed" ]; do
# echo "Waiting for seed checkpoint from ${model_name} to complete ..."
# sleep 1
# done
# python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugmodel --qos high
# echo "================"
# done
python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf --model_type transformers_modeling_backend --hf_assets_path "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf/seed_checkpoint --qos high
while [ ! -f debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt)" != "completed" ]; do
echo "Waiting for seed checkpoint from ${model_name} to complete ..."
sleep 1
done
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf --qos high
echo "================"
done

# for model_name in "${moe_model_names[@]}"; do
# rm -rf debug_local_results/${model_name}

# USE_MOE=1 python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugmodel
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugmodel/seed_checkpoint --qos high
# while [ ! -f debug_local_results/${model_name}/debugmodel/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugmodel/seed_checkpoint/status.txt)" != "completed" ]; do
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf/seed_checkpoint --qos high
# while [ ! -f debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt)" != "completed" ]; do
# echo "Waiting for seed checkpoint from ${model_name} to complete ..."
# sleep 1
# done
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugmodel --qos high
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf --qos high
# echo "================"
# done
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
#SBATCH --ntasks-per-node=1
#SBATCH --qos={{ qos }}
#SBATCH --cpus-per-task=12
#SBATCH --partition=hopper-prod
#SBATCH --partition=hopper-extra
#SBATCH --time=00:30:00

# Misc initializations.
echo "========================"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn

BASELINE = "fsdp2_tp1_cp1_pp1"
# BASELINE = "fsdp1_tp1_cp1_pp1"
# BASELINE = "fsdp2_tp1_cp1_pp1"
BASELINE = "fsdp1_tp1_cp1_pp1"

console = Console()

Expand Down Expand Up @@ -114,7 +114,7 @@ def _create_slurm_script(
print(f"Slurm script created at {script_path}")


def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str = "transformers_modeling_backend"):
def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str = "transformers_modeling_backend", hf_assets_path: str = None):
"""
results/
|_ meta-llama
Expand Down Expand Up @@ -166,29 +166,33 @@ def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str =
config["hf_transformers"]["model"] = model_name
config["model"]["flavor"] = flavor

# Extract just the model name from repo_id (e.g., "Llama-3.2-1B" from "meta-llama/Llama-3.2-1B")
model_name_only = model_name.split("/")[-1] if "/" in model_name else model_name
config["model"]["hf_assets_path"] = f"./{out_dir}/{model_name}/assets/hf/{model_name_only}"
# Use provided hf_assets_path or default
if hf_assets_path:
config["model"]["hf_assets_path"] = hf_assets_path
else:
# Extract just the model name from repo_id (e.g., "Llama-3.2-1B" from "meta-llama/Llama-3.2-1B")
model_name_only = model_name.split("/")[-1] if "/" in model_name else model_name
config["model"]["hf_assets_path"] = f"./{out_dir}/{model_name}/assets/hf/{model_name_only}"
elif model_type == "torchtitan":
config["model"]["name"] = model_name
config["model"]["flavor"] = flavor
config["model"]["hf_assets_path"] = f"/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
config["model"]["hf_assets_path"] = hf_assets_path or "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
else:
raise ValueError(f"Unknown model_type: {model_type}. Must be 'transformers_modeling_backend' or 'torchtitan'")

# Set absolute path to dataset to avoid path resolution issues
config["training"]["dataset_path"] = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"

parallelism_configs = [
BASELINE, # baseline
"fsdp2_tp2_cp1_pp1",
"fsdp2_tp1_cp1_pp2",
"fsdp2_tp1_cp2_pp1",
"fsdp2_tp1_cp2_pp2",
"fsdp2_tp2_cp2_pp1",
"fsdp2_tp2_cp1_pp2",
"fsdp2_tp2_cp2_pp2",
]
# parallelism_configs = [
# BASELINE, # baseline
# "fsdp2_tp2_cp1_pp1",
# "fsdp2_tp1_cp1_pp2",
# "fsdp2_tp1_cp2_pp1",
# "fsdp2_tp1_cp2_pp2",
# "fsdp2_tp2_cp2_pp1",
# "fsdp2_tp2_cp1_pp2",
# "fsdp2_tp2_cp2_pp2",
# ]

# parallelism_configs = [
# BASELINE, # baseline
Expand All @@ -201,11 +205,11 @@ def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str =
# # "fsdp1_tp2_cp2_pp2",
# ]

# parallelism_configs = [
# BASELINE, # baseline
# # "fsdp2_tp1_cp1_pp2",
# # "fsdp1_tp1_cp1_pp2",
# ]
parallelism_configs = [
BASELINE, # baseline
# "fsdp2_tp1_cp1_pp2",
# "fsdp1_tp1_cp1_pp2",
]

out_path = Path(out_dir) / model_name / flavor
out_path.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -1187,6 +1191,8 @@ def fmt_min_avg_max(min_v, avg_v, max_v, fmt="{:.2f}"):
create_configs_parser.add_argument("--model_type", type=str, default="transformers_modeling_backend",
choices=["transformers_modeling_backend", "torchtitan"],
help="Model type: 'transformers_modeling_backend' for HF models, 'torchtitan' for torchtitan native")
create_configs_parser.add_argument("--hf_assets_path", type=str, default=None,
help="Override hf_assets_path (tokenizer path). If not provided, uses default based on model_type.")

submit_jobs_parser = subparsers.add_parser("submit_jobs")
submit_jobs_parser.add_argument("--inp_dir", type=str, required=True)
Expand All @@ -1213,7 +1219,7 @@ def fmt_min_avg_max(min_v, avg_v, max_v, fmt="{:.2f}"):
args = parser.parse_args()

if args.action == "create_configs":
create_configs(args.model_name, args.out_dir, args.flavor, args.model_type)
create_configs(args.model_name, args.out_dir, args.flavor, args.model_type, args.hf_assets_path)
elif args.action == "submit_jobs":
submit_jobs(args.inp_dir, args.qos, args.only)
elif args.action == "report":
Expand Down
3 changes: 3 additions & 0 deletions torchtitan/models/llama3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@


llama3_args = {
"debugperf": TransformerModelArgs(
dim=256, n_layers=6, n_heads=16, n_kv_heads=16, vocab_size=2048, rope_theta=500000
),
"debugmodel": TransformerModelArgs(
dim=256, n_layers=6, n_heads=16, vocab_size=2048, rope_theta=500000
),
Expand Down