Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add debugperf_large model configuration and profiling support
  • Loading branch information
3outeille committed Dec 8, 2025
commit d78b0e69addb8abfbd155207eeb871bd0349f3f5
10 changes: 10 additions & 0 deletions torchtitan/experiments/transformers_modeling_backend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,16 @@
rope_theta=500000,
),
),
"debugperf_large": HFTransformerModelArgs(
titan_dense_args=TitanDenseModelArgs(
dim=1024,
n_layers=12,
n_heads=16,
n_kv_heads=16,
vocab_size=32000,
rope_theta=500000,
),
),
"debugmodel": HFTransformerModelArgs(
titan_dense_args=TitanDenseModelArgs(
dim=256,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,38 +20,38 @@ model_names=(
for model_name in "${tt_model_names[@]}"; do
rm -rf debug_local_results/${model_name}

python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf --model_type torchtitan
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf/seed_checkpoint --qos high
while [ ! -f debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt)" != "completed" ]; do
python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf_large --model_type torchtitan --enable_profiling --profile_freq 5
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large/seed_checkpoint --qos high
while [ ! -f debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt)" != "completed" ]; do
echo "Waiting for seed checkpoint from ${model_name} to complete ..."
sleep 1
done
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf --qos high
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large --qos high
echo "================"
done

for model_name in "${model_names[@]}"; do
rm -rf debug_local_results/${model_name}

python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf --model_type transformers_modeling_backend --hf_assets_path "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf/seed_checkpoint --qos high
while [ ! -f debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt)" != "completed" ]; do
python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf_large --model_type transformers_modeling_backend --hf_assets_path "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" --enable_profiling --profile_freq 5
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large/seed_checkpoint --qos high
while [ ! -f debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt)" != "completed" ]; do
echo "Waiting for seed checkpoint from ${model_name} to complete ..."
sleep 1
done
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf --qos high
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large --qos high
echo "================"
done

# for model_name in "${moe_model_names[@]}"; do
# rm -rf debug_local_results/${model_name}

# USE_MOE=1 python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf/seed_checkpoint --qos high
# while [ ! -f debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf/seed_checkpoint/status.txt)" != "completed" ]; do
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf_large
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large/seed_checkpoint --qos high
# while [ ! -f debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt)" != "completed" ]; do
# echo "Waiting for seed checkpoint from ${model_name} to complete ..."
# sleep 1
# done
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf --qos high
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large --qos high
# echo "================"
# done
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def _create_slurm_script(
print(f"Slurm script created at {script_path}")


def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str = "transformers_modeling_backend", hf_assets_path: str = None):
def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str = "transformers_modeling_backend", hf_assets_path: str = None, enable_profiling: bool = False, profile_freq: int = 5):
"""
results/
|_ meta-llama
Expand Down Expand Up @@ -183,6 +183,12 @@ def create_configs(model_name: str, out_dir: str, flavor: str, model_type: str =
# Set absolute path to dataset to avoid path resolution issues
config["training"]["dataset_path"] = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"

# Configure profiling
if enable_profiling:
config["profiling"]["enable_profiling"] = True
config["profiling"]["profile_freq"] = profile_freq
config["profiling"]["save_traces_folder"] = "profile_trace"

# parallelism_configs = [
# BASELINE, # baseline
# "fsdp2_tp2_cp1_pp1",
Expand Down Expand Up @@ -1193,6 +1199,10 @@ def fmt_min_avg_max(min_v, avg_v, max_v, fmt="{:.2f}"):
help="Model type: 'transformers_modeling_backend' for HF models, 'torchtitan' for torchtitan native")
create_configs_parser.add_argument("--hf_assets_path", type=str, default=None,
help="Override hf_assets_path (tokenizer path). If not provided, uses default based on model_type.")
create_configs_parser.add_argument("--enable_profiling", action="store_true",
help="Enable PyTorch profiler for trace collection")
create_configs_parser.add_argument("--profile_freq", type=int, default=5,
help="Profiling frequency (steps between profiles)")

submit_jobs_parser = subparsers.add_parser("submit_jobs")
submit_jobs_parser.add_argument("--inp_dir", type=str, required=True)
Expand All @@ -1219,7 +1229,7 @@ def fmt_min_avg_max(min_v, avg_v, max_v, fmt="{:.2f}"):
args = parser.parse_args()

if args.action == "create_configs":
create_configs(args.model_name, args.out_dir, args.flavor, args.model_type, args.hf_assets_path)
create_configs(args.model_name, args.out_dir, args.flavor, args.model_type, args.hf_assets_path, args.enable_profiling, args.profile_freq)
elif args.action == "submit_jobs":
submit_jobs(args.inp_dir, args.qos, args.only)
elif args.action == "report":
Expand Down
8 changes: 8 additions & 0 deletions torchtitan/models/llama3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@
"debugperf": TransformerModelArgs(
dim=256, n_layers=6, n_heads=16, n_kv_heads=16, vocab_size=2048, rope_theta=500000
),
"debugperf_large": TransformerModelArgs(
dim=1024,
n_layers=12,
n_heads=16,
n_kv_heads=16,
vocab_size=32000,
rope_theta=500000,
),
"debugmodel": TransformerModelArgs(
dim=256, n_layers=6, n_heads=16, vocab_size=2048, rope_theta=500000
),
Expand Down
Loading