diffusion-pipe/examples/main_example.toml at main · klinok64/diffusion-pipe · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Output path for training runs. Each training run makes a new directory in here.
output_dir = '/data/diffusion_pipe_training_runs/hunyuan_video_test'

# Dataset config file.
dataset = 'examples/dataset.toml'
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
# eval_datasets = [
#     {name = 'something', config = 'path/to/eval_dataset.toml'},
# ]

# training settings

# I usually set this to a really high value because I don't know how long I want to train.
epochs = 1000
# Batch size of a single forward/backward pass for one GPU.
micro_batch_size_per_gpu = 1
# For mixed video / image training, you can have a different batch size for images.
#image_micro_batch_size_per_gpu = 4
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
pipeline_stages = 1
# Number of micro-batches sent through the pipeline for each training step.
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
gradient_accumulation_steps = 1
# Grad norm clipping.
gradient_clipping = 1.0
# Learning rate warmup.
warmup_steps = 100

# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
# Block swapping only works for LoRA training, and requires pipeline_stages=1.
#blocks_to_swap = 20

# eval settings

eval_every_n_epochs = 1
eval_before_first_step = true
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
eval_micro_batch_size_per_gpu = 1
# Batch size for images when doing mixed image / video training. Will be micro_batch_size_per_gpu if not set.
#image_eval_micro_batch_size_per_gpu = 4
eval_gradient_accumulation_steps = 1
# If using block swap, you can disable it for eval. Eval uses less memory, so depending on block swapping amount you can maybe get away with
# doing this, and then eval is much faster.
#disable_block_swap_for_eval = true

# misc settings

# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
save_every_n_epochs = 2
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
#checkpoint_every_n_epochs = 1
checkpoint_every_n_minutes = 120
# Always set to true unless you have a huge amount of VRAM.
# This can also be 'unsloth' to reduce VRAM even more, with a slight performance hit.
activation_checkpointing = true

# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
partition_method = 'parameters'
# Alternatively you can use 'manual' in combination with partition_split, which specifies the split points for dividing
# layers between GPUs. For example, with two GPUs, partition_split=[10] puts layers 0-9 on GPU 0, and the rest on GPU 1.
# With three GPUs, partition_split=[10, 20] puts layers 0-9 on GPU 0, layers 10-19 on GPU 1, and the rest on GPU 2.
# Length of partition_split must be pipeline_stages-1.
#partition_split = [N]

# dtype for saving the LoRA or model, if different from training dtype
save_dtype = 'bfloat16'
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
caching_batch_size = 1
# How often deepspeed logs to console.
steps_per_print = 1
# How to extract video clips for training from a single input video file.
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
# number of frames for that bucket.
# single_beginning: one clip starting at the beginning of the video
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
# default is single_beginning
video_clip_mode = 'single_beginning'

# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
# details on the configuration and options for each model.
[model]
type = 'hunyuan-video'
# Can load HunyuanVideo entirely from the ckpt path set up for the official inference scripts.
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
# Or you can load it by pointing to all the ComfyUI files.
transformer_path = '/data2/imagegen_models/hunyuan_video_comfyui/hunyuan_video_720_cfgdistill_fp8_e4m3fn.safetensors'
vae_path = '/data2/imagegen_models/hunyuan_video_comfyui/hunyuan_video_vae_bf16.safetensors'
llm_path = '/data2/imagegen_models/hunyuan_video_comfyui/llava-llama-3-8b-text-encoder-tokenizer'
clip_path = '/data2/imagegen_models/hunyuan_video_comfyui/clip-vit-large-patch14'
# Base dtype used for all models.
dtype = 'bfloat16'
# Hunyuan Video supports fp8 for the transformer when training LoRA.
transformer_dtype = 'float8'
# How to sample timesteps to train on. Can be logit_normal or uniform.
timestep_sample_method = 'logit_normal'

# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
[adapter]
type = 'lora'
rank = 32
# Dtype for the LoRA weights you are training.
dtype = 'bfloat16'
# You can initialize the lora weights from a previously trained lora.
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'

[optimizer]
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
# Look at train.py for other options. You could also easily edit the file and add your own.
type = 'adamw_optimi'
lr = 2e-5
betas = [0.9, 0.99]
weight_decay = 0.01
eps = 1e-8

# Can use this optimizer for a bit less memory usage.
# [optimizer]
# type = 'AdamW8bitKahan'
# lr = 2e-5
# betas = [0.9, 0.99]
# weight_decay = 0.01
# stabilize = false

# Any optimizer not explicitly supported will be dynamically loaded from the pytorch-optimizer library.
# [optimizer]
# type = 'Prodigy'
# lr = 1
# betas = [0.9, 0.99]
# weight_decay = 0.01

[monitoring]
# Set to true and fill in these fields to enable wandb
enable_wandb = false
wandb_api_key = ''
wandb_tracker_name = ''
wandb_run_name = ''