Merge remote-tracking branch 'upstream/main'

AI Toolkit Contributor · AI Toolkit Contributor · commit 86d107edde0c · 2025-10-29T20:24:54.000+01:00
diff --git a/extensions_built_in/sd_trainer/SDTrainer.py b/extensions_built_in/sd_trainer/SDTrainer.py
@@ -95,8 +95,13 @@ def __init__(self, process_id: int, job, config: OrderedDict, **kwargs):
                 raise ValueError("diff_output_preservation requires a network to be set")
             if self.train_config.train_text_encoder:
                 raise ValueError("diff_output_preservation is not supported with train_text_encoder")
-            
-            # always do a prior prediction when doing diff output preservation
+        
+        if self.train_config.blank_prompt_preservation:
+            if self.network_config is None:
+                raise ValueError("blank_prompt_preservation requires a network to be set")
+        
+        if self.train_config.blank_prompt_preservation or self.train_config.diff_output_preservation:
+            # always do a prior prediction when doing output preservation
             self.do_prior_prediction = True
         
         # store the loss target for a batch so we can use it in a loss
@@ -372,6 +377,13 @@ def hook_before_train_loop(self):
                     self.sd.text_encoder_to("cpu")
                 flush()
         
+        if self.train_config.blank_prompt_preservation and self.cached_blank_embeds is None:
+            # make sure we have this if not unloading
+            self.cached_blank_embeds = self.sd.encode_prompt("").to(
+                self.device_torch,
+                dtype=self.sd.torch_dtype
+            ).detach()
+        
         if self.train_config.diffusion_feature_extractor_path is not None:
             vae = self.sd.vae
             # if not (self.model_config.arch in ["flux"]) or self.sd.vae.__class__.__name__ == "AutoencoderPixelMixer":
@@ -634,33 +646,28 @@ def calculate_loss(
                     stepped_latents = torch.cat(stepped_chunks, dim=0)
                     
                 stepped_latents = stepped_latents.to(self.sd.vae.device, dtype=self.sd.vae.dtype)
-                # resize to half the size of the latents
-                stepped_latents_half = torch.nn.functional.interpolate(
-                    stepped_latents, 
-                    size=(stepped_latents.shape[2] // 2, stepped_latents.shape[3] // 2), 
-                    mode='bilinear', 
-                    align_corners=False
-                )
-                pred_features = self.dfe(stepped_latents.float())
-                pred_features_half = self.dfe(stepped_latents_half.float())
+                sl = stepped_latents
+                if len(sl.shape) == 5:
+                    # video B,C,T,H,W
+                    sl = sl.permute(0, 2, 1, 3, 4)  # B,T,C,H,W
+                    b, t, c, h, w = sl.shape
+                    sl = sl.reshape(b * t, c, h, w)
+                pred_features = self.dfe(sl.float())
                 with torch.no_grad():
-                    target_features = self.dfe(batch.latents.to(self.device_torch, dtype=torch.float32))
-                    batch_latents_half = torch.nn.functional.interpolate(
-                        batch.latents.to(self.device_torch, dtype=torch.float32),
-                        size=(batch.latents.shape[2] // 2, batch.latents.shape[3] // 2),
-                        mode='bilinear',
-                        align_corners=False
-                    )
-                    target_features_half = self.dfe(batch_latents_half)
+                    bl = batch.latents
+                    bl = bl.to(self.sd.vae.device)
+                    if len(bl.shape) == 5:
+                        # video B,C,T,H,W
+                        bl = bl.permute(0, 2, 1, 3, 4)  # B,T,C,H,W
+                        b, t, c, h, w = bl.shape
+                        bl = bl.reshape(b * t, c, h, w)
+                    target_features = self.dfe(bl.float())
                     # scale dfe so it is weaker at higher noise levels
                     dfe_scaler = 1 - (timesteps.float() / 1000.0).view(-1, 1, 1, 1).to(self.device_torch)
                 
                 dfe_loss = torch.nn.functional.mse_loss(pred_features, target_features, reduction="none") * \
                     self.train_config.diffusion_feature_extractor_weight * dfe_scaler
-                
-                dfe_loss_half = torch.nn.functional.mse_loss(pred_features_half, target_features_half, reduction="none") * \
-                    self.train_config.diffusion_feature_extractor_weight * dfe_scaler
-                additional_loss += dfe_loss.mean() + dfe_loss_half.mean()
+                additional_loss += dfe_loss.mean()
             elif self.dfe.version == 2:
                 # version 2
                 # do diffusion feature extraction on target
@@ -1798,6 +1805,14 @@ def get_adapter_multiplier():
                         if self.train_config.diff_output_preservation:
                             prior_embeds_to_use = self.diff_output_preservation_embeds.expand_to_batch(noisy_latents.shape[0])
                         
+                        if self.train_config.blank_prompt_preservation:
+                            blank_embeds = self.cached_blank_embeds.clone().detach().to(
+                                self.device_torch, dtype=dtype
+                            )
+                            prior_embeds_to_use = concat_prompt_embeds(
+                                [blank_embeds] * noisy_latents.shape[0]
+                            )
+                        
                         prior_pred = self.get_prior_prediction(
                             noisy_latents=noisy_latents,
                             conditional_embeds=prior_embeds_to_use,
@@ -1973,7 +1988,8 @@ def get_adapter_multiplier():
                         prior_to_calculate_loss = prior_pred
                         # if we are doing diff_output_preservation and not noing inverted masked prior
                         # then we need to send none here so it will not target the prior
-                        if self.train_config.diff_output_preservation and not do_inverted_masked_prior:
+                        doing_preservation = self.train_config.diff_output_preservation or self.train_config.blank_prompt_preservation
+                        if doing_preservation and not do_inverted_masked_prior:
                             prior_to_calculate_loss = None
                         
                         loss = self.calculate_loss(
@@ -1986,24 +2002,34 @@ def get_adapter_multiplier():
                             prior_pred=prior_to_calculate_loss,
                         )
                     
-                    if self.train_config.diff_output_preservation:
+                    if self.train_config.diff_output_preservation or self.train_config.blank_prompt_preservation:
                         # send the loss backwards otherwise checkpointing will fail
                         self.accelerator.backward(loss)
                         normal_loss = loss.detach() # dont send backward again
                         
-                        dop_embeds = self.diff_output_preservation_embeds.expand_to_batch(noisy_latents.shape[0])
-                        dop_pred = self.predict_noise(
+                        with torch.no_grad():
+                            if self.train_config.diff_output_preservation:
+                                preservation_embeds = self.diff_output_preservation_embeds.expand_to_batch(noisy_latents.shape[0])
+                            elif self.train_config.blank_prompt_preservation:
+                                blank_embeds = self.cached_blank_embeds.clone().detach().to(
+                                    self.device_torch, dtype=dtype
+                                )
+                                preservation_embeds = concat_prompt_embeds(
+                                    [blank_embeds] * noisy_latents.shape[0]
+                                )
+                        preservation_pred = self.predict_noise(
                             noisy_latents=noisy_latents.to(self.device_torch, dtype=dtype),
                             timesteps=timesteps,
-                            conditional_embeds=dop_embeds.to(self.device_torch, dtype=dtype),
+                            conditional_embeds=preservation_embeds.to(self.device_torch, dtype=dtype),
                             unconditional_embeds=unconditional_embeds,
                             batch=batch,
                             **pred_kwargs
                         )
-                        dop_loss = torch.nn.functional.mse_loss(dop_pred, prior_pred) * self.train_config.diff_output_preservation_multiplier
-                        self.accelerator.backward(dop_loss)
-                        
-                        loss = normal_loss + dop_loss
+                        multiplier = self.train_config.diff_output_preservation_multiplier if self.train_config.diff_output_preservation else self.train_config.blank_prompt_preservation_multiplier
+                        preservation_loss = torch.nn.functional.mse_loss(preservation_pred, prior_pred) * multiplier
+                        self.accelerator.backward(preservation_loss)
+
+                        loss = normal_loss + preservation_loss
                         loss = loss.clone().detach()
                         # require grad again so the backward wont fail
                         loss.requires_grad_(True)
diff --git a/toolkit/config_modules.py b/toolkit/config_modules.py
@@ -457,7 +457,11 @@ def __init__(self, **kwargs):
         self.diff_output_preservation_multiplier = kwargs.get('diff_output_preservation_multiplier', 1.0)
         # If the trigger word is in the prompt, we will use this class name to replace it eg. "sks woman" -> "woman"
         self.diff_output_preservation_class = kwargs.get('diff_output_preservation_class', '')
-
+        
+        # blank prompt preservation will preserve the model's knowledge of a blank prompt
+        self.blank_prompt_preservation = kwargs.get('blank_prompt_preservation', False)
+        self.blank_prompt_preservation_multiplier = kwargs.get('blank_prompt_preservation_multiplier', 1.0)
+        
         # legacy
         if match_adapter_assist and self.match_adapter_chance == 0.0:
             self.match_adapter_chance = 1.0
@@ -1325,5 +1329,8 @@ def validate_configs(
     if model_config.arch == 'qwen_image_edit':
         if train_config.unload_text_encoder:
             raise ValueError("Cannot cache unload text encoder with qwen_image_edit model. Control images are encoded with text embeddings. You can cache the text embeddings though")
+    
+    if train_config.diff_output_preservation and train_config.blank_prompt_preservation:
+        raise ValueError("Cannot use both differential output preservation and blank prompt preservation at the same time. Please set one of them to False.")
 
     
diff --git a/toolkit/optimizer.py b/toolkit/optimizer.py
@@ -93,7 +93,7 @@ def get_optimizer(
             optimizer_params['scale_parameter'] = False
         if 'warmup_init' not in optimizer_params:
             optimizer_params['warmup_init'] = False
-        optimizer = Adafactor(params, lr=float(learning_rate), eps=1e-6, **optimizer_params)
+        optimizer = Adafactor(params, lr=float(learning_rate), **optimizer_params)
     elif lower_type == 'automagic':
         from toolkit.optimizers.automagic import Automagic
         optimizer = Automagic(params, lr=float(learning_rate), **optimizer_params)
diff --git a/ui/src/app/jobs/new/SimpleJob.tsx b/ui/src/app/jobs/new/SimpleJob.tsx
@@ -215,12 +215,12 @@ export default function SimpleJob({
               </FormGroup>
             )}
             {modelArch?.additionalSections?.includes('model.qie.match_target_res') && (
-                <Checkbox
-                  label="Match Target Res"
-                  docKey="model.qie.match_target_res"
-                  checked={jobConfig.config.process[0].model.model_kwargs.match_target_res}
-                  onChange={value => setJobConfig(value, 'config.process[0].model.model_kwargs.match_target_res')}
-                />
+              <Checkbox
+                label="Match Target Res"
+                docKey="model.qie.match_target_res"
+                checked={jobConfig.config.process[0].model.model_kwargs.match_target_res}
+                onChange={value => setJobConfig(value, 'config.process[0].model.model_kwargs.match_target_res')}
+              />
             )}
             {modelArch?.additionalSections?.includes('model.layer_offloading') && (
               <>
@@ -586,16 +586,27 @@ export default function SimpleJob({
                 </FormGroup>
               </div>
               <div>
+                {disableSections.includes('train.diff_output_preservation') ||
+                disableSections.includes('train.blank_prompt_preservation') ? null : (
+                  <FormGroup label="Regularization">
+                    <></>
+                  </FormGroup>
+                )}
                 {disableSections.includes('train.diff_output_preservation') ? null : (
                   <>
-                    <FormGroup label="Regularization">
-                      <Checkbox
-                        label="Differential Output Preservation"
-                        className="pt-1"
-                        checked={jobConfig.config.process[0].train.diff_output_preservation || false}
-                        onChange={value => setJobConfig(value, 'config.process[0].train.diff_output_preservation')}
-                      />
-                    </FormGroup>
+                    <Checkbox
+                      label="Differential Output Preservation"
+                      docKey={'train.diff_output_preservation'}
+                      className="pt-1"
+                      checked={jobConfig.config.process[0].train.diff_output_preservation || false}
+                      onChange={value => {
+                        setJobConfig(value, 'config.process[0].train.diff_output_preservation');
+                        if (value && jobConfig.config.process[0].train.blank_prompt_preservation) {
+                          // only one can be enabled at a time
+                          setJobConfig(false, 'config.process[0].train.blank_prompt_preservation');
+                        }
+                      }}
+                    />
                     {jobConfig.config.process[0].train.diff_output_preservation && (
                       <>
                         <NumberInput
@@ -610,7 +621,7 @@ export default function SimpleJob({
                         />
                         <TextInput
                           label="DOP Preservation Class"
-                          className="pt-2"
+                          className="pt-2 pb-4"
                           value={jobConfig.config.process[0].train.diff_output_preservation_class as string}
                           onChange={value =>
                             setJobConfig(value, 'config.process[0].train.diff_output_preservation_class')
@@ -621,6 +632,39 @@ export default function SimpleJob({
                     )}
                   </>
                 )}
+                {disableSections.includes('train.blank_prompt_preservation') ? null : (
+                  <>
+                    <Checkbox
+                      label="Blank Prompt Preservation"
+                      docKey={'train.blank_prompt_preservation'}
+                      className="pt-1"
+                      checked={jobConfig.config.process[0].train.blank_prompt_preservation || false}
+                      onChange={value => {
+                        setJobConfig(value, 'config.process[0].train.blank_prompt_preservation');
+                        if (value && jobConfig.config.process[0].train.diff_output_preservation) {
+                          // only one can be enabled at a time
+                          setJobConfig(false, 'config.process[0].train.diff_output_preservation');
+                        }
+                      }}
+                    />
+                    {jobConfig.config.process[0].train.blank_prompt_preservation && (
+                      <>
+                        <NumberInput
+                          label="BPP Loss Multiplier"
+                          className="pt-2"
+                          value={
+                            (jobConfig.config.process[0].train.blank_prompt_preservation_multiplier as number) || 1.0
+                          }
+                          onChange={value =>
+                            setJobConfig(value, 'config.process[0].train.blank_prompt_preservation_multiplier')
+                          }
+                          placeholder="eg. 1.0"
+                          min={0}
+                        />
+                      </>
+                    )}
+                  </>
+                )}
               </div>
             </div>
           </Card>
diff --git a/ui/src/app/jobs/new/options.ts b/ui/src/app/jobs/new/options.ts
@@ -9,6 +9,7 @@ type DisableableSections =
   | 'network.conv'
   | 'trigger_word'
   | 'train.diff_output_preservation'
+  | 'train.blank_prompt_preservation'
   | 'train.unload_text_encoder'
   | 'slider';
 
diff --git a/ui/src/components/SampleImages.tsx b/ui/src/components/SampleImages.tsx
@@ -154,9 +154,7 @@ export default function SampleImages({ job }: SampleImagesProps) {
 
     switch (cols) {
       case 1:
-        return 'grid-cols-1';
       case 2:
-        return 'grid-cols-2';
       case 3:
         return 'grid-cols-3';
       case 4:
@@ -234,7 +232,7 @@ export default function SampleImages({ job }: SampleImagesProps) {
       case 40:
         return 'grid-cols-40';
       default:
-        return 'grid-cols-1';
+        return 'grid-cols-3';
     }
   }, [numSamples]);
 
@@ -262,17 +260,38 @@ export default function SampleImages({ job }: SampleImagesProps) {
         {PageInfoContent}
         {sampleImages && (
           <div className={`grid ${gridColsClass} gap-1`}>
-            {sampleImages.map((sample: string) => (
-              <SampleImageCard
-                key={sample}
-                imageUrl={sample}
-                numSamples={numSamples}
-                sampleImages={sampleImages}
-                alt="Sample Image"
-                onClick={() => setSelectedSamplePath(sample)}
-                observerRoot={containerRef.current}
-              />
-            ))}
+            {sampleImages.map((sample: string, idx: number) => {
+              // Compute current group (groups are size = numSamples)
+              const groupIndex = Math.floor(idx / numSamples);
+              const groupStart = groupIndex * numSamples;
+              const groupEnd = Math.min(groupStart + numSamples, sampleImages.length);
+              const groupSize = groupEnd - groupStart;
+              const isEndOfGroup = idx === groupEnd - 1;
+
+              // Only enforce a MIN of 3 when the group's planned width is < 3
+              const MIN_COLS = 3;
+              const shouldPad = numSamples < MIN_COLS && groupSize < MIN_COLS;
+              const padsNeeded = shouldPad ? MIN_COLS - groupSize : 0;
+
+              return (
+                <div key={sample} className="contents">
+                  <SampleImageCard
+                    imageUrl={sample}
+                    numSamples={numSamples}
+                    sampleImages={sampleImages}
+                    alt="Sample Image"
+                    onClick={() => setSelectedSamplePath(sample)}
+                    observerRoot={containerRef.current}
+                  />
+
+                  {isEndOfGroup &&
+                    padsNeeded > 0 &&
+                    Array.from({ length: padsNeeded }).map((_, i) => (
+                      <div key={`pad-${groupIndex}-${i}`} className="invisible" />
+                    ))}
+                </div>
+              );
+            })}
           </div>
         )}
       </div>
diff --git a/ui/src/docs.tsx b/ui/src/docs.tsx
diff --git a/ui/src/types.ts b/ui/src/types.ts
diff --git a/version.py b/version.py