Merge pull request #20 from shirleyzhu233/xzhu/sdxl

Update SDXL implementation
genforce · Oct 21, 2024 · da3f3d2 · da3f3d2
2 parents a6406e9 + fa5ce80
commit da3f3d2
Show file tree

Hide file tree

Showing 13 changed files with 396 additions and 1,752 deletions.
diff --git a/README.md b/README.md
@@ -25,6 +25,8 @@
 ```bash
 conda env create -f environment.yml
 conda activate freecontrol
+pip install -U diffusers 
+pip install -U gradio
 ```
 
 **Sample Semantic Bases**

diff --git a/config/base.yaml b/config/base.yaml
@@ -5,70 +5,69 @@ sd_config:
   steps: 200                      # Diffusion scale
   guidance_scale: 7.5             # Classifier-free guidance scale
   grad_guidance_scale: 1          # Gradient guidance scale, it will be multiplied with the weight in each type of guidance
-  sd_version: '2.1'               # choice from ['2,1'|'2.0'|'1.4']
+  sd_version: "1.5"               # choice from ["1.5", "2.1_base"]
   dreambooth: null                # Path to dreambooth. Set to null to disable dreambooth
-  safetensors: True                # whether to use safetenosr. For most ckpt from civitai.com, they used safetensor format
-  same_latent: False            # whether to use the same latent from inversion
+  safetensors: True               # whether to use safetenosr. For most ckpt from civitai.com, they used safetensor format
+  same_latent: False              # whether to use the same latent from inversion
   appearnace_same_latent: False
   pca_paths:
     []
   seed: 922                       # Seed for random number generator
-  generated_sample: False          # Whether to use generated sample as the reference pose
-  prompt : ""    # Prompt for generated sample
+  generated_sample: False         # Whether to use generated sample as the reference pose
+  prompt: ""                      # Prompt for generated sample
   negative_prompt: "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality"  # Prompt for negative sample
-  target_obj: ''             # Target object for inversion
+  target_obj: ""                  # Target object for inversion
   obj_pairs: ""
 
 data:
   inversion:
-    target_folder: 'dataset/latent'
+    target_folder: "dataset/latent"
     num_inference_steps: 999
-    method: 'DDIM'                # choice from ['DDIM'|'NTI'|'NPT']
-    fixed_size: [512,512]              # Set to null to disable fixed size, otherwise set to the fixed size (h,w) of the target image
+    method: "DDIM"                # choice from ['DDIM'|'NTI'|'NPT']
+    fixed_size: [512, 512]        # Set to null to disable fixed size, otherwise set to the fixed size (h,w) of the target image
     prompt: "A photo of a person, holding his hands, sketch"
     select_objects: "person"
     policy: "share"               # choice from ['share'|'separate']
-    sd_model: ''
+    sd_model: ""
 
 # Guidance config
 guidance:
   pca_guidance:
-    start_step: 0               # Start step for PCA self-attention injection
-    end_step: 80               # End step for PCA injection
-    weight: 800                 # Weight for PCA self-attention injection
+    start_step: 0                 # Start step for PCA self-attention injection
+    end_step: 80                  # End step for PCA injection
+    weight: 800                   # Weight for PCA self-attention injection
     select_feature: "key"
-#    weight1: 0
-    structure_guidance: # Parameters for PCA injection
-      apply: True               # Whether apply PCA injection
-      n_components: 64          # Number of leading components for PCA injection
-      normalized: True          # Whether normalize the PCA score
-      mask_type: 'cross_attn'   # Mask type for PCA injection, choice from ['cross_attn'|'tr']
-      penalty_type: 'max'       # Penalty type for PCA injection, choice from ['max'|'mean']
-      mask_tr: 0.3             # Threshold for PCA score, only applied when normalized is true
-      penalty_factor: 10       # Penalty factor for masked region, only applied when normalized is true
-    warm_up: # Parameters for Guidance weight warm up
-      apply: True               # Whether apply warm up
-      end_step: 10              # End step for warm up
-    adaptive: # Parameters for adaptive self-attention injection
-      apply: False               # Whether apply adaptive self-attention injection
-      adaptive_p: 1             # power of the adaptive threshold
-    blocks: # Blocks for self-attention injection
-      [ 'up_blocks.1']
-    appearance_guidance: # Parameters for texture regulation
-      apply: True               # Whether apply texture regulation
-      reg_factor: 0.05           # Regularization factor
+    structure_guidance:           # Parameters for PCA injection
+      apply: True                 # Whether apply PCA injection
+      n_components: 64            # Number of leading components for PCA injection
+      normalized: True            # Whether normalize the PCA score
+      mask_type: "cross_attn"     # Mask type for PCA injection, choice from ["cross_attn"|"tr"]
+      penalty_type: "max"         # Penalty type for PCA injection, choice from ["max"|"mean"]
+      mask_tr: 0.3                # Threshold for PCA score, only applied when normalized is true
+      penalty_factor: 10          # Penalty factor for masked region, only applied when normalized is true
+    warm_up:                      # Parameters for Guidance weight warm up
+      apply: True                 # Whether apply warm up
+      end_step: 10                # End step for warm up
+    adaptive:                     # Parameters for adaptive self-attention injection
+      apply: False                # Whether apply adaptive self-attention injection
+      adaptive_p: 1               # power of the adaptive threshold
+    blocks:                       # Blocks for self-attention injection
+      ["up_blocks.1"]
+    appearance_guidance:          # Parameters for texture regulation
+      apply: True                 # Whether apply texture regulation
+      reg_factor: 0.05            # Regularization factor
       tr: 0.5
       cross_attn_mask_tr: 0.3
-      app_n_components: 2       # Number of leading components used to extract mean appearance feature
+      app_n_components: 2         # Number of leading components used to extract mean appearance feature
 
   cross_attn:
-    start_step: 0               # Start step for cross-attention guidance
-    end_step: 80               # End step for cross-attention guidance
-    weight: 0                 # Weight for cross-attention guidance
-    obj_only: True              # Whether apply object only cross-attention guidance
-    soft_guidance: # Parameters for soft guidance
-      apply: True               # Whether apply soft guidance
-      kernel_size: 5            # Kernel size for Gaussian blur
-      sigma: 2                  # Sigma for Gaussian blur
+    start_step: 0                 # Start step for cross-attention guidance
+    end_step: 80                  # End step for cross-attention guidance
+    weight: 0                     # Weight for cross-attention guidance
+    obj_only: True                # Whether apply object only cross-attention guidance
+    soft_guidance:                # Parameters for soft guidance
+      apply: True                 # Whether apply soft guidance
+      kernel_size: 5              # Kernel size for Gaussian blur
+      sigma: 2                    # Sigma for Gaussian blur
     blocks:
-      [ 'up_blocks.1']
+      ["up_blocks.1"]
diff --git a/config/sdxl_base.yaml b/config/sdxl_base.yaml
@@ -0,0 +1,73 @@
+# SD config
+sd_config:
+  H: 1024                         # Generated image weight
+  W: 1024                         # Generated image height
+  steps: 200                      # Diffusion scale
+  guidance_scale: 7.5             # Classifier-free guidance scale
+  grad_guidance_scale: 1          # Gradient guidance scale, it will be multiplied with the weight in each type of guidance
+  sd_version: "XL-1.0"            # choice from ['sdxl-1.0','2,1'|'2.0'|'1.4']
+  dreambooth: null                # Path to dreambooth. Set to null to disable dreambooth
+  safetensors: True               # whether to use safetenosr. For most ckpt from civitai.com, they used safetensor format
+  same_latent: False              # whether to use the same latent from inversion
+  appearnace_same_latent: False
+  pca_paths:
+    []
+  seed: 922                       # Seed for random number generator
+  generated_sample: False         # Whether to use generated sample as the reference pose
+  prompt : ""                     # Prompt for generated sample
+  negative_prompt: "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality"  # Prompt for negative sample
+  target_obj: ""                  # Target object for inversion
+  obj_pairs: ""
+
+data:
+  inversion:
+    target_folder: "dataset/latent"
+    num_inference_steps: 999
+    method: "DDIM"                # choice from ['DDIM'|'NTI'|'NPT']
+    fixed_size: [1024,1024]       # Set to null to disable fixed size, otherwise set to the fixed size (h,w) of the target image
+    prompt: "A photo of a person, holding his hands, sketch"
+    select_objects: "person"
+    policy: "share"               # choice from ['share'|'separate']
+    sd_model: ""
+
+# Guidance config
+guidance:
+  pca_guidance:
+    start_step: 0                 # Start step for PCA self-attention injection
+    end_step: 80                  # End step for PCA injection
+    weight: 800                   # Weight for PCA self-attention injection
+    select_feature: "key"
+    structure_guidance:           # Parameters for PCA injection
+      apply: True                 # Whether apply PCA injection
+      n_components: 64            # Number of leading components for PCA injection
+      normalized: True            # Whether normalize the PCA score
+      mask_type: "cross_attn"     # Mask type for PCA injection, choice from ['cross_attn'|'tr']
+      penalty_type: "max"         # Penalty type for PCA injection, choice from ['max'|'mean']
+      mask_tr: 0.3                # Threshold for PCA score, only applied when normalized is true
+      penalty_factor: 10          # Penalty factor for masked region, only applied when normalized is true
+    warm_up:                      # Parameters for Guidance weight warm up
+      apply: True                 # Whether apply warm up
+      end_step: 10                # End step for warm up
+    adaptive:                     # Parameters for adaptive self-attention injection
+      apply: False                # Whether apply adaptive self-attention injection
+      adaptive_p: 1               # power of the adaptive threshold
+    blocks:                       # Blocks for self-attention injection
+      ["up_blocks.0"]
+    appearance_guidance:          # Parameters for texture regulation
+      apply: True                 # Whether apply texture regulation
+      reg_factor: 0.05            # Regularization factor
+      tr: 0.5
+      cross_attn_mask_tr: 0.3
+      app_n_components: 2         # Number of leading components used to extract mean appearance feature
+
+  cross_attn:
+    start_step: 0                 # Start step for cross-attention guidance
+    end_step: 80                  # End step for cross-attention guidance
+    weight: 0                     # Weight for cross-attention guidance
+    obj_only: True                # Whether apply object only cross-attention guidance
+    soft_guidance:                # Parameters for soft guidance
+      apply: True                 # Whether apply soft guidance
+      kernel_size: 5              # Kernel size for Gaussian blur
+      sigma: 2                    # Sigma for Gaussian blur
+    blocks:
+      ["up_blocks.0"]
diff --git a/gradio_app.py b/gradio_app.py
@@ -69,13 +69,20 @@ def freecontrol_generate(condition_image, prompt, scale, ddim_steps, sd_version,
     input_config = gradio_update_parameter
 
     # Load base config
-    base_config = yaml.load(open("config/base.yaml", "r"), Loader=yaml.FullLoader)
+    if 'XL' in sd_version:
+        config_path = 'config/sdxl_base.yaml'
+    else:
+        config_path = 'config/base.yaml'
+    base_config = yaml.load(open(config_path, "r"), Loader=yaml.FullLoader)
     # Update the Default config by gradio config
     config = merge_sweep_config(base_config=base_config, update=input_config)
     config = OmegaConf.create(config)
 
     # set the correct pipeline
-    pipeline_name = "SDPipeline"
+    if 'XL' in sd_version:
+        pipeline_name = "SDXLPipeline"
+    else:
+        pipeline_name = "SDPipeline"
 
     pipeline = make_pipeline(pipeline_name,
                              model_path,
@@ -277,7 +284,7 @@ def main():
 
         run_button.click(fn=freecontrol_generate, inputs=ips, outputs=[result_gallery])
 
-    block.launch(server_name='0.0.0.0', share=False, server_port=9989)
+    block.launch(server_name='0.0.0.0', share=True, server_port=9989)
 
 
 if __name__ == '__main__':

diff --git a/libs/model/__init__.py b/libs/model/__init__.py
@@ -1,2 +1,3 @@
 from .pipelines import make_pipeline
-from . import sd_pipeline
+from . import sd_pipeline
+from . import sdxl_pipeline
diff --git a/libs/model/module/xformer_attention.py b/libs/model/module/xformer_attention.py
@@ -54,15 +54,15 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
         if attn.group_norm is not None:
             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
 
-        query = attn.to_q(hidden_states, scale=scale)
+        query = attn.to_q(hidden_states)
 
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
-        key = attn.to_k(encoder_hidden_states, scale=scale)
-        value = attn.to_v(encoder_hidden_states, scale=scale)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
 
         # Record the Q,K,V for PCA guidance
         self.key = key

diff --git a/libs/model/sd_pipeline.py b/libs/model/sd_pipeline.py
@@ -304,8 +304,8 @@ def __call__(
                     self.compute_cross_attn_mask(cond_control_ids, cond_example_ids, cond_appearance_ids)
 
                 if _in_step(self.guidance_config.pca_guidance, i):
-                    # Compute the PCA structure and appearance guidance
-                    # Set the select feature to key by default
+                # Compute the PCA structure and appearance guidance
+                # Set the select feature to key by default
                     try:
                         select_feature = self.guidance_config.pca_guidance.select_feature
                     except: