From ad93a825475ae83778dd9d68b39d3d4bc17aa269 Mon Sep 17 00:00:00 2001 From: ljleb Date: Wed, 12 Jun 2024 14:18:45 -0400 Subject: [PATCH] add t5xxl to sd3 arch --- sd_mecha/models/sd3_sgm.yaml | 16 ++- sd_mecha/models/sd3_sgm_keys.txt | 215 +++++++++++++++++++++++++++++++ 2 files changed, 229 insertions(+), 2 deletions(-) diff --git a/sd_mecha/models/sd3_sgm.yaml b/sd_mecha/models/sd3_sgm.yaml index 34d9c48..16242b2 100644 --- a/sd_mecha/models/sd3_sgm.yaml +++ b/sd_mecha/models/sd3_sgm.yaml @@ -34,7 +34,7 @@ merge: - text_model.embeddings in11: - text_model.encoder.layers.11 - - final_layer_norm + - text_model.final_layer_norm in*: text_model.encoder.layers.* txt2: @@ -45,8 +45,20 @@ merge: - text_model.embeddings in31: - text_model.encoder.layers.31 - - final_layer_norm + - text_model.final_layer_norm - text_projection in*: text_model.encoder.layers.* + t5xxl: + prefix: text_encoders.t5xxl.transformer + blocks: + in0: + - encoder.block.0.#.# + - encoder.embed_tokens + - shared + in23: + - encoder.block.23.#.# + - encoder.final_layer_norm + in*: encoder.block.*.#.# + keys: sd3_sgm_keys.txt diff --git a/sd_mecha/models/sd3_sgm_keys.txt b/sd_mecha/models/sd3_sgm_keys.txt index 1a13182..d1b6ed9 100644 --- a/sd_mecha/models/sd3_sgm_keys.txt +++ b/sd_mecha/models/sd3_sgm_keys.txt @@ -1446,3 +1446,218 @@ text_encoders.clip_l.transformer.text_model.encoder.layers.9.self_attn.v_proj.bi text_encoders.clip_l.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight text_encoders.clip_l.transformer.text_model.final_layer_norm.bias text_encoders.clip_l.transformer.text_model.final_layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight +text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.0.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.0.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.0.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.0.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.0.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.1.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.1.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.1.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.1.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.1.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.1.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.1.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.1.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.1.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.10.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.10.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.10.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.10.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.10.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.10.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.10.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.10.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.10.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.11.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.11.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.11.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.11.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.11.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.11.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.11.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.11.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.11.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.12.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.12.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.12.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.12.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.12.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.12.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.12.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.12.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.12.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.13.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.13.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.13.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.13.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.13.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.13.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.13.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.13.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.13.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.14.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.14.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.14.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.14.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.14.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.14.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.14.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.14.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.14.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.15.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.15.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.15.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.15.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.15.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.15.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.15.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.15.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.15.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.16.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.16.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.16.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.16.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.16.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.16.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.16.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.16.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.16.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.17.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.17.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.17.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.17.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.17.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.17.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.17.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.17.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.17.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.18.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.18.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.18.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.18.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.18.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.18.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.18.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.18.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.18.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.19.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.19.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.19.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.19.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.19.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.19.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.19.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.19.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.19.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.2.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.2.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.2.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.2.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.2.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.2.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.2.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.2.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.2.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.20.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.20.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.20.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.20.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.20.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.20.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.20.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.20.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.20.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.21.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.21.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.21.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.21.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.21.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.21.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.21.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.21.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.21.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.22.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.22.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.22.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.22.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.22.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.22.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.22.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.22.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.22.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.23.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.23.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.23.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.23.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.23.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.23.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.23.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.23.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.23.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.3.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.3.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.3.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.3.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.3.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.3.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.3.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.3.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.3.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.4.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.4.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.4.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.4.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.4.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.4.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.4.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.4.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.4.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.5.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.5.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.5.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.5.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.5.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.5.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.5.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.5.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.5.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.6.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.6.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.6.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.6.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.6.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.6.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.6.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.6.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.6.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.7.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.7.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.7.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.7.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.7.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.7.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.7.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.7.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.7.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.8.layer.0.SelfAttention.k.weight +text_encoders.t5xxl.transformer.encoder.block.8.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.8.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.8.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.8.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.9.layer.0.SelfAttention.o.weight +text_encoders.t5xxl.transformer.encoder.block.9.layer.0.SelfAttention.q.weight +text_encoders.t5xxl.transformer.encoder.block.9.layer.0.SelfAttention.v.weight +text_encoders.t5xxl.transformer.encoder.block.9.layer.0.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.block.9.layer.1.DenseReluDense.wi_0.weight +text_encoders.t5xxl.transformer.encoder.block.9.layer.1.DenseReluDense.wi_1.weight +text_encoders.t5xxl.transformer.encoder.block.9.layer.1.DenseReluDense.wo.weight +text_encoders.t5xxl.transformer.encoder.block.9.layer.1.layer_norm.weight +text_encoders.t5xxl.transformer.encoder.embed_tokens.weight +text_encoders.t5xxl.transformer.encoder.final_layer_norm.weight +text_encoders.t5xxl.transformer.shared.weight \ No newline at end of file