From f6ae111690124194d5f160ea2572c898c77a73a4 Mon Sep 17 00:00:00 2001
From: fewtarius <fewtarius@steamfork.org>
Date: Sat, 7 Feb 2026 07:26:01 -0500
Subject: [PATCH 1/3] fix(diffusers): use correct tensor name prefixes for SDXL
 text encoders

Problem:
When loading SDXL models in diffusers directory format, text encoders were
loaded with prefixes "te." and "te.1." which don't match the expected tensor
names in the model graph. The model expects "cond_stage_model.transformer."
for clip_l and "cond_stage_model.1.transformer." for clip_g.

This caused "tensor not in model file" errors for all text encoder tensors
when loading SDXL diffusers models.

Solution:
- Changed text_encoder prefix from "te." to "cond_stage_model.transformer."
- Changed text_encoder_2 prefix from "te.1." to "cond_stage_model.1.transformer."
- These prefixes now match what's used when loading separate clip_l/clip_g files
- Added early return in get_sd_version() when SDXL is detected to prevent
  later components (VAE) from overriding the version
- Added version caching to prevent re-detection from changing SDXL version

Testing:
- SDXL diffusers models now load successfully
- SD 1.5 models continue to work (regression tested)
- All text encoder tensors are found and loaded correctly

Files changed:
- model.cpp: Updated diffusers text encoder prefixes and SDXL detection logic
- stable-diffusion.cpp: Added version caching to preserve SDXL detection
---
 src/model.cpp            | 18 +++++++++++++++---
 src/stable-diffusion.cpp |  9 ++++++++-
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/model.cpp b/src/model.cpp
index 58d71d9e4..81c39151b 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -655,11 +655,11 @@ bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const s
         LOG_WARN("Couldn't find working VAE in %s", file_path.c_str());
         // return false;
     }
-    if (!init_from_safetensors_file(clip_path, "te.")) {
+    if (!init_from_safetensors_file(clip_path, "cond_stage_model.transformer.")) {
         LOG_WARN("Couldn't find working text encoder in %s", file_path.c_str());
         // return false;
     }
-    if (!init_from_safetensors_file(clip_g_path, "te.1.")) {
+    if (!init_from_safetensors_file(clip_g_path, "cond_stage_model.1.transformer.")) {
         LOG_DEBUG("Couldn't find working second text encoder in %s", file_path.c_str());
     }
     return true;
@@ -1028,6 +1028,11 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
 
 SDVersion ModelLoader::get_sd_version() {
     TensorStorage token_embedding_weight, input_block_weight;
+    // Return cached version if already detected as SDXL in earlier component
+    if (version_ == VERSION_SDXL || version_ == VERSION_SDXL_INPAINT || version_ == VERSION_SDXL_PIX2PIX) {
+        LOG_DEBUG("Returning cached SDXL version");
+        return version_;
+    }
 
     bool has_multiple_encoders = false;
     bool is_unet               = false;
@@ -1089,8 +1094,10 @@ SDVersion ModelLoader::get_sd_version() {
                 tensor_storage.name.find("cond_stage_model.1") != std::string::npos ||
                 tensor_storage.name.find("te.1") != std::string::npos) {
                 has_multiple_encoders = true;
+                // Return SDXL immediately to prevent later components from overriding
                 if (is_unet) {
-                    is_xl = true;
+                    LOG_DEBUG("Detected SDXL (multiple text encoders in UNET model)");
+                    return VERSION_SDXL;
                 }
             }
             if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {
@@ -1122,6 +1129,11 @@ SDVersion ModelLoader::get_sd_version() {
             input_block_weight = tensor_storage;
         }
     }
+    
+    // Ensure SDXL is detected even if early return was not reached
+    if (has_multiple_encoders && is_unet) {
+        is_xl = true;
+    }
     if (is_wan) {
         LOG_DEBUG("patch_embedding_channels %d", patch_embedding_channels);
         if (patch_embedding_channels == 184320 && !has_img_emb) {
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index c0ee1182d..6c3935338 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -326,7 +326,13 @@ class StableDiffusionGGML {
 
         model_loader.convert_tensors_name();
 
-        version = model_loader.get_sd_version();
+        // SDXL_FIX: Don't overwrite if already detected as SDXL in earlier component
+        SDVersion detected_version = model_loader.get_sd_version();
+        if (version != VERSION_SDXL && version != VERSION_SDXL_INPAINT && version != VERSION_SDXL_PIX2PIX) {
+            version = detected_version;
+        } else {
+            LOG_INFO("Keeping previous SDXL version, detected version: %s", model_version_to_str[detected_version]);
+        }
         if (version == VERSION_COUNT) {
             LOG_ERROR("get sd version from file failed: '%s'", SAFE_STR(sd_ctx_params->model_path));
             return false;
@@ -335,6 +341,7 @@ class StableDiffusionGGML {
         auto& tensor_storage_map = model_loader.get_tensor_storage_map();
 
         LOG_INFO("Version: %s ", model_version_to_str[version]);
+        
         ggml_type wtype               = (int)sd_ctx_params->wtype < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)
                                             ? (ggml_type)sd_ctx_params->wtype
                                             : GGML_TYPE_COUNT;

From f4541815c95196a7727af55c8e83c6c3f305e234 Mon Sep 17 00:00:00 2001
From: fewtarius <fewtarius@steamfork.org>
Date: Sat, 7 Feb 2026 07:33:51 -0500
Subject: [PATCH 2/3] feat(cli): add --model-type parameter for manual version
 override

Adds a new --model-type CLI parameter that allows users to manually specify
the model version instead of relying on auto-detection. This is useful when:
- Auto-detection fails or is ambiguous
- Testing model behavior with different version settings
- Working with modified/custom models

Usage:
  --model-type sdxl          # Force SDXL version
  --model-type sd1           # Force SD 1.x version
  --model-type flux          # Force FLUX version

Supported values: sd1, sd2, sdxl, sdxl_inpaint, sdxl_pix2pix, flux, sd3, svd

Implementation:
- Added version_override field to sd_ctx_params_t struct
- Added model_type string parameter to SDContextParams
- Added string-to-enum conversion in to_sd_ctx_params_t()
- Updated model loading to check for manual override before auto-detection
- Auto-detection still works when --model-type is not specified

Testing:
- Tested manual override with --model-type sdxl (works)
- Tested auto-detection without parameter (still works)
- Tested with SD 1.5 model and --model-type sd1 (works)

Files changed:
- stable-diffusion.h: Added version_override field to sd_ctx_params_t
- stable-diffusion.cpp: Added version override logic and initialization
- examples/common/common.hpp: Added CLI parameter and string-to-enum conversion
---
 examples/common/common.hpp | 40 ++++++++++++++++++++++++++++++++++++++
 include/stable-diffusion.h |  1 +
 src/stable-diffusion.cpp   | 25 +++++++++++++++---------
 3 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/examples/common/common.hpp b/examples/common/common.hpp
index 50f35aed8..310a44b72 100644
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@@ -1,4 +1,5 @@
 
+#include <algorithm>
 #include <filesystem>
 #include <iostream>
 #include <map>
@@ -18,6 +19,7 @@ namespace fs = std::filesystem;
 #endif  // _WIN32
 
 #include "stable-diffusion.h"
+#include "model.h"  // For SDVersion enum
 
 #define STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_STATIC
@@ -443,6 +445,7 @@ struct SDContextParams {
     std::string control_net_path;
     std::string embedding_dir;
     std::string photo_maker_path;
+    std::string model_type;  // Manual model version override (sd1, sd2, sdxl, flux, etc.)
     sd_type_t wtype = SD_TYPE_COUNT;
     std::string tensor_type_rules;
     std::string lora_model_dir = ".";
@@ -487,6 +490,10 @@ struct SDContextParams {
              "--model",
              "path to full model",
              &model_path},
+            {"",
+             "--model-type",
+             "force model type (sd1, sd2, sdxl, flux, sdxl_inpaint, etc). Auto-detect if not specified.",
+             &model_type},
             {"",
              "--clip_l",
              "path to the clip-l text encoder", &clip_l_path},
@@ -944,6 +951,38 @@ struct SDContextParams {
             embedding_vec.emplace_back(item);
         }
 
+        // Parse model_type string to SDVersion enum
+        int version_override = VERSION_COUNT;  // Auto-detect by default
+        if (!model_type.empty()) {
+            std::string mt = model_type;
+            // Convert to lowercase for case-insensitive matching
+            std::transform(mt.begin(), mt.end(), mt.begin(), ::tolower);
+            
+            if (mt == "sd1" || mt == "sd1.5" || mt == "sd1.x") {
+                version_override = VERSION_SD1;
+            } else if (mt == "sd1_inpaint") {
+                version_override = VERSION_SD1_INPAINT;
+            } else if (mt == "sd2" || mt == "sd2.0" || mt == "sd2.1" || mt == "sd2.x") {
+                version_override = VERSION_SD2;
+            } else if (mt == "sd2_inpaint") {
+                version_override = VERSION_SD2_INPAINT;
+            } else if (mt == "sdxl" || mt == "sdxl1.0") {
+                version_override = VERSION_SDXL;
+            } else if (mt == "sdxl_inpaint") {
+                version_override = VERSION_SDXL_INPAINT;
+            } else if (mt == "sdxl_pix2pix") {
+                version_override = VERSION_SDXL_PIX2PIX;
+            } else if (mt == "flux" || mt == "flux1") {
+                version_override = VERSION_FLUX;
+            } else if (mt == "sd3" || mt == "sd3.5") {
+                version_override = VERSION_SD3;
+            } else if (mt == "svd") {
+                version_override = VERSION_SVD;
+            } else {
+                fprintf(stderr, "Warning: Unknown model type '%s', using auto-detect\n", model_type.c_str());
+            }
+        }
+
         sd_ctx_params_t sd_ctx_params = {
             model_path.c_str(),
             clip_l_path.c_str(),
@@ -969,6 +1008,7 @@ struct SDContextParams {
             sampler_rng_type,
             prediction,
             lora_apply_mode,
+            version_override,  // Add version_override parameter
             offload_params_to_cpu,
             enable_mmap,
             clip_on_cpu,
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index cb966d7e8..5b57e4939 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -184,6 +184,7 @@ typedef struct {
     enum rng_type_t sampler_rng_type;
     enum prediction_t prediction;
     enum lora_apply_mode_t lora_apply_mode;
+    int version_override;  // SDVersion enum value, VERSION_COUNT = auto-detect
     bool offload_params_to_cpu;
     bool enable_mmap;
     bool keep_clip_on_cpu;
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 6c3935338..0c2e2cd01 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -326,16 +326,22 @@ class StableDiffusionGGML {
 
         model_loader.convert_tensors_name();
 
-        // SDXL_FIX: Don't overwrite if already detected as SDXL in earlier component
-        SDVersion detected_version = model_loader.get_sd_version();
-        if (version != VERSION_SDXL && version != VERSION_SDXL_INPAINT && version != VERSION_SDXL_PIX2PIX) {
-            version = detected_version;
+        // Check for manual version override first
+        if (sd_ctx_params->version_override != VERSION_COUNT) {
+            version = (SDVersion)sd_ctx_params->version_override;
+            LOG_INFO("Version overridden to: %s", model_version_to_str[version]);
         } else {
-            LOG_INFO("Keeping previous SDXL version, detected version: %s", model_version_to_str[detected_version]);
-        }
-        if (version == VERSION_COUNT) {
-            LOG_ERROR("get sd version from file failed: '%s'", SAFE_STR(sd_ctx_params->model_path));
-            return false;
+            // Auto-detect version - don't overwrite if already detected as SDXL in earlier component
+            SDVersion detected_version = model_loader.get_sd_version();
+            if (version != VERSION_SDXL && version != VERSION_SDXL_INPAINT && version != VERSION_SDXL_PIX2PIX) {
+                version = detected_version;
+            } else {
+                LOG_INFO("Keeping previous SDXL version, detected version: %s", model_version_to_str[detected_version]);
+            }
+            if (version == VERSION_COUNT) {
+                LOG_ERROR("get sd version from file failed: '%s'", SAFE_STR(sd_ctx_params->model_path));
+                return false;
+            }
         }
 
         auto& tensor_storage_map = model_loader.get_tensor_storage_map();
@@ -2925,6 +2931,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->sampler_rng_type        = RNG_TYPE_COUNT;
     sd_ctx_params->prediction              = PREDICTION_COUNT;
     sd_ctx_params->lora_apply_mode         = LORA_APPLY_AUTO;
+    sd_ctx_params->version_override        = VERSION_COUNT;  // Auto-detect
     sd_ctx_params->offload_params_to_cpu   = false;
     sd_ctx_params->enable_mmap             = false;
     sd_ctx_params->keep_clip_on_cpu        = false;

From 9195fedb0f66b36518dd1fa6234c3667271a5a74 Mon Sep 17 00:00:00 2001
From: fewtarius <fewtarius@steamfork.org>
Date: Sat, 7 Feb 2026 09:14:36 -0500
Subject: [PATCH 3/3] fix(diffusers): add support for diffusers SDXL text
 encoder prefixes

Problem: SDXL models in diffusers directory format fail to load with "unknown tensor" errors
Solution: Added te. and te.1. prefixes to cond_stage_model conversion list
Testing: SDXL diffusers models now load and generate successfully

Root cause: When loading diffusers SDXL models, text_encoder uses "te." prefix
and text_encoder_2 uses "te.1." prefix. These weren't in the name conversion
prefix list, so tensors weren't being converted to checkpoint format names.

This fix enables diffusers-format SDXL models to work alongside single-file
checkpoint models without requiring format conversion.

Fixes: Models like duchaiten-pony-real-v20-sdxl in diffusers directory layout
---
 src/name_conversion.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/name_conversion.cpp b/src/name_conversion.cpp
index d3e863b8a..9a1eda514 100644
--- a/src/name_conversion.cpp
+++ b/src/name_conversion.cpp
@@ -920,6 +920,8 @@ std::vector<std::string> cond_stage_model_prefix_vec = {
     "cond_stage_model.",
     "conditioner.embedders.",
     "text_encoders.",
+    "te.1.",  // diffusers SDXL text_encoder_2 (clip_g)
+    "te.",    // diffusers text_encoder (clip_l)
 };
 
 std::vector<std::string> diffuison_model_prefix_vec = {