From 29878b2944c858eb4df3eb595aa51daf85d38218 Mon Sep 17 00:00:00 2001
From: taylorchu <tailinchu@gmail.com>
Date: Sat, 29 Nov 2025 09:35:33 -0800
Subject: [PATCH 1/4] read swa pattern for all models

---
 src/llama-arch.cpp        | 1 +
 src/llama-arch.h          | 1 +
 src/llama-model-saver.cpp | 1 +
 src/llama-model.cpp       | 8 ++++++++
 4 files changed, 11 insertions(+)

diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 8571a2e025a..9410c9e7e45 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -201,6 +201,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_GATE_LORA_RANK,               "%s.attention.gate_lora_rank"               },
     { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       },
     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
+    { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,       "%s.attention.sliding_window_pattern"       },
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
     { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
     { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 150646478ae..811035bd392 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -205,6 +205,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_GATE_LORA_RANK,
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
+    LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_OUTPUT_SCALE,
     LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp
index 563823dc35d..6b6262ca3d5 100644
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -182,6 +182,7 @@ void llama_model_saver::add_kv_from_model() {
     add_kv(LLM_KV_ATTENTION_KV_LORA_RANK,            hparams.n_lora_kv);
     add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,  hparams.n_rel_attn_bkts);
     add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
+    add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,  hparams.swa_layers, true);
     add_kv(LLM_KV_ATTENTION_SCALE,                   hparams.f_attention_scale);
 
     const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index c2a545531a9..660261fb3cf 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -555,6 +555,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
     ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
 
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+    if (hparams.n_swa > 0) {
+        hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+        ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer, false);
+    } else {
+        hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+    }
+
     bool rope_finetuned = false;
     ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
     hparams.rope_finetuned = rope_finetuned;

From 5a0c228c7233d587402c95faeb56c04c7ac335a2 Mon Sep 17 00:00:00 2001
From: taylorchu <tailinchu@gmail.com>
Date: Sat, 29 Nov 2025 09:55:29 -0800
Subject: [PATCH 2/4] add missing template

---
 src/llama-model-loader.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index aa3a65f87a5..9a03efdf89e 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -466,6 +466,7 @@ namespace GGUFMeta {
     template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
+    template bool llama_model_loader::get_key_or_arr<std::array<bool, 512>>(enum llm_kv kid, std::array<bool, 512> & result, uint32_t n, bool required);
 
 
 llama_model_loader::llama_model_loader(

From b355891cd8e4d67f4db8ea9f4c4782e2af471934 Mon Sep 17 00:00:00 2001
From: taylorchu <tailinchu@gmail.com>
Date: Sat, 29 Nov 2025 15:03:32 -0800
Subject: [PATCH 3/4] bool is not supported

---
 src/llama-model-loader.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 9a03efdf89e..59d088db92b 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -308,8 +308,9 @@ namespace GGUFMeta {
                                                 (std::is_same<T,    uint32_t>::value)); break;
             case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,       float>::value)); break;
             case GGUF_TYPE_STRING:  GGML_ASSERT((std::is_same<T, std::string>::value)); break;
+            case GGUF_TYPE_BOOL:    GGML_ASSERT((std::is_same<T,        bool>::value)); break;
             default:
-                throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str()));
         }
 
         if constexpr (std::is_same<T, std::string>::value) {
@@ -349,8 +350,9 @@ namespace GGUFMeta {
                                                 (std::is_same<T,    uint32_t>::value)); break;
             case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,       float>::value)); break;
             case GGUF_TYPE_STRING:  GGML_ASSERT((std::is_same<T, std::string>::value)); break;
+            case GGUF_TYPE_BOOL:    GGML_ASSERT((std::is_same<T,        bool>::value)); break;
             default:
-                throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str()));
         }
 
         if (arr_info.length > N_MAX) {

From cbcd6f98273b88bbc0e80ec4960d8c86827a4ca5 Mon Sep 17 00:00:00 2001
From: taylorchu <tailinchu@gmail.com>
Date: Sat, 29 Nov 2025 16:44:10 -0800
Subject: [PATCH 4/4] qwen3 only

---
 src/llama-model.cpp  | 23 ++++++++++++++---------
 src/models/models.h  |  1 +
 src/models/qwen3.cpp | 16 ++++++++++++++--
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 660261fb3cf..45fd00517c8 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -555,14 +555,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
     ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
 
-    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-    if (hparams.n_swa > 0) {
-        hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-        ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer, false);
-    } else {
-        hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-    }
-
     bool rope_finetuned = false;
     ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
     hparams.rope_finetuned = rope_finetuned;
@@ -1071,6 +1063,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             {
                 ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+                if (hparams.n_swa > 0) {
+                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer, false);
+                } else {
+                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+                }
+
                 switch (hparams.n_layer) {
                     case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
                     case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
@@ -7204,7 +7205,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             } break;
         case LLM_ARCH_QWEN3:
             {
-                llm = std::make_unique<llm_build_qwen3>(*this, params);
+                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+                    llm = std::make_unique<llm_build_qwen3<true>> (*this, params);
+                } else {
+                    llm = std::make_unique<llm_build_qwen3<false>>(*this, params);
+                }
             } break;
         case LLM_ARCH_QWEN3MOE:
             {
diff --git a/src/models/models.h b/src/models/models.h
index 7ba225b4784..a085bf8ffb9 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -407,6 +407,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
     llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params);
 };
 
+template <bool iswa>
 struct llm_build_qwen3 : public llm_graph_context {
     llm_build_qwen3(const llama_model & model, const llm_graph_params & params);
 };
diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp
index a5cfffa5314..c06de75e6a6 100644
--- a/src/models/qwen3.cpp
+++ b/src/models/qwen3.cpp
@@ -1,6 +1,7 @@
 #include "models.h"
 
-llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+template <bool iswa>
+llm_build_qwen3<iswa>::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v;
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -14,7 +15,14 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
     // inp_pos - contains the positions
     ggml_tensor * inp_pos = build_inp_pos();
 
-    auto * inp_attn = build_attn_inp_kv();
+    using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+    inp_attn_type * inp_attn = nullptr;
+
+    if constexpr (iswa) {
+        inp_attn = build_attn_inp_kv_iswa();
+    } else {
+        inp_attn = build_attn_inp_kv();
+    }
 
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
@@ -115,3 +123,7 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
 
     ggml_build_forward_expand(gf, cur);
 }
+
+// Explicit template instantiations
+template struct llm_build_qwen3<false>;
+template struct llm_build_qwen3<true>;