From 29878b2944c858eb4df3eb595aa51daf85d38218 Mon Sep 17 00:00:00 2001 From: taylorchu Date: Sat, 29 Nov 2025 09:35:33 -0800 Subject: [PATCH 1/4] read swa pattern for all models --- src/llama-arch.cpp | 1 + src/llama-arch.h | 1 + src/llama-model-saver.cpp | 1 + src/llama-model.cpp | 8 ++++++++ 4 files changed, 11 insertions(+) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 8571a2e025a..9410c9e7e45 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -201,6 +201,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, + { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" }, { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 150646478ae..811035bd392 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -205,6 +205,7 @@ enum llm_kv { LLM_KV_ATTENTION_GATE_LORA_RANK, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_SLIDING_WINDOW, + LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_OUTPUT_SCALE, LLM_KV_ATTENTION_TEMPERATURE_LENGTH, diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 563823dc35d..6b6262ca3d5 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -182,6 +182,7 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts); add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, true); add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale); const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c2a545531a9..660261fb3cf 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -555,6 +555,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false); + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (hparams.n_swa > 0) { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer, false); + } else { + hparams.swa_type = LLAMA_SWA_TYPE_NONE; + } + bool rope_finetuned = false; ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); hparams.rope_finetuned = rope_finetuned; From 5a0c228c7233d587402c95faeb56c04c7ac335a2 Mon Sep 17 00:00:00 2001 From: taylorchu Date: Sat, 29 Nov 2025 09:55:29 -0800 Subject: [PATCH 2/4] add missing template --- src/llama-model-loader.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index aa3a65f87a5..9a03efdf89e 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -466,6 +466,7 @@ namespace GGUFMeta { template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); + template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); llama_model_loader::llama_model_loader( From b355891cd8e4d67f4db8ea9f4c4782e2af471934 Mon Sep 17 00:00:00 2001 From: taylorchu Date: Sat, 29 Nov 2025 15:03:32 -0800 Subject: [PATCH 3/4] bool is not supported --- src/llama-model-loader.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 9a03efdf89e..59d088db92b 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -308,8 +308,9 @@ namespace GGUFMeta { (std::is_same::value)); break; case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same::value)); break; + case GGUF_TYPE_BOOL: GGML_ASSERT((std::is_same::value)); break; default: - throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); + throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str())); } if constexpr (std::is_same::value) { @@ -349,8 +350,9 @@ namespace GGUFMeta { (std::is_same::value)); break; case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same::value)); break; + case GGUF_TYPE_BOOL: GGML_ASSERT((std::is_same::value)); break; default: - throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); + throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str())); } if (arr_info.length > N_MAX) { From cbcd6f98273b88bbc0e80ec4960d8c86827a4ca5 Mon Sep 17 00:00:00 2001 From: taylorchu Date: Sat, 29 Nov 2025 16:44:10 -0800 Subject: [PATCH 4/4] qwen3 only --- src/llama-model.cpp | 23 ++++++++++++++--------- src/models/models.h | 1 + src/models/qwen3.cpp | 16 ++++++++++++++-- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 660261fb3cf..45fd00517c8 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -555,14 +555,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false); - ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); - if (hparams.n_swa > 0) { - hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer, false); - } else { - hparams.swa_type = LLAMA_SWA_TYPE_NONE; - } - bool rope_finetuned = false; ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); hparams.rope_finetuned = rope_finetuned; @@ -1071,6 +1063,15 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (hparams.n_swa > 0) { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer, false); + } else { + hparams.swa_type = LLAMA_SWA_TYPE_NONE; + } + switch (hparams.n_layer) { case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; @@ -7204,7 +7205,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { } break; case LLM_ARCH_QWEN3: { - llm = std::make_unique(*this, params); + if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { + llm = std::make_unique> (*this, params); + } else { + llm = std::make_unique>(*this, params); + } } break; case LLM_ARCH_QWEN3MOE: { diff --git a/src/models/models.h b/src/models/models.h index 7ba225b4784..a085bf8ffb9 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -407,6 +407,7 @@ struct llm_build_qwen2vl : public llm_graph_context { llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params); }; +template struct llm_build_qwen3 : public llm_graph_context { llm_build_qwen3(const llama_model & model, const llm_graph_params & params); }; diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index a5cfffa5314..c06de75e6a6 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -1,6 +1,7 @@ #include "models.h" -llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { +template +llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -14,7 +15,14 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + using inp_attn_type = std::conditional_t; + inp_attn_type * inp_attn = nullptr; + + if constexpr (iswa) { + inp_attn = build_attn_inp_kv_iswa(); + } else { + inp_attn = build_attn_inp_kv(); + } ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -115,3 +123,7 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para ggml_build_forward_expand(gf, cur); } + +// Explicit template instantiations +template struct llm_build_qwen3; +template struct llm_build_qwen3;