diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 8571a2e025a..9410c9e7e45 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -201,6 +201,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, + { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" }, { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 150646478ae..811035bd392 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -205,6 +205,7 @@ enum llm_kv { LLM_KV_ATTENTION_GATE_LORA_RANK, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_SLIDING_WINDOW, + LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_OUTPUT_SCALE, LLM_KV_ATTENTION_TEMPERATURE_LENGTH, diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index aa3a65f87a5..59d088db92b 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -308,8 +308,9 @@ namespace GGUFMeta { (std::is_same::value)); break; case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same::value)); break; + case GGUF_TYPE_BOOL: GGML_ASSERT((std::is_same::value)); break; default: - throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); + throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str())); } if constexpr (std::is_same::value) { @@ -349,8 +350,9 @@ namespace GGUFMeta { (std::is_same::value)); break; case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same::value)); break; + case GGUF_TYPE_BOOL: GGML_ASSERT((std::is_same::value)); break; default: - throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); + throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str())); } if (arr_info.length > N_MAX) { @@ -466,6 +468,7 @@ namespace GGUFMeta { template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); + template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); llama_model_loader::llama_model_loader( diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 563823dc35d..6b6262ca3d5 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -182,6 +182,7 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts); add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, true); add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale); const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c2a545531a9..45fd00517c8 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1063,6 +1063,15 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (hparams.n_swa > 0) { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer, false); + } else { + hparams.swa_type = LLAMA_SWA_TYPE_NONE; + } + switch (hparams.n_layer) { case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; @@ -7196,7 +7205,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { } break; case LLM_ARCH_QWEN3: { - llm = std::make_unique(*this, params); + if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { + llm = std::make_unique> (*this, params); + } else { + llm = std::make_unique>(*this, params); + } } break; case LLM_ARCH_QWEN3MOE: { diff --git a/src/models/models.h b/src/models/models.h index 7ba225b4784..a085bf8ffb9 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -407,6 +407,7 @@ struct llm_build_qwen2vl : public llm_graph_context { llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params); }; +template struct llm_build_qwen3 : public llm_graph_context { llm_build_qwen3(const llama_model & model, const llm_graph_params & params); }; diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index a5cfffa5314..c06de75e6a6 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -1,6 +1,7 @@ #include "models.h" -llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { +template +llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -14,7 +15,14 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + using inp_attn_type = std::conditional_t; + inp_attn_type * inp_attn = nullptr; + + if constexpr (iswa) { + inp_attn = build_attn_inp_kv_iswa(); + } else { + inp_attn = build_attn_inp_kv(); + } ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -115,3 +123,7 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para ggml_build_forward_expand(gf, cur); } + +// Explicit template instantiations +template struct llm_build_qwen3; +template struct llm_build_qwen3;