Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ enum llm_kv {
LLM_KV_ATTENTION_GATE_LORA_RANK,
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
Expand Down
7 changes: 5 additions & 2 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,8 +308,9 @@ namespace GGUFMeta {
(std::is_same<T, uint32_t>::value)); break;
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break;
case GGUF_TYPE_BOOL: GGML_ASSERT((std::is_same<T, bool>::value)); break;
default:
throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str()));
}

if constexpr (std::is_same<T, std::string>::value) {
Expand Down Expand Up @@ -349,8 +350,9 @@ namespace GGUFMeta {
(std::is_same<T, uint32_t>::value)); break;
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break;
case GGUF_TYPE_BOOL: GGML_ASSERT((std::is_same<T, bool>::value)); break;
default:
throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str()));
}

if (arr_info.length > N_MAX) {
Expand Down Expand Up @@ -466,6 +468,7 @@ namespace GGUFMeta {
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<bool, 512>>(enum llm_kv kid, std::array<bool, 512> & result, uint32_t n, bool required);


llama_model_loader::llama_model_loader(
Expand Down
1 change: 1 addition & 0 deletions src/llama-model-saver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ void llama_model_saver::add_kv_from_model() {
add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, true);
add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);

const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
Expand Down
15 changes: 14 additions & 1 deletion src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1063,6 +1063,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer, false);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}

switch (hparams.n_layer) {
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
Expand Down Expand Up @@ -7196,7 +7205,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
} break;
case LLM_ARCH_QWEN3:
{
llm = std::make_unique<llm_build_qwen3>(*this, params);
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
llm = std::make_unique<llm_build_qwen3<true>> (*this, params);
} else {
llm = std::make_unique<llm_build_qwen3<false>>(*this, params);
}
} break;
case LLM_ARCH_QWEN3MOE:
{
Expand Down
1 change: 1 addition & 0 deletions src/models/models.h
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params);
};

template <bool iswa>
struct llm_build_qwen3 : public llm_graph_context {
llm_build_qwen3(const llama_model & model, const llm_graph_params & params);
};
Expand Down
16 changes: 14 additions & 2 deletions src/models/qwen3.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "models.h"

llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
template <bool iswa>
llm_build_qwen3<iswa>::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;

GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
Expand All @@ -14,7 +15,14 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();

auto * inp_attn = build_attn_inp_kv();
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
inp_attn_type * inp_attn = nullptr;

if constexpr (iswa) {
inp_attn = build_attn_inp_kv_iswa();
} else {
inp_attn = build_attn_inp_kv();
}

ggml_tensor * inp_out_ids = build_inp_out_ids();

Expand Down Expand Up @@ -115,3 +123,7 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para

ggml_build_forward_expand(gf, cur);
}

// Explicit template instantiations
template struct llm_build_qwen3<false>;
template struct llm_build_qwen3<true>;