Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
2030 commits
Select commit Hold shift + click to select a range
e148380
ggml : use svcntb() for SVE vector length detection (#17474)
angt Dec 2, 2025
c4357dc
Server: Change Invalid Schema from Server Error (500) to User Error (…
chadvoegele Dec 2, 2025
e251e5e
cmake : add utf8 compilation options for msvc (#17682)
xiaobing318 Dec 2, 2025
61bde8e
vulkan: Reduce temporary memory usage for TOP_K (#17623)
jeffbolznv Dec 2, 2025
4eba8d9
ci : RVV1.0 builds with tests (#16682)
alitariq4589 Dec 2, 2025
a96283a
mtmd: fix --no-warmup (#17695)
ngxson Dec 2, 2025
13628d8
server: add --media-path for local media files (#17697)
ngxson Dec 2, 2025
16cc3c6
build: document how to compile with Vulkan using Debian/Ubuntu packag…
socram8888 Dec 3, 2025
37adc9c
ggml, llama : use defaulted constructors/destructors (#17649)
GermanAizek Dec 3, 2025
b3e3060
ci : move release details to the top visible by default (#17719)
CISC Dec 3, 2025
7ca5991
ggml webgpu: add support for emscripten builds (#17184)
reeselevine Dec 3, 2025
5ceed62
server: fix duplicate HTTP headers in multiple models mode (#17698)
ServeurpersoCom Dec 3, 2025
0a8026e
common : introduce composable PEG parser combinators for chat parsing…
aldehir Dec 3, 2025
7feb0a1
ci : remove the build of openeuler-cann in release (#17724)
xuedinge233 Dec 3, 2025
3d94e96
metal : fix data race in pipeline library (#17731)
ggerganov Dec 3, 2025
083e18b
cmake: explicitly link against crypt32 on non-MSVC Windows builds (#1…
angt Dec 3, 2025
1257491
server : fix bad fmt, size() is a size_type (#17735)
angt Dec 3, 2025
e7c2cf1
server: add router multi-model tests (#17704) (#17722)
ServeurpersoCom Dec 3, 2025
190c483
chat : reserve memory in compute_diffs and improve naming (#17729)
ggerganov Dec 3, 2025
2e1c9cd
CUDA: generalized (mma) FA, add Volta support (#17505)
JohannesGaessler Dec 3, 2025
41c5e02
webui: Fix zero pasteLongTextToFileLen to disable conversion being ov…
awasisto Dec 3, 2025
e9f9483
Use OpenAI-compatible `/v1/models` endpoint by default (#17689)
allozaur Dec 3, 2025
424c579
convert : support latest mistral-common (fix conversion with --mistra…
SmartestWashingMachine Dec 3, 2025
c6d1a00
Add a couple of file types to the text section (#17670)
pwilkin Dec 3, 2025
dea9ba2
ggml-cpu: remove duplicate conditional check 'iid' (#17650)
GermanAizek Dec 3, 2025
d8b5cdc
build: enable parallel builds in msbuild using MTT (#17708)
jeffbolznv Dec 4, 2025
ef75a89
build : move _WIN32_WINNT definition to headers (#17736)
angt Dec 4, 2025
a67ef0f
llama : fix sanity checks during quantization (#17721)
ggerganov Dec 4, 2025
0d13248
metal : use params per pipeline instance (#17739)
ggerganov Dec 4, 2025
83c1171
common: use native MultiByteToWideChar (#17738)
angt Dec 4, 2025
7dba049
ci : disable ggml-ci-x64-amd-* (#17753)
CISC Dec 4, 2025
2a73f81
cmake : simplify build info detection using standard variables (#17423)
angt Dec 4, 2025
3659aa2
convert: use existing local chat_template if mistral-format model has…
SmartestWashingMachine Dec 4, 2025
87a2084
ggml-cpu : remove asserts always evaluating to false (#17728)
Alcpz Dec 4, 2025
bd4ef13
common : skip model validation when --help is requested (#17755)
danbev Dec 4, 2025
817d743
examples : add missing code block end marker [no ci] (#17756)
danbev Dec 4, 2025
c4c10bf
server: move msg diffs tracking to HTTP thread (#17740)
ngxson Dec 4, 2025
9d02299
server: strip content-length header on proxy (#17734)
ngxson Dec 4, 2025
bde188d
metal: TRI, FILL, EXPM1, SOFTPLUS (#16623)
gabe-l-hart Dec 4, 2025
96fe9ba
Add support for CUMSUM and TRI for CUDA. (#17584)
pwilkin Dec 4, 2025
3143a75
docs : update ops.md (Metal, BLAS) (#17768)
gabe-l-hart Dec 4, 2025
03d9a77
ci : transform release binary root dir in tar to llama-bXXXX (#17773)
CISC Dec 5, 2025
668ed76
HIP: enable WMMA-MMQ INT kernels for RDNA 3 (#17576)
jiachengjason Dec 5, 2025
e95d0bc
CUDA: fix FA VKQ accumulator overflow (#17746)
JohannesGaessler Dec 5, 2025
6648989
Add pwilkin to CODEOWNERS for chat files (#17789)
pwilkin Dec 5, 2025
3a0d105
Q4/Q8 Tiled Gemm Optimization. (#16999)
shalinib-ibm Dec 5, 2025
a6cfc21
ci : fix winget workflow (#17790)
angt Dec 5, 2025
1be9783
fix: prevent segfault in tokenizer on highly repetitive input (#17786)
ServeurpersoCom Dec 5, 2025
6016d0b
HIP : fix RDNA4 build (#17792)
JohannesGaessler Dec 5, 2025
c41bde6
metal : add residency sets keep-alive heartbeat (#17766)
ggerganov Dec 5, 2025
8160b38
rpc : fix alloc size logic (#17116)
ggerganov Dec 5, 2025
93bb926
vulkan: set all memory allocations to high priority (#17624)
jeffbolznv Dec 5, 2025
6ab0d64
vulkan: enable mmvq for q2_k on NVIDIA (#17675)
jeffbolznv Dec 5, 2025
fd57b24
ggml webgpu: unary op suppport, code refactoring, ops support (#17764)
reeselevine Dec 5, 2025
e15cd06
vulkan : support conv-2d with large output size (#17685)
Acly Dec 5, 2025
a0f3897
vulkan: fix top_k bug when there are ties in the input (#17659)
jeffbolznv Dec 5, 2025
933414c
vulkan: add more num_blocks instantiations in rms_norm (#17701)
jeffbolznv Dec 5, 2025
d8c0a7b
vulkan: Fix mismatch in TOPK_MOE unit test (#17541)
rillomas Dec 6, 2025
67788f6
vulkan: Replace deprecated VK_EXT_validation_features (#17637)
rillomas Dec 6, 2025
8ce774a
metal : fix build(#17799)
ggerganov Dec 6, 2025
8e5f498
contrib : stale PRs (#17803)
ggerganov Dec 6, 2025
c6c5e85
vulkan: support solve_tri with larger N/K values (#17781)
jeffbolznv Dec 6, 2025
dbc15a7
convert: support Mistral 3 Large MoE (#17730)
ngxson Dec 6, 2025
2960eb2
vulkan: Use one row per workgroup for f32 mmv (#17711)
jeffbolznv Dec 6, 2025
444f00b
llama : remove quantization sanity check (#17788)
danbev Dec 6, 2025
7b43f55
ggml : improve error handling for search path existence checks (#17653)
flyinskyin2013 Dec 6, 2025
21f24f2
webui: Per-conversation system message with UI displaying, edition & …
allozaur Dec 6, 2025
e31b5c5
webui: Fix context available value in Multi-model Router mode (#17804)
allozaur Dec 6, 2025
a28e3c7
webui: Stop generation from chat sidebar (#17806)
allozaur Dec 6, 2025
f334b79
HIP: fix RDNA3 FP16/BF16 matrix multiplication (#17817)
JohannesGaessler Dec 6, 2025
09c7c50
ggml : add circular tiling support to pad, for Vulkan, CUDA, and CPU …
Phylliida Dec 6, 2025
c42712b
server: support multiple generations from one prompt (OAI "n" option)…
ngxson Dec 6, 2025
017761d
ggml-zendnn : add ZenDNN backend for AMD CPUs (#17690)
z-vishal Dec 6, 2025
db97837
vulkan: perf_logger improvements (#17672)
jeffbolznv Dec 6, 2025
d9e03db
sycl: add missing BF16 conversion support for Intel oneAPI (#17780)
yingying0906 Dec 7, 2025
2257758
common : change --color to accept on/off/auto, default to auto (#17827)
CISC Dec 7, 2025
0a540f9
ci : add windows-cuda 13.1 release (#17839)
CISC Dec 7, 2025
08f9d3c
Vulkan: improve mul_mat_vec_iq1_m (#16907)
lovedheart Dec 7, 2025
4d37262
model: add llama 4 scaling for mistral-large (deepseek arch) (#17744)
ngxson Dec 7, 2025
79d6189
ggml-cpu: add ggml_thread_cpu_relax with Zihintpause support (#17784)
ixgbe Dec 8, 2025
5814b4d
cuda: optimize SOLVE_TRI using registers and FMAF (#17703)
wsbagnsv1 Dec 8, 2025
2bc9693
server : make cache_reuse configurable per request (#17858)
ggerganov Dec 8, 2025
37a4f63
server : add development documentation (#17760)
ngxson Dec 8, 2025
51e0c2d
cuda : add FILL op support (#17851)
JayZenith Dec 8, 2025
636fc17
Fix Kimi-K2 tool-call parsing issues (#17376)
hksdpc255 Dec 8, 2025
e4e9c43
Make graph_max_nodes vary by ubatch size (#17794)
pwilkin Dec 8, 2025
f896d2c
server: improve speed of speculative decoding (#17808)
ngxson Dec 8, 2025
68522c6
ci : support bfloat16 SYCL release package (#17855)
arthw Dec 8, 2025
951520d
server: delegate result_state creation to server_task (#17835)
ngxson Dec 8, 2025
2fa51c1
model-conversion : add token ids to prompt token output [no ci] (#17863)
danbev Dec 8, 2025
c8554b6
graph : use fill instead of scale_bias in grouped expert selection (#…
CISC Dec 8, 2025
1d2a1ab
model : support Rnj-1 (#17811)
philip-essential Dec 9, 2025
e39502e
llama : add token matching support to llama-grammar (#17816)
aldehir Dec 9, 2025
0cdce38
CUDA: fix FP16 overflow in tile FA kernel (#17875)
JohannesGaessler Dec 9, 2025
ca709e4
CANN: add support for partial RoPE and Vision mode (#17543)
noemotiovon Dec 9, 2025
4e842d5
console: allow using arrow left/right, home/end keys and history mode…
ngxson Dec 9, 2025
42b12b5
model : nit, DeepSeek V1 MoE is 16B and GigaChat is 20B (#12652)
CISC Dec 9, 2025
63908b6
cmake: fix Mach-O current version number (#17877)
Rhys-T Dec 9, 2025
86a3f0f
ggml : allow fill node alloc inplace (#17870)
CISC Dec 9, 2025
6b82eb7
metal : print node names for debugging (#17882)
ggerganov Dec 9, 2025
02e409a
ggml : Provide macos-specific backtrace printing to avoid terminal de…
gabe-l-hart Dec 9, 2025
48f4756
docs: clarify that CPU support should be first (#17886)
JohannesGaessler Dec 9, 2025
b635092
Add DIAG for CUDA (#17873)
pwilkin Dec 9, 2025
086a63e
metal: SSM kernel improvements (#17876)
gabe-l-hart Dec 9, 2025
6339185
docs : update cpu and cuda ops (#17890)
CISC Dec 9, 2025
2fbe3b7
common : add parser for ministral/mistral large 3/devstral 2 (#17713)
aldehir Dec 9, 2025
2e9eab8
fix softmax for iGPU (#17838)
NeoZhangJianyu Dec 10, 2025
9e79b01
convert: allow using quantized Mistral weight (#17889)
ngxson Dec 10, 2025
17f7f4b
CUDA: fix unpadded strides in MMA FA kernel (#17891)
JohannesGaessler Dec 10, 2025
2d2e103
docs : update opencl ops (#17904)
lhez Dec 10, 2025
b677721
model : Qwen3-Next-80B-A3B has 48 layers (#17898)
EZForever Dec 10, 2025
6c21317
cli: new CLI experience (#17824)
ngxson Dec 10, 2025
4df6e85
cuda : add missing support check for xielu (#17895)
CISC Dec 10, 2025
4dff236
ggml : remove GGML_KQ_MASK_PAD constant (#17910)
ggerganov Dec 10, 2025
e1f4921
Fix race conditions in threadpool when dealing with dynamic/frequent …
max-krasnyansky Dec 10, 2025
f32ca51
server: add presets (config) when using multiple models (#17859)
ServeurpersoCom Dec 10, 2025
34a6d86
cli: enable jinja by default (#17911)
ngxson Dec 10, 2025
c6b2c93
mtmd: some small clean up (#17909)
ngxson Dec 10, 2025
45e350e
ci: fix riscv64-native build (#17916)
CISC Dec 10, 2025
34ce48d
ggml-hexagon: fix `rope` failure at `test-backend-ops` (#17565)
chraac Dec 10, 2025
e4ae383
docs: use port 8080 in Docker examples (#17903)
utsumi-fj Dec 11, 2025
d9f8f60
batch : fix sequence id ownership (#17915)
ggerganov Dec 11, 2025
c6f6e4f
ggml-alloc : fix reuse-parent logic for misaligned sizes (#17884)
ggerganov Dec 11, 2025
53ecd4f
SOLVE_TRI extension to more dimensions (#17793)
pwilkin Dec 11, 2025
a81a569
Add a search field on model selector / improve mobile display (#17765)
ServeurpersoCom Dec 11, 2025
c33a58b
HIP: enable mmf for RDNA3 (#17879)
zhang-hui-yulo Dec 12, 2025
2eaa2c6
cmake: link ws2_32 for MinGW/w64devkit builds in cpp-httplib (#17949)
gustrd Dec 12, 2025
b8ee22c
common : add minimalist multi-thread progress bar (#17602)
angt Dec 12, 2025
dada4c8
model-conversion : remove max diff check in compare-logits [no ci] (#…
danbev Dec 12, 2025
54a0fee
arg: add -mm and -mmu as short form of --mmproj and --mmproj-url (#17…
ngxson Dec 12, 2025
12280ae
webui: Fix parsing non-LaTeX occurrencies of `\(` or `\)` (#17810)
allozaur Dec 12, 2025
1715896
mtmd: explicitly forbidden inclusion of private header and libcommon …
ngxson Dec 12, 2025
5160443
ggml-cpu : fix RISC-V Q4_0 repack select and RVV feature reporting (#…
ixgbe Dec 12, 2025
dcb7d17
cann : fix ops broken by circular padding guard (#17825)
CISC Dec 12, 2025
7bed317
models : fix the attn_factor for mistral3 graphs + improve consistenc…
ggerganov Dec 12, 2025
4822114
CUDA: fix overflow in MMA kernel without stream-k (#17939)
JohannesGaessler Dec 12, 2025
b7f5f46
docker : include legacy llama-completion binary (#17964)
CISC Dec 12, 2025
a8c7f33
ci : change the cann version and the container pull method (#17953)
xuedinge233 Dec 12, 2025
e39a2ce
clip: move model cgraphs into their own files (#17965)
ngxson Dec 12, 2025
380b4c9
common: support negated args (#17919)
ngxson Dec 12, 2025
fd1085f
model-conversion : use CONVERTED_MODEL value for converted model [no …
danbev Dec 13, 2025
2bc94e7
add llama-completion to completion-bash executables (#17976)
CISC Dec 13, 2025
07a10c1
vulkan: Allow non-pow2 n_experts in topk_moe (#17872)
jeffbolznv Dec 13, 2025
8e4d678
common : skip model validation when --completion-bash is requested (#…
CISC Dec 13, 2025
3c6391e
speculative-simple : free batch on exit (#17985)
ggerganov Dec 13, 2025
303f861
vulkan: Multi-pass softmax for large number of cols (#17892)
jeffbolznv Dec 13, 2025
3229a23
vulkan: support GGML_OP_DIAG (#17893)
jeffbolznv Dec 13, 2025
36255a2
vulkan: support get_rows for i32 (#17941)
jeffbolznv Dec 13, 2025
66ba512
cmake: correct scope - link ws2_32 for MinGW/w64devkit builds in cpp-…
gustrd Dec 13, 2025
4d5ae24
arg: fix common_params_parse not accepting negated arg (#17991)
ngxson Dec 13, 2025
5266379
llama_context: synchronize before reallocating output buffer (#17974)
jeffbolznv Dec 13, 2025
4ed2bae
server-models.cpp: add missing <filesystem> (#18000)
barracuda156 Dec 13, 2025
c00ff92
scripts: add script to compare logprobs of llama.cpp against other fr…
ngxson Dec 13, 2025
615655a
cmake : set `CMAKE_RUNTIME_OUTPUT_DIRECTORY` for non standalone build…
HerrCai0907 Dec 12, 2025
71fdcf0
ggml : arm repack fix build (whisper/0)
ggerganov Dec 13, 2025
0e59224
sync : ggml
ggerganov Dec 13, 2025
a63cbaf
ggml : arm repack fix build
ggerganov Dec 13, 2025
609a2d0
models : fix YaRN regression + consolidate logic (#18006)
ggerganov Dec 14, 2025
77ad854
model-conversion : cast logits to float32 (#18009)
ggerganov Dec 14, 2025
d15d177
vulkan: faster q6_k matmul (#17813)
netrunnereve Dec 14, 2025
4722671
vulkan: improve mul_mat_vec_iq1_s speed (#17874)
lovedheart Dec 14, 2025
3238b14
vulkan: Fix data race/hang in scalar/cm1 flash attention (#17887)
jeffbolznv Dec 14, 2025
254098a
common : refactor common_sampler + grammar logic changes (#17937)
ggerganov Dec 14, 2025
0759b09
graph: add f_attn_temp_offset (#18025)
ngxson Dec 14, 2025
9e6649e
vulkan: fix mul_mat_vec_iq1_s formatting (#18026)
0cc4m Dec 14, 2025
37f5a10
mtmd: enhance image resizing in llava_uhd (#18014)
bluebread Dec 14, 2025
5c8a717
convert : refactor rope scaling handling (#18013)
CISC Dec 14, 2025
5239229
preset: handle negated arg, reverse the meaning if needed (#18041)
ngxson Dec 14, 2025
745fa0e
model : add glm-asr support (#17901)
piDack Dec 15, 2025
4aced7a
[SYCL] Support gpt-oss by OPs add-id, mul_mat for mxfp4, swiglu_oai (…
NeoZhangJianyu Dec 15, 2025
b1f3a6e
llama: automatically set parameters not set by the user in such a way…
JohannesGaessler Dec 15, 2025
e73d548
webui: add "delete all conversations" button to import/export tab (#1…
thomasjfox Dec 15, 2025
4a4f7e6
cli: fixed dead links to tools/main for cli and completion, fixed cod…
andrew-aladev Dec 15, 2025
96a181a
mtmd: refactor audio preprocessing (#17978)
ngxson Dec 15, 2025
165caaf
metal: use shared buffers on eGPU (#17866)
jdemeule Dec 15, 2025
0f4f35e
Fix unreadable user markdown colors and truncate long texts in deleti…
ServeurpersoCom Dec 15, 2025
4529c66
kv-cache: Fix state restore fragmented cache (#17982)
ssweens Dec 15, 2025
9d52f17
model : add KORMo model (#18032)
HelloKS Dec 15, 2025
c45f89d
ggml-hexagon: mm for mtmd (#17894)
joeldushouyu Dec 15, 2025
d6a1e18
convert : move rope_parameters to TextModel class (#18061)
CISC Dec 15, 2025
40d9c39
Webui: Disable attachment button and model selector button when promp…
dariusjlukas Dec 16, 2025
2995341
llama : add support for NVIDIA Nemotron 3 Nano (#18058)
danbev Dec 16, 2025
a20979d
webui: Add setting to always show sidebar on Desktop (#17809)
allozaur Dec 16, 2025
3034836
webui: Improve copy to clipboard with text attachments (#17969)
allozaur Dec 16, 2025
d674212
ci : separate webui from server (#18072)
CISC Dec 16, 2025
c560316
graph : reuse SSM graphs (#16490)
ggerganov Dec 16, 2025
2aa45ef
llama: Include algorithm header needed for C++23 (#18078)
cpeterso Dec 16, 2025
5ba9575
security : add collaborator guidance (#18081)
ggerganov Dec 16, 2025
279cef2
added note for old Intel hardware pre sycl (#18017)
alosslessdev Dec 16, 2025
c05aa69
common : add nemotron 3 parsing (#18077)
aldehir Dec 16, 2025
db81d5e
model-conversion : use CONVERTED_EMBEDDING_MODEL for embedding_verify…
danbev Dec 16, 2025
9963b81
model-conversion : add note about verifying previous models (#18082)
danbev Dec 16, 2025
3d86c6c
model: support GLM4V vision encoder (#18042)
ngxson Dec 16, 2025
5f5f9b4
server: Update README.md incorrect argument (#18073)
2114L3 Dec 16, 2025
fb64424
CLI: fixed adding cli and completion into docker containers, improved…
andrew-aladev Dec 16, 2025
a5251ca
Optimization: Qwen3 next autoregressive pass (#17996)
pwilkin Dec 16, 2025
7b1db3d
arg: clarify auto kvu/np being set on server (#17997)
ngxson Dec 16, 2025
7f2b2f3
arch: refactor LLM_TENSOR_NAMES (#18051)
ngxson Dec 16, 2025
79dbae0
model-conversion : remove -fa option in model card template [no ci] (…
danbev Dec 16, 2025
59977eb
server: fix crash when batch > ubatch with embeddings (#17912)
yifant-code Dec 16, 2025
ec98e20
llama: fix early stop in params_fit if ctx is set (#18070)
JohannesGaessler Dec 16, 2025
ef83fb8
model: fix LFM2 missing tensors (#18105)
ngxson Dec 16, 2025
4164596
llama-fit-params: QoL impr. for prints/errors (#18089)
JohannesGaessler Dec 16, 2025
0e49a7b
llama-fit-params: fix underflow for dense models (#18095)
JohannesGaessler Dec 16, 2025
9dcac6c
llama-fit-params: lower ctx size for multi GPU (#18101)
JohannesGaessler Dec 16, 2025
d0794e8
llama-fit-params: force disable mlock (#18103)
JohannesGaessler Dec 16, 2025
2973a65
gguf-py : allow converting multi-tensor models from read-only locatio…
ykhrustalev Dec 17, 2025
5806286
ggml : use WARP_SIZE/2 for argmax reduction offset (#18092)
Aadeshveer Dec 17, 2025
4b2a477
arg: allow -kvu flag for llama-perplexity (#18117)
TrevorS Dec 17, 2025
5c0d188
llama.android : Rewrite Android binding (w/o cpu_features dep) (#17413)
naco-siren Dec 17, 2025
acec774
HIP: Refactor mma for RDNA and CDNA (#17990)
zhang-hui-yulo Dec 17, 2025
487674f
common: fix --override-kv to support comma-separated values (#18056)
ServeurpersoCom Dec 17, 2025
6853bee
ci : clean up webui jobs (#18116)
CISC Dec 17, 2025
982060f
model: fix LFM2_MOE missing tensors (#18132)
tdakhran Dec 17, 2025
669696e
ggml-cpu: ARM64: repack version of q8_0 (dotprod and i8mm) (#18096)
Alcpz Dec 17, 2025
6f1f6a9
Github: ask for -v logs for params_fit [no ci] (#18128)
JohannesGaessler Dec 17, 2025
8faa87d
Extend run-org-model.py, add (a) batching (b) loading prompt from fil…
pwilkin Dec 17, 2025
bde461d
server: (router) allow child process to report status via stdout (#18…
ngxson Dec 17, 2025
15dd67d
model: fix GLM-ASR-Nano-2512 load error (#18130) (#18142)
HonestQiao Dec 17, 2025
a2c199e
common: clarify instructions for bug reports (#18134)
JohannesGaessler Dec 17, 2025
4301e27
common : restore grammar-based rejection sampling (#18137)
ggerganov Dec 17, 2025
4470a07
ggml-hexagon: gelu operation (#17921)
joeldushouyu Dec 17, 2025
d37fc93
webui: fix chat header width when sidebar is closed (#17981)
polydecay Dec 17, 2025
8dcc366
llama-fit-params: fix memory print (#18136)
JohannesGaessler Dec 17, 2025
e85e9d7
server: (router) disable SSL on child process (#18141)
ngxson Dec 17, 2025
6ce3d85
server: (webui) add --webui-config (#18028)
ServeurpersoCom Dec 17, 2025
5166aaf
convert : force patch_merger tensors to f16/f32 (#18124)
CISC Dec 17, 2025
0a0bba0
ggml-hexagon: swiglu_oai operation (#18114)
joeldushouyu Dec 17, 2025
4d4f4ca
llama : Async DirectIO model loading on Linux (#18012)
JTischbein Dec 18, 2025
9cff4cc
convert : sort and use file parts from model index if present (#18043)
CISC Dec 18, 2025
57c1e05
llama: offload output layer to GPU first (#18148)
JohannesGaessler Dec 18, 2025
900316d
webui: fix chat screen shadow width (#18010)
polydecay Dec 18, 2025
9ce64ae
webui: Fix selecting generated output issues during active streaming …
allozaur Dec 18, 2025
54189c0
remove i_major_dual (#18157)
zhang-hui-yulo Dec 18, 2025
ec7b932
gguf-py : use copy-on-write mode for localtensor (#18162)
CISC Dec 18, 2025
4d1316c
arg: fix ASAN error on sampler_type_names empty (#18167)
ngxson Dec 18, 2025
f716588
ggml-cpu: extend support for RVV floating-point kernels (#17318)
taimur-10x Dec 18, 2025
f9ec885
webui: display prompt processing stats (#18146)
ServeurpersoCom Dec 18, 2025
8ea958d
model : add ASR support for LFM2-Audio-1.5B (conformer) (#18106)
ngxson Dec 18, 2025
cdbada8
vulkan: Add perf logger mode with concurrency (#17944)
jeffbolznv Dec 19, 2025
52fc7fe
android: fix missing screenshots for Android.md (#18156)
naco-siren Dec 19, 2025
0a271d8
model-conversion : add verbose flag in run-org-model.py (#18194)
danbev Dec 19, 2025
acb73d8
webui: Add editing attachments in user messages (#18147)
allozaur Dec 19, 2025
98c1c7a
presets: refactor, allow cascade presets from different sources, add …
ngxson Dec 19, 2025
cc0a043
server: friendlier error msg when ctx < input (#18174)
am17an Dec 19, 2025
f99ef53
llama : Changing off_t to size_t for Windows (#18204)
JTischbein Dec 19, 2025
14931a8
arg: fix order to use short form before long form (#18196)
ServeurpersoCom Dec 19, 2025
ce734a8
ggml-hexagon: Implement true Q8_0 quantization on Hexagon NPU for mor…
ngdxzy Dec 19, 2025
f74747d
ci : only save ccache on master (#18207)
CISC Dec 19, 2025
74e0513
ci : remove non-windows zip artifacts (#18201)
CISC Dec 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
18 changes: 14 additions & 4 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,15 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
AttributeMacros:
- __host__
- __device__
- __global__
- __forceinline__
- __launch_bounds__
BinPackArguments: true
BinPackParameters: true # OnePerLine
BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
BraceWrapping:
Expand Down Expand Up @@ -70,15 +77,18 @@ ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^<.*\.h>'
- Regex: '".*"'
Priority: 1
SortPriority: 0
- Regex: '^<.*'
- Regex: '^<.*\.h>'
Priority: 2
SortPriority: 0
- Regex: '.*'
- Regex: '^<.*'
Priority: 3
SortPriority: 0
- Regex: '.*'
Priority: 4
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
Expand Down
1 change: 1 addition & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Checks: >
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
-performance-enum-size,
portability-*,
-portability-simd-intrinsics,
misc-*,
Expand Down
129 changes: 129 additions & 0 deletions .devops/cann.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# ==============================================================================
# ARGUMENTS
# ==============================================================================

# Define the CANN base image for easier version updates later
ARG CHIP_TYPE=910b
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11

# ==============================================================================
# BUILD STAGE
# Compile all binary files and libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS build

# -- Install build dependencies --
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
yum clean all && \
rm -rf /var/cache/yum

# -- Set the working directory --
WORKDIR /app

# -- Copy project files --
COPY . .

# -- Set CANN environment variables (required for compilation) --
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
# ... You can add other environment variables from the original file as needed ...
# For brevity, only core variables are listed here. You can paste the original ENV list here.

# -- Build llama.cpp --
# Use the passed CHIP_TYPE argument and add general build options
ARG CHIP_TYPE
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
&& \
cmake -B build \
-DGGML_CANN=ON \
-DCMAKE_BUILD_TYPE=Release \
-DSOC_TYPE=ascend${CHIP_TYPE} \
. && \
cmake --build build --config Release -j$(nproc)

# -- Organize build artifacts for copying in later stages --
# Create a lib directory to store all .so files
RUN mkdir -p /app/lib && \
find build -name "*.so*" -exec cp -P {} /app/lib \;

# Create a full directory to store all executables and Python scripts
RUN mkdir -p /app/full && \
cp build/bin/* /app/full/ && \
cp *.py /app/full/ && \
cp -r gguf-py /app/full/ && \
cp -r requirements /app/full/ && \
cp requirements.txt /app/full/
# If you have a tools.sh script, make sure it is copied here
# cp .devops/tools.sh /app/full/tools.sh

# ==============================================================================
# BASE STAGE
# Create a minimal base image with CANN runtime and common libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS base

# -- Install runtime dependencies --
RUN yum install -y libgomp curl && \
yum clean all && \
rm -rf /var/cache/yum

# -- Set CANN environment variables (required for runtime) --
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
# ... You can add other environment variables from the original file as needed ...

WORKDIR /app

# Copy compiled .so files from the build stage
COPY --from=build /app/lib/ /app

# ==============================================================================
# FINAL STAGES (TARGETS)
# ==============================================================================

### Target: full
# Complete image with all tools, Python bindings, and dependencies
# ==============================================================================
FROM base AS full

COPY --from=build /app/full /app

# Install Python dependencies
RUN yum install -y git python3 python3-pip && \
pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
pip3 install --no-cache-dir -r requirements.txt && \
yum clean all && \
rm -rf /var/cache/yum

# You need to provide a tools.sh script as the entrypoint
ENTRYPOINT ["/app/tools.sh"]
# If there is no tools.sh, you can set the default to start the server
# ENTRYPOINT ["/app/llama-server"]

### Target: light
# Lightweight image containing only llama-cli and llama-completion
# ==============================================================================
FROM base AS light

COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

ENTRYPOINT [ "/app/llama-cli" ]

### Target: server
# Dedicated server image containing only llama-server
# ==============================================================================
FROM base AS server

ENV LLAMA_ARG_HOST=0.0.0.0

COPY --from=build /app/full/llama-server /app

HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]

ENTRYPOINT [ "/app/llama-server" ]
22 changes: 0 additions & 22 deletions .devops/cloud-v-pipeline

This file was deleted.

10 changes: 3 additions & 7 deletions .devops/cpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,23 @@ FROM ubuntu:$UBUNTU_VERSION AS build

ARG TARGETARCH

ARG GGML_CPU_ARM_ARCH=armv8-a

RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev

WORKDIR /app

COPY . .

RUN if [ "$TARGETARCH" = "amd64" ]; then \
RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
elif [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
else \
echo "Unsupported architecture"; \
exit 1; \
fi && \
cmake --build build -j $(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down Expand Up @@ -72,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

WORKDIR /app

Expand Down
6 changes: 3 additions & 3 deletions .devops/cuda.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down Expand Up @@ -61,7 +61,7 @@ RUN apt-get update \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
Expand All @@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

WORKDIR /app

Expand Down
40 changes: 22 additions & 18 deletions .devops/intel.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04

## Build Image

FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
Expand All @@ -21,7 +21,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand All @@ -31,7 +31,7 @@ RUN mkdir -p /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh

FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

RUN apt-get update \
&& apt-get install -y libgomp1 curl\
Expand All @@ -49,27 +49,31 @@ COPY --from=build /app/full /app

WORKDIR /app

RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete

RUN apt-get update && \
apt-get install -y \
git \
python3 \
python3-pip \
python3-venv && \
python3 -m venv /opt/venv && \
. /opt/venv/bin/activate && \
pip install --upgrade pip setuptools wheel && \
pip install -r requirements.txt && \
apt autoremove -y && \
apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete

ENV PATH="/opt/venv/bin:$PATH"

ENTRYPOINT ["/app/tools.sh"]

### Light, CLI only
FROM base AS light

COPY --from=build /app/lib/ /app
COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

WORKDIR /app

Expand Down
5 changes: 3 additions & 2 deletions .devops/llama-cli-cann.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
cmake --build build --config Release --target llama-cli
cmake --build build --config Release --target llama-cli && \
cmake --build build --config Release --target llama-completion

# TODO: use image with NNRT
FROM ascendai/cann:$ASCEND_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli
COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /

ENV LC_ALL=C.utf8

Expand Down
2 changes: 2 additions & 0 deletions .devops/llama-cpp-cuda.srpm.spec
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ make -j GGML_CUDA=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple

Expand Down Expand Up @@ -68,6 +69,7 @@ rm -rf %{_builddir}/*

%files
%{_bindir}/llama-cuda-cli
%{_bindir}/llama-cuda-completion
%{_bindir}/llama-cuda-server
%{_bindir}/llama-cuda-simple
/usr/lib/systemd/system/llamacuda.service
Expand Down
2 changes: 2 additions & 0 deletions .devops/llama-cpp.srpm.spec
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ make -j
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple

Expand Down Expand Up @@ -70,6 +71,7 @@ rm -rf %{_builddir}/*

%files
%{_bindir}/llama-cli
%{_bindir}/llama-completion
%{_bindir}/llama-server
%{_bindir}/llama-simple
/usr/lib/systemd/system/llama.service
Expand Down
Loading