Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
3295 commits
Select commit Hold shift + click to select a range
7b50f7c
graph : prepare for 4D mask (#14515)
ggerganov Jul 4, 2025
67d1ef2
batch : add optional for sequential equal split (#14511)
ggerganov Jul 4, 2025
ef797db
metal : disable fast math in all quantize kernels (#14528)
ggerganov Jul 4, 2025
b81510a
test-backend-ops: add support for specifying output format (#14368)
yeahdongcn Jul 5, 2025
bac8bed
eval-callback : check for empty input (#14539)
ggerganov Jul 5, 2025
6681688
opencl: add GELU_ERF (#14476)
CISC Jul 5, 2025
ddef995
server : fix assistant prefilling when content is an array (#14360)
CISC Jul 5, 2025
a0374a6
vulkan: Handle updated FA dim2/3 definition (#14518)
jeffbolznv Jul 5, 2025
e592be1
vulkan: fix rms_norm+mul fusion (#14545)
jeffbolznv Jul 6, 2025
6491d6e
vulkan: increase LOAD_VEC_A to 8 (IQ1/IQ2) or 4 (IQ3) (#14485)
netrunnereve Jul 6, 2025
b9c3eef
CUDA: add bf16 and i32 to getrows (#14529)
am17an Jul 7, 2025
12f55c3
llama : remove ggml_cont where possible (#14568)
CISC Jul 7, 2025
e1a7059
llama : fix incorrect minicpm3 v_states shape (#14571)
CISC Jul 7, 2025
68155c6
musa: fix build warnings (unused variable) (#14561)
yeahdongcn Jul 7, 2025
75c91de
CUDA: add bilinear interpolation for upscale (#14563)
am17an Jul 8, 2025
4d0dcd4
cuda : fix rope with partial rotation and non-cont src (#14580)
ggerganov Jul 8, 2025
53903ae
vulkan: increase timeout for CI (#14574)
jeffbolznv Jul 8, 2025
8f22dc0
model : add hunyuan moe (#14425)
ngxson Jul 8, 2025
17a1f0d
server: Add ability to mount server at prefix (#14544)
oluwandabira Jul 8, 2025
b8eeb87
vulkan : fix rope with partial rotation and non-cont src (#14582)
jeffbolznv Jul 8, 2025
bb4f7a9
memory : fix broken batch splits for recurrent cache (#14575)
compilade Jul 8, 2025
0838286
model : add SmolLM3 (#14581)
ngxson Jul 8, 2025
699f439
model : fix hunyuan moe chat template (#14584)
stevenkuang-tencent Jul 8, 2025
6efcd65
vulkan: optimize flash attention split_k_reduce (#14554)
jeffbolznv Jul 8, 2025
20b7bf8
convert : fix smollm3 jinja template (#14586)
ngxson Jul 9, 2025
0465506
model : add support for Falcon-H1 family (#14534)
ibrahimkhadraoui Jul 9, 2025
1055545
llama : remove unintended whitespace (#14592)
CISC Jul 9, 2025
ffd59e7
model : add skt/A.X-4.0 model vocabulary (#14589)
Bing-su Jul 9, 2025
26a48ad
ggml : prevent integer overflow in gguf tensor size calculation (#14595)
Yuuoniy Jul 9, 2025
98bab63
ggml : add ggml_scale_bias (#14417)
ngxson Jul 9, 2025
4a5686d
llama : support Jamba hybrid Transformer-Mamba models (#7531)
compilade Jul 9, 2025
cb9178f
llama : remove llm_graph_input_one (#14603)
ngxson Jul 9, 2025
a57d1bc
cuda : support Falcon-H1 state size for SSM_SCAN (#14602)
compilade Jul 10, 2025
ac44eb6
cmake : llguidance build parser library only (#14608)
EZForever Jul 10, 2025
f9a867f
cmake : bump llguidance version to v1.0.1 (#14609)
EZForever Jul 10, 2025
435a6d1
llama : minor coding style fix for smollm3 (#14605)
ngxson Jul 10, 2025
704bb7a
SYCL: Initial set_rows kernel implementation (#14562)
qnixsynapse Jul 10, 2025
a457551
cmake : do not search for curl libraries by ourselves (#14613)
EZForever Jul 10, 2025
11ee0fe
Docs: script to auto-generate ggml operations docs (#14598)
am17an Jul 10, 2025
4bb625b
Smoldocling support (#14597)
ryan-mangeno Jul 10, 2025
0b88557
opencl: add `set_rows` for `f16` and `f32` (#14547)
lhez Jul 10, 2025
6bdda13
opencl: add tiled mul_mat_f16_f32 (#14535)
rmatif Jul 10, 2025
0aedae0
model : Granite Four (#13550)
gabe-l-hart Jul 11, 2025
576c82e
vocab : add midm-2.0 model pre-tokenizer (#14626)
Bing-su Jul 11, 2025
0d5375d
llama : move enum llama_vocab_pre_type to implementation (#14631)
ggerganov Jul 11, 2025
aaa088d
readme : add hot PRs (#14636)
ggerganov Jul 11, 2025
756aa10
HIP : Add HIP 7.0+ compatibility for hipBLAS compute types (#14634)
slojosic-amd Jul 11, 2025
f5e96b3
model : support LiquidAI LFM2 hybrid family (#14620)
tdakhran Jul 11, 2025
98197e5
vulkan: optimizations for deepseek prompt processing (#14555)
jeffbolznv Jul 12, 2025
b3ad3a0
vulkan: support SET_ROWS (#14587)
jeffbolznv Jul 12, 2025
0c1df14
server : fix pooled embedding output (#14645)
iamlemec Jul 12, 2025
3e303b1
vulkan : implement ggml_roll (ggml/1290)
Acly Jul 12, 2025
74bb294
vulkan : implement bilinear interpolation (ggml/1291)
Acly Jul 12, 2025
2155357
sync : ggml
ggerganov Jul 12, 2025
3120413
vulkan : remove unused vars (#0)
ggerganov Jul 12, 2025
8eff955
sync : ggml
ggerganov Jul 12, 2025
7de5c7c
CUDA: add set rows for f32 and f16 (#14551)
am17an Jul 12, 2025
67eade1
docs : add LFM2 to models section (#14650)
tdakhran Jul 12, 2025
c31e606
tests : cover lfm2 cases in test_ssm_conv (#14651)
tdakhran Jul 12, 2025
84b396e
cmake : Add CMake presets for Linux and GCC (#14656)
YavorGIvanov Jul 13, 2025
dcf7f2e
metal : Add missing unary ops Metal support (#14660)
YavorGIvanov Jul 13, 2025
05fec5b
ggml : add build-time message to remind about ggml_set_rows (#14661)
ggerganov Jul 13, 2025
e743cdd
cuda : add ELU support (#14657)
YavorGIvanov Jul 13, 2025
923e3ea
cuda : add set rows for bf16 (#14664)
CISC Jul 13, 2025
982e347
quantize : fix minor logic flaw in --tensor-type (#14572)
EAddario Jul 13, 2025
0d92267
llama : add jinja template for rwkv-world (#14665)
MollySophia Jul 13, 2025
65a3ebb
sycl: Batched mulmat rework for oneDNN dispatch (#14617)
ShanoToni Jul 14, 2025
0f4c6ec
SYCL: use 1D kernel for set_rows (#14618)
qnixsynapse Jul 14, 2025
494c589
scripts: benchmark for HTTP server throughput (#14668)
JohannesGaessler Jul 14, 2025
9c9e4fc
llama-context: add ability to get logits (#14672)
am17an Jul 14, 2025
55c509d
ggml : refactor llamafile_sgemm PPC code (#14673)
shalinib-ibm Jul 14, 2025
bdca383
sycl: Hotfix for non dnnl codepath (#14677)
ShanoToni Jul 14, 2025
cbc68be
cuda: fix build warnings in set-rows.cu (unused variable) (#14687)
yeahdongcn Jul 15, 2025
68e37a6
model : add PLaMo-2 support (#14560)
mitmul Jul 15, 2025
10a0351
vulkan: add RTE variants for glu/add/sub/mul/div (#14653)
jeffbolznv Jul 15, 2025
ba1ceb3
vulkan: fix noncontig check for mat_mul_id splitting (#14683)
jeffbolznv Jul 15, 2025
4a4f426
model : add Kimi-K2 support (#14654)
gabriellarson Jul 15, 2025
c81f419
gguf-py : dump bpw per layer and model in markdown mode (#14703)
EAddario Jul 15, 2025
79e0b68
llama: add LLAMA_API to deprecated llama_kv_self_seq_div (#14708)
Min-Hua Jul 16, 2025
cf91f21
convert : add pre-computed hashes first to prevent order mishaps (#14…
CISC Jul 16, 2025
4b91d6f
convert : only check for tokenizer folder if we need it (#14704)
CISC Jul 16, 2025
5cae766
scripts: synthetic prompt mode for server-bench.py (#14695)
JohannesGaessler Jul 16, 2025
538cc77
server : fix handling of the ignore_eos flag (#14710)
ggerganov Jul 16, 2025
e4841d2
llama : fix parallel processing for plamo2 (#14716)
mitmul Jul 16, 2025
6ffd4e9
server : pre-calculate EOG logit biases (#14721)
ggerganov Jul 16, 2025
6497834
ggml : add asserts (#14720)
ggerganov Jul 16, 2025
ab14019
Support diffusion models: Add Dream 7B (#14644)
am17an Jul 16, 2025
225e7a1
llama : add high-throughput mode (#14363)
ggerganov Jul 16, 2025
b0f0ecc
model : support output bias for qwen2 (#14711)
tempstudio Jul 16, 2025
21c0217
ggml: Add initial WebGPU backend (#14521)
reeselevine Jul 16, 2025
496957e
llama : fix parameter order for hybrid memory initialization (#14725)
dinerburger Jul 16, 2025
19e5943
convert : make hf token optional (#14717)
CISC Jul 16, 2025
1ba45d4
ci : disable failing vulkan crossbuilds (#14723)
CISC Jul 16, 2025
ad57d3e
batch : fix uninitialized has_cpl flag (#14733)
ggerganov Jul 17, 2025
d9b6910
kv-cache : opt mask set input (#14600)
ggerganov Jul 17, 2025
086cf81
llama : fix parallel processing for lfm2 (#14705)
tdakhran Jul 17, 2025
01612b7
llama : reuse compute graphs (#14482)
ggerganov Jul 17, 2025
d6fb3f6
kv-cache : fix k-shift for multiple streams (#14742)
ggerganov Jul 17, 2025
cb887f1
model: add Ernie 4.5 MoE support (#14658)
pwilkin Jul 17, 2025
760b448
nix : use optionalAttrs for env mkDerivation attrset argument (#14726)
amozeo Jul 17, 2025
670e136
convert : fix Ernie4.5 MoE without shared experts (#14746)
pwilkin Jul 17, 2025
349ea79
use max work group size for device to replace the magic number (#14732)
NeoZhangJianyu Jul 18, 2025
09651d0
graph : Pass the graph placeholder message in debug mode (#14748)
Nexesenex Jul 18, 2025
8f974bc
graph : refactor context to not pass gf explicitly (#14629)
ggerganov Jul 18, 2025
f9a31ee
CUDA: set_rows + cpy.cu refactor (#14712)
am17an Jul 18, 2025
e0cb5c5
model : add EXAONE 4.0 support (#14630)
lgai-exaone Jul 18, 2025
eacdeb5
model : fix build after merge conflict (#14754)
ggerganov Jul 18, 2025
d498af3
graph : avoid huge warm-up graphs for MoE models (#14753)
ggerganov Jul 18, 2025
021cc28
cuda : Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs (#14741)
ORippler Jul 18, 2025
2adf8d8
parallel : add option for different RNG seeds (#14757)
ggerganov Jul 18, 2025
9fb1042
graph : fix graph reuse reset of params (#14760)
ggerganov Jul 18, 2025
bf9087f
metal : fuse add, mul + add tests (#14596)
ggerganov Jul 18, 2025
b172309
sync : ggml
ggerganov Jul 19, 2025
f0d4d17
Documentation: Update build.md's Vulkan section (#14736)
rspOverflow Jul 19, 2025
83f5872
Vulkan: Fix fprintf format-security warning (#14770)
0cc4m Jul 19, 2025
d4b91ea
vulkan: Add logging for bf16 features to ggml_vk_print_gpu_info (#132…
Peter0x44 Jul 19, 2025
9008328
imatrix : use GGUF to store importance matrices (#9400)
compilade Jul 19, 2025
a979ca2
ggml: adds CONV_2D op and direct GEMM Vulkan implementation (#14316)
etasnadi Jul 19, 2025
36c1532
Contrib: add 0cc4m as codeowner for Vulkan backend (#14775)
0cc4m Jul 19, 2025
938b785
Clang-format: local files first + fix BinPacking (#14779)
am17an Jul 20, 2025
b526ad2
Documentation: Further revisions to the Vulkan section in build.md (#…
rspOverflow Jul 20, 2025
2be60cb
docs : fix link for tools/perplexity in README.md (#14780)
am17an Jul 20, 2025
b4efd77
server : add parse_special option to /tokenize endpoint (#14783)
IsaacDynamo Jul 21, 2025
c82d48e
llama : fix `--reverse-prompt` crashing issue (#14794)
MollySophia Jul 21, 2025
c2e058f
vulkan/cuda: Fix im2col when KW!=KH (#14789)
jeffbolznv Jul 21, 2025
2ba1333
docs : fix backends table in README.md (#14796)
rgerganov Jul 21, 2025
9220426
kleidiai: add support for get_rows (#14676)
chaxu01 Jul 21, 2025
cd465d8
sycl: Fix im2col (#14797)
Rbiessy Jul 21, 2025
6c9ee3b
opencl: add conv2d kernel (#14403)
rmatif Jul 21, 2025
38d3af1
opencl: fix `im2col` when `KW!=KH` (#14803)
CISC Jul 21, 2025
48b86c4
cuda: remove linking to cublasLt (#14790)
yeahdongcn Jul 21, 2025
adef817
server : allow setting `--reverse-prompt` arg (#14799)
MollySophia Jul 22, 2025
8e6f8bc
opencl: remove unreachable `return` (#14806)
lhez Jul 22, 2025
e28c0b8
cuda : implement bf16 cpy ops and enable bf16 cont (#14763)
CISC Jul 22, 2025
c8ade30
Mtmd: add a way to select device for vision encoder (#14236)
stduhpf Jul 22, 2025
d1aa0cc
imatrix: add option to display importance score statistics for a give…
EAddario Jul 22, 2025
d4d1522
llama : add model type detection for rwkv7 7B&14B (#14816)
MollySophia Jul 22, 2025
84712b6
vulkan: fix rms_norm_mul to handle broadcasting dim0 (#14817)
jeffbolznv Jul 22, 2025
acd6cb1
ggml : model card yaml tab->2xspace (#14819)
csabakecskemeti Jul 22, 2025
8c988fa
CUDA: add fused rms norm (#14800)
am17an Jul 23, 2025
14c28df
CANN: weight format to NZ for Ascend310P3 (#14407)
tqgy6 Jul 23, 2025
6c88b3b
ggml: fix loongarch quantize_row_q8_1 error (#14827)
lixing-star Jul 23, 2025
7233358
memory : handle saving/loading null layers in recurrent memory (#14675)
l3utterfly Jul 23, 2025
18f3b5f
tests : add non-cont K,V FA tests
ggerganov Jul 18, 2025
07a19e2
CUDA: fix quantized KV cache + multiple sequences (#14822)
JohannesGaessler Jul 23, 2025
221c0e0
ci : correct label refactor->refactoring (#14832)
CISC Jul 23, 2025
b284197
CUDA: fix compilation with GGML_CUDA_F16 (#14837)
JohannesGaessler Jul 23, 2025
a86f52b
CUDA: fix overflow in FA, tune performance (#14840)
JohannesGaessler Jul 23, 2025
a12363b
convert : text-only support for GLM-4.1V-9B-Thinking (#14823)
jacekpoplawski Jul 23, 2025
4ec6291
sycl: fix undefined variable in work group size check (#14843)
djeong20 Jul 24, 2025
065908c
metal : fix fusion across different encoders (#14849)
ggerganov Jul 24, 2025
39cffdf
docs: add libcurl-dev install hint for Linux distros (#14801)
PouyaGhahramanian Jul 24, 2025
86f5623
llama : fix MiniCPM inference after Granite Four changes (#14850)
jk3456a Jul 24, 2025
cb4a63a
sycl: fixed semantics of block offset calculation (#14814)
Jul 24, 2025
820de57
chat : fix kimi-k2 chat template (#14852)
ngxson Jul 24, 2025
e4868d1
context : perform output reorder lazily upon access after sync (#14853)
ggerganov Jul 24, 2025
5592f27
ggml-cpu : remove stdlib include from repack.cpp (ggml/1276)
danbev Jul 21, 2025
60f816a
cmake : fix usage issues (ggml/1257)
dg0yt Jul 22, 2025
2df255d
sync : ggml
ggerganov Jul 24, 2025
3f4fc97
musa: upgrade musa sdk to rc4.2.0 (#14498)
yeahdongcn Jul 24, 2025
c12bbde
sched : fix multiple evaluations of the same graph with pipeline para…
slaren Jul 25, 2025
64bf1c3
rpc : check for null buffers in get/set/copy tensor endpoints (#14868)
struct Jul 25, 2025
749e0d2
mtmd : fix 32-bit narrowing issue in export-lora and mtmd clip (#14503)
kiwi142857 Jul 25, 2025
c1dbea7
context : restore preemptive sched reset when LLAMA_SET_ROWS=0 (#14870)
ggerganov Jul 25, 2025
e2b7621
ggml : remove invalid portPos specifiers from dot files (#14838)
ORippler Jul 25, 2025
e7fecba
docs : update HOWTO‑add‑model.md for ModelBase and new model classes …
wooksong Jul 25, 2025
ce111d3
opencl: add fused `rms_norm_mul` (#14841)
lhez Jul 25, 2025
793c0d7
metal: SSM_SCAN performance (#14743)
gabe-l-hart Jul 25, 2025
c7f3169
ggml-cpu : disable GGML_NNPA by default due to instability (#14880)
taronaeo Jul 25, 2025
9b8f3c6
musa: fix build warnings (unused variable) (#14869)
yeahdongcn Jul 26, 2025
11dd5a4
CANN: Implement GLU ops (#14884)
hipudding Jul 26, 2025
66906cd
HIP: Enable Matrix cores for MMQ Kernels, Enable stream-K for CDNA 3 …
deepsek Jul 26, 2025
446595b
Docs: add instructions for adding backends (#14889)
am17an Jul 27, 2025
1dc9614
llama : fix kq_scale for the attention layers of PLaMo2 (#14892)
mitmul Jul 27, 2025
4762ad7
model : make rope_yarn_log_mul optional for deepseek2 (#14896)
gabriellarson Jul 27, 2025
f1a4e72
vulkan: skip empty set_rows to avoid invalid API usage (#14860)
jeffbolznv Jul 27, 2025
89d1029
vulkan : add fp16 support for the conv_2d kernel (#14872)
Green-Sky Jul 27, 2025
ca0ef2d
llama : clarify comment about pp and tg graphs [no ci] (#14895)
danbev Jul 27, 2025
bbfc849
SYCL: add ops doc (#14901)
qnixsynapse Jul 27, 2025
bf78f54
vulkan: add ops docs (#14900)
0cc4m Jul 27, 2025
7f97599
quantize : update README.md (#14905)
EAddario Jul 27, 2025
613c509
cmake : Indent ggml-config.cmake (ggml/1310)
dg0yt Jul 24, 2025
1f45f28
sync : ggml
ggerganov Jul 28, 2025
c35f9ea
ops : update Metal (#14912)
ggerganov Jul 28, 2025
a5771c9
ops : update BLAS (#14914)
ggerganov Jul 28, 2025
afc0e89
sycl: refactor quantization to q8_1 (#14815)
Jul 28, 2025
6c6e397
model : add support for SmallThinker series (#14898)
wdl339 Jul 28, 2025
946b1f6
CUDA: fix pointer incrementation in FA (#14916)
JohannesGaessler Jul 28, 2025
00fa15f
mtmd : add support for Voxtral (#14862)
ngxson Jul 28, 2025
cd1fce6
SYCL: Add set_rows support for quantized types (#14883)
qnixsynapse Jul 28, 2025
db16e28
ggml-cpu : deduplicate scalar implementations (#14897)
xctan Jul 28, 2025
c556418
llama-bench : use local GPUs along with RPC servers (#14917)
rgerganov Jul 28, 2025
bda6219
test-backend-ops : extend test case filtering (#14865)
tlemo Jul 28, 2025
8ad7b3e
opencl : add ops docs (#14910)
lhez Jul 28, 2025
0a5036b
CUDA: add roll (#14919)
am17an Jul 29, 2025
bbd0f91
server-bench: make seed choice configurable (#14929)
JohannesGaessler Jul 29, 2025
138b288
cuda : add softcap fusion (#14907)
CISC Jul 29, 2025
204f2cf
CANN: Add ggml_set_rows (#14943)
hipudding Jul 29, 2025
1a67fcc
common : avoid logging partial messages (which can contain broken UTF…
kallewoof Jul 29, 2025
c7aa136
HIP: Ignore unsupported unroll transformation in fattn-vec (#14931)
IMbackK Jul 29, 2025
b77d111
HIP: add GGML_HIP_MMQ_MFMA option to allow disableing the MFMA path. …
IMbackK Jul 29, 2025
aa79524
HIP: remove the use of __HIP_PLATFORM_AMD__, explicitly support only …
IMbackK Jul 29, 2025
61550f8
CANN: update ops docs (#14935)
bachelor-dou Jul 30, 2025
a118d80
embeddings: fix extraction of CLS pooling results (#14927)
iamlemec Jul 30, 2025
1e15bfd
graph : fix stack-use-after-return (#14960)
ggerganov Jul 30, 2025
00131d6
tests : update for LLAMA_SET_ROWS=1 (#14961)
ggerganov Jul 30, 2025
92b8810
CUDA: skip masked KV slices for all FA kernels (#14924)
JohannesGaessler Jul 30, 2025
73a8e5c
vulkan : fix 32-bit builds (ggml/1313)
dg0yt Jul 30, 2025
e228de9
cmake : Fix BLAS link interface (ggml/1316)
dg0yt Jul 30, 2025
e32a4ec
sync : ggml
ggerganov Jul 30, 2025
ad4a700
HIP: enable mfma mmq on gfx908 and gfx90a for select datatypes and sh…
IMbackK Jul 30, 2025
41e78c5
server : add support for `embd_normalize` parameter (#14964)
danbev Jul 30, 2025
e9192be
quantize : fix using combined imatrix GGUFs (multiple datasets) (#14973)
EAddario Jul 30, 2025
6e67254
opencl: add `mul_mat_f32_f32_l4_lm` and `mul_mat_f16_f32_l4_lm` (#14809)
lhez Jul 30, 2025
66625a5
graph : reduce splits for recurrent and hybrid models (#14825)
compilade Jul 31, 2025
11490b3
CANN: Improve loading efficiency after converting weights to NZ forma…
hipudding Jul 31, 2025
8a4a856
Add LLaDA 8b Diffusion model (#14771)
am17an Jul 31, 2025
a9f77a8
server : add openai-style logit_bias support (#14946)
lukasstraub2 Jul 31, 2025
c1dacaa
llama : merge build_moe_ffn_from_probs function into build_moe_ffn (#…
wdl339 Jul 31, 2025
94933c8
server : implement universal assisted decoding (#12635)
g2mt Jul 31, 2025
36e5fe7
MODEL_TENSOR.SSM_DT_NORM has defined twice (#14991)
csabakecskemeti Jul 31, 2025
952a47f
mtmd : support MiniCPM-V 4.0 (#14983)
tc-mb Jul 31, 2025
e08a988
Vulkan: Fix minor debug mode issues (#14899)
0cc4m Jul 31, 2025
d6818d0
llama : allow other bufts when overriding to CPU, add --no-repack opt…
slaren Jul 31, 2025
7845240
Fix params bug in diffusion example (#14993)
am17an Jul 31, 2025
a06ed5f
llama : add simple option to enable CPU for MoE weights (--cpu-moe) (…
slaren Jul 31, 2025
daf2dd7
quantize : skip tensor override when in fallback mode (#14995)
EAddario Jul 31, 2025
484b209
compare-commits.sh: support both llama-bench and test-backend-ops (#1…
yeahdongcn Aug 1, 2025
2860d47
docker : add cann build pipline (#14591)
diannaojiang Aug 1, 2025
ba42794
graph : fix equal_seq() check (#14986)
ggerganov Aug 1, 2025
baad948
ggml : Q2k interleaving implementation - x86/x64 SIMD (#14373)
Srihari-mcw Aug 1, 2025
1c872f7
opencl: add f16 for `add`, `sub`, `mul`, `div` (#14984)
lhez Aug 1, 2025
0f5ccd6
model : add hunyuan dense (#14878)
stevenkuang-tencent Aug 1, 2025
c76b420
vendor : update vendored copy of google/minja (#15011)
l-austenfeld Aug 1, 2025
9c35706
CUDA: fix MMQ nwarps for AMD with warp_size==32 (#15014)
JohannesGaessler Aug 1, 2025
a9f7541
vulkan: optimizations for direct convolution (#14933)
jeffbolznv Aug 2, 2025
f906275
server: enable token array inputs for OAI API (#15001)
JohannesGaessler Aug 2, 2025
339bd02
model : support Qwen3-Embedding (#15023)
iamlemec Aug 2, 2025
ec0b188
vulkan: Support ne[3]>1 in noncontig matrix-vector multiply (#15015)
jeffbolznv Aug 2, 2025
3025b62
llama-bench: rename DB table name from test to llama_bench (#15003)
yeahdongcn Aug 2, 2025
4cb208c
vulkan: coopmat2 mul_mat optimizations (#14934)
jeffbolznv Aug 2, 2025
f738989
chat : fix multiple tool_calls on hermes-2-pro (#14962)
jhen0409 Aug 2, 2025
711d5e6
convert : fix Qwen3-Embedding pre-tokenizer hash (#15030)
iamlemec Aug 2, 2025
2bf3fbf
ci : check that pre-tokenizer hashes are up-to-date (#15032)
CISC Aug 2, 2025
15e92fd
cuda, sycl : fix batched gemm when ne02 == 1 && ne03 > 1 (#15038)
ggerganov Aug 2, 2025
a4569c4
llama : enable LLAMA_SET_ROWS=1 by default (#14959)
ggerganov Aug 2, 2025
4fdea54
kv-cache : skip alignment of n_stream in kv-cache log msg [no ci] (#1…
danbev Aug 2, 2025
3303c19
cuda: make im2col a little faster (#15025)
leejet Aug 2, 2025
03d4698
CUDA: use mma FA kernel for gqa > 4 on RTX 4000 (#15035)
JohannesGaessler Aug 2, 2025
5c0eb5e
opencl: fix adreno compiler detection logic (#15029)
lhez Aug 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
164 changes: 164 additions & 0 deletions .clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
---
Language: Cpp
AlignAfterOpenBracket: Align
AlignArrayOfStructures: Left
AlignConsecutiveAssignments: AcrossComments
AlignConsecutiveBitFields: AcrossComments
AlignConsecutiveDeclarations: AcrossComments
AlignConsecutiveMacros: AcrossComments
# AlignConsecutiveShortCaseStatements: AcrossComments
AlignEscapedNewlines: Left # LeftWithLastLine
AlignOperands: Align
AlignTrailingComments:
Kind: Always
OverEmptyLines: 1
AllowAllArgumentsOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: false
# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackArguments: false
BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
BraceWrapping:
AfterCaseLabel: true
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
BeforeLambdaBody: false
BeforeWhile: false
IndentBraces: false
SplitEmptyFunction: false
SplitEmptyRecord: false
SplitEmptyNamespace: false
# BreakAdjacentStringLiterals: true
BreakAfterAttributes: Never
BreakBeforeBinaryOperators: None
BreakBeforeInlineASMColon: OnlyMultiline
BreakBeforeTernaryOperators: false
# BreakBinaryOperations: Never
BreakConstructorInitializers: AfterColon
# BreakFunctionDefinitionParameters: false
BreakInheritanceList: AfterComma
BreakStringLiterals: true
# BreakTemplateDeclarations: Yes
ColumnLimit: 120
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: false
DerivePointerAlignment: false
DisableFormat: false
EmptyLineBeforeAccessModifier: Leave
EmptyLineAfterAccessModifier: Never
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '".*"'
Priority: 1
SortPriority: 0
- Regex: '^<.*\.h>'
Priority: 2
SortPriority: 0
- Regex: '^<.*'
Priority: 3
SortPriority: 0
- Regex: '.*'
Priority: 4
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
IndentCaseBlocks: true
IndentCaseLabels: true
IndentExternBlock: NoIndent
IndentGotoLabels: false
IndentPPDirectives: AfterHash
IndentWidth: 4
IndentWrappedFunctionNames: false
InsertBraces: true # NOTE: may lead to incorrect formatting
InsertNewlineAtEOF: true
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
LambdaBodyIndentation: Signature
LineEnding: LF
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 4
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
PPIndentWidth: -1
PackConstructorInitializers: CurrentLine
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Middle
QualifierAlignment: Left
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
RawStringFormats:
- Language: Cpp
Delimiters:
- cc
- CC
- cpp
- Cpp
- CPP
- 'c++'
- 'C++'
CanonicalDelimiter: ''
ReferenceAlignment: Middle
ReflowComments: false # IndentOnly
SeparateDefinitionBlocks: Always
SortIncludes: CaseInsensitive
SortUsingDeclarations: LexicographicNumeric
SpaceAfterCStyleCast: true
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: Never
SpacesInContainerLiterals: true
SpacesInLineCommentPrefix:
Minimum: 1
Maximum: -1
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpaceBeforeSquareBrackets: false
Standard: c++17
TabWidth: 4
UseTab: Never
WhitespaceSensitiveMacros: ['STRINGIZE']
...

3 changes: 3 additions & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@ Checks: >
-readability-magic-numbers,
-readability-uppercase-literal-suffix,
-readability-simplify-boolean-expr,
-readability-math-missing-parentheses,
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
portability-*,
-portability-simd-intrinsics,
misc-*,
-misc-const-correctness,
-misc-non-private-member-variables-in-classes,
-misc-no-recursion,
-misc-use-anonymous-namespace,
FormatStyle: none
130 changes: 130 additions & 0 deletions .devops/cann.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# ==============================================================================
# ARGUMENTS
# ==============================================================================

# Define the CANN base image for easier version updates later
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10

# ==============================================================================
# BUILD STAGE
# Compile all binary files and libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS build

# Define the Ascend chip model for compilation. Default is Ascend910B3
ARG ASCEND_SOC_TYPE=Ascend910B3

# -- Install build dependencies --
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
yum clean all && \
rm -rf /var/cache/yum

# -- Set the working directory --
WORKDIR /app

# -- Copy project files --
COPY . .

# -- Set CANN environment variables (required for compilation) --
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
# ... You can add other environment variables from the original file as needed ...
# For brevity, only core variables are listed here. You can paste the original ENV list here.

# -- Build llama.cpp --
# Use the passed ASCEND_SOC_TYPE argument and add general build options
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
&& \
cmake -B build \
-DGGML_CANN=ON \
-DCMAKE_BUILD_TYPE=Release \
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
. && \
cmake --build build --config Release -j$(nproc)

# -- Organize build artifacts for copying in later stages --
# Create a lib directory to store all .so files
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;

# Create a full directory to store all executables and Python scripts
RUN mkdir -p /app/full && \
cp build/bin/* /app/full/ && \
cp *.py /app/full/ && \
cp -r gguf-py /app/full/ && \
cp -r requirements /app/full/ && \
cp requirements.txt /app/full/
# If you have a tools.sh script, make sure it is copied here
# cp .devops/tools.sh /app/full/tools.sh

# ==============================================================================
# BASE STAGE
# Create a minimal base image with CANN runtime and common libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS base

# -- Install runtime dependencies --
RUN yum install -y libgomp curl && \
yum clean all && \
rm -rf /var/cache/yum

# -- Set CANN environment variables (required for runtime) --
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
# ... You can add other environment variables from the original file as needed ...

WORKDIR /app

# Copy compiled .so files from the build stage
COPY --from=build /app/lib/ /app

# ==============================================================================
# FINAL STAGES (TARGETS)
# ==============================================================================

### Target: full
# Complete image with all tools, Python bindings, and dependencies
# ==============================================================================
FROM base AS full

COPY --from=build /app/full /app

# Install Python dependencies
RUN yum install -y git python3 python3-pip && \
pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
pip3 install --no-cache-dir -r requirements.txt && \
yum clean all && \
rm -rf /var/cache/yum

# You need to provide a tools.sh script as the entrypoint
ENTRYPOINT ["/app/tools.sh"]
# If there is no tools.sh, you can set the default to start the server
# ENTRYPOINT ["/app/llama-server"]

### Target: light
# Lightweight image containing only llama-cli
# ==============================================================================
FROM base AS light

COPY --from=build /app/full/llama-cli /app

ENTRYPOINT [ "/app/llama-cli" ]

### Target: server
# Dedicated server image containing only llama-server
# ==============================================================================
FROM base AS server

ENV LLAMA_ARG_HOST=0.0.0.0

COPY --from=build /app/full/llama-server /app

HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]

ENTRYPOINT [ "/app/llama-server" ]
2 changes: 1 addition & 1 deletion .devops/cloud-v-pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
stage('Running llama.cpp'){
sh'''#!/bin/bash
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
cat llama_log.txt # Printing results
'''
}
Expand Down
Loading