27B uncensored heretic v2 with MTP speculative decoding. ~128K context on a single 16GB GPU with KV Q8/Q5.
Think — reasoning on, MTP draft spec, 128K ctx
# optimized for coding
# max context: 116736 KDE n=2 31.15 t/s,
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# Headless
# --ctx-size 32768 \
# -c 116000
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /mnt/large/LM/models/mradermacher/Qwen3.6-27B-uncensored-heretic-v2-Native-MTP-Preserved.i1-IQ3_M.gguf \
--host 0.0.0.0 -np 1 -fa on --no-mmap --jinja \
-ctk q8_0 -ctv q5_0 \
--temp 0.6 --top-k 30 --top-p 0.95 --min-p 0.0 \
--presence-penalty 0.0 --repeat-penalty 1.0 \
-b 1024 -ub 128 \
--fit-target 60 \
--spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 1 \
--cache-type-k-draft q4_0 --cache-type-v-draft q4_0 \
--reasoning on --reasoning-budget 6096 --reasoning-budget-message " -- Reasoning budget exceeded, proceed to final answer." \
--cache-ram 6000 -ngl 99 -lv 3 --no-warmup \