27B parameter model with Multi-Token Prediction (MTP) for speculative decoding. Uses a Q8 nextn draft model for fast token generation. Best all-round config for coding.
Normal — optimized for coding, 32K context headless
# optimized for coding
# max context: 32K headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# Headless
# --ctx-size 32768 \
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /home/eaman/lm/models/localweights/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn-GGUF/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn.gguf \
--host 0.0.0.0 -np 1 -fa on \
--fit-target 50 \
-ctk q4_0 -ctv q4_0 \
--temp 0.6 --top-k 30 --top-p 0.95 --min-p 0.0 \
--repeat-penalty 1.0 --presence_penalty 0.0 \
-b 128 \
--jinja \
--no-mmap \
--spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 3 \
-ctkd q8_0 -ctvd q8_0 \
--ctx-size 30000 \
-ngl 99 \
--reasoning-budget 1 --chat-template-kwargs '{"enable_thinking":false}' \
--cache-ram 6000 -ngl 99 -lv 4 --no-warmup \
Think — reasoning mode with higher temperature
# optimized for coding
# max context: 32k headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# /home/eaman/lm/models/mradermacher/Qwen3.6-27B/Qwen3.6-27B.i1-IQ4_XS-attn_qkv-IQ4_XS.gguf
# /home/eaman/lm/models/localweights/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn-GGUF/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn.gguf
# --spec-draft-n-max 2 \
# Headless
# --ctx-size 32768 \
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /home/eaman/lm/models/localweights/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn-GGUF/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn.gguf \
--host 0.0.0.0 -np 1 \
--fit-target 50 \
-ctk q5_1 \
-ctv q4_0 \
-fa on \
--temp 0.7 --top-k 30 --top-p 0.95 --min-p 0.0 \
--repeat-penalty 1.0 --presence_penalty 0.0 \
-b 128 \
--jinja \
--no-mmap \
--spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 3 \
-ctkd q8_0 -ctvd q8_0 \
--ctx-size 30000 \
-ngl 99 \
--cache-ram 6000 -ngl 99 -lv 3 --no-warmup \
Chat — 81K context, higher temp for conversational use
# optimized for coding
# max context: 81K headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# max in headless
# --ctx-size 24576 \
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /home/eaman/lm/models/localweights/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn-GGUF/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn.gguf \
--host 0.0.0.0 \
-np 1 \
--fit-target 30 \
-ctk q4_0 -ctv q4_0 \
-fa on \
--temp 1.0 \
--top-p 0.95 --min-p 0.0 --top-k 20 \
--repeat-penalty 1.0 --presence_penalty 1.5 \
-b 128 \
--jinja \
--no-mmap \
--spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 2 \
-ctkd q8_0 -ctvd q8_0 \
--ctx-size 32768 \
-ngl 99 \
-lv 4 \
--no-warmup \
Optimized — 81K context, tuned batch size and cache
# optimized for coding
# max context: 81K headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# --ctx-size 4000 \
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /home/eaman/lm/models/localweights/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn-GGUF/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn.gguf \
--host 0.0.0.0 -fa on -np 1 --no-mmap \
--fit-target 50 \
-ctk q4_0 -ctv q4_0 \
--temp 0.6 --min-p 0.1 \
--repeat-penalty 1.0 \
--presence_penalty 0.0 \
-b 512 \
--jinja \
--spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 3 \
-ctkd q8_0 -ctvd q8_0 \
--reasoning-budget 1 --chat-template-kwargs '{"enable_thinking":false}' \
--cache-ram 4000 -ngl 99 -lv 4 \