IQ4_XS quantized 27B model with ngram-based speculative decoding. Lighter quantization for faster inference at the cost of some quality.
Normal — ngram-mod speculative decoding, coding optimized
# optimized for coding
# max context: 81K headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /home/eaman/lm/models/mradermacher/Qwen3.6-27B/Qwen3.6-27B.i1-IQ4_XS-attn_qkv-IQ4_XS.gguf \
--host 0.0.0.0 -np 1 -fa on --no-mmap \
--fit-target 50 \
-ctk q8_0 \
-ctv q5_0 \
--temp 0.5 --min-p 0.0 \
--repeat-penalty 1.0 --presence_penalty 0.0 \
-b 512 \
--jinja \
--reasoning-budget 1 \
--chat-template-kwargs '{"enable_thinking":false}' \
--spec-type ngram-mod \
--spec-ngram-mod-n-match 8 \
--spec-ngram-mod-n-min 3 \
--spec-ngram-mod-n-max 24 \
-lv 4 \
Think — higher temperature for reasoning
# optimized for coding
# max context: 81K headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /home/eaman/lm/models/mradermacher/Qwen3.6-27B/Qwen3.6-27B.i1-IQ4_XS-attn_qkv-IQ4_XS.gguf \
--host 0.0.0.0 -np 1 -fa on --no-mmap \
--fit-target 40 \
-ctk q8_0 \
-ctv q5_0 \
--temp 0.8 \
--min-p 0.0 \
--repeat-penalty 1.0 --presence_penalty 0.0 \
-b 512 \
--jinja \