Base configuration for the 35B-A3B UD model with Q4_K_S quantization. Good reference config with 80K context.
Base — Q4_K_S UD, ngram speculative decoding
# optimized for coding
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /home/eaman/lm/models/unsloth/Qwen3.6-35B-A3B-GGUF/Qwen3.6-35B-A3B-UD-Q4_K_S.gguf \
--host 0.0.0.0 \
-np 1 \
-ctk q5_0 \
-ctv q4_0 \
-fa on \
--temp 0.55 \
--top-p 0.9 \
--top-k 30 \
--min-p 0.0 \
--repeat-penalty 1.0 \
-b 512 \
--fit-target 50 \
--ctx-size 80000 \
--jinja \
--chat-template-kwargs '{"enable_thinking":false}' \
--reasoning-budget 1 \
--no-mmap \
--spec-type ngram-mod \
--spec-ngram-mod-n-match 8 \
--spec-ngram-mod-n-min 3 \
--spec-ngram-mod-n-max 24 \
-lv 4 \