Mixture-of-Experts model with 35B total / 3B active parameters. Multiple configs ranging from fast IQ3_M quant to larger Q4_K_XL.
Normal — IQ3_M quant, ngram speculative decoding
# optimized for coding
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /home/eaman/lm/models/mradermacher/Qwen3.6-35B-A3B.i1-IQ3_M.gguf \
--host 0.0.0.0 -np 1 -fa on --no-mmap \
-ctk q8_0 \
-ctv q5_0 \
--temp 0.6 --top-k 30 --top-p 0.95 --min-p 0.0 \
--repeat-penalty 1.0 \
-b 512 \
--fit-target 60 \
--jinja \
--chat-template-kwargs '{"enable_thinking":false}' \
--reasoning-budget 1 \
--spec-type ngram-mod \
--spec-ngram-mod-n-match 8 \
--spec-ngram-mod-n-min 3 \
--spec-ngram-mod-n-max 24 \
--cache-ram 6000 -ngl 99 -lv 3 --no-warmup \
Think — 121K context, higher fit-target
# optimized for coding
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /home/eaman/lm/models/mradermacher/Qwen3.6-35B-A3B.i1-IQ3_M.gguf \
--host 0.0.0.0 \
-np 1 \
-ctk q4_0 \
-ctv q4_0 \
-fa on \
--temp 0.6 --top-k 30 --top-p 0.95 --min-p 0.0 \
--repeat-penalty 1.0 \
-b 512 \
--fit-target 40 \
--ctx-size 121000 \
--jinja \
--no-mmap \
--spec-type ngram-mod \
--spec-ngram-mod-n-match 8 \
--spec-ngram-mod-n-min 3 \
--spec-ngram-mod-n-max 24 \
--cache-ram 6000 -ngl 99 -lv 3 --no-warmup \
Fastest MTD — Q4_K_S UD with MTP draft
# optimized for coding
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# --ctx-size 30000 \
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /home/eaman/lm/models/unsloth/Qwen3.6-35B-A3B-GGUF/Qwen3.6-35B-A3B-MTD-UD-IQ3_S.gguf \
--host 0.0.0.0 \
-np 1 \
-ctk q5_0 -ctv q4_0 \
-fa on \
--temp 0.55 \
--top-p 0.9 \
--min-p 0.0 \
--repeat-penalty 1.0 \
-b 512 \
--fit-target 60 \
--jinja \
--no-mmap \
--ctx-size 42000 \
--chat-template-kwargs '{"enable_thinking":false}' \
--reasoning-budget 1 \
--spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 2 \
-ctkd q8_0 -ctvd q8_0 \
-lv 4 --no-warmup \
MTD XL — Q4_K_XL quant, 80K fit-ctx
# optimized for coding
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# --ctx-size 30000 \
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /home/eaman/lm/models/unsloth/Qwen3.6-35B-A3B-GGUF/Qwen3.6-35B-A3B-MTD-UD-Q4_K_XL.gguf \
--host 0.0.0.0 -np 1 -fa on \
-ctk q5_0 \
-ctv q4_0 \
--temp 0.55 --top-p 0.9 --min-p 0.0 \
--repeat-penalty 1.0 \
-b 512 \
--fit-ctx 80000 \
--jinja \
--no-mmap \
--chat-template-kwargs '{"enable_thinking":false}' \
--reasoning-budget 1 \
--spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 2 \
-ctkd q8_0 -ctvd q8_0 \
-lv 4 --no-warmup \