← Back to index

Qwen3.6-35B-A3B MoE

Mixture-of-Experts model with 35B total / 3B active parameters. Multiple configs ranging from fast IQ3_M quant to larger Q4_K_XL.

qwen_MoE.sh_ngram

Normal — IQ3_M quant, ngram speculative decoding

# optimized for coding
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan" 

# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
 -m /home/eaman/lm/models/mradermacher/Qwen3.6-35B-A3B.i1-IQ3_M.gguf \
    --host 0.0.0.0     -np 1 -fa on --no-mmap  \
    -ctk q8_0 \
    -ctv q5_0 \
    --temp 0.6  --top-k 30 --top-p 0.95 --min-p 0.0 \
    --repeat-penalty 1.0 \
    -b 512 \
    --fit-target 60 \
    --jinja  \
    --chat-template-kwargs '{"enable_thinking":false}' \
    --reasoning-budget 1 \
    --spec-type ngram-mod \
    --spec-ngram-mod-n-match 8 \
    --spec-ngram-mod-n-min 3 \
    --spec-ngram-mod-n-max 24 \
    --cache-ram 6000 -ngl 99 -lv 3 --no-warmup \

qwen_MoE.sh_ngram_think

Think — 121K context, higher fit-target

# optimized for coding
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan" 



# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
 -m /home/eaman/lm/models/mradermacher/Qwen3.6-35B-A3B.i1-IQ3_M.gguf \
    --host 0.0.0.0 \
    -np 1 \
    -ctk q4_0 \
    -ctv q4_0 \
    -fa on \
    --temp 0.6  --top-k 30 --top-p 0.95 --min-p 0.0 \
    --repeat-penalty 1.0 \
    -b 512 \
    --fit-target 40 \
    --ctx-size 121000 \
    --jinja  \
    --no-mmap \
    --spec-type ngram-mod \
    --spec-ngram-mod-n-match 8 \
    --spec-ngram-mod-n-min 3 \
    --spec-ngram-mod-n-max 24 \
    --cache-ram 6000 -ngl 99 -lv 3 --no-warmup \

fastest_MoE.sh_mtd

Fastest MTD — Q4_K_S UD with MTP draft

# optimized for coding
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan" 

#    --ctx-size 30000 \

# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
 -m /home/eaman/lm/models/unsloth/Qwen3.6-35B-A3B-GGUF/Qwen3.6-35B-A3B-MTD-UD-IQ3_S.gguf \
    --host 0.0.0.0 \
    -np 1 \
    -ctk q5_0 -ctv q4_0 \
    -fa on \
    --temp 0.55 \
    --top-p 0.9 \
    --min-p 0.0 \
    --repeat-penalty 1.0 \
    -b 512 \
    --fit-target 60 \
    --jinja  \
    --no-mmap \
    --ctx-size 42000 \
    --chat-template-kwargs '{"enable_thinking":false}' \
    --reasoning-budget 1 \
    --spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 2 \
    -ctkd q8_0 -ctvd q8_0 \
    -lv 4 --no-warmup \


moe_mtd_xl.sh

MTD XL — Q4_K_XL quant, 80K fit-ctx

# optimized for coding
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan" 

#    --ctx-size 30000 \

# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
 -m /home/eaman/lm/models/unsloth/Qwen3.6-35B-A3B-GGUF/Qwen3.6-35B-A3B-MTD-UD-Q4_K_XL.gguf \
    --host 0.0.0.0     -np 1 -fa on \
    -ctk q5_0 \
    -ctv q4_0 \
    --temp 0.55 --top-p 0.9 --min-p 0.0 \
    --repeat-penalty 1.0 \
    -b 512 \
    --fit-ctx 80000 \
    --jinja  \
    --no-mmap \
    --chat-template-kwargs '{"enable_thinking":false}' \
    --reasoning-budget 1 \
    --spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 2 \
    -ctkd q8_0 -ctvd q8_0 \
    -lv 4 --no-warmup \