← Back to index

Qwen3.6-35B-A3B MoE

35B total / 3B active MoE. ByteShape IQ3_S MTD variant (128K ctx, ~140 t/s) and IQ3_X no-MTD variant (200k ctx). Running on a single 16GB AMD 6800.

fastest_moe.sh

MTD Think — IQ3_S, reasoning on, MTD, 128K ctx

# optimized for coding
# 1. Set Environment Variables
# ctx headless: mtd  n2 = 128000, n1 =  140544, none = 15978
# Speed MTD n2 122.76,n1 115.20 t/s , none 106.13
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan" 

#    --ctx-size 30000 \

# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
 -m /home/eaman/lm/models/byteshape/Qwen3.6-35B-A3B-GGUF/Qwen3.6-35B-A3B-IQ3_S-3.06bpw.gguf \
    --host 0.0.0.0 \
    -np 1 \
    -ctk q8_0 -ctv q5_0 \
    -fa on \
    --temp 0.7  --top-k 25 --top-p 0.95 --min-p 0.0 \
    --repeat-penalty 1.0 --presence_penalty 0.0 \
    -b 512 \
    --fit-target 60 \
    --jinja  \
    --no-mmap \
    --spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 2 \
    --cache-type-k-draft q4_0 --cache-type-v-draft q4_0 \
    --reasoning on --reasoning-budget 4096 --reasoning-budget-message " -- Reasoning budget exceeded, proceed to final answer." \
    -lv 3 --no-warmup -ngl 99 \


moe.sh

No-MTD — IQ3_S 3.48bpw, reasoning on, 200k ctx

# This is the old non-MTP version that would qualify as IQ4_xss with added MTP heads,
# 3.48bpw, yet it's 600MB smaller: 14.7GB vs 15.3GB.
# https://huggingface.co/byteshape/Qwen3.6-35B-A3B-GGUF?show_file_info=Qwen3.6-35B-A3B-IQ3_S-3.48bpw.gguf
# The idea that the MTP version won't leave room to use MTP with enough ctx anyway.
# Headless is 197632ctx full vram 104t/s.

# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
 -m /home/eaman/lm/models/byteshape/Qwen3.6-35B-A3B-GGUF/Qwen3.6-35B-A3B-IQ3_S-3.48bpw.gguf \
    --host 0.0.0.0 \
    -np 1 --kv-unified \
    -ctk q8_0 -ctv q5_0 \
    -fa on \
    --temp 0.7  --top-k 25 --top-p 0.95 --min-p 0.0 \
    --repeat-penalty 1.0 --presence_penalty 0.0 \
    -b 2048 -ub 512 \
    --fit-target 60 \
    --jinja  \
    --no-mmap \
    --reasoning on --reasoning-budget  8096 --reasoning-budget-message " -- Reasoning budget exceeded, proceed to final answer." \
    -lv 4 --no-warmup --timeout 900 \