← Back to index

Qwen3.6-27B MTP

27B parameter model with Multi-Token Prediction (MTP) for speculative decoding. Uses a Q8 nextn draft model for fast token generation. Best all-round config for coding.

new_27B_mtp.sh

Normal — optimized for coding, 32K context headless

# optimized for coding
# max context:    32K headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan" 

# Headless
#    --ctx-size 32768 \

# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
 -m /home/eaman/lm/models/localweights/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn-GGUF/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn.gguf \
    --host 0.0.0.0     -np 1 -fa on \
    --fit-target 50 \
    -ctk q4_0 -ctv q4_0 \
    --temp 0.6  --top-k 30 --top-p 0.95 --min-p 0.0 \
    --repeat-penalty 1.0 --presence_penalty 0.0 \
    -b 128 \
    --jinja  \
    --no-mmap \
    --spec-type draft-mtp --spec-draft-p-min 0.75  --spec-draft-n-max 3 \
    -ctkd q8_0 -ctvd q8_0 \
    --ctx-size 30000 \
    -ngl 99 \
    --reasoning-budget 1 --chat-template-kwargs '{"enable_thinking":false}' \
    --cache-ram 6000 -ngl 99 -lv 4 --no-warmup \


new_27B_mtp.sh_think

Think — reasoning mode with higher temperature

# optimized for coding
# max context:    32k headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan" 

# /home/eaman/lm/models/mradermacher/Qwen3.6-27B/Qwen3.6-27B.i1-IQ4_XS-attn_qkv-IQ4_XS.gguf
# /home/eaman/lm/models/localweights/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn-GGUF/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn.gguf
    #  --spec-draft-n-max 2 \
# Headless
#    --ctx-size 32768 \

# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
 -m /home/eaman/lm/models/localweights/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn-GGUF/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn.gguf \
    --host 0.0.0.0     -np 1 \
    --fit-target 50 \
    -ctk q5_1 \
    -ctv q4_0 \
    -fa on \
    --temp 0.7  --top-k 30 --top-p 0.95 --min-p 0.0 \
    --repeat-penalty 1.0 --presence_penalty 0.0 \
    -b 128 \
    --jinja  \
    --no-mmap \
    --spec-type draft-mtp --spec-draft-p-min 0.75  --spec-draft-n-max 3 \
    -ctkd q8_0 -ctvd q8_0 \
    --ctx-size 30000 \
    -ngl 99 \
    --cache-ram 6000 -ngl 99 -lv 3 --no-warmup \


new_27B_mtp.sh_chat

Chat — 81K context, higher temp for conversational use

# optimized for coding
# max context:    81K headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan" 

# max in headless
#    --ctx-size 24576 \

# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
 -m /home/eaman/lm/models/localweights/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn-GGUF/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn.gguf \
    --host 0.0.0.0 \
    -np 1 \
    --fit-target 30 \
    -ctk q4_0  -ctv q4_0 \
    -fa on \
    --temp 1.0 \
    --top-p 0.95 --min-p 0.0 --top-k 20 \
    --repeat-penalty 1.0 --presence_penalty 1.5 \
    -b 128 \
    --jinja  \
    --no-mmap \
    --spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 2 \
    -ctkd q8_0 -ctvd q8_0 \
    --ctx-size 32768 \
    -ngl 99 \
    -lv 4 \
    --no-warmup \


new_27B_mtp.sh_opti

Optimized — 81K context, tuned batch size and cache

# optimized for coding
# max context:    81K headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan" 

#    --ctx-size 4000 \

# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
 -m /home/eaman/lm/models/localweights/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn-GGUF/Qwen3.6-27B-MTP-IQ4_XS-Q8nextn.gguf \
    --host 0.0.0.0     -fa on     -np 1 --no-mmap \
    --fit-target 50 \
    -ctk q4_0 -ctv q4_0 \
    --temp 0.6 --min-p 0.1 \
    --repeat-penalty 1.0 \
    --presence_penalty 0.0 \
    -b 512 \
    --jinja  \
    --spec-type draft-mtp  --spec-draft-p-min 0.75 --spec-draft-n-max 3 \
    -ctkd q8_0 -ctvd q8_0 \
    --reasoning-budget 1 --chat-template-kwargs '{"enable_thinking":false}' \
    --cache-ram 4000 -ngl 99 -lv 4 \