← Back to index

Qwopus3.5-9B Coder

9B coder model (Qwopus3.5) with MTP speculative decoding. Smaller and faster, good for quick coding tasks.

new_9b.sh

Normal — 81K context, Q6_K quantization

# optimized for coding
# max context:    81K headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan" 

# Headless
#    --ctx-size 32768 \

# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
 -m /home/eaman/lm/models/nocxtrex/Qwopus3.5-9B-Coder-MTP-Q6_K.gguf \
    --host 0.0.0.0     -np 1 --no-mmap -fa on \
    -ctk q8_0 \
    -ctv q8_0 \
    --temp 0.4  --top-k 30 --min-p 0.0 \
    --repeat-penalty 1.0 --presence_penalty 0.0 \
    --jinja  \
     -b 512   \
    --spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 3 \
    --ctx-size 80000 \
    --reasoning off \
    -ngl 99 -lv 3 --no-warmup \
    --reasoning-budget 1 --chat-template-kwargs '{"enable_thinking":false}' \
    --threads 10 --threads-batch 10 



new_9b.sh_think

Think — 30K context, higher temperature

# optimized for coding
# max context:    81K headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan" 

#    --ctx-size 32768 \

# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
 -m /home/eaman/lm/models/nocxtrex/Qwopus3.5-9B-Coder-MTP-Q6_K.gguf \
    --host 0.0.0.0     -np 1 -fa on --no-mmap \
    -ctk q8_0 \
    -ctv q8_0 \
    --temp 0.7  --top-k 35 --min-p 0.0 \
    --repeat-penalty 1.0 --presence_penalty 0.0 \
    --jinja  \
     -b 512   \
    --spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 3 \
    --ctx-size 30000 \
    --reasoning off \
    -ngl 99 -lv 3 --no-warmup \
    --reasoning-budget 1 --chat-template-kwargs '{"enable_thinking":false}' \
    --threads 10 --threads-batch 10