9B coder model (Qwopus3.5) with MTP speculative decoding. Smaller and faster, good for quick coding tasks.
Normal — 81K context, Q6_K quantization
# optimized for coding
# max context: 81K headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# Headless
# --ctx-size 32768 \
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /home/eaman/lm/models/nocxtrex/Qwopus3.5-9B-Coder-MTP-Q6_K.gguf \
--host 0.0.0.0 -np 1 --no-mmap -fa on \
-ctk q8_0 \
-ctv q8_0 \
--temp 0.4 --top-k 30 --min-p 0.0 \
--repeat-penalty 1.0 --presence_penalty 0.0 \
--jinja \
-b 512 \
--spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 3 \
--ctx-size 80000 \
--reasoning off \
-ngl 99 -lv 3 --no-warmup \
--reasoning-budget 1 --chat-template-kwargs '{"enable_thinking":false}' \
--threads 10 --threads-batch 10
Think — 30K context, higher temperature
# optimized for coding
# max context: 81K headless
# 1. Set Environment Variables
export LD_LIBRARY_PATH="/home/eaman/llama/bin_vulkan"
# --ctx-size 32768 \
# 2. Run the Server
/home/eaman/llama/bin_vulkan/llama-server \
-m /home/eaman/lm/models/nocxtrex/Qwopus3.5-9B-Coder-MTP-Q6_K.gguf \
--host 0.0.0.0 -np 1 -fa on --no-mmap \
-ctk q8_0 \
-ctv q8_0 \
--temp 0.7 --top-k 35 --min-p 0.0 \
--repeat-penalty 1.0 --presence_penalty 0.0 \
--jinja \
-b 512 \
--spec-type draft-mtp --spec-draft-p-min 0.75 --spec-draft-n-max 3 \
--ctx-size 30000 \
--reasoning off \
-ngl 99 -lv 3 --no-warmup \
--reasoning-budget 1 --chat-template-kwargs '{"enable_thinking":false}' \
--threads 10 --threads-batch 10