Files
ubicloud/config/ai_models.yml
Junhao Li fb5a904261 Enable prefix caching on Llama 3.2 3B model
Prefix caching allows for significant performance improvements
when processing inputs that share a common prefix, which is common
in conversation and multi-step tasks.

Here's the benchmark for Llama 3.2 3B with and without prefix
caching:
https://docs.google.com/spreadsheets/d/16gi4veItLB6UH9FppWjKIq10y-v-I8tp1vhlMsVuMVE/edit?usp=sharing

We can see that:
1. Enabling prefix caching has no noticeable performance impact
when there are no shared prefixes but achieves several times higher
throughput when there are.
2. VLLM automatically manages the normal KV cache and prefix cache
using preallocated GPU memory.
2025-01-27 13:39:53 -05:00

15 lines
7.0 KiB
YAML

- { id: 8b0b55b3-fb99-415f-8441-3abef2c2a200, model_name: test-model, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 80}, {read_only: true, image: ai-model-test-model}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation"}, engine: vllm, engine_params: "" }
- { id: 04ba0d97-859b-46ba-a90b-36a7c7900d4b, model_name: gemma-2-2b-it, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 80}, {read_only: true, image: ai-model-gemma-2-2b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "google/gemma-2-2b-it"}, engine: vllm, engine_params: "" }
- { id: d895faab-9e8d-4371-800b-128cd3416913, model_name: llama-3-2-3b-it, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 80}, {read_only: true, image: ai-model-llama-3-2-3b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.2-3B-Instruct", context_length: "90k"}, engine: vllm, engine_params: "--gpu-memory-utilization 0.95 --max-model-len 90000 --enable-prefix-caching" }
- { id: acc50340-c036-44ff-85a2-c5b7c8823e2a, model_name: llama-3-2-3b-it, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 80}, {read_only: true, image: ai-model-llama-3-2-3b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.2-3B-Instruct", context_length: "90k", tool_support: true}, engine: vllm, engine_params: "--gpu-memory-utilization 0.95 --max-model-len 90000 --enable-prefix-caching --enable-auto-tool-choice --tool-call-parser llama3_json --chat-template resources/tool_chat_template_llama3.2_json.jinja" }
- { id: 9f077493-dcd7-4067-8311-c98c4b48c4d4, model_name: e5-mistral-7b-it, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 80}, {read_only: true, image: ai-model-e5-mistral-7b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Embeddings", hf_model: "intfloat/e5-mistral-7b-instruct"}, engine: vllm, engine_params: "--gpu-memory-utilization 0.95" }
- { id: b034af76-b5c6-43ed-ac25-4f9ef8a25cf1, model_name: llama-guard-3-1b, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 100}, {read_only: true, image: ai-model-llama-guard-3-1b}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-Guard-3-1B"}, engine: vllm, engine_params: "--gpu-memory-utilization 0.95" }
- { id: 80755784-c83f-4b0f-a94c-aab614ab3992, model_name: llama-guard-3-8b, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 100}, {read_only: true, image: ai-model-llama-guard-3-8b}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-Guard-3-8B", context_length: "18k"}, engine: vllm, engine_params: "--gpu-memory-utilization 0.95 --max-model-len 18000" }
- { id: 1c5da179-ff26-4a16-92e1-d766432c96e2, model_name: llama-3-1-405b-it, enabled: true, locations: [latitude-ai], vm_size: standard-60, gpu_count: 8, storage_volumes: [{encrypted: true, size_gib: 250}, {read_only: true, image: ai-model-llama-3-1-405b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.1-405B-Instruct-FP8"}, engine: vllm, engine_params: "--tensor-parallel-size 8" }
- { id: 8af733c4-4ffc-4b1b-8363-fb3653d5ffd8, model_name: llama-3-1-405b-it, enabled: true, locations: [latitude-ai], vm_size: standard-60, gpu_count: 8, storage_volumes: [{encrypted: true, size_gib: 250}, {read_only: true, image: ai-model-llama-3-1-405b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.1-405B-Instruct-FP8", tool_support: true}, engine: vllm, engine_params: "--tensor-parallel-size 8 --enable-auto-tool-choice --tool-call-parser llama3_json --chat-template resources/tool_chat_template_llama3.1_json.jinja" }
- { id: bc806a5a-0b54-4722-92a3-a2b44d43fd27, model_name: llama-3-3-70b-it, enabled: true, locations: [latitude-ai], vm_size: standard-30, gpu_count: 4, storage_volumes: [{encrypted: true, size_gib: 200}, {read_only: true, image: ai-model-llama-3-3-70b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.3-70B-Instruct"}, engine: vllm, engine_params: "--tensor-parallel-size 4" }
- { id: 8d5a6b6b-d9a2-4228-b9b5-b8a7a239bf61, model_name: llama-3-1-8b-it, enabled: true, locations: [latitude-ai], vm_size: standard-16, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-llama-3-1-8b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.1-8B-Instruct"}, engine: vllm, engine_params: "" }
- { id: fcf5e784-630b-4bb0-8d17-5c0729f7bfe0, model_name: qwen-2-5-14b-it, enabled: true, locations: [latitude-ai], vm_size: standard-16, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-qwen-2-5-14b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "Qwen/Qwen2.5-14B-Instruct"}, engine: vllm, engine_params: "" }
- { id: 8c969e7c-df4f-41ad-a51b-ea02c305a4e1, model_name: qwq-32b-preview, enabled: true, locations: [latitude-ai], vm_size: standard-16, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-qwq-32b-preview}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "Qwen/QwQ-32B-Preview"}, engine: vllm, engine_params: "--gpu-memory-utilization 0.97" }
- { id: 9cb1e48b-f9a6-4a33-84d5-ea55ada659f4, model_name: ds-r1-qwen-32b, enabled: true, locations: [latitude-ai], vm_size: standard-16, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-ds-r1-qwen-32b}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", context_length: "32k"}, engine: vllm, engine_params: "--gpu-memory-utilization 0.97 --max-model-len 32768" }