Prefix caching allows for significant performance improvements when processing inputs that share a common prefix, which is common in conversation and multi-step tasks. Here's the benchmark for Llama 3.2 3B with and without prefix caching: https://docs.google.com/spreadsheets/d/16gi4veItLB6UH9FppWjKIq10y-v-I8tp1vhlMsVuMVE/edit?usp=sharing We can see that: 1. Enabling prefix caching has no noticeable performance impact when there are no shared prefixes but achieves several times higher throughput when there are. 2. VLLM automatically manages the normal KV cache and prefix cache using preallocated GPU memory.
15 lines
7.0 KiB
YAML
15 lines
7.0 KiB
YAML
- { id: 8b0b55b3-fb99-415f-8441-3abef2c2a200, model_name: test-model, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 80}, {read_only: true, image: ai-model-test-model}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation"}, engine: vllm, engine_params: "" }
|
|
- { id: 04ba0d97-859b-46ba-a90b-36a7c7900d4b, model_name: gemma-2-2b-it, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 80}, {read_only: true, image: ai-model-gemma-2-2b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "google/gemma-2-2b-it"}, engine: vllm, engine_params: "" }
|
|
- { id: d895faab-9e8d-4371-800b-128cd3416913, model_name: llama-3-2-3b-it, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 80}, {read_only: true, image: ai-model-llama-3-2-3b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.2-3B-Instruct", context_length: "90k"}, engine: vllm, engine_params: "--gpu-memory-utilization 0.95 --max-model-len 90000 --enable-prefix-caching" }
|
|
- { id: acc50340-c036-44ff-85a2-c5b7c8823e2a, model_name: llama-3-2-3b-it, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 80}, {read_only: true, image: ai-model-llama-3-2-3b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.2-3B-Instruct", context_length: "90k", tool_support: true}, engine: vllm, engine_params: "--gpu-memory-utilization 0.95 --max-model-len 90000 --enable-prefix-caching --enable-auto-tool-choice --tool-call-parser llama3_json --chat-template resources/tool_chat_template_llama3.2_json.jinja" }
|
|
- { id: 9f077493-dcd7-4067-8311-c98c4b48c4d4, model_name: e5-mistral-7b-it, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 80}, {read_only: true, image: ai-model-e5-mistral-7b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Embeddings", hf_model: "intfloat/e5-mistral-7b-instruct"}, engine: vllm, engine_params: "--gpu-memory-utilization 0.95" }
|
|
- { id: b034af76-b5c6-43ed-ac25-4f9ef8a25cf1, model_name: llama-guard-3-1b, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 100}, {read_only: true, image: ai-model-llama-guard-3-1b}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-Guard-3-1B"}, engine: vllm, engine_params: "--gpu-memory-utilization 0.95" }
|
|
- { id: 80755784-c83f-4b0f-a94c-aab614ab3992, model_name: llama-guard-3-8b, enabled: true, locations: [hetzner-ai], vm_size: standard-gpu-6, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 100}, {read_only: true, image: ai-model-llama-guard-3-8b}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-Guard-3-8B", context_length: "18k"}, engine: vllm, engine_params: "--gpu-memory-utilization 0.95 --max-model-len 18000" }
|
|
- { id: 1c5da179-ff26-4a16-92e1-d766432c96e2, model_name: llama-3-1-405b-it, enabled: true, locations: [latitude-ai], vm_size: standard-60, gpu_count: 8, storage_volumes: [{encrypted: true, size_gib: 250}, {read_only: true, image: ai-model-llama-3-1-405b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.1-405B-Instruct-FP8"}, engine: vllm, engine_params: "--tensor-parallel-size 8" }
|
|
- { id: 8af733c4-4ffc-4b1b-8363-fb3653d5ffd8, model_name: llama-3-1-405b-it, enabled: true, locations: [latitude-ai], vm_size: standard-60, gpu_count: 8, storage_volumes: [{encrypted: true, size_gib: 250}, {read_only: true, image: ai-model-llama-3-1-405b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.1-405B-Instruct-FP8", tool_support: true}, engine: vllm, engine_params: "--tensor-parallel-size 8 --enable-auto-tool-choice --tool-call-parser llama3_json --chat-template resources/tool_chat_template_llama3.1_json.jinja" }
|
|
- { id: bc806a5a-0b54-4722-92a3-a2b44d43fd27, model_name: llama-3-3-70b-it, enabled: true, locations: [latitude-ai], vm_size: standard-30, gpu_count: 4, storage_volumes: [{encrypted: true, size_gib: 200}, {read_only: true, image: ai-model-llama-3-3-70b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.3-70B-Instruct"}, engine: vllm, engine_params: "--tensor-parallel-size 4" }
|
|
- { id: 8d5a6b6b-d9a2-4228-b9b5-b8a7a239bf61, model_name: llama-3-1-8b-it, enabled: true, locations: [latitude-ai], vm_size: standard-16, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-llama-3-1-8b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.1-8B-Instruct"}, engine: vllm, engine_params: "" }
|
|
- { id: fcf5e784-630b-4bb0-8d17-5c0729f7bfe0, model_name: qwen-2-5-14b-it, enabled: true, locations: [latitude-ai], vm_size: standard-16, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-qwen-2-5-14b-it}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "Qwen/Qwen2.5-14B-Instruct"}, engine: vllm, engine_params: "" }
|
|
- { id: 8c969e7c-df4f-41ad-a51b-ea02c305a4e1, model_name: qwq-32b-preview, enabled: true, locations: [latitude-ai], vm_size: standard-16, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-qwq-32b-preview}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "Qwen/QwQ-32B-Preview"}, engine: vllm, engine_params: "--gpu-memory-utilization 0.97" }
|
|
- { id: 9cb1e48b-f9a6-4a33-84d5-ea55ada659f4, model_name: ds-r1-qwen-32b, enabled: true, locations: [latitude-ai], vm_size: standard-16, gpu_count: 1, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-ds-r1-qwen-32b}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", context_length: "32k"}, engine: vllm, engine_params: "--gpu-memory-utilization 0.97 --max-model-len 32768" }
|