ubicloud/config/ai_models.yml

- { id: 8b0b55b3-fb99-415f-8441-3abef2c2a200, model_name: test-model,          enabled: true, locations: [hetzner-ai],  vm_size: standard-gpu-6, gpu_count: 1, max_requests: 500,  max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 80},  {read_only: true, image: ai-model-test-model}],          boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation"},                                                                                             engine: vllm, engine_params: "" }
- { id: cbbb08b4-3f02-4979-a12d-3bb0e872a3a4, model_name: llama-3-2-1b-it,     enabled: true, locations: [hetzner-ai],  vm_size: standard-4,     gpu_count: 0, max_requests: 500,  max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 80},  {read_only: true, image: ai-model-llama-3-2-1b-it}],     boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.2-1B-Instruct", context_length: "8k", device: "cpu"},          engine: vllm, engine_params: "--max-model-len 8192" }
- { id: 3c8028f5-ec18-4e2a-82cb-3d79729c9244, model_name: ds-r1-qwen-1-5b,     enabled: true, locations: [hetzner-ai],  vm_size: standard-4,     gpu_count: 0, max_requests: 500,  max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 80},  {read_only: true, image: ai-model-ds-r1-qwen-1-5b}],     boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", context_length: "8k", device: "cpu"}, engine: vllm, engine_params: "--max-model-len 8192" }
- { id: 04ba0d97-859b-46ba-a90b-36a7c7900d4b, model_name: gemma-2-2b-it,       enabled: true, locations: [hetzner-ai],  vm_size: standard-gpu-6, gpu_count: 1, max_requests: 500,  max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 80},  {read_only: true, image: ai-model-gemma-2-2b-it}],       boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "google/gemma-2-2b-it"},                                                           engine: vllm, engine_params: "" }
- { id: acc50340-c036-44ff-85a2-c5b7c8823e2a, model_name: llama-3-2-3b-it,     enabled: true, locations: [hetzner-ai],  vm_size: standard-gpu-6, gpu_count: 1, max_requests: 1000, max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 80},  {read_only: true, image: ai-model-llama-3-2-3b-it}],     boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.2-3B-Instruct", context_length: "60k", tool_calling: true},    engine: vllm, engine_params: "--gpu-memory-utilization 0.95 --max-model-len 60000 --generation-config auto --enable-auto-tool-choice --tool-call-parser llama3_json" }
- { id: 9f077493-dcd7-4067-8311-c98c4b48c4d4, model_name: e5-mistral-7b-it,    enabled: true, locations: [hetzner-ai],  vm_size: standard-gpu-6, gpu_count: 1, max_requests: 500,  max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 80},  {read_only: true, image: ai-model-e5-mistral-7b-it}],    boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Embeddings",      hf_model: "intfloat/e5-mistral-7b-instruct"},                                                engine: vllm, engine_params: "--gpu-memory-utilization 0.95" }
- { id: b034af76-b5c6-43ed-ac25-4f9ef8a25cf1, model_name: llama-guard-3-1b,    enabled: true, locations: [hetzner-ai],  vm_size: standard-gpu-6, gpu_count: 1, max_requests: 500,  max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 100}, {read_only: true, image: ai-model-llama-guard-3-1b}],    boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-Guard-3-1B"},                                                    engine: vllm, engine_params: "--gpu-memory-utilization 0.95" }
- { id: 80755784-c83f-4b0f-a94c-aab614ab3992, model_name: llama-guard-3-8b,    enabled: true, locations: [hetzner-ai],  vm_size: standard-gpu-6, gpu_count: 1, max_requests: 500,  max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 100}, {read_only: true, image: ai-model-llama-guard-3-8b}],    boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-Guard-3-8B", context_length: "18k"},                             engine: vllm, engine_params: "--gpu-memory-utilization 0.95 --max-model-len 18000" }
- { id: bc806a5a-0b54-4722-92a3-a2b44d43fd27, model_name: llama-3-3-70b-it,    enabled: true, locations: [latitude-ai], vm_size: standard-30,    gpu_count: 4, max_requests: 500,  max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 200}, {read_only: true, image: ai-model-llama-3-3-70b-it}],    boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.3-70B-Instruct"},                                              engine: vllm, engine_params: "--tensor-parallel-size 4" }
- { id: 534b97b2-bcd7-47c4-8ba9-dbf47ba9df01, model_name: llama-3-3-70b-turbo, enabled: true, locations: [latitude-ai], vm_size: standard-16,    gpu_count: 2, max_requests: 500,  max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-llama-3-3-70b-turbo}], boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "neuralmagic/Llama-3.3-70B-Instruct-quantized.w8a8", tool_calling: true},          engine: vllm, engine_params: "--tensor-parallel-size 2 --gpu-memory-utilization 0.97 --generation-config auto --tool-call-parser llama3_json --enable-auto-tool-choice" }
- { id: 8d5a6b6b-d9a2-4228-b9b5-b8a7a239bf61, model_name: llama-3-1-8b-it,     enabled: true, locations: [latitude-ai], vm_size: standard-16,    gpu_count: 1, max_requests: 500,  max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-llama-3-1-8b-it}],     boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "meta-llama/Llama-3.1-8B-Instruct"},                                               engine: vllm, engine_params: "" }
- { id: fcf5e784-630b-4bb0-8d17-5c0729f7bfe0, model_name: qwen-2-5-14b-it,     enabled: true, locations: [latitude-ai], vm_size: standard-16,    gpu_count: 1, max_requests: 500,  max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-qwen-2-5-14b-it}],     boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "Qwen/Qwen2.5-14B-Instruct"},                                                      engine: vllm, engine_params: "" }
- { id: 8c969e7c-df4f-41ad-a51b-ea02c305a4e1, model_name: qwq-32b-preview,     enabled: true, locations: [latitude-ai], vm_size: standard-16,    gpu_count: 1, max_requests: 500,  max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-qwq-32b-preview}],     boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "Qwen/QwQ-32B-Preview"},                                                           engine: vllm, engine_params: "--gpu-memory-utilization 0.97" }
- { id: 9cb1e48b-f9a6-4a33-84d5-ea55ada659f4, model_name: ds-r1-qwen-32b,      enabled: true, locations: [latitude-ai], vm_size: standard-16,    gpu_count: 1, max_requests: 500,  max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-ds-r1-qwen-32b}],      boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", context_length: "32k"},                engine: vllm, engine_params: "--gpu-memory-utilization 0.97 --max-model-len 32768 --speculative_model /ie/models/draft_model --num_speculative_tokens 2" }
- { id: 43a7016a-69b2-47f7-89b5-eb725a1ab540, model_name: ms-phi-4,            enabled: true, locations: [latitude-ai], vm_size: standard-16,    gpu_count: 1, max_requests: 1000, max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-ms-phi-4}],            boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "microsoft/phi-4"},                                                                engine: vllm, engine_params: "" }
- { id: 71e448aa-0308-45c8-a613-80401c9bc7ac, model_name: mistral-small-3,     enabled: true, locations: [latitude-ai], vm_size: standard-16,    gpu_count: 1, max_requests: 1000, max_project_rps: 100, max_project_tps: 10000, storage_volumes: [{encrypted: true, size_gib: 150}, {read_only: true, image: ai-model-mistral-small-3}],     boot_image: ai-ubuntu-2404-nvidia, tags: {capability: "Text Generation", hf_model: "mistralai/Mistral-Small-24B-Instruct-2501", tool_calling: true},                  engine: vllm, engine_params: "--gpu-memory-utilization 0.97 --generation-config auto --tokenizer-mode mistral --tool-call-parser mistral --enable-auto-tool-choice" }