Creates the Clover control plane that manages the lifecycle of inference router replicas, which will handle inference requests across all models and route them to an appropriate inference endpoint based on priority, capacity, and cache characteristics.
46 lines
772 B
JSON
46 lines
772 B
JSON
{
|
|
"basic": {},
|
|
"health_check": {
|
|
"check_frequency": "10s",
|
|
"consecutive_success": 2,
|
|
"consecutive_failure": 2
|
|
},
|
|
"servers": [
|
|
{
|
|
"name": "main-server",
|
|
"addr": "[::]:8443",
|
|
"locations": [
|
|
"inference",
|
|
"up"
|
|
],
|
|
"threads": 0,
|
|
"metrics_path": "/metrics"
|
|
},
|
|
{
|
|
"name": "admin-server",
|
|
"addr": "127.0.0.1:8080,::1:8080",
|
|
"locations": [
|
|
"usage"
|
|
],
|
|
"threads": 1
|
|
}
|
|
],
|
|
"locations": [
|
|
{
|
|
"name": "up",
|
|
"path": "^/up$",
|
|
"app": "up"
|
|
},
|
|
{
|
|
"name": "inference",
|
|
"path": "^/v1/(chat/)?completions$",
|
|
"app": "inference"
|
|
},
|
|
{
|
|
"name": "usage",
|
|
"path": "^/usage$",
|
|
"app": "usage"
|
|
}
|
|
]
|
|
}
|