From f2a9b34896041704633d978fb87e60c029ae3634 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 10 Apr 2025 17:12:05 +0000 Subject: [PATCH] Rename resources to be model server generic instead of referencing vLLM --- .../gateway/gke/gcp-backend-policy.yaml | 2 +- config/manifests/gateway/gke/healthcheck.yaml | 2 +- config/manifests/gateway/gke/httproute.yaml | 2 +- config/manifests/inferencemodel.yaml | 6 +++--- config/manifests/inferencepool-resources.yaml | 20 +++++++++---------- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/config/manifests/gateway/gke/gcp-backend-policy.yaml b/config/manifests/gateway/gke/gcp-backend-policy.yaml index 7b294304e..f5cc0642d 100644 --- a/config/manifests/gateway/gke/gcp-backend-policy.yaml +++ b/config/manifests/gateway/gke/gcp-backend-policy.yaml @@ -6,7 +6,7 @@ spec: targetRef: group: "inference.networking.x-k8s.io" kind: InferencePool - name: vllm-llama3-8b-instruct + name: llama3-8b-instruct default: timeoutSec: 300 logging: diff --git a/config/manifests/gateway/gke/healthcheck.yaml b/config/manifests/gateway/gke/healthcheck.yaml index 93b6cd7fa..161e58dea 100644 --- a/config/manifests/gateway/gke/healthcheck.yaml +++ b/config/manifests/gateway/gke/healthcheck.yaml @@ -7,7 +7,7 @@ spec: targetRef: group: "inference.networking.x-k8s.io" kind: InferencePool - name: vllm-llama3-8b-instruct + name: llama3-8b-instruct default: config: type: HTTP diff --git a/config/manifests/gateway/gke/httproute.yaml b/config/manifests/gateway/gke/httproute.yaml index 6ea90891c..111f78eda 100644 --- a/config/manifests/gateway/gke/httproute.yaml +++ b/config/manifests/gateway/gke/httproute.yaml @@ -11,7 +11,7 @@ spec: - backendRefs: - group: inference.networking.x-k8s.io kind: InferencePool - name: vllm-llama3-8b-instruct + name: llama3-8b-instruct matches: - path: type: PathPrefix diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml index 67c91d0e5..431105182 100644 --- a/config/manifests/inferencemodel.yaml +++ b/config/manifests/inferencemodel.yaml @@ -6,7 +6,7 @@ spec: modelName: food-review criticality: Standard poolRef: - name: vllm-llama3-8b-instruct + name: llama3-8b-instruct targetModels: - name: food-review-1 weight: 100 @@ -19,7 +19,7 @@ spec: modelName: meta-llama/Llama-3.1-8B-Instruct criticality: Critical poolRef: - name: vllm-llama3-8b-instruct + name: llama3-8b-instruct --- apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferenceModel @@ -29,4 +29,4 @@ spec: modelName: Qwen/Qwen2.5-1.5B-Instruct criticality: Critical poolRef: - name: vllm-llama3-8b-instruct + name: llama3-8b-instruct diff --git a/config/manifests/inferencepool-resources.yaml b/config/manifests/inferencepool-resources.yaml index 993b7bf62..c16a0fc39 100644 --- a/config/manifests/inferencepool-resources.yaml +++ b/config/manifests/inferencepool-resources.yaml @@ -5,22 +5,22 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: labels: - name: vllm-llama3-8b-instruct + name: llama3-8b-instruct spec: targetPortNumber: 8000 selector: - app: vllm-llama3-8b-instruct + app: vllm-llama3-8b-instruct # Change this to target a different Model Server Deployment extensionRef: - name: vllm-llama3-8b-instruct-epp + name: llama3-8b-instruct-epp --- apiVersion: v1 kind: Service metadata: - name: vllm-llama3-8b-instruct-epp + name: llama3-8b-instruct-epp namespace: default spec: selector: - app: vllm-llama3-8b-instruct-epp + app: llama3-8b-instruct-epp ports: - protocol: TCP port: 9002 @@ -31,19 +31,19 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: vllm-llama3-8b-instruct-epp + name: llama3-8b-instruct-epp namespace: default labels: - app: vllm-llama3-8b-instruct-epp + app: llama3-8b-instruct-epp spec: replicas: 1 selector: matchLabels: - app: vllm-llama3-8b-instruct-epp + app: llama3-8b-instruct-epp template: metadata: labels: - app: vllm-llama3-8b-instruct-epp + app: llama3-8b-instruct-epp spec: # Conservatively, this timeout should mirror the longest grace period of the pods within the pool terminationGracePeriodSeconds: 130 @@ -53,7 +53,7 @@ spec: imagePullPolicy: Always args: - -poolName - - "vllm-llama3-8b-instruct" + - "llama3-8b-instruct" - -v - "4" - --zap-encoder