Add GCP support (#144)

This pull request includes several changes to the `Makefile`, Go source files, and documentation, primarily focusing on adding new functionality, updating configurations, and improving documentation. The most important changes are grouped by theme below: ### Makefile Enhancements: * Added a new `image-clean` target to remove Docker images. * Updated the `clean` target to remove additional binaries specified by `CONTRIB_BINARIES`. ### Go Source File Updates: * Introduced the `ApplyPriorityLevelConfiguration` function in `cmd/kperf/commands/utils/helper.go` to apply Kubernetes PriorityLevelConfiguration using `kubectl`. * Updated `cmd/kperf/commands/virtualcluster/nodepool.go` to call the new `ApplyPriorityLevelConfiguration` function. ### Documentation Improvements: * Added instructions for obtaining KubeConfig for Azure, AWS, and GCP in `docs/getting-started.md`. * Updated runner group specifications and example commands in `docs/getting-started.md` to reflect current configurations and image versions. [[1]](diffhunk://#diff-31bcba2ccafa41d46fbbd6d1219f7f1e3b1fb3cad9faa8e4dc521bbb579dd7b3L277-R300) [[2]](diffhunk://#diff-31bcba2ccafa41d46fbbd6d1219f7f1e3b1fb3cad9faa8e4dc521bbb579dd7b3L449-R465) * Revised benchmark scenario descriptions and options in `docs/runkperf.md` to reflect updated configurations and image versions. [[1]](diffhunk://#diff-688542738f7367395493ff070565f201bbfd1725bf503cb82693121b327a01b3L16-R26) [[2]](diffhunk://#diff-688542738f7367395493ff070565f201bbfd1725bf503cb82693121b327a01b3L36-R46) [[3]](diffhunk://#diff-688542738f7367395493ff070565f201bbfd1725bf503cb82693121b327a01b3L56-R57) ### Helm Chart Updates: * Added labels and annotations to Helm templates for `FlowSchema` in `manifests/runnergroup/server/templates/flowcontrol.yaml` and `manifests/virtualcluster/nodecontrollers/templates/flowcontrol.yaml`. [[1]](diffhunk://#diff-c87b0ac3f211ee3e24694365efaa53d4acf41f01f7029e3d115b5d529cdc086cR6-R10) [[2]](diffhunk://#diff-195fd55124eae8c543a248fd9404448a0aaa30687cdfc7d2e434ea0c9c2370bdR6-R16)
Azure · Dec 21, 2024 · b839e02 · b839e02
1 parent ee26b10
commit b839e02
Show file tree

Hide file tree

Showing 9 changed files with 112 additions and 21 deletions.
diff --git a/Makefile b/Makefile
@@ -44,6 +44,10 @@ image-push: image-build ## push image
 	@echo pushing ${IMAGE_NAME}
 	@docker push ${IMAGE_NAME}
 
+image-clean: ## clean image
+	@echo cleaning ${IMAGE_NAME}
+	@docker rmi ${IMAGE_NAME}
+
 test: ## run test
 	@go test -v ./...
 
@@ -53,6 +57,7 @@ lint: ## run lint
 .PHONY: clean
 clean: ## clean up binaries
 	@rm -f $(BINARIES)
+	@rm -f $(CONTRIB_BINARIES)
 
 .PHONY: help
 help: ## this help

diff --git a/cmd/kperf/commands/utils/helper.go b/cmd/kperf/commands/utils/helper.go
@@ -4,11 +4,16 @@
 package utils
 
 import (
+	"context"
 	"fmt"
 	"os"
 	"path/filepath"
 	"strings"
 
+	flowcontrolv1beta3 "k8s.io/api/flowcontrol/v1beta3"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/tools/clientcmd"
 	"k8s.io/client-go/util/homedir"
 )
 
@@ -60,3 +65,53 @@ func inCluster() bool {
 	return os.Getenv("KUBERNETES_SERVICE_HOST") != "" &&
 		os.Getenv("KUBERNETES_SERVICE_PORT") != ""
 }
+
+// ApplyPriorityLevelConfiguration applies the PriorityLevelConfiguration manifest using kubectl.
+func ApplyPriorityLevelConfiguration(kubeconfigPath string) error {
+	// Load the kubeconfig file
+	config, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath)
+	if err != nil {
+		return fmt.Errorf("failed to load kubeconfig: %v", err)
+	}
+
+	// Create a Kubernetes client
+	clientset, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		return fmt.Errorf("failed to create Kubernetes client: %v", err)
+	}
+
+	// Define the PriorityLevelConfiguration
+	lendablePercent := int32(30)
+	plc := &flowcontrolv1beta3.PriorityLevelConfiguration{
+		TypeMeta: metav1.TypeMeta{
+			APIVersion: "flowcontrol.apiserver.k8s.io/v1beta3",
+			Kind:       "PriorityLevelConfiguration",
+		},
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "custom-system",
+		},
+		Spec: flowcontrolv1beta3.PriorityLevelConfigurationSpec{
+			Type: flowcontrolv1beta3.PriorityLevelEnablementLimited,
+			Limited: &flowcontrolv1beta3.LimitedPriorityLevelConfiguration{
+				LendablePercent: &lendablePercent,
+				LimitResponse: flowcontrolv1beta3.LimitResponse{
+					Type: flowcontrolv1beta3.LimitResponseTypeQueue,
+					Queuing: &flowcontrolv1beta3.QueuingConfiguration{
+						Queues:           64,
+						HandSize:         6,
+						QueueLengthLimit: 50,
+					},
+				},
+			},
+		},
+	}
+
+	// Apply the PriorityLevelConfiguration
+	_, err = clientset.FlowcontrolV1beta3().PriorityLevelConfigurations().Create(context.TODO(), plc, metav1.CreateOptions{})
+	if err != nil {
+		return fmt.Errorf("failed to apply PriorityLevelConfiguration: %v", err)
+	}
+
+	fmt.Printf("Successfully applied PriorityLevelConfiguration: %s\n", plc.Name)
+	return nil
+}
diff --git a/cmd/kperf/commands/virtualcluster/nodepool.go b/cmd/kperf/commands/virtualcluster/nodepool.go
@@ -84,6 +84,11 @@ var nodepoolAddCommand = cli.Command{
 
 		kubeCfgPath := cliCtx.GlobalString("kubeconfig")
 
+		err := utils.ApplyPriorityLevelConfiguration(kubeCfgPath)
+		if err != nil {
+			return fmt.Errorf("failed to apply priority level configuration: %w", err)
+		}
+
 		affinityLabels, err := utils.KeyValuesMap(cliCtx.StringSlice("affinity"))
 		if err != nil {
 			return fmt.Errorf("failed to parse affinity: %w", err)

diff --git a/contrib/cmd/runkperf/commands/bench/root.go b/contrib/cmd/runkperf/commands/bench/root.go
@@ -37,12 +37,12 @@ var Command = cli.Command{
 		cli.StringFlag{
 			Name:  "vc-affinity",
 			Usage: "Deploy virtualnode's controller with a specific labels (FORMAT: KEY=VALUE[,VALUE])",
-			Value: "node.kubernetes.io/instance-type=Standard_D8s_v3,m4.2xlarge",
+			Value: "node.kubernetes.io/instance-type=Standard_D8s_v3,m4.2xlarge,n1-standard-8",
 		},
 		cli.StringFlag{
 			Name:  "rg-affinity",
 			Usage: "Deploy runner group with a specific labels (FORMAT: KEY=VALUE[,VALUE])",
-			Value: "node.kubernetes.io/instance-type=Standard_D16s_v3,m4.4xlarge",
+			Value: "node.kubernetes.io/instance-type=Standard_D16s_v3,m4.4xlarge,n1-standard-16",
 		},
 		cli.BoolFlag{
 			Name:   "eks",

diff --git a/contrib/cmd/runkperf/commands/warmup/command.go b/contrib/cmd/runkperf/commands/warmup/command.go
@@ -59,12 +59,12 @@ var Command = cli.Command{
 		cli.StringFlag{
 			Name:  "vc-affinity",
 			Usage: "Deploy virtualnode's controller with a specific labels (FORMAT: KEY=VALUE[,VALUE])",
-			Value: "node.kubernetes.io/instance-type=Standard_D8s_v3,m4.2xlarge",
+			Value: "node.kubernetes.io/instance-type=Standard_D8s_v3,m4.2xlarge,n1-standard-8",
 		},
 		cli.StringFlag{
 			Name:  "rg-affinity",
 			Usage: "Deploy runner group with a specific labels (FORMAT: KEY=VALUE[,VALUE])",
-			Value: "node.kubernetes.io/instance-type=Standard_D16s_v3,m4.4xlarge",
+			Value: "node.kubernetes.io/instance-type=Standard_D16s_v3,m4.4xlarge,n1-standard-16",
 		},
 		cli.BoolFlag{
 			Name:   "eks",

diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -44,6 +44,22 @@ sudo make install
 By default, the binaries will be in `/usr/local/bin`. The install prefix can be
 changed by passing the `PREFIX` variable (default: `/usr/local`).
 
+## Getting KubeConfig
+### Azure
+```bash
+az aks get-credentials --location <REGION> --resource-group <RESOURCE_GROUP> --name <CLUSTER_NAME> --overwrite-existing
+```
+
+### AWS
+```bash
+eksctl utils write-kubeconfig --cluster=<CLUSTER_NAME> --region=<REGION>
+```
+
+### GCP
+```bash
+gcloud container clusters get-credentials <CLUSTER_NAME> --region <REGION>
+```
+
 ## Using kperf
 
 ### kperf-runner run
@@ -219,7 +235,7 @@ If you want to run benchmark in Kubernetes cluster, please use `kperf runnergrou
 
 ### kperf-runnergroup
 
-The `kperf runnergroup` command manages a group of runners within a target Kubernetes cluster. 
+The `kperf runnergroup` command manages a group of runners within a target Kubernetes cluster.
 A runner group consists of multiple runners, with each runner deployed as an individual Pod for the `kperf runner` process.
 These runners not only generate requests within the cluster but can also issue requests from multiple endpoints,
 mitigating limitations such as network bandwidth constraints.
@@ -274,14 +290,14 @@ loadProfile:
 # nodeAffinity defines how to deploy runners into dedicated nodes which have specific labels.
 nodeAffinity:
   node.kubernetes.io/instance-type:
-    - Standard_DS2_v2
+    - n1-standard-16
 ```
 
 Let's say the local file `/tmp/example-runnergroup-spec.yaml`. You can run:
 
 ```bash
 $ kperf rg run \
-  --runner-image=telescope.azurecr.io/oss/kperf:v0.1.5 \
+  --runner-image=ghcr.io/azure/kperf:0.1.8 \
   --runnergroup="file:///tmp/example-runnergroup-spec.yaml"
 ```
 
@@ -446,7 +462,7 @@ You can use the following command to add nodepool named by `example` with 10 nod
 ```bash
 $ kperf vc nodepool add example \
   --nodes=10 --cpu=32 --memory=96 --max-pods=50 \
-  --affinity="node.kubernetes.io/instance-type=Standard_DS2_v2"
+  --affinity="node.kubernetes.io/instance-type=n1-standard-16"
 ```
 
 > NOTE: The `--affinity` is used to deploy node controller (kwok) to nodes with the specific labels.

diff --git a/docs/runkperf.md b/docs/runkperf.md
@@ -13,17 +13,17 @@ runkperf includes three benchmark scenarios, one of which focuses on measuring
 performance and stability with 3,000 short-lifecycle pods distributed across 100 nodes.
 
 ```bash
-$ runkperf bench --runner-image telescope.azurect.io/oss/kperf:v0.1.5 node100_job1_pod3k --help
+$ runkperf bench --runner-image ghcr.io/azure/kperf:0.1.8 node10_job1_pod100 --help
 
 NAME:
-   runkperf bench node100_job1_pod3k -
+   runkperf bench node10_job1_pod100 -
 
-The test suite is to setup 100 virtual nodes and deploy one job with 3k pods on
+The test suite is to setup 10 virtual nodes and deploy one job with 100 pods on
 that nodes. It repeats to create and delete job. The load profile is fixed.
 
 
 USAGE:
-   runkperf bench node100_job1_pod3k [command options] [arguments...]
+   runkperf bench node10_job1_pod100 [command options] [arguments...]
 
 OPTIONS:
    --total value         Total requests per runner (There are 10 runners totally and runner's rate is 10) (default: 36000)
@@ -33,28 +33,28 @@ OPTIONS:
    --content-type value  Content type (json or protobuf) (default: "json")
 ```
 
-This test eliminates the need to set up 100 physical nodes, as kperf leverages
+This test eliminates the need to set up many physical nodes, as kperf leverages
 [kwok](https://github.com/kubernetes-sigs/kwok) to simulate both nodes and pod
-lifecycles. Only a few physical nodes are required to host **5** kperf runners
-and **100** kwok controllers.
+lifecycles. Only a few physical nodes are required to run large scale benchmark
+with **5** kperf runners and **100** kwok controllers.
 
 We **recommend** using two separate node pools in the target Kubernetes cluster
 to host the kperf runners and Kwok controllers independently. By default, runkperf
 schedules:
 
-* Runners on nodes with instance type: **Standard_D16s_v3** on Azure or **m4.4xlarge** on AWS
-* kwok controllers on nodes with instance type: **Standard_D8s_v3** on Azure or **m4.2xlarge** on AWS
+* Runners on nodes with instance type: **Standard_D16s_v3** on Azure or **m4.4xlarge** on AWS or **n1-standard-8** on GCP
+* kwok controllers on nodes with instance type: **Standard_D8s_v3** on Azure or **m4.2xlarge** on AWS or **n1-standard-16** on GCP
 
-You can modify the scheduling affinity for runners and controllers using the 
+You can modify the scheduling affinity for runners and controllers using the
 `--rg-affinity` and `--vc-affinity` options. Please check `runkperf bench --help` for more details.
 
 When that target cluster is ready, you can run
 
 ```bash
 $ sudo runkperf -v 3 bench \
   --kubeconfig $HOME/.kube/config \
-  --runner-image telescope.azurecr.io/oss/kperf:v0.1.5 \
-  node100_job1_pod3k --total 1000
+  --runner-image ghcr.io/azure/kperf:0.1.8 \
+  node10_job1_pod100 --total 1000
 ```
 
 > NOTE: The `sudo` allows that command to create [mount_namespaces(7)](https://man7.org/linux/man-pages/man7/mount_namespaces.7.html)

diff --git a/manifests/runnergroup/server/templates/flowcontrol.yaml b/manifests/runnergroup/server/templates/flowcontrol.yaml
@@ -3,6 +3,11 @@ kind: FlowSchema
 metadata:
   name: {{ .Values.name }}
   namespace: {{ .Release.Namespace }}
+  labels:
+    app.kubernetes.io/managed-by: "Helm"
+  annotations:
+    meta.helm.sh/release-name: "{{ .Release.Name }}"
+    meta.helm.sh/release-namespace: "{{ .Release.Namespace }}"
 spec:
   distinguisherMethod:
     type: ByUser

diff --git a/manifests/virtualcluster/nodecontrollers/templates/flowcontrol.yaml b/manifests/virtualcluster/nodecontrollers/templates/flowcontrol.yaml
@@ -3,12 +3,17 @@ kind: FlowSchema
 metadata:
   name: {{ .Values.name }}
   namespace: {{ .Release.Namespace }}
+  labels:
+    app.kubernetes.io/managed-by: "Helm"
+  annotations:
+    meta.helm.sh/release-name: "{{ .Release.Name }}"
+    meta.helm.sh/release-namespace: "{{ .Release.Namespace }}"
 spec:
   distinguisherMethod:
     type: ByUser
   matchingPrecedence: 500
   priorityLevelConfiguration:
-    name: system
+    name: custom-system
   rules:
   - resourceRules:
     - apiGroups: