Skip to content

Commit

Permalink
Add GCP support (#144)
Browse files Browse the repository at this point in the history
This pull request includes several changes to the `Makefile`, Go source
files, and documentation, primarily focusing on adding new
functionality, updating configurations, and improving documentation. The
most important changes are grouped by theme below:

### Makefile Enhancements:
* Added a new `image-clean` target to remove Docker images.
* Updated the `clean` target to remove additional binaries specified by
`CONTRIB_BINARIES`.

### Go Source File Updates:
* Introduced the `ApplyPriorityLevelConfiguration` function in
`cmd/kperf/commands/utils/helper.go` to apply Kubernetes
PriorityLevelConfiguration using `kubectl`.
* Updated `cmd/kperf/commands/virtualcluster/nodepool.go` to call the
new `ApplyPriorityLevelConfiguration` function.

### Documentation Improvements:
* Added instructions for obtaining KubeConfig for Azure, AWS, and GCP in
`docs/getting-started.md`.
* Updated runner group specifications and example commands in
`docs/getting-started.md` to reflect current configurations and image
versions.
[[1]](diffhunk://#diff-31bcba2ccafa41d46fbbd6d1219f7f1e3b1fb3cad9faa8e4dc521bbb579dd7b3L277-R300)
[[2]](diffhunk://#diff-31bcba2ccafa41d46fbbd6d1219f7f1e3b1fb3cad9faa8e4dc521bbb579dd7b3L449-R465)
* Revised benchmark scenario descriptions and options in
`docs/runkperf.md` to reflect updated configurations and image versions.
[[1]](diffhunk://#diff-688542738f7367395493ff070565f201bbfd1725bf503cb82693121b327a01b3L16-R26)
[[2]](diffhunk://#diff-688542738f7367395493ff070565f201bbfd1725bf503cb82693121b327a01b3L36-R46)
[[3]](diffhunk://#diff-688542738f7367395493ff070565f201bbfd1725bf503cb82693121b327a01b3L56-R57)

### Helm Chart Updates:
* Added labels and annotations to Helm templates for `FlowSchema` in
`manifests/runnergroup/server/templates/flowcontrol.yaml` and
`manifests/virtualcluster/nodecontrollers/templates/flowcontrol.yaml`.
[[1]](diffhunk://#diff-c87b0ac3f211ee3e24694365efaa53d4acf41f01f7029e3d115b5d529cdc086cR6-R10)
[[2]](diffhunk://#diff-195fd55124eae8c543a248fd9404448a0aaa30687cdfc7d2e434ea0c9c2370bdR6-R16)
  • Loading branch information
anson627 authored Dec 21, 2024
1 parent ee26b10 commit b839e02
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 21 deletions.
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ image-push: image-build ## push image
@echo pushing ${IMAGE_NAME}
@docker push ${IMAGE_NAME}

image-clean: ## clean image
@echo cleaning ${IMAGE_NAME}
@docker rmi ${IMAGE_NAME}

test: ## run test
@go test -v ./...

Expand All @@ -53,6 +57,7 @@ lint: ## run lint
.PHONY: clean
clean: ## clean up binaries
@rm -f $(BINARIES)
@rm -f $(CONTRIB_BINARIES)

.PHONY: help
help: ## this help
Expand Down
55 changes: 55 additions & 0 deletions cmd/kperf/commands/utils/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,16 @@
package utils

import (
"context"
"fmt"
"os"
"path/filepath"
"strings"

flowcontrolv1beta3 "k8s.io/api/flowcontrol/v1beta3"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/client-go/util/homedir"
)

Expand Down Expand Up @@ -60,3 +65,53 @@ func inCluster() bool {
return os.Getenv("KUBERNETES_SERVICE_HOST") != "" &&
os.Getenv("KUBERNETES_SERVICE_PORT") != ""
}

// ApplyPriorityLevelConfiguration applies the PriorityLevelConfiguration manifest using kubectl.
func ApplyPriorityLevelConfiguration(kubeconfigPath string) error {
// Load the kubeconfig file
config, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath)
if err != nil {
return fmt.Errorf("failed to load kubeconfig: %v", err)
}

// Create a Kubernetes client
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return fmt.Errorf("failed to create Kubernetes client: %v", err)
}

// Define the PriorityLevelConfiguration
lendablePercent := int32(30)
plc := &flowcontrolv1beta3.PriorityLevelConfiguration{
TypeMeta: metav1.TypeMeta{
APIVersion: "flowcontrol.apiserver.k8s.io/v1beta3",
Kind: "PriorityLevelConfiguration",
},
ObjectMeta: metav1.ObjectMeta{
Name: "custom-system",
},
Spec: flowcontrolv1beta3.PriorityLevelConfigurationSpec{
Type: flowcontrolv1beta3.PriorityLevelEnablementLimited,
Limited: &flowcontrolv1beta3.LimitedPriorityLevelConfiguration{
LendablePercent: &lendablePercent,
LimitResponse: flowcontrolv1beta3.LimitResponse{
Type: flowcontrolv1beta3.LimitResponseTypeQueue,
Queuing: &flowcontrolv1beta3.QueuingConfiguration{
Queues: 64,
HandSize: 6,
QueueLengthLimit: 50,
},
},
},
},
}

// Apply the PriorityLevelConfiguration
_, err = clientset.FlowcontrolV1beta3().PriorityLevelConfigurations().Create(context.TODO(), plc, metav1.CreateOptions{})
if err != nil {
return fmt.Errorf("failed to apply PriorityLevelConfiguration: %v", err)
}

fmt.Printf("Successfully applied PriorityLevelConfiguration: %s\n", plc.Name)
return nil
}
5 changes: 5 additions & 0 deletions cmd/kperf/commands/virtualcluster/nodepool.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ var nodepoolAddCommand = cli.Command{

kubeCfgPath := cliCtx.GlobalString("kubeconfig")

err := utils.ApplyPriorityLevelConfiguration(kubeCfgPath)
if err != nil {
return fmt.Errorf("failed to apply priority level configuration: %w", err)
}

affinityLabels, err := utils.KeyValuesMap(cliCtx.StringSlice("affinity"))
if err != nil {
return fmt.Errorf("failed to parse affinity: %w", err)
Expand Down
4 changes: 2 additions & 2 deletions contrib/cmd/runkperf/commands/bench/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ var Command = cli.Command{
cli.StringFlag{
Name: "vc-affinity",
Usage: "Deploy virtualnode's controller with a specific labels (FORMAT: KEY=VALUE[,VALUE])",
Value: "node.kubernetes.io/instance-type=Standard_D8s_v3,m4.2xlarge",
Value: "node.kubernetes.io/instance-type=Standard_D8s_v3,m4.2xlarge,n1-standard-8",
},
cli.StringFlag{
Name: "rg-affinity",
Usage: "Deploy runner group with a specific labels (FORMAT: KEY=VALUE[,VALUE])",
Value: "node.kubernetes.io/instance-type=Standard_D16s_v3,m4.4xlarge",
Value: "node.kubernetes.io/instance-type=Standard_D16s_v3,m4.4xlarge,n1-standard-16",
},
cli.BoolFlag{
Name: "eks",
Expand Down
4 changes: 2 additions & 2 deletions contrib/cmd/runkperf/commands/warmup/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,12 @@ var Command = cli.Command{
cli.StringFlag{
Name: "vc-affinity",
Usage: "Deploy virtualnode's controller with a specific labels (FORMAT: KEY=VALUE[,VALUE])",
Value: "node.kubernetes.io/instance-type=Standard_D8s_v3,m4.2xlarge",
Value: "node.kubernetes.io/instance-type=Standard_D8s_v3,m4.2xlarge,n1-standard-8",
},
cli.StringFlag{
Name: "rg-affinity",
Usage: "Deploy runner group with a specific labels (FORMAT: KEY=VALUE[,VALUE])",
Value: "node.kubernetes.io/instance-type=Standard_D16s_v3,m4.4xlarge",
Value: "node.kubernetes.io/instance-type=Standard_D16s_v3,m4.4xlarge,n1-standard-16",
},
cli.BoolFlag{
Name: "eks",
Expand Down
24 changes: 20 additions & 4 deletions docs/getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,22 @@ sudo make install
By default, the binaries will be in `/usr/local/bin`. The install prefix can be
changed by passing the `PREFIX` variable (default: `/usr/local`).

## Getting KubeConfig
### Azure
```bash
az aks get-credentials --location <REGION> --resource-group <RESOURCE_GROUP> --name <CLUSTER_NAME> --overwrite-existing
```

### AWS
```bash
eksctl utils write-kubeconfig --cluster=<CLUSTER_NAME> --region=<REGION>
```

### GCP
```bash
gcloud container clusters get-credentials <CLUSTER_NAME> --region <REGION>
```

## Using kperf

### kperf-runner run
Expand Down Expand Up @@ -219,7 +235,7 @@ If you want to run benchmark in Kubernetes cluster, please use `kperf runnergrou

### kperf-runnergroup

The `kperf runnergroup` command manages a group of runners within a target Kubernetes cluster.
The `kperf runnergroup` command manages a group of runners within a target Kubernetes cluster.
A runner group consists of multiple runners, with each runner deployed as an individual Pod for the `kperf runner` process.
These runners not only generate requests within the cluster but can also issue requests from multiple endpoints,
mitigating limitations such as network bandwidth constraints.
Expand Down Expand Up @@ -274,14 +290,14 @@ loadProfile:
# nodeAffinity defines how to deploy runners into dedicated nodes which have specific labels.
nodeAffinity:
node.kubernetes.io/instance-type:
- Standard_DS2_v2
- n1-standard-16
```

Let's say the local file `/tmp/example-runnergroup-spec.yaml`. You can run:

```bash
$ kperf rg run \
--runner-image=telescope.azurecr.io/oss/kperf:v0.1.5 \
--runner-image=ghcr.io/azure/kperf:0.1.8 \
--runnergroup="file:///tmp/example-runnergroup-spec.yaml"
```

Expand Down Expand Up @@ -446,7 +462,7 @@ You can use the following command to add nodepool named by `example` with 10 nod
```bash
$ kperf vc nodepool add example \
--nodes=10 --cpu=32 --memory=96 --max-pods=50 \
--affinity="node.kubernetes.io/instance-type=Standard_DS2_v2"
--affinity="node.kubernetes.io/instance-type=n1-standard-16"
```

> NOTE: The `--affinity` is used to deploy node controller (kwok) to nodes with the specific labels.
Expand Down
24 changes: 12 additions & 12 deletions docs/runkperf.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@ runkperf includes three benchmark scenarios, one of which focuses on measuring
performance and stability with 3,000 short-lifecycle pods distributed across 100 nodes.

```bash
$ runkperf bench --runner-image telescope.azurect.io/oss/kperf:v0.1.5 node100_job1_pod3k --help
$ runkperf bench --runner-image ghcr.io/azure/kperf:0.1.8 node10_job1_pod100 --help

NAME:
runkperf bench node100_job1_pod3k -
runkperf bench node10_job1_pod100 -

The test suite is to setup 100 virtual nodes and deploy one job with 3k pods on
The test suite is to setup 10 virtual nodes and deploy one job with 100 pods on
that nodes. It repeats to create and delete job. The load profile is fixed.


USAGE:
runkperf bench node100_job1_pod3k [command options] [arguments...]
runkperf bench node10_job1_pod100 [command options] [arguments...]

OPTIONS:
--total value Total requests per runner (There are 10 runners totally and runner's rate is 10) (default: 36000)
Expand All @@ -33,28 +33,28 @@ OPTIONS:
--content-type value Content type (json or protobuf) (default: "json")
```
This test eliminates the need to set up 100 physical nodes, as kperf leverages
This test eliminates the need to set up many physical nodes, as kperf leverages
[kwok](https://github.com/kubernetes-sigs/kwok) to simulate both nodes and pod
lifecycles. Only a few physical nodes are required to host **5** kperf runners
and **100** kwok controllers.
lifecycles. Only a few physical nodes are required to run large scale benchmark
with **5** kperf runners and **100** kwok controllers.
We **recommend** using two separate node pools in the target Kubernetes cluster
to host the kperf runners and Kwok controllers independently. By default, runkperf
schedules:
* Runners on nodes with instance type: **Standard_D16s_v3** on Azure or **m4.4xlarge** on AWS
* kwok controllers on nodes with instance type: **Standard_D8s_v3** on Azure or **m4.2xlarge** on AWS
* Runners on nodes with instance type: **Standard_D16s_v3** on Azure or **m4.4xlarge** on AWS or **n1-standard-8** on GCP
* kwok controllers on nodes with instance type: **Standard_D8s_v3** on Azure or **m4.2xlarge** on AWS or **n1-standard-16** on GCP
You can modify the scheduling affinity for runners and controllers using the
You can modify the scheduling affinity for runners and controllers using the
`--rg-affinity` and `--vc-affinity` options. Please check `runkperf bench --help` for more details.
When that target cluster is ready, you can run
```bash
$ sudo runkperf -v 3 bench \
--kubeconfig $HOME/.kube/config \
--runner-image telescope.azurecr.io/oss/kperf:v0.1.5 \
node100_job1_pod3k --total 1000
--runner-image ghcr.io/azure/kperf:0.1.8 \
node10_job1_pod100 --total 1000
```
> NOTE: The `sudo` allows that command to create [mount_namespaces(7)](https://man7.org/linux/man-pages/man7/mount_namespaces.7.html)
Expand Down
5 changes: 5 additions & 0 deletions manifests/runnergroup/server/templates/flowcontrol.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ kind: FlowSchema
metadata:
name: {{ .Values.name }}
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/managed-by: "Helm"
annotations:
meta.helm.sh/release-name: "{{ .Release.Name }}"
meta.helm.sh/release-namespace: "{{ .Release.Namespace }}"
spec:
distinguisherMethod:
type: ByUser
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,17 @@ kind: FlowSchema
metadata:
name: {{ .Values.name }}
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/managed-by: "Helm"
annotations:
meta.helm.sh/release-name: "{{ .Release.Name }}"
meta.helm.sh/release-namespace: "{{ .Release.Namespace }}"
spec:
distinguisherMethod:
type: ByUser
matchingPrecedence: 500
priorityLevelConfiguration:
name: system
name: custom-system
rules:
- resourceRules:
- apiGroups:
Expand Down

0 comments on commit b839e02

Please sign in to comment.