From 30e555d9e80d9c4b9824fed2029713359ce5607c Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Tue, 6 Feb 2024 20:34:08 -0800 Subject: [PATCH] add leader election retry (#66) * adds a test that ensures apiserver going away for 20 seconds would force a successfuly retry of leader election For reference, see Kong gateway that had to implement the same thing: https://github.com/Kong/kubernetes-ingress-controller/pull/578 Fixes #60 --- .github/workflows/tests.yml | 13 ++++++++- cmd/lingo/main.go | 6 +++- pkg/leader/election.go | 23 ++++++++++++--- tests/e2e/test.sh | 57 ++++++++++++++++--------------------- 4 files changed, 60 insertions(+), 39 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9025d2cb..8cae2b74 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -27,8 +27,15 @@ jobs: run: make test-integration e2e: + strategy: + matrix: + replicas: ["1", "3"] + test_cases: + - { requests: 60, expected_replicas: 1 } + # remove broken test, put this back when scaling issues are solved + # - { requests: 300, expected_replicas: 2 } runs-on: ubuntu-latest - + name: E2E Lingo.replicas=${{ matrix.replicas }} requests=${{ matrix.test_cases.requests }} expected_replicas=${{ matrix.test_cases.expected_replicas }} steps: - name: Checkout code uses: actions/checkout@v2 @@ -49,4 +56,8 @@ jobs: sudo mv skaffold /usr/local/bin - name: Run e2e tests + env: + REPLICAS: ${{ matrix.replicas }} + REQUESTS: ${{ matrix.test_cases.requests }} + EXPECTED_REPLICAS: ${{ matrix.test_cases.expected_replicas }} run: make test-e2e diff --git a/cmd/lingo/main.go b/cmd/lingo/main.go index 7bfec8f0..2f9ae6f4 100644 --- a/cmd/lingo/main.go +++ b/cmd/lingo/main.go @@ -182,7 +182,11 @@ func run() error { }() go func() { setupLog.Info("Starting leader election") - le.Start(ctx) + err := le.Start(ctx) + if err != nil { + setupLog.Error(err, "starting leader election") + os.Exit(1) + } }() defer func() { setupLog.Info("waiting on manager to stop") diff --git a/pkg/leader/election.go b/pkg/leader/election.go index 1bdcaf55..d1e2e756 100644 --- a/pkg/leader/election.go +++ b/pkg/leader/election.go @@ -10,6 +10,7 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/leaderelection" "k8s.io/client-go/tools/leaderelection/resourcelock" + "k8s.io/client-go/util/flowcontrol" ) func NewElection(clientset kubernetes.Interface, id, namespace string) *Election { @@ -35,11 +36,11 @@ func NewElection(clientset kubernetes.Interface, id, namespace string) *Election RetryPeriod: 2 * time.Second, Callbacks: leaderelection.LeaderCallbacks{ OnStartedLeading: func(ctx context.Context) { - log.Println("Started leading") + log.Printf("%q started leading", id) isLeader.Store(true) }, OnStoppedLeading: func() { - log.Println("Stopped leading") + log.Printf("%q stopped leading", id) isLeader.Store(false) }, OnNewLeader: func(identity string) { @@ -54,14 +55,28 @@ func NewElection(clientset kubernetes.Interface, id, namespace string) *Election return &Election{ IsLeader: isLeader, config: config, + ID: id, } } type Election struct { config leaderelection.LeaderElectionConfig IsLeader *atomic.Bool + ID string } -func (le *Election) Start(ctx context.Context) { - leaderelection.RunOrDie(ctx, le.config) +func (le *Election) Start(ctx context.Context) error { + backoff := flowcontrol.NewBackOff(1*time.Second, 15*time.Second) + const backoffID = "lingo-leader-election" + for { + leaderelection.RunOrDie(ctx, le.config) + backoff.Next(backoffID, backoff.Clock.Now()) + delay := backoff.Get(backoffID) + log.Printf("Leader election stopped on %q, retrying in %s", le.ID, delay) + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(delay): + } + } } diff --git a/tests/e2e/test.sh b/tests/e2e/test.sh index c64b2453..89924271 100755 --- a/tests/e2e/test.sh +++ b/tests/e2e/test.sh @@ -6,6 +6,9 @@ set -xe HOST=127.0.0.1 PORT=30080 BASE_URL="http://$HOST:$PORT/v1" +REPLICAS=${REPLICAS:-3} +REQUESTS=${REQUESTS:-60} +EXPECTED_REPLICAS=${EXPECTED_REPLICAS:-1} if kind get clusters | grep -q substratus-test; then @@ -42,6 +45,7 @@ if ! kubectl get deployment lingo; then skaffold run fi +kubectl patch deployment lingo --patch "{\"spec\": {\"replicas\": $REPLICAS}}" kubectl wait --for=condition=available --timeout=30s deployment/lingo @@ -77,22 +81,34 @@ pip3 install openai==1.2.3 # Send 60 requests in parallel to stapi backend using openai python client and threading python3 $SCRIPT_DIR/test_openai_embedding.py \ - --requests 60 --timeout 300 --base-url "${BASE_URL}" \ + --requests ${REQUESTS} --timeout 300 --base-url "${BASE_URL}" \ --model text-embedding-ada-002 # Ensure replicas has been scaled up to 1 after sending 60 requests replicas=$(kubectl get deployment stapi-minilm-l6-v2 -o jsonpath='{.spec.replicas}') -if [ "$replicas" -eq 1 ]; then - echo "Test passed: Expected 1 replica after sending requests 60 requests" +if [ "$replicas" -ge "${EXPECTED_REPLICAS}" ]; then + echo "Test passed: Expected ${EXPECTED_REPLICAS} or more replicas and got ${replicas} after sending requests ${REQUESTS} requests" else - echo "Test failed: Expected 1 replica after sending requests 60 requests, got $replicas" + echo "Test failed: Expected ${EXPECTED_REPLICAS} or more replicas after sending requests ${REQUESTS} requests, got ${replicas}" exit 1 fi -echo "Waiting for deployment to scale down back to 0 within 2 minutes" -for i in {1..15}; do - if [ "$i" -eq 15 ]; then +# Verify that leader election works by forcing a 120 second apiserver outage +KIND_NODE=$(kind get nodes --name=substratus-test) +docker exec ${KIND_NODE} iptables -I INPUT -p tcp --dport 6443 -j DROP +sleep 120 +docker exec ${KIND_NODE} iptables -D INPUT -p tcp --dport 6443 -j DROP +echo "Waiting for K8s to recover from apiserver outage" +sleep 30 +until kubectl get deployment stapi-minilm-l6-v2; do + sleep 1 +done + +echo "Waiting for deployment to scale down back to 0 within ~2 minutes" +for i in {1..30}; do + if [ "$i" -eq 30 ]; then echo "Test failed: Expected 0 replica after not having requests for more than 1 minute, got $replicas" + kubectl logs -l app=lingo --tail=-1 exit 1 fi replicas=$(kubectl get deployment stapi-minilm-l6-v2 -o jsonpath='{.spec.replicas}') @@ -100,30 +116,5 @@ for i in {1..15}; do echo "Test passed: Expected 0 replica after not having requests for more than 1 minute" break fi - sleep 8 + sleep 6 done - -echo "Patching stapi deployment to sleep on startup" -cat <