Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add retry logic to PRM watch thread #116

Merged
merged 5 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions oper8/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ python_watch_manager:
heartbeat_file: null
heartbeat_period: 30s

# Amount of times to retry a watch before exiting
watch_retry_count: 5
watch_retry_delay: 5s

# watch filter used to reduce the amount of reconciles. The following
# filters are available by default, or you can supply a custom filter via
# the module.Filter notation.
Expand Down
6 changes: 6 additions & 0 deletions oper8/test_helpers/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,8 @@ def __init__(
disable_raise=False,
get_state_fail=False,
get_state_raise=False,
watch_fail=False,
watch_raise=False,
generate_resource_version=True,
set_status_fail=False,
set_status_raise=False,
Expand All @@ -235,6 +237,7 @@ def __init__(
resources, generate_resource_version=generate_resource_version, **kwargs
)

self.watch_fail = "assert" if watch_raise else watch_fail
self.deploy_fail = "assert" if deploy_raise else deploy_fail
self.disable_fail = "assert" if disable_raise else disable_fail
self.get_state_fail = "assert" if get_state_raise else get_state_fail
Expand Down Expand Up @@ -270,6 +273,9 @@ def enable_mocks(self):
self.set_status_fail, super().set_status, (False, False)
)
)
self.watch_objects = mock.Mock(
side_effect=get_failable_method(self.watch_fail, super().watch_objects, [])
)

def get_obj(self, kind, name, namespace=None, api_version=None):
return self.get_object_current_state(kind, name, namespace, api_version)[1]
Expand Down
7 changes: 7 additions & 0 deletions oper8/watch_manager/python_watch_manager/threads/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,10 @@ def check_preconditions(self) -> bool:
self.leadership_manager.acquire()

return True

def wait_on_precondition(self, timeout: float) -> bool:
"""Helper function to allow threads to wait for a certain period of time
only being interrupted for preconditions"""
self.shutdown.wait(timeout)

return self.check_preconditions()
110 changes: 68 additions & 42 deletions oper8/watch_manager/python_watch_manager/threads/watch.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Dict, List, Optional, Set
import copy
import dataclasses
import sys

# Third Party
from kubernetes import watch
Expand All @@ -14,6 +15,7 @@
import alog

# Local
from .... import config
from ....deploy_manager import DeployManagerBase, KubeEventType, KubeWatchEvent
from ....managed_object import ManagedObject
from ..filters import FilterManager, get_configured_filter
Expand All @@ -24,6 +26,7 @@
ResourceId,
WatchedResource,
WatchRequest,
parse_time_delta,
)
from .base import ThreadBase

Expand Down Expand Up @@ -95,6 +98,12 @@ def __init__( # pylint: disable=too-many-arguments
# Lock for adding/gathering watch requests
self.watch_request_lock = Lock()

# Variables for tracking retries
self.attempts_left = config.python_watch_manager.watch_retry_count
self.retry_delay = parse_time_delta(
config.python_watch_manager.watch_retry_delay or ""
)

def run(self):
"""The WatchThread's control loop continuously watches the DeployManager for any new
events. For every event it gets it gathers all the WatchRequests whose `watched` value
Expand All @@ -110,56 +119,73 @@ def run(self):
log.debug("Checking preconditions failed. Shuting down")
return

for event in self.deploy_manager.watch_objects(
self.kind,
self.api_version,
namespace=self.namespace,
resource_version=list_resource_version,
watch_manager=self.kubernetes_watch,
):
# Validate leadership on each event
if not self.check_preconditions():
log.debug("Checking preconditions failed. Shuting down")
return
try:
for event in self.deploy_manager.watch_objects(
self.kind,
self.api_version,
namespace=self.namespace,
resource_version=list_resource_version,
watch_manager=self.kubernetes_watch,
):
# Validate leadership on each event
if not self.check_preconditions():
log.debug("Checking preconditions failed. Shuting down")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know this is an indentation change, but Shutting

return

resource = event.resource

# Gather all the watch requests which apply to this event
watch_requests = self._gather_resource_requests(resource)
if not watch_requests:
log.debug2("Skipping resource without requested watch")
self._clean_event(event)
continue

resource = event.resource
# Ensure a watched object exists for every resource
if resource.uid not in self.watched_resources:
self._create_watched_resource(resource, watch_requests)

# Gather all the watch requests which apply to this event
watch_requests = self._gather_resource_requests(resource)
if not watch_requests:
log.debug2("Skipping resource without requested watch")
self._clean_event(event)
continue
# Check both global and watch specific filters
watch_requests = self._check_filters(
watch_requests, resource, event.type
)
if not watch_requests:
log.debug2(
"Skipping event %s as all requests failed filters", event
)
self._clean_event(event)
continue

# Ensure a watched object exists for every resource
if resource.uid not in self.watched_resources:
self._create_watched_resource(resource, watch_requests)
# Push a reconcile request for each watch requested
for watch_request in watch_requests:
log.debug(
"Requesting reconcile for %s",
resource,
extra={"resource": watch_request.requester.get_resource()},
)
self._request_reconcile(event, watch_request)

# Check both global and watch specific filters
watch_requests = self._check_filters(
watch_requests, resource, event.type
)
if not watch_requests:
log.debug2(
"Skipping event %s as all requests failed filters", event
)
# Clean up any resources used for the event
self._clean_event(event)
continue

# Push a reconcile request for each watch requested
for watch_request in watch_requests:
log.debug(
"Requesting reconcile for %s",
resource,
extra={"resource": watch_request.requester.get_resource()},
# Update the resource version to only get new events
list_resource_version = self.kubernetes_watch.resource_version
except Exception as exc:
log.info(
"Exception raised when attempting to watch %s",
repr(exc),
exc_info=exc,
)
if self.attempts_left <= 0:
log.error(
"Unable to start watch within %d attempts",
config.python_watch_manager.watch_retry_count,
)
self._request_reconcile(event, watch_request)

# Clean up any resources used for the event
self._clean_event(event)
sys.exit(1)

# Update the resource version to only get new events
list_resource_version = self.kubernetes_watch.resource_version
self.wait_on_precondition(self.retry_delay.total_seconds())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This returns a bool, but I'm not seeing it used anywhere. Can the preconditions fail?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not seeing it used anywhere.

good catch. It should be checked in the retry logic.

Can the preconditions fail?

Yep! If the thread should shutdown then it returns True

self.attempts_left = self.attempts_left - 1
log.info("Restarting watch with %d attempts left", self.attempts_left)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may be because I'm reviewing on my phone and can't see indentation well, but I'm not seeing how this actually performs the retry. There isn't a loop above the try, so it's this being done outside this function?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There isn't a loop above the try, so it's this being done outside this function?

The try is inside the While True: loop

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is why I shouldn't review on a phone!


## Class Interface ###################################################

Expand Down
Loading
Loading