-
-
Notifications
You must be signed in to change notification settings - Fork 218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
check_repeaters()
task iterates repeaters
#34946
Changes from 18 commits
5a0e77a
127912c
f2b3aa0
1267f51
1714f62
c6f603b
3d40f91
3d4e4cd
d90a5c1
86f1cce
3315c45
7c4e3cf
a916c34
bb17648
3047c08
5370a17
f2f50e9
976d297
a922568
1bdca83
6458b3e
369c96e
74e6676
5e4fbca
5fe6d3c
407d60d
7a02d6a
62369a4
2bb7c59
7d5abeb
b8543a8
2600e94
1b0306a
d333eeb
6b79a5e
61a0a00
04c8177
f586d2c
eb1f424
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -65,10 +65,12 @@ | |
""" | ||
import inspect | ||
import json | ||
import random | ||
import traceback | ||
import uuid | ||
from collections import defaultdict | ||
from datetime import datetime, timedelta | ||
from http import HTTPStatus | ||
from typing import Any | ||
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse | ||
|
||
|
@@ -92,6 +94,7 @@ | |
|
||
from corehq import toggles | ||
from corehq.apps.accounting.utils import domain_has_privilege | ||
from corehq.apps.domain.models import Domain | ||
from corehq.apps.locations.models import SQLLocation | ||
from corehq.apps.users.models import CommCareUser | ||
from corehq.form_processor.exceptions import XFormNotFound | ||
|
@@ -122,8 +125,10 @@ | |
from .const import ( | ||
MAX_ATTEMPTS, | ||
MAX_BACKOFF_ATTEMPTS, | ||
MAX_REPEATER_WORKERS, | ||
MAX_RETRY_WAIT, | ||
MIN_RETRY_WAIT, | ||
RATE_LIMITER_DELAY_RANGE, | ||
State, | ||
) | ||
from .exceptions import RequestConnectionError, UnknownRepeater | ||
|
@@ -220,6 +225,7 @@ def all_ready(self): | |
""" | ||
Return all Repeaters ready to be forwarded. | ||
""" | ||
domains = get_domains_forwarding_enabled() | ||
not_paused = models.Q(is_paused=False) | ||
next_attempt_not_in_the_future = ( | ||
models.Q(next_attempt_at__isnull=True) | ||
|
@@ -228,10 +234,13 @@ def all_ready(self): | |
repeat_records_ready_to_send = models.Q( | ||
repeat_records__state__in=(State.Pending, State.Fail) | ||
) | ||
return (self.get_queryset() | ||
.filter(not_paused) | ||
.filter(next_attempt_not_in_the_future) | ||
.filter(repeat_records_ready_to_send)) | ||
return ( | ||
self.get_queryset() | ||
.filter(domain__in=domains) | ||
.filter(not_paused) | ||
.filter(next_attempt_not_in_the_future) | ||
.filter(repeat_records_ready_to_send) | ||
) | ||
|
||
def get_queryset(self): | ||
repeater_obj = self.model() | ||
|
@@ -258,6 +267,7 @@ class Repeater(RepeaterSuperProxy): | |
is_paused = models.BooleanField(default=False) | ||
next_attempt_at = models.DateTimeField(null=True, blank=True) | ||
last_attempt_at = models.DateTimeField(null=True, blank=True) | ||
# TODO: max_workers = models.IntegerField(default=1) | ||
options = JSONField(default=dict) | ||
connection_settings_id = models.IntegerField(db_index=True) | ||
is_deleted = models.BooleanField(default=False, db_index=True) | ||
|
@@ -348,21 +358,19 @@ def _repeater_type(cls): | |
|
||
@property | ||
def repeat_records_ready(self): | ||
return self.repeat_records.filter(state__in=(State.Pending, State.Fail)) | ||
return ( | ||
self.repeat_records | ||
.filter(state__in=(State.Pending, State.Fail)) | ||
.order_by('registered_at') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm getting confused on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If one payload fails repeatedly, but another succeeds, that sounds like a problem with the payload and not the remote endpoint. Trying to send it again isn't going to help. I think when a remote API responds with a 4XX error, we should immediately cancel the repeat record.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
What about a 429 (Too Many Requests) response with a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
) | ||
|
||
@property | ||
def is_ready(self): | ||
""" | ||
Returns True if there are repeat records to be sent. | ||
""" | ||
if self.is_paused or toggles.PAUSE_DATA_FORWARDING.enabled(self.domain): | ||
return False | ||
if not (self.next_attempt_at is None | ||
or self.next_attempt_at < timezone.now()): | ||
return False | ||
return self.repeat_records_ready.exists() | ||
def rate_limit(self): | ||
interval = random.uniform(*RATE_LIMITER_DELAY_RANGE) | ||
Repeater.objects.filter(id=self.repeater_id).update( | ||
next_attempt_at=datetime.utcnow() + interval, | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The intention of This code no longer does that, but that seems fine given a repeater will try one record at a time now (unless max repeaters is > 1, but then it is the project's choice). So maybe the only thing to do is revert the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On second thought, rate limiting at the repeater level is pretty different than the existing behavior. If a project has a high volume of records, they might be rate limited at the minute window level, but would be able to successfully send records in the following minute (in the current world). Whereas postponing the entire repeater would delay any records from being sent by up to 15 minutes, which I don't think is desirable. So maybe pushing rate limiting back down to a record by record basis is best? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should chat through this idea offline. I might not be understanding how you're thinking of implementing this, because I worry that if we rate limit at the repeat record level, then we will be iterating repeaters that don't actually have repeat records ready to send. That would result in churning through There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
def set_next_attempt(self): | ||
def set_backoff(self): | ||
now = datetime.utcnow() | ||
interval = _get_retry_interval(self.last_attempt_at, now) | ||
self.last_attempt_at = now | ||
|
@@ -375,7 +383,7 @@ def set_next_attempt(self): | |
next_attempt_at=now + interval, | ||
) | ||
|
||
def reset_next_attempt(self): | ||
def reset_backoff(self): | ||
if self.last_attempt_at or self.next_attempt_at: | ||
self.last_attempt_at = None | ||
self.next_attempt_at = None | ||
|
@@ -411,7 +419,8 @@ def register(self, payload, fire_synchronously=False): | |
# Prime the cache to prevent unnecessary lookup. Only do this for synchronous repeaters | ||
# to prevent serializing the repeater in the celery task payload | ||
repeat_record.__dict__["repeater"] = self | ||
repeat_record.attempt_forward_now(fire_synchronously=fire_synchronously) | ||
# TODO: No, send the repeat record when it's its turn. | ||
# repeat_record.attempt_forward_now(fire_synchronously=fire_synchronously) | ||
return repeat_record | ||
|
||
def allowed_to_forward(self, payload): | ||
|
@@ -432,6 +441,11 @@ def retire(self): | |
self.is_deleted = True | ||
Repeater.objects.filter(id=self.repeater_id).update(is_deleted=True) | ||
|
||
@property | ||
def num_workers(self): | ||
# TODO: return min(self.max_workers, MAX_REPEATER_WORKERS) | ||
return MAX_REPEATER_WORKERS | ||
|
||
def fire_for_record(self, repeat_record): | ||
payload = self.get_payload(repeat_record) | ||
try: | ||
|
@@ -473,10 +487,36 @@ def handle_response(self, result, repeat_record): | |
|
||
result may be either a response object or an exception | ||
""" | ||
never_gonna_work = ( | ||
HTTPStatus.BAD_REQUEST, | ||
HTTPStatus.UNAUTHORIZED, | ||
HTTPStatus.PAYMENT_REQUIRED, | ||
HTTPStatus.FORBIDDEN, | ||
HTTPStatus.NOT_FOUND, | ||
HTTPStatus.METHOD_NOT_ALLOWED, | ||
HTTPStatus.NOT_ACCEPTABLE, | ||
HTTPStatus.PROXY_AUTHENTICATION_REQUIRED, | ||
HTTPStatus.GONE, | ||
HTTPStatus.LENGTH_REQUIRED, | ||
HTTPStatus.REQUEST_ENTITY_TOO_LARGE, | ||
HTTPStatus.REQUEST_URI_TOO_LONG, | ||
HTTPStatus.UNSUPPORTED_MEDIA_TYPE, | ||
HTTPStatus.REQUESTED_RANGE_NOT_SATISFIABLE, | ||
HTTPStatus.EXPECTATION_FAILED, | ||
HTTPStatus.IM_A_TEAPOT, # For completeness :) | ||
HTTPStatus.MISDIRECTED_REQUEST, | ||
HTTPStatus.UNPROCESSABLE_ENTITY, | ||
HTTPStatus.REQUEST_HEADER_FIELDS_TOO_LARGE, | ||
HTTPStatus.UNAVAILABLE_FOR_LEGAL_REASONS, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The error codes (particularly which are 4XX vs 5XX) are not obvious here. Actually, upon spot-checking, it looks like these may all be 4XX errors. Does this mean that all 5XX and a few 4XX errors will be retried? Is there a way to make that, esp. about 5XX errors, more obvious in the code? Like maybe invert the condition: (pseudocode)
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes.
Oh, nice. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
) | ||
|
||
if isinstance(result, Exception): | ||
repeat_record.handle_exception(result) | ||
elif is_response(result) and 200 <= result.status_code < 300 or result is True: | ||
repeat_record.handle_success(result) | ||
elif is_response(result) and result.status_code in never_gonna_work: | ||
message = format_response(result) | ||
repeat_record.handle_payload_error(message) | ||
else: | ||
repeat_record.handle_failure(result) | ||
|
||
|
@@ -916,10 +956,21 @@ def count_by_repeater_and_state(self, domain): | |
return result | ||
|
||
def count_overdue(self, threshold=timedelta(minutes=10)): | ||
return self.filter( | ||
next_check__isnull=False, | ||
next_check__lt=datetime.utcnow() - threshold | ||
).count() | ||
overdue = datetime.utcnow() - threshold | ||
domains = get_domains_forwarding_enabled() | ||
repeater_not_paused = models.Q(repeater__is_paused=False) | ||
repeater_next_attempt_overdue = models.Q(repeater__next_attempt_at__lt=overdue) | ||
ready_to_send = models.Q( | ||
state__in=(State.Pending, State.Fail) | ||
) | ||
return ( | ||
self.get_queryset() | ||
.filter(domain__in=domains) | ||
.filter(repeater_not_paused) | ||
.filter(repeater_next_attempt_overdue) | ||
.filter(ready_to_send) | ||
.count() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you profiled this query? Will we need any new indexes to make it fast? |
||
) | ||
|
||
def iterate(self, domain, repeater_id=None, state=None, chunk_size=1000): | ||
db = router.db_for_read(self.model) | ||
|
@@ -1154,9 +1205,12 @@ def fire(self, force_send=False): | |
except Exception as e: | ||
log_repeater_error_in_datadog(self.domain, status_code=None, | ||
repeater_type=self.repeater_type) | ||
self.handle_payload_exception(e) | ||
raise | ||
self.handle_payload_error(str(e)) | ||
finally: | ||
return self.state | ||
return None | ||
|
||
# TODO: Drop: `process_repeater` task will call `process_repeat_record` tasks directly | ||
def attempt_forward_now(self, *, is_retry=False, fire_synchronously=False): | ||
from corehq.motech.repeaters.tasks import ( | ||
process_repeat_record, | ||
|
@@ -1206,8 +1260,8 @@ def handle_failure(self, response): | |
def handle_exception(self, exception): | ||
self.add_client_failure_attempt(str(exception)) | ||
|
||
def handle_payload_exception(self, exception): | ||
self.add_client_failure_attempt(str(exception), retry=False) | ||
def handle_payload_error(self, message): | ||
self.add_client_failure_attempt(message, retry=False) | ||
|
||
def cancel(self): | ||
self.state = State.Cancelled | ||
|
@@ -1370,7 +1424,29 @@ def is_response(duck): | |
|
||
|
||
def domain_can_forward(domain): | ||
""" | ||
Checks whether ``domain`` has the privilege to forward data. Ignores | ||
the status of the (temporary) ``PAUSE_DATA_FORWARDING`` toggle. | ||
|
||
Used for registering repeat records. | ||
""" | ||
return domain and ( | ||
domain_has_privilege(domain, ZAPIER_INTEGRATION) | ||
or domain_has_privilege(domain, DATA_FORWARDING) | ||
) | ||
|
||
|
||
def get_domains_forwarding_enabled(): | ||
""" | ||
Returns a set of domains that are *currently* able to forward data. | ||
Considers the status of the (temporary) ``PAUSE_DATA_FORWARDING`` | ||
toggle. | ||
|
||
Used for iterating repeaters and counting overdue repeat records. | ||
""" | ||
domains_can_forward = { | ||
domain for domain in Domain.get_all_names() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will pull thousands of domain names on production, right? Are most of them allowed to forward? If yes, seems like this will make some very big queries when using conditions like If not, would it make sense to add a Couch view to make it efficient to grab the few domains that can forward? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Pro Plan and higher.
Not yet. An alternative is to filter out repeaters for domains that can't forward as we iterate the repeaters. That wouldn't be too bad, but it would make metrics like "overdue_repeat_records" less accurate. ("overdue_repeat_records" currently includes the repeat records of domains that have been paused. I'm undecided whether that's correct or not.) |
||
if domain_can_forward(domain) | ||
} | ||
domains_paused = set(toggles.PAUSE_DATA_FORWARDING.get_enabled_domains()) | ||
return domains_can_forward - domains_paused |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have an index
next_check_not_null
and a check constraintnext_check_pending_or_null
that validates and keepsnext_check
in sync withstate
. If we're droppingnext_check
then we'll need to redefine that index to make this condition efficient. Looks like we'll want to includerepeater_id
in that new index.