Merge pull request #27244 from bluca/uphold_retry

Uphold/StopWhenUnneeded/BindsTo: add retry timer on rate limit
This commit is contained in:
Luca Boccassi
2023-04-13 21:33:06 +01:00
committed by GitHub
10 changed files with 132 additions and 6 deletions

View File

@@ -38,3 +38,12 @@ unsigned ratelimit_num_dropped(RateLimit *r) {
return r->num > r->burst ? r->num - r->burst : 0;
}
usec_t ratelimit_end(const RateLimit *rl) {
assert(rl);
if (rl->begin == 0)
return 0;
return usec_add(rl->begin, rl->interval);
}

View File

@@ -23,3 +23,5 @@ static inline bool ratelimit_configured(RateLimit *rl) {
bool ratelimit_below(RateLimit *r);
unsigned ratelimit_num_dropped(RateLimit *r);
usec_t ratelimit_end(const RateLimit *rl);

View File

@@ -1051,6 +1051,12 @@ finish:
job_add_to_gc_queue(other->job);
}
/* Ensure that when an upheld/unneeded/bound unit activation job fails we requeue it, if it still
* necessary. If there are no state changes in the triggerer, it would not be retried otherwise. */
unit_submit_to_start_when_upheld_queue(u);
unit_submit_to_stop_when_bound_queue(u);
unit_submit_to_stop_when_unneeded_queue(u);
manager_check_finished(u->manager);
return 0;

View File

@@ -1409,6 +1409,48 @@ static unsigned manager_dispatch_gc_job_queue(Manager *m) {
return n;
}
static int manager_ratelimit_requeue(sd_event_source *s, uint64_t usec, void *userdata) {
Unit *u = userdata;
assert(u);
assert(s == u->auto_start_stop_event_source);
u->auto_start_stop_event_source = sd_event_source_unref(u->auto_start_stop_event_source);
/* Re-queue to all queues, if the rate limit hit we might have been throttled on any of them. */
unit_submit_to_stop_when_unneeded_queue(u);
unit_submit_to_start_when_upheld_queue(u);
unit_submit_to_stop_when_bound_queue(u);
return 0;
}
static int manager_ratelimit_check_and_queue(Unit *u) {
int r;
assert(u);
if (ratelimit_below(&u->auto_start_stop_ratelimit))
return 1;
/* Already queued, no need to requeue */
if (u->auto_start_stop_event_source)
return 0;
r = sd_event_add_time(
u->manager->event,
&u->auto_start_stop_event_source,
CLOCK_MONOTONIC,
ratelimit_end(&u->auto_start_stop_ratelimit),
0,
manager_ratelimit_requeue,
u);
if (r < 0)
return log_unit_error_errno(u, r, "Failed to queue timer on event loop: %m");
return 0;
}
static unsigned manager_dispatch_stop_when_unneeded_queue(Manager *m) {
unsigned n = 0;
Unit *u;
@@ -1433,8 +1475,11 @@ static unsigned manager_dispatch_stop_when_unneeded_queue(Manager *m) {
/* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the
* service being unnecessary after a while. */
if (!ratelimit_below(&u->auto_start_stop_ratelimit)) {
log_unit_warning(u, "Unit not needed anymore, but not stopping since we tried this too often recently.");
r = manager_ratelimit_check_and_queue(u);
if (r <= 0) {
log_unit_warning(u,
"Unit not needed anymore, but not stopping since we tried this too often recently.%s",
r == 0 ? " Will retry later." : "");
continue;
}
@@ -1472,8 +1517,12 @@ static unsigned manager_dispatch_start_when_upheld_queue(Manager *m) {
/* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the
* service being unnecessary after a while. */
if (!ratelimit_below(&u->auto_start_stop_ratelimit)) {
log_unit_warning(u, "Unit needs to be started because active unit %s upholds it, but not starting since we tried this too often recently.", culprit->id);
r = manager_ratelimit_check_and_queue(u);
if (r <= 0) {
log_unit_warning(u,
"Unit needs to be started because active unit %s upholds it, but not starting since we tried this too often recently.%s",
culprit->id,
r == 0 ? " Will retry later." : "");
continue;
}
@@ -1510,8 +1559,12 @@ static unsigned manager_dispatch_stop_when_bound_queue(Manager *m) {
/* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the
* service being unnecessary after a while. */
if (!ratelimit_below(&u->auto_start_stop_ratelimit)) {
log_unit_warning(u, "Unit needs to be stopped because it is bound to inactive unit %s it, but not stopping since we tried this too often recently.", culprit->id);
r = manager_ratelimit_check_and_queue(u);
if (r <= 0) {
log_unit_warning(u,
"Unit needs to be stopped because it is bound to inactive unit %s it, but not stopping since we tried this too often recently.%s",
culprit->id,
r == 0 ? " Will retry later." : "");
continue;
}

View File

@@ -732,6 +732,8 @@ Unit* unit_free(Unit *u) {
if (!u)
return NULL;
sd_event_source_disable_unref(u->auto_start_stop_event_source);
u->transient_file = safe_fclose(u->transient_file);
if (!MANAGER_IS_RELOADING(u->manager))

View File

@@ -350,6 +350,7 @@ typedef struct Unit {
/* Make sure we never enter endless loops with the StopWhenUnneeded=, BindsTo=, Uphold= logic */
RateLimit auto_start_stop_ratelimit;
sd_event_source *auto_start_stop_event_source;
/* Reference to a specific UID/GID */
uid_t ref_uid;

View File

@@ -0,0 +1,9 @@
# SPDX-License-Identifier: LGPL-2.1-or-later
[Unit]
Description=Failed Dependency Unit
[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/bin/sh -c "if [ -f /tmp/testsuite-57-retry-fail ]; then exit 0; else exit 1; fi"
Restart=no

View File

@@ -0,0 +1,10 @@
# SPDX-License-Identifier: LGPL-2.1-or-later
[Unit]
Description=Upheld Unit
Requires=testsuite-57-retry-fail.service
After=testsuite-57-retry-fail.service
[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/bin/echo ok

View File

@@ -0,0 +1,7 @@
# SPDX-License-Identifier: LGPL-2.1-or-later
[Unit]
Description=Upholding Unit
Upholds=testsuite-57-retry-upheld.service
[Service]
ExecStart=/bin/sleep infinity

View File

@@ -26,6 +26,33 @@ done
systemctl stop testsuite-57-uphold.service
# Idea is this:
# 1. we start testsuite-57-retry-uphold.service
# 2. which through Uphold= starts testsuite-57-retry-upheld.service
# 3. which through Requires= starts testsuite-57-retry-fail.service
# 4. which fails as /tmp/testsuite-57-retry-fail does not exist, so testsuite-57-retry-upheld.service
# is no longer restarted
# 5. we create /tmp/testsuite-57-retry-fail
# 6. now testsuite-57-retry-upheld.service will be restarted since upheld, and its dependency will
# be satisfied
rm -f /tmp/testsuite-57-retry-fail
systemctl start testsuite-57-retry-uphold.service
while ! systemctl is-failed testsuite-57-retry-fail.service ; do
sleep .5
done
systemctl is-active testsuite-57-retry-upheld.service && { echo 'unexpected success'; exit 1; }
touch /tmp/testsuite-57-retry-fail
while ! systemctl is-active testsuite-57-retry-upheld.service ; do
sleep .5
done
systemctl stop testsuite-57-retry-uphold.service testsuite-57-retry-fail.service testsuite-57-retry-upheld.service
# Idea is this:
# 1. we start testsuite-57-prop-stop-one.service
# 2. which through Wants=/After= pulls in testsuite-57-prop-stop-two.service as well