From 7223d500ac548c69e7879931e3ad8c84838f925b Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Wed, 12 Apr 2023 20:14:17 +0100 Subject: [PATCH 1/2] Uphold/StopWhenUnneeded/BindsTo: add retry timer on rate limit The Upholds= promise is that as long as unit A is up and Upholds=B, B will be activated if failed or inactive. But there is a hard-coded, non-configurable rate limit for this, so add a timed retry after the ratelimit has expired. Apply to BindsTo= and StopWhenUnneeded= as well. --- src/basic/ratelimit.c | 9 ++++++ src/basic/ratelimit.h | 2 ++ src/core/manager.c | 65 +++++++++++++++++++++++++++++++++++++++---- src/core/unit.c | 2 ++ src/core/unit.h | 1 + 5 files changed, 73 insertions(+), 6 deletions(-) diff --git a/src/basic/ratelimit.c b/src/basic/ratelimit.c index f90a63b1a9..a0260bfe1c 100644 --- a/src/basic/ratelimit.c +++ b/src/basic/ratelimit.c @@ -38,3 +38,12 @@ unsigned ratelimit_num_dropped(RateLimit *r) { return r->num > r->burst ? r->num - r->burst : 0; } + +usec_t ratelimit_end(const RateLimit *rl) { + assert(rl); + + if (rl->begin == 0) + return 0; + + return usec_add(rl->begin, rl->interval); +} diff --git a/src/basic/ratelimit.h b/src/basic/ratelimit.h index 2236189851..048084ece4 100644 --- a/src/basic/ratelimit.h +++ b/src/basic/ratelimit.h @@ -23,3 +23,5 @@ static inline bool ratelimit_configured(RateLimit *rl) { bool ratelimit_below(RateLimit *r); unsigned ratelimit_num_dropped(RateLimit *r); + +usec_t ratelimit_end(const RateLimit *rl); diff --git a/src/core/manager.c b/src/core/manager.c index 47a502df7f..fd576ef25b 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -1389,6 +1389,48 @@ static unsigned manager_dispatch_gc_job_queue(Manager *m) { return n; } +static int manager_ratelimit_requeue(sd_event_source *s, uint64_t usec, void *userdata) { + Unit *u = userdata; + + assert(u); + assert(s == u->auto_start_stop_event_source); + + u->auto_start_stop_event_source = sd_event_source_unref(u->auto_start_stop_event_source); + + /* Re-queue to all queues, if the rate limit hit we might have been throttled on any of them. */ + unit_submit_to_stop_when_unneeded_queue(u); + unit_submit_to_start_when_upheld_queue(u); + unit_submit_to_stop_when_bound_queue(u); + + return 0; +} + +static int manager_ratelimit_check_and_queue(Unit *u) { + int r; + + assert(u); + + if (ratelimit_below(&u->auto_start_stop_ratelimit)) + return 1; + + /* Already queued, no need to requeue */ + if (u->auto_start_stop_event_source) + return 0; + + r = sd_event_add_time( + u->manager->event, + &u->auto_start_stop_event_source, + CLOCK_MONOTONIC, + ratelimit_end(&u->auto_start_stop_ratelimit), + 0, + manager_ratelimit_requeue, + u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to queue timer on event loop: %m"); + + return 0; +} + static unsigned manager_dispatch_stop_when_unneeded_queue(Manager *m) { unsigned n = 0; Unit *u; @@ -1413,8 +1455,11 @@ static unsigned manager_dispatch_stop_when_unneeded_queue(Manager *m) { /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the * service being unnecessary after a while. */ - if (!ratelimit_below(&u->auto_start_stop_ratelimit)) { - log_unit_warning(u, "Unit not needed anymore, but not stopping since we tried this too often recently."); + r = manager_ratelimit_check_and_queue(u); + if (r <= 0) { + log_unit_warning(u, + "Unit not needed anymore, but not stopping since we tried this too often recently.%s", + r == 0 ? " Will retry later." : ""); continue; } @@ -1452,8 +1497,12 @@ static unsigned manager_dispatch_start_when_upheld_queue(Manager *m) { /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the * service being unnecessary after a while. */ - if (!ratelimit_below(&u->auto_start_stop_ratelimit)) { - log_unit_warning(u, "Unit needs to be started because active unit %s upholds it, but not starting since we tried this too often recently.", culprit->id); + r = manager_ratelimit_check_and_queue(u); + if (r <= 0) { + log_unit_warning(u, + "Unit needs to be started because active unit %s upholds it, but not starting since we tried this too often recently.%s", + culprit->id, + r == 0 ? " Will retry later." : ""); continue; } @@ -1490,8 +1539,12 @@ static unsigned manager_dispatch_stop_when_bound_queue(Manager *m) { /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the * service being unnecessary after a while. */ - if (!ratelimit_below(&u->auto_start_stop_ratelimit)) { - log_unit_warning(u, "Unit needs to be stopped because it is bound to inactive unit %s it, but not stopping since we tried this too often recently.", culprit->id); + r = manager_ratelimit_check_and_queue(u); + if (r <= 0) { + log_unit_warning(u, + "Unit needs to be stopped because it is bound to inactive unit %s it, but not stopping since we tried this too often recently.%s", + culprit->id, + r == 0 ? " Will retry later." : ""); continue; } diff --git a/src/core/unit.c b/src/core/unit.c index 409801aed2..f3566a5672 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -676,6 +676,8 @@ Unit* unit_free(Unit *u) { if (!u) return NULL; + sd_event_source_disable_unref(u->auto_start_stop_event_source); + u->transient_file = safe_fclose(u->transient_file); if (!MANAGER_IS_RELOADING(u->manager)) diff --git a/src/core/unit.h b/src/core/unit.h index 420405b2b7..dd0d42dc90 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -347,6 +347,7 @@ typedef struct Unit { /* Make sure we never enter endless loops with the StopWhenUnneeded=, BindsTo=, Uphold= logic */ RateLimit auto_start_stop_ratelimit; + sd_event_source *auto_start_stop_event_source; /* Reference to a specific UID/GID */ uid_t ref_uid; From 4c7a0fc8d061b41fdd63eb19b6fc0a5c94668dde Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Wed, 12 Apr 2023 21:37:45 +0100 Subject: [PATCH 2/2] Uphold/StopWhenUnneeded/BindsTo: requeue when job finishes When a unit is upheld and fails, and there are no state changes in the upholder, it will not be retried, which is against what the documentation suggests. Requeue when the job finishes. Same for the other two queues. --- src/core/job.c | 6 +++++ test/units/testsuite-57-retry-fail.service | 9 +++++++ test/units/testsuite-57-retry-upheld.service | 10 ++++++++ test/units/testsuite-57-retry-uphold.service | 7 +++++ test/units/testsuite-57.sh | 27 ++++++++++++++++++++ 5 files changed, 59 insertions(+) create mode 100644 test/units/testsuite-57-retry-fail.service create mode 100644 test/units/testsuite-57-retry-upheld.service create mode 100644 test/units/testsuite-57-retry-uphold.service diff --git a/src/core/job.c b/src/core/job.c index 334fbf770e..826ef6809c 100644 --- a/src/core/job.c +++ b/src/core/job.c @@ -1051,6 +1051,12 @@ finish: job_add_to_gc_queue(other->job); } + /* Ensure that when an upheld/unneeded/bound unit activation job fails we requeue it, if it still + * necessary. If there are no state changes in the triggerer, it would not be retried otherwise. */ + unit_submit_to_start_when_upheld_queue(u); + unit_submit_to_stop_when_bound_queue(u); + unit_submit_to_stop_when_unneeded_queue(u); + manager_check_finished(u->manager); return 0; diff --git a/test/units/testsuite-57-retry-fail.service b/test/units/testsuite-57-retry-fail.service new file mode 100644 index 0000000000..67f34079d5 --- /dev/null +++ b/test/units/testsuite-57-retry-fail.service @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +[Unit] +Description=Failed Dependency Unit + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/bin/sh -c "if [ -f /tmp/testsuite-57-retry-fail ]; then exit 0; else exit 1; fi" +Restart=no diff --git a/test/units/testsuite-57-retry-upheld.service b/test/units/testsuite-57-retry-upheld.service new file mode 100644 index 0000000000..2f718a61fa --- /dev/null +++ b/test/units/testsuite-57-retry-upheld.service @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +[Unit] +Description=Upheld Unit +Requires=testsuite-57-retry-fail.service +After=testsuite-57-retry-fail.service + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/bin/echo ok diff --git a/test/units/testsuite-57-retry-uphold.service b/test/units/testsuite-57-retry-uphold.service new file mode 100644 index 0000000000..a01b131ed5 --- /dev/null +++ b/test/units/testsuite-57-retry-uphold.service @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +[Unit] +Description=Upholding Unit +Upholds=testsuite-57-retry-upheld.service + +[Service] +ExecStart=/bin/sleep infinity diff --git a/test/units/testsuite-57.sh b/test/units/testsuite-57.sh index 66d946bebc..24040c3189 100755 --- a/test/units/testsuite-57.sh +++ b/test/units/testsuite-57.sh @@ -26,6 +26,33 @@ done systemctl stop testsuite-57-uphold.service +# Idea is this: +# 1. we start testsuite-57-retry-uphold.service +# 2. which through Uphold= starts testsuite-57-retry-upheld.service +# 3. which through Requires= starts testsuite-57-retry-fail.service +# 4. which fails as /tmp/testsuite-57-retry-fail does not exist, so testsuite-57-retry-upheld.service +# is no longer restarted +# 5. we create /tmp/testsuite-57-retry-fail +# 6. now testsuite-57-retry-upheld.service will be restarted since upheld, and its dependency will +# be satisfied + +rm -f /tmp/testsuite-57-retry-fail +systemctl start testsuite-57-retry-uphold.service + +while ! systemctl is-failed testsuite-57-retry-fail.service ; do + sleep .5 +done + +systemctl is-active testsuite-57-retry-upheld.service && { echo 'unexpected success'; exit 1; } + +touch /tmp/testsuite-57-retry-fail + +while ! systemctl is-active testsuite-57-retry-upheld.service ; do + sleep .5 +done + +systemctl stop testsuite-57-retry-uphold.service testsuite-57-retry-fail.service testsuite-57-retry-upheld.service + # Idea is this: # 1. we start testsuite-57-prop-stop-one.service # 2. which through Wants=/After= pulls in testsuite-57-prop-stop-two.service as well