From a364ebd46d9fef0b16598aca8a723b6632e90a6a Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Fri, 8 Nov 2024 16:57:40 +0100 Subject: [PATCH 1/3] core/unit: disable unit debug invocation in generic unit_reset_failed() --- src/core/service.c | 2 -- src/core/unit.c | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/core/service.c b/src/core/service.c index 12b367ccc8..4bad026537 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -5131,8 +5131,6 @@ static void service_reset_failed(Unit *u) { s->live_mount_result = SERVICE_SUCCESS; s->clean_result = SERVICE_SUCCESS; s->n_restarts = 0; - - (void) unit_set_debug_invocation(u, /* enable= */ false); } static PidRef* service_main_pid(Unit *u, bool *ret_is_alien) { diff --git a/src/core/unit.c b/src/core/unit.c index b784342c08..781de8d594 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3867,7 +3867,8 @@ void unit_reset_failed(Unit *u) { ratelimit_reset(&u->start_ratelimit); u->start_limit_hit = false; - u->debug_invocation = false; + + (void) unit_set_debug_invocation(u, /* enable= */ false); } Unit *unit_following(Unit *u) { From 406aeb5da691180d7f655564f6543576ec211909 Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Sat, 26 Oct 2024 01:51:04 +0200 Subject: [PATCH 2/3] core/service: introduce sd_notify() RESTART_RESET=1 for resetting restart counter We have RestartMaxDelaySec= + RestartSteps= to exponentially increase auto restart durations, but it currently cannot be reset by the service itself, which makes it sometimes awkward to use. A typical pattern in real life is that a service was once down (e.g. due to temporary network interruption) and multiple restarts were attempted. Then, future restarts would always wait for increated amount of time based on RestartMaxDelaySec=, even after the original problem got resolved. Such "persistence" could result in longer unavailablity than there should be for failures that come later. (C.f. https://utcc.utoronto.ca/~cks/space/blog/linux/SystemdResettingUnitBackoff) Let's introduce a new sd_notify() notification for resetting the restart counter. There were discussions about making this timer-based, but I think it's more flexible to leave the decision-making to the service. This enables them to do a combination of N successful requests + uptime check for instance. --- man/sd_notify.xml | 16 ++++++++++++++-- src/core/service.c | 11 +++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/man/sd_notify.xml b/man/sd_notify.xml index c017f48487..746789e955 100644 --- a/man/sd_notify.xml +++ b/man/sd_notify.xml @@ -333,7 +333,7 @@ systemd.service5 for information how to enable this functionality and sd_watchdog_enabled3 - for the details of how the service can check whether the watchdog is enabled. + for the details of how the service can check whether the watchdog is enabled. @@ -345,7 +345,7 @@ in time. Note that WatchdogSec= does not need to be enabled for WATCHDOG=trigger to trigger the watchdog action. See systemd.service5 - for information about the watchdog behavior. + for information about the watchdog behavior. @@ -376,6 +376,18 @@ + + RESTART_RESET=1 + + Reset the restart counter of the service, which has the effect of restoring + the restart duration to RestartSec= if RestartSteps= and + RestartMaxDelaySec= are in use. For more information, refer to + systemd.service5. + + + + + FDSTORE=1 diff --git a/src/core/service.c b/src/core/service.c index 4bad026537..ccfa439dd0 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -4861,6 +4861,17 @@ static void service_notify_message( service_override_watchdog_timeout(s, watchdog_override_usec); } + /* Interpret RESTART_RESET=1 */ + if (strv_contains(tags, "RESTART_RESET=1") && IN_SET(s->state, SERVICE_RUNNING, SERVICE_STOP)) { + log_unit_struct(u, LOG_NOTICE, + LOG_UNIT_MESSAGE(u, "Got RESTART_RESET=1, resetting restart counter from %u.", s->n_restarts), + "N_RESTARTS=0", + LOG_UNIT_INVOCATION_ID(u)); + + s->n_restarts = 0; + notify_dbus = true; + } + /* Process FD store messages. Either FDSTOREREMOVE=1 for removal, or FDSTORE=1 for addition. In both cases, * process FDNAME= for picking the file descriptor name to use. Note that FDNAME= is required when removing * fds, but optional when pushing in new fds, for compatibility reasons. */ From c139ac7f534edb9bc28943e83c7f99c177144c91 Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Sun, 27 Oct 2024 13:05:01 +0100 Subject: [PATCH 3/3] TODO: support RESTART_RESET=1 in journal-upload --- TODO | 3 +++ 1 file changed, 3 insertions(+) diff --git a/TODO b/TODO index 773edc5590..8d2cfd41ab 100644 --- a/TODO +++ b/TODO @@ -2836,3 +2836,6 @@ Features: * shared/wall: Once more programs are taught to prefer sd-login over utmp, switch the default wall implementation to wall_logind (https://github.com/systemd/systemd/pull/29051#issuecomment-1704917074) + +* Hook up systemd-journal-upload with RESTART_RESET=1 logic (should probably + be conditioned on the num of successfully uploaded entries?)