Two regression fixes for the timer and timer migration code:

1) Prevent endless timer requeuing which is caused by two CPUs racing out of idle. This happens when the last CPU goes idle and therefore has to ensure to expire the pending global timers and some other CPU come out of idle at the same time and the other CPU wins the race and expires the global queue. This causes the last CPU to chase ghost timers forever and reprogramming it's clockevent device endlessly. Cure this by re-evaluating the wakeup time unconditionally. 2) The split into local (pinned) and global timers in the timer wheel caused a regression for NOHZ full as it broke the idle tracking of global timers. On NOHZ full this prevents an self IPI being sent which in turn causes the timer to be not programmed and not being expired on time. Restore the idle tracking for the global timer base so that the self IPI condition for NOHZ full is working correctly again. -----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmX/Mn8THHRnbHhAbGlu dXRyb25peC5kZQAKCRCmGPVMDXSYoYX0D/93fKa9qp+qBs72Vvctj4bJuk6auel7 GRWx3Vc//kk4VOJFCteQ2eykr1/fsLuibPb67iiKp41stvaWKeJPj0kD+RUuVf8E dPGPpPU+qY5ynhoiqJekZL7+5NSA48y4bDc00a4U31MPpEcJB5y94zFOCKMiCWtk 6Tf6I6168bsSFYvKqb2LImVoowu/bf7bXLVUk1HcdNnSC7bfx+yN8nkQ1zSy3K2M IyE1CQqMyDmdfKW9Vs68ooTIpuA0n7bxOuXbVaFdJyiJ035v3Z3+m2vQmrHHLDdz MfqHbFDEomDC+zfiugFvuxyxLIi2Gf/NXPibu6OxLkVe2Pu1KUJkhFbgZVUR2W6A EU6SmZr77zMPAMZyqG8OJqTqlCPiJfJX2KMWDF+ezbXBt+sbMe6LfazPqj9TopnN /ECMCt77xl1POCEnP81hPWizKsqf8HCTDDZEi9UlqbIxT3TgrZFlTnKfmmciwWiP uGoUXgZmi8qJ+lSGNTVUgbTmIbazvUz43sKgfndUy2yxeCb/SBlx1/8Ys2ntszOy XDOF8QroPH0zXlBaYo0QVZbOdB4O0/En1qZuGScBmoUY7bRr0NRD/C3ObhvQHI7C iguAsnB+zirwwZSTDzwhQDhXAtWgSaqBB7pb4aCDxC0AvnKtL5HKdDNeIafpPJeA 4Xh40iu44u/V9w== =lY5O -----END PGP SIGNATURE----- Merge tag 'timers-urgent-2024-03-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull timer fixes from Thomas Gleixner: "Two regression fixes for the timer and timer migration code: - Prevent endless timer requeuing which is caused by two CPUs racing out of idle. This happens when the last CPU goes idle and therefore has to ensure to expire the pending global timers and some other CPU come out of idle at the same time and the other CPU wins the race and expires the global queue. This causes the last CPU to chase ghost timers forever and reprogramming it's clockevent device endlessly. Cure this by re-evaluating the wakeup time unconditionally. - The split into local (pinned) and global timers in the timer wheel caused a regression for NOHZ full as it broke the idle tracking of global timers. On NOHZ full this prevents an self IPI being sent which in turn causes the timer to be not programmed and not being expired on time. Restore the idle tracking for the global timer base so that the self IPI condition for NOHZ full is working correctly again" * tag 'timers-urgent-2024-03-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: timers: Fix removed self-IPI on global timer's enqueue in nohz_full timers/migration: Fix endless timer requeue after idle interrupts
2024-07-21 10:41:44 +00:00 · 2024-03-23 14:49:25 -07:00 · 2024-03-23 14:49:25 -07:00 · 70293240c5
parent 00164f477f 0387703986
commit 70293240c5
2 changed files with 20 additions and 3 deletions
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@ -642,7 +642,8 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
 	 * the base lock:
 	 */
 	if (base->is_idle) {
-		WARN_ON_ONCE(!(timer->flags & TIMER_PINNED));
+		WARN_ON_ONCE(!(timer->flags & TIMER_PINNED ||
+			       tick_nohz_full_cpu(base->cpu)));
 		wake_up_nohz_cpu(base->cpu);
 	}
 }
@ -2292,6 +2293,13 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
 		 */
 		if (!base_local->is_idle && time_after(nextevt, basej + 1)) {
 			base_local->is_idle = true;
+			/*
+			 * Global timers queued locally while running in a task
+			 * in nohz_full mode need a self-IPI to kick reprogramming
+			 * in IRQ tail.
+			 */
+			if (tick_nohz_full_cpu(base_local->cpu))
+				base_global->is_idle = true;
 			trace_timer_base_idle(true, base_local->cpu);
 		}
 		*idle = base_local->is_idle;
@ -2364,6 +2372,8 @@ void timer_clear_idle(void)
 	 * path. Required for BASE_LOCAL only.
 	 */
 	__this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
+	if (tick_nohz_full_cpu(smp_processor_id()))
+		__this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
 	trace_timer_base_idle(false, smp_processor_id());

 	/* Activate without holding the timer_base->lock */
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@ -1038,8 +1038,15 @@ void tmigr_handle_remote(void)
 	 * in tmigr_handle_remote_up() anyway. Keep this check to speed up the
 	 * return when nothing has to be done.
 	 */
-	if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask))
-		return;
+	if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) {
+		/*
+		 * If this CPU was an idle migrator, make sure to clear its wakeup
+		 * value so it won't chase timers that have already expired elsewhere.
+		 * This avoids endless requeue from tmigr_new_timer().
+		 */
+		if (READ_ONCE(tmc->wakeup) == KTIME_MAX)
+			return;
+	}

 	data.now = get_jiffies_update(&data.basej);