1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
4   *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
5   *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
6   *
7   *  NOHZ implementation for low and high resolution timers
8   *
9   *  Started by: Thomas Gleixner and Ingo Molnar
10   */
11  #include <linux/compiler.h>
12  #include <linux/cpu.h>
13  #include <linux/err.h>
14  #include <linux/hrtimer.h>
15  #include <linux/interrupt.h>
16  #include <linux/kernel_stat.h>
17  #include <linux/percpu.h>
18  #include <linux/nmi.h>
19  #include <linux/profile.h>
20  #include <linux/sched/signal.h>
21  #include <linux/sched/clock.h>
22  #include <linux/sched/stat.h>
23  #include <linux/sched/nohz.h>
24  #include <linux/sched/loadavg.h>
25  #include <linux/module.h>
26  #include <linux/irq_work.h>
27  #include <linux/posix-timers.h>
28  #include <linux/context_tracking.h>
29  #include <linux/mm.h>
30  
31  #include <asm/irq_regs.h>
32  
33  #include "tick-internal.h"
34  
35  #include <trace/events/timer.h>
36  
37  /*
38   * Per-CPU nohz control structure
39   */
40  static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
41  
tick_get_tick_sched(int cpu)42  struct tick_sched *tick_get_tick_sched(int cpu)
43  {
44  	return &per_cpu(tick_cpu_sched, cpu);
45  }
46  
47  /*
48   * The time when the last jiffy update happened. Write access must hold
49   * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
50   * consistent view of jiffies and last_jiffies_update.
51   */
52  static ktime_t last_jiffies_update;
53  
54  /*
55   * Must be called with interrupts disabled !
56   */
tick_do_update_jiffies64(ktime_t now)57  static void tick_do_update_jiffies64(ktime_t now)
58  {
59  	unsigned long ticks = 1;
60  	ktime_t delta, nextp;
61  
62  	/*
63  	 * 64-bit can do a quick check without holding the jiffies lock and
64  	 * without looking at the sequence count. The smp_load_acquire()
65  	 * pairs with the update done later in this function.
66  	 *
67  	 * 32-bit cannot do that because the store of 'tick_next_period'
68  	 * consists of two 32-bit stores, and the first store could be
69  	 * moved by the CPU to a random point in the future.
70  	 */
71  	if (IS_ENABLED(CONFIG_64BIT)) {
72  		if (ktime_before(now, smp_load_acquire(&tick_next_period)))
73  			return;
74  	} else {
75  		unsigned int seq;
76  
77  		/*
78  		 * Avoid contention on 'jiffies_lock' and protect the quick
79  		 * check with the sequence count.
80  		 */
81  		do {
82  			seq = read_seqcount_begin(&jiffies_seq);
83  			nextp = tick_next_period;
84  		} while (read_seqcount_retry(&jiffies_seq, seq));
85  
86  		if (ktime_before(now, nextp))
87  			return;
88  	}
89  
90  	/* Quick check failed, i.e. update is required. */
91  	raw_spin_lock(&jiffies_lock);
92  	/*
93  	 * Re-evaluate with the lock held. Another CPU might have done the
94  	 * update already.
95  	 */
96  	if (ktime_before(now, tick_next_period)) {
97  		raw_spin_unlock(&jiffies_lock);
98  		return;
99  	}
100  
101  	write_seqcount_begin(&jiffies_seq);
102  
103  	delta = ktime_sub(now, tick_next_period);
104  	if (unlikely(delta >= TICK_NSEC)) {
105  		/* Slow path for long idle sleep times */
106  		s64 incr = TICK_NSEC;
107  
108  		ticks += ktime_divns(delta, incr);
109  
110  		last_jiffies_update = ktime_add_ns(last_jiffies_update,
111  						   incr * ticks);
112  	} else {
113  		last_jiffies_update = ktime_add_ns(last_jiffies_update,
114  						   TICK_NSEC);
115  	}
116  
117  	/* Advance jiffies to complete the 'jiffies_seq' protected job */
118  	jiffies_64 += ticks;
119  
120  	/* Keep the tick_next_period variable up to date */
121  	nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);
122  
123  	if (IS_ENABLED(CONFIG_64BIT)) {
124  		/*
125  		 * Pairs with smp_load_acquire() in the lockless quick
126  		 * check above, and ensures that the update to 'jiffies_64' is
127  		 * not reordered vs. the store to 'tick_next_period', neither
128  		 * by the compiler nor by the CPU.
129  		 */
130  		smp_store_release(&tick_next_period, nextp);
131  	} else {
132  		/*
133  		 * A plain store is good enough on 32-bit, as the quick check
134  		 * above is protected by the sequence count.
135  		 */
136  		tick_next_period = nextp;
137  	}
138  
139  	/*
140  	 * Release the sequence count. calc_global_load() below is not
141  	 * protected by it, but 'jiffies_lock' needs to be held to prevent
142  	 * concurrent invocations.
143  	 */
144  	write_seqcount_end(&jiffies_seq);
145  
146  	calc_global_load();
147  
148  	raw_spin_unlock(&jiffies_lock);
149  	update_wall_time();
150  }
151  
152  /*
153   * Initialize and return retrieve the jiffies update.
154   */
tick_init_jiffy_update(void)155  static ktime_t tick_init_jiffy_update(void)
156  {
157  	ktime_t period;
158  
159  	raw_spin_lock(&jiffies_lock);
160  	write_seqcount_begin(&jiffies_seq);
161  
162  	/* Have we started the jiffies update yet ? */
163  	if (last_jiffies_update == 0) {
164  		u32 rem;
165  
166  		/*
167  		 * Ensure that the tick is aligned to a multiple of
168  		 * TICK_NSEC.
169  		 */
170  		div_u64_rem(tick_next_period, TICK_NSEC, &rem);
171  		if (rem)
172  			tick_next_period += TICK_NSEC - rem;
173  
174  		last_jiffies_update = tick_next_period;
175  	}
176  	period = last_jiffies_update;
177  
178  	write_seqcount_end(&jiffies_seq);
179  	raw_spin_unlock(&jiffies_lock);
180  
181  	return period;
182  }
183  
tick_sched_flag_test(struct tick_sched * ts,unsigned long flag)184  static inline int tick_sched_flag_test(struct tick_sched *ts,
185  				       unsigned long flag)
186  {
187  	return !!(ts->flags & flag);
188  }
189  
tick_sched_flag_set(struct tick_sched * ts,unsigned long flag)190  static inline void tick_sched_flag_set(struct tick_sched *ts,
191  				       unsigned long flag)
192  {
193  	lockdep_assert_irqs_disabled();
194  	ts->flags |= flag;
195  }
196  
tick_sched_flag_clear(struct tick_sched * ts,unsigned long flag)197  static inline void tick_sched_flag_clear(struct tick_sched *ts,
198  					 unsigned long flag)
199  {
200  	lockdep_assert_irqs_disabled();
201  	ts->flags &= ~flag;
202  }
203  
204  #define MAX_STALLED_JIFFIES 5
205  
tick_sched_do_timer(struct tick_sched * ts,ktime_t now)206  static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
207  {
208  	int tick_cpu, cpu = smp_processor_id();
209  
210  	/*
211  	 * Check if the do_timer duty was dropped. We don't care about
212  	 * concurrency: This happens only when the CPU in charge went
213  	 * into a long sleep. If two CPUs happen to assign themselves to
214  	 * this duty, then the jiffies update is still serialized by
215  	 * 'jiffies_lock'.
216  	 *
217  	 * If nohz_full is enabled, this should not happen because the
218  	 * 'tick_do_timer_cpu' CPU never relinquishes.
219  	 */
220  	tick_cpu = READ_ONCE(tick_do_timer_cpu);
221  
222  	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) {
223  #ifdef CONFIG_NO_HZ_FULL
224  		WARN_ON_ONCE(tick_nohz_full_running);
225  #endif
226  		WRITE_ONCE(tick_do_timer_cpu, cpu);
227  		tick_cpu = cpu;
228  	}
229  
230  	/* Check if jiffies need an update */
231  	if (tick_cpu == cpu)
232  		tick_do_update_jiffies64(now);
233  
234  	/*
235  	 * If the jiffies update stalled for too long (timekeeper in stop_machine()
236  	 * or VMEXIT'ed for several msecs), force an update.
237  	 */
238  	if (ts->last_tick_jiffies != jiffies) {
239  		ts->stalled_jiffies = 0;
240  		ts->last_tick_jiffies = READ_ONCE(jiffies);
241  	} else {
242  		if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
243  			tick_do_update_jiffies64(now);
244  			ts->stalled_jiffies = 0;
245  			ts->last_tick_jiffies = READ_ONCE(jiffies);
246  		}
247  	}
248  
249  	if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
250  		ts->got_idle_tick = 1;
251  }
252  
tick_sched_handle(struct tick_sched * ts,struct pt_regs * regs)253  static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
254  {
255  	/*
256  	 * When we are idle and the tick is stopped, we have to touch
257  	 * the watchdog as we might not schedule for a really long
258  	 * time. This happens on completely idle SMP systems while
259  	 * waiting on the login prompt. We also increment the "start of
260  	 * idle" jiffy stamp so the idle accounting adjustment we do
261  	 * when we go busy again does not account too many ticks.
262  	 */
263  	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) &&
264  	    tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
265  		touch_softlockup_watchdog_sched();
266  		if (is_idle_task(current))
267  			ts->idle_jiffies++;
268  		/*
269  		 * In case the current tick fired too early past its expected
270  		 * expiration, make sure we don't bypass the next clock reprogramming
271  		 * to the same deadline.
272  		 */
273  		ts->next_tick = 0;
274  	}
275  
276  	update_process_times(user_mode(regs));
277  	profile_tick(CPU_PROFILING);
278  }
279  
280  /*
281   * We rearm the timer until we get disabled by the idle code.
282   * Called with interrupts disabled.
283   */
tick_nohz_handler(struct hrtimer * timer)284  static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer)
285  {
286  	struct tick_sched *ts =	container_of(timer, struct tick_sched, sched_timer);
287  	struct pt_regs *regs = get_irq_regs();
288  	ktime_t now = ktime_get();
289  
290  	tick_sched_do_timer(ts, now);
291  
292  	/*
293  	 * Do not call when we are not in IRQ context and have
294  	 * no valid 'regs' pointer
295  	 */
296  	if (regs)
297  		tick_sched_handle(ts, regs);
298  	else
299  		ts->next_tick = 0;
300  
301  	/*
302  	 * In dynticks mode, tick reprogram is deferred:
303  	 * - to the idle task if in dynticks-idle
304  	 * - to IRQ exit if in full-dynticks.
305  	 */
306  	if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED)))
307  		return HRTIMER_NORESTART;
308  
309  	hrtimer_forward(timer, now, TICK_NSEC);
310  
311  	return HRTIMER_RESTART;
312  }
313  
tick_sched_timer_cancel(struct tick_sched * ts)314  static void tick_sched_timer_cancel(struct tick_sched *ts)
315  {
316  	if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
317  		hrtimer_cancel(&ts->sched_timer);
318  	else if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
319  		tick_program_event(KTIME_MAX, 1);
320  }
321  
322  #ifdef CONFIG_NO_HZ_FULL
323  cpumask_var_t tick_nohz_full_mask;
324  EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
325  bool tick_nohz_full_running;
326  EXPORT_SYMBOL_GPL(tick_nohz_full_running);
327  static atomic_t tick_dep_mask;
328  
check_tick_dependency(atomic_t * dep)329  static bool check_tick_dependency(atomic_t *dep)
330  {
331  	int val = atomic_read(dep);
332  
333  	if (val & TICK_DEP_MASK_POSIX_TIMER) {
334  		trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
335  		return true;
336  	}
337  
338  	if (val & TICK_DEP_MASK_PERF_EVENTS) {
339  		trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
340  		return true;
341  	}
342  
343  	if (val & TICK_DEP_MASK_SCHED) {
344  		trace_tick_stop(0, TICK_DEP_MASK_SCHED);
345  		return true;
346  	}
347  
348  	if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
349  		trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
350  		return true;
351  	}
352  
353  	if (val & TICK_DEP_MASK_RCU) {
354  		trace_tick_stop(0, TICK_DEP_MASK_RCU);
355  		return true;
356  	}
357  
358  	if (val & TICK_DEP_MASK_RCU_EXP) {
359  		trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP);
360  		return true;
361  	}
362  
363  	return false;
364  }
365  
can_stop_full_tick(int cpu,struct tick_sched * ts)366  static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
367  {
368  	lockdep_assert_irqs_disabled();
369  
370  	if (unlikely(!cpu_online(cpu)))
371  		return false;
372  
373  	if (check_tick_dependency(&tick_dep_mask))
374  		return false;
375  
376  	if (check_tick_dependency(&ts->tick_dep_mask))
377  		return false;
378  
379  	if (check_tick_dependency(&current->tick_dep_mask))
380  		return false;
381  
382  	if (check_tick_dependency(&current->signal->tick_dep_mask))
383  		return false;
384  
385  	return true;
386  }
387  
nohz_full_kick_func(struct irq_work * work)388  static void nohz_full_kick_func(struct irq_work *work)
389  {
390  	/* Empty, the tick restart happens on tick_nohz_irq_exit() */
391  }
392  
393  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) =
394  	IRQ_WORK_INIT_HARD(nohz_full_kick_func);
395  
396  /*
397   * Kick this CPU if it's full dynticks in order to force it to
398   * re-evaluate its dependency on the tick and restart it if necessary.
399   * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
400   * is NMI safe.
401   */
tick_nohz_full_kick(void)402  static void tick_nohz_full_kick(void)
403  {
404  	if (!tick_nohz_full_cpu(smp_processor_id()))
405  		return;
406  
407  	irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
408  }
409  
410  /*
411   * Kick the CPU if it's full dynticks in order to force it to
412   * re-evaluate its dependency on the tick and restart it if necessary.
413   */
tick_nohz_full_kick_cpu(int cpu)414  void tick_nohz_full_kick_cpu(int cpu)
415  {
416  	if (!tick_nohz_full_cpu(cpu))
417  		return;
418  
419  	irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
420  }
421  
tick_nohz_kick_task(struct task_struct * tsk)422  static void tick_nohz_kick_task(struct task_struct *tsk)
423  {
424  	int cpu;
425  
426  	/*
427  	 * If the task is not running, run_posix_cpu_timers()
428  	 * has nothing to elapse, and an IPI can then be optimized out.
429  	 *
430  	 * activate_task()                      STORE p->tick_dep_mask
431  	 *   STORE p->on_rq
432  	 * __schedule() (switch to task 'p')    smp_mb() (atomic_fetch_or())
433  	 *   LOCK rq->lock                      LOAD p->on_rq
434  	 *   smp_mb__after_spin_lock()
435  	 *   tick_nohz_task_switch()
436  	 *     LOAD p->tick_dep_mask
437  	 *
438  	 * XXX given a task picks up the dependency on schedule(), should we
439  	 * only care about tasks that are currently on the CPU instead of all
440  	 * that are on the runqueue?
441  	 *
442  	 * That is, does this want to be: task_on_cpu() / task_curr()?
443  	 */
444  	if (!sched_task_on_rq(tsk))
445  		return;
446  
447  	/*
448  	 * If the task concurrently migrates to another CPU,
449  	 * we guarantee it sees the new tick dependency upon
450  	 * schedule.
451  	 *
452  	 * set_task_cpu(p, cpu);
453  	 *   STORE p->cpu = @cpu
454  	 * __schedule() (switch to task 'p')
455  	 *   LOCK rq->lock
456  	 *   smp_mb__after_spin_lock()          STORE p->tick_dep_mask
457  	 *   tick_nohz_task_switch()            smp_mb() (atomic_fetch_or())
458  	 *      LOAD p->tick_dep_mask           LOAD p->cpu
459  	 */
460  	cpu = task_cpu(tsk);
461  
462  	preempt_disable();
463  	if (cpu_online(cpu))
464  		tick_nohz_full_kick_cpu(cpu);
465  	preempt_enable();
466  }
467  
468  /*
469   * Kick all full dynticks CPUs in order to force these to re-evaluate
470   * their dependency on the tick and restart it if necessary.
471   */
tick_nohz_full_kick_all(void)472  static void tick_nohz_full_kick_all(void)
473  {
474  	int cpu;
475  
476  	if (!tick_nohz_full_running)
477  		return;
478  
479  	preempt_disable();
480  	for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
481  		tick_nohz_full_kick_cpu(cpu);
482  	preempt_enable();
483  }
484  
tick_nohz_dep_set_all(atomic_t * dep,enum tick_dep_bits bit)485  static void tick_nohz_dep_set_all(atomic_t *dep,
486  				  enum tick_dep_bits bit)
487  {
488  	int prev;
489  
490  	prev = atomic_fetch_or(BIT(bit), dep);
491  	if (!prev)
492  		tick_nohz_full_kick_all();
493  }
494  
495  /*
496   * Set a global tick dependency. Used by perf events that rely on freq and
497   * unstable clocks.
498   */
tick_nohz_dep_set(enum tick_dep_bits bit)499  void tick_nohz_dep_set(enum tick_dep_bits bit)
500  {
501  	tick_nohz_dep_set_all(&tick_dep_mask, bit);
502  }
503  
tick_nohz_dep_clear(enum tick_dep_bits bit)504  void tick_nohz_dep_clear(enum tick_dep_bits bit)
505  {
506  	atomic_andnot(BIT(bit), &tick_dep_mask);
507  }
508  
509  /*
510   * Set per-CPU tick dependency. Used by scheduler and perf events in order to
511   * manage event-throttling.
512   */
tick_nohz_dep_set_cpu(int cpu,enum tick_dep_bits bit)513  void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
514  {
515  	int prev;
516  	struct tick_sched *ts;
517  
518  	ts = per_cpu_ptr(&tick_cpu_sched, cpu);
519  
520  	prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
521  	if (!prev) {
522  		preempt_disable();
523  		/* Perf needs local kick that is NMI safe */
524  		if (cpu == smp_processor_id()) {
525  			tick_nohz_full_kick();
526  		} else {
527  			/* Remote IRQ work not NMI-safe */
528  			if (!WARN_ON_ONCE(in_nmi()))
529  				tick_nohz_full_kick_cpu(cpu);
530  		}
531  		preempt_enable();
532  	}
533  }
534  EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu);
535  
tick_nohz_dep_clear_cpu(int cpu,enum tick_dep_bits bit)536  void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
537  {
538  	struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
539  
540  	atomic_andnot(BIT(bit), &ts->tick_dep_mask);
541  }
542  EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
543  
544  /*
545   * Set a per-task tick dependency. RCU needs this. Also posix CPU timers
546   * in order to elapse per task timers.
547   */
tick_nohz_dep_set_task(struct task_struct * tsk,enum tick_dep_bits bit)548  void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
549  {
550  	if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask))
551  		tick_nohz_kick_task(tsk);
552  }
553  EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
554  
tick_nohz_dep_clear_task(struct task_struct * tsk,enum tick_dep_bits bit)555  void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
556  {
557  	atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
558  }
559  EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);
560  
561  /*
562   * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
563   * per process timers.
564   */
tick_nohz_dep_set_signal(struct task_struct * tsk,enum tick_dep_bits bit)565  void tick_nohz_dep_set_signal(struct task_struct *tsk,
566  			      enum tick_dep_bits bit)
567  {
568  	int prev;
569  	struct signal_struct *sig = tsk->signal;
570  
571  	prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask);
572  	if (!prev) {
573  		struct task_struct *t;
574  
575  		lockdep_assert_held(&tsk->sighand->siglock);
576  		__for_each_thread(sig, t)
577  			tick_nohz_kick_task(t);
578  	}
579  }
580  
tick_nohz_dep_clear_signal(struct signal_struct * sig,enum tick_dep_bits bit)581  void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
582  {
583  	atomic_andnot(BIT(bit), &sig->tick_dep_mask);
584  }
585  
586  /*
587   * Re-evaluate the need for the tick as we switch the current task.
588   * It might need the tick due to per task/process properties:
589   * perf events, posix CPU timers, ...
590   */
__tick_nohz_task_switch(void)591  void __tick_nohz_task_switch(void)
592  {
593  	struct tick_sched *ts;
594  
595  	if (!tick_nohz_full_cpu(smp_processor_id()))
596  		return;
597  
598  	ts = this_cpu_ptr(&tick_cpu_sched);
599  
600  	if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
601  		if (atomic_read(&current->tick_dep_mask) ||
602  		    atomic_read(&current->signal->tick_dep_mask))
603  			tick_nohz_full_kick();
604  	}
605  }
606  
607  /* Get the boot-time nohz CPU list from the kernel parameters. */
tick_nohz_full_setup(cpumask_var_t cpumask)608  void __init tick_nohz_full_setup(cpumask_var_t cpumask)
609  {
610  	alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
611  	cpumask_copy(tick_nohz_full_mask, cpumask);
612  	tick_nohz_full_running = true;
613  }
614  
tick_nohz_cpu_hotpluggable(unsigned int cpu)615  bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
616  {
617  	/*
618  	 * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound
619  	 * timers, workqueues, timekeeping, ...) on behalf of full dynticks
620  	 * CPUs. It must remain online when nohz full is enabled.
621  	 */
622  	if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu)
623  		return false;
624  	return true;
625  }
626  
tick_nohz_cpu_down(unsigned int cpu)627  static int tick_nohz_cpu_down(unsigned int cpu)
628  {
629  	return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
630  }
631  
tick_nohz_init(void)632  void __init tick_nohz_init(void)
633  {
634  	int cpu, ret;
635  
636  	if (!tick_nohz_full_running)
637  		return;
638  
639  	/*
640  	 * Full dynticks uses IRQ work to drive the tick rescheduling on safe
641  	 * locking contexts. But then we need IRQ work to raise its own
642  	 * interrupts to avoid circular dependency on the tick.
643  	 */
644  	if (!arch_irq_work_has_interrupt()) {
645  		pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n");
646  		cpumask_clear(tick_nohz_full_mask);
647  		tick_nohz_full_running = false;
648  		return;
649  	}
650  
651  	if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
652  			!IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
653  		cpu = smp_processor_id();
654  
655  		if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
656  			pr_warn("NO_HZ: Clearing %d from nohz_full range "
657  				"for timekeeping\n", cpu);
658  			cpumask_clear_cpu(cpu, tick_nohz_full_mask);
659  		}
660  	}
661  
662  	for_each_cpu(cpu, tick_nohz_full_mask)
663  		ct_cpu_track_user(cpu);
664  
665  	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
666  					"kernel/nohz:predown", NULL,
667  					tick_nohz_cpu_down);
668  	WARN_ON(ret < 0);
669  	pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
670  		cpumask_pr_args(tick_nohz_full_mask));
671  }
672  #endif /* #ifdef CONFIG_NO_HZ_FULL */
673  
674  /*
675   * NOHZ - aka dynamic tick functionality
676   */
677  #ifdef CONFIG_NO_HZ_COMMON
678  /*
679   * NO HZ enabled ?
680   */
681  bool tick_nohz_enabled __read_mostly  = true;
682  unsigned long tick_nohz_active  __read_mostly;
683  /*
684   * Enable / Disable tickless mode
685   */
setup_tick_nohz(char * str)686  static int __init setup_tick_nohz(char *str)
687  {
688  	return (kstrtobool(str, &tick_nohz_enabled) == 0);
689  }
690  
691  __setup("nohz=", setup_tick_nohz);
692  
tick_nohz_tick_stopped(void)693  bool tick_nohz_tick_stopped(void)
694  {
695  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
696  
697  	return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
698  }
699  
tick_nohz_tick_stopped_cpu(int cpu)700  bool tick_nohz_tick_stopped_cpu(int cpu)
701  {
702  	struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
703  
704  	return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
705  }
706  
707  /**
708   * tick_nohz_update_jiffies - update jiffies when idle was interrupted
709   * @now: current ktime_t
710   *
711   * Called from interrupt entry when the CPU was idle
712   *
713   * In case the sched_tick was stopped on this CPU, we have to check if jiffies
714   * must be updated. Otherwise an interrupt handler could use a stale jiffy
715   * value. We do this unconditionally on any CPU, as we don't know whether the
716   * CPU, which has the update task assigned, is in a long sleep.
717   */
tick_nohz_update_jiffies(ktime_t now)718  static void tick_nohz_update_jiffies(ktime_t now)
719  {
720  	unsigned long flags;
721  
722  	__this_cpu_write(tick_cpu_sched.idle_waketime, now);
723  
724  	local_irq_save(flags);
725  	tick_do_update_jiffies64(now);
726  	local_irq_restore(flags);
727  
728  	touch_softlockup_watchdog_sched();
729  }
730  
tick_nohz_stop_idle(struct tick_sched * ts,ktime_t now)731  static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
732  {
733  	ktime_t delta;
734  
735  	if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)))
736  		return;
737  
738  	delta = ktime_sub(now, ts->idle_entrytime);
739  
740  	write_seqcount_begin(&ts->idle_sleeptime_seq);
741  	if (nr_iowait_cpu(smp_processor_id()) > 0)
742  		ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
743  	else
744  		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
745  
746  	ts->idle_entrytime = now;
747  	tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
748  	write_seqcount_end(&ts->idle_sleeptime_seq);
749  
750  	sched_clock_idle_wakeup_event();
751  }
752  
tick_nohz_start_idle(struct tick_sched * ts)753  static void tick_nohz_start_idle(struct tick_sched *ts)
754  {
755  	write_seqcount_begin(&ts->idle_sleeptime_seq);
756  	ts->idle_entrytime = ktime_get();
757  	tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
758  	write_seqcount_end(&ts->idle_sleeptime_seq);
759  
760  	sched_clock_idle_sleep_event();
761  }
762  
get_cpu_sleep_time_us(struct tick_sched * ts,ktime_t * sleeptime,bool compute_delta,u64 * last_update_time)763  static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
764  				 bool compute_delta, u64 *last_update_time)
765  {
766  	ktime_t now, idle;
767  	unsigned int seq;
768  
769  	if (!tick_nohz_active)
770  		return -1;
771  
772  	now = ktime_get();
773  	if (last_update_time)
774  		*last_update_time = ktime_to_us(now);
775  
776  	do {
777  		seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
778  
779  		if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) {
780  			ktime_t delta = ktime_sub(now, ts->idle_entrytime);
781  
782  			idle = ktime_add(*sleeptime, delta);
783  		} else {
784  			idle = *sleeptime;
785  		}
786  	} while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
787  
788  	return ktime_to_us(idle);
789  
790  }
791  
792  /**
793   * get_cpu_idle_time_us - get the total idle time of a CPU
794   * @cpu: CPU number to query
795   * @last_update_time: variable to store update time in. Do not update
796   * counters if NULL.
797   *
798   * Return the cumulative idle time (since boot) for a given
799   * CPU, in microseconds. Note that this is partially broken due to
800   * the counter of iowait tasks that can be remotely updated without
801   * any synchronization. Therefore it is possible to observe backward
802   * values within two consecutive reads.
803   *
804   * This time is measured via accounting rather than sampling,
805   * and is as accurate as ktime_get() is.
806   *
807   * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu
808   */
get_cpu_idle_time_us(int cpu,u64 * last_update_time)809  u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
810  {
811  	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
812  
813  	return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
814  				     !nr_iowait_cpu(cpu), last_update_time);
815  }
816  EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
817  
818  /**
819   * get_cpu_iowait_time_us - get the total iowait time of a CPU
820   * @cpu: CPU number to query
821   * @last_update_time: variable to store update time in. Do not update
822   * counters if NULL.
823   *
824   * Return the cumulative iowait time (since boot) for a given
825   * CPU, in microseconds. Note this is partially broken due to
826   * the counter of iowait tasks that can be remotely updated without
827   * any synchronization. Therefore it is possible to observe backward
828   * values within two consecutive reads.
829   *
830   * This time is measured via accounting rather than sampling,
831   * and is as accurate as ktime_get() is.
832   *
833   * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu
834   */
get_cpu_iowait_time_us(int cpu,u64 * last_update_time)835  u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
836  {
837  	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
838  
839  	return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
840  				     nr_iowait_cpu(cpu), last_update_time);
841  }
842  EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
843  
tick_nohz_restart(struct tick_sched * ts,ktime_t now)844  static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
845  {
846  	hrtimer_cancel(&ts->sched_timer);
847  	hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
848  
849  	/* Forward the time to expire in the future */
850  	hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
851  
852  	if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
853  		hrtimer_start_expires(&ts->sched_timer,
854  				      HRTIMER_MODE_ABS_PINNED_HARD);
855  	} else {
856  		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
857  	}
858  
859  	/*
860  	 * Reset to make sure the next tick stop doesn't get fooled by past
861  	 * cached clock deadline.
862  	 */
863  	ts->next_tick = 0;
864  }
865  
local_timer_softirq_pending(void)866  static inline bool local_timer_softirq_pending(void)
867  {
868  	return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
869  }
870  
871  /*
872   * Read jiffies and the time when jiffies were updated last
873   */
get_jiffies_update(unsigned long * basej)874  u64 get_jiffies_update(unsigned long *basej)
875  {
876  	unsigned long basejiff;
877  	unsigned int seq;
878  	u64 basemono;
879  
880  	do {
881  		seq = read_seqcount_begin(&jiffies_seq);
882  		basemono = last_jiffies_update;
883  		basejiff = jiffies;
884  	} while (read_seqcount_retry(&jiffies_seq, seq));
885  	*basej = basejiff;
886  	return basemono;
887  }
888  
889  /**
890   * tick_nohz_next_event() - return the clock monotonic based next event
891   * @ts:		pointer to tick_sched struct
892   * @cpu:	CPU number
893   *
894   * Return:
895   * *%0		- When the next event is a maximum of TICK_NSEC in the future
896   *		  and the tick is not stopped yet
897   * *%next_event	- Next event based on clock monotonic
898   */
tick_nohz_next_event(struct tick_sched * ts,int cpu)899  static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
900  {
901  	u64 basemono, next_tick, delta, expires;
902  	unsigned long basejiff;
903  	int tick_cpu;
904  
905  	basemono = get_jiffies_update(&basejiff);
906  	ts->last_jiffies = basejiff;
907  	ts->timer_expires_base = basemono;
908  
909  	/*
910  	 * Keep the periodic tick, when RCU, architecture or irq_work
911  	 * requests it.
912  	 * Aside of that, check whether the local timer softirq is
913  	 * pending. If so, its a bad idea to call get_next_timer_interrupt(),
914  	 * because there is an already expired timer, so it will request
915  	 * immediate expiry, which rearms the hardware timer with a
916  	 * minimal delta, which brings us back to this place
917  	 * immediately. Lather, rinse and repeat...
918  	 */
919  	if (rcu_needs_cpu() || arch_needs_cpu() ||
920  	    irq_work_needs_cpu() || local_timer_softirq_pending()) {
921  		next_tick = basemono + TICK_NSEC;
922  	} else {
923  		/*
924  		 * Get the next pending timer. If high resolution
925  		 * timers are enabled this only takes the timer wheel
926  		 * timers into account. If high resolution timers are
927  		 * disabled this also looks at the next expiring
928  		 * hrtimer.
929  		 */
930  		next_tick = get_next_timer_interrupt(basejiff, basemono);
931  		ts->next_timer = next_tick;
932  	}
933  
934  	/* Make sure next_tick is never before basemono! */
935  	if (WARN_ON_ONCE(basemono > next_tick))
936  		next_tick = basemono;
937  
938  	/*
939  	 * If the tick is due in the next period, keep it ticking or
940  	 * force prod the timer.
941  	 */
942  	delta = next_tick - basemono;
943  	if (delta <= (u64)TICK_NSEC) {
944  		/*
945  		 * We've not stopped the tick yet, and there's a timer in the
946  		 * next period, so no point in stopping it either, bail.
947  		 */
948  		if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
949  			ts->timer_expires = 0;
950  			goto out;
951  		}
952  	}
953  
954  	/*
955  	 * If this CPU is the one which had the do_timer() duty last, we limit
956  	 * the sleep time to the timekeeping 'max_deferment' value.
957  	 * Otherwise we can sleep as long as we want.
958  	 */
959  	delta = timekeeping_max_deferment();
960  	tick_cpu = READ_ONCE(tick_do_timer_cpu);
961  	if (tick_cpu != cpu &&
962  	    (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST)))
963  		delta = KTIME_MAX;
964  
965  	/* Calculate the next expiry time */
966  	if (delta < (KTIME_MAX - basemono))
967  		expires = basemono + delta;
968  	else
969  		expires = KTIME_MAX;
970  
971  	ts->timer_expires = min_t(u64, expires, next_tick);
972  
973  out:
974  	return ts->timer_expires;
975  }
976  
tick_nohz_stop_tick(struct tick_sched * ts,int cpu)977  static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
978  {
979  	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
980  	unsigned long basejiff = ts->last_jiffies;
981  	u64 basemono = ts->timer_expires_base;
982  	bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
983  	int tick_cpu;
984  	u64 expires;
985  
986  	/* Make sure we won't be trying to stop it twice in a row. */
987  	ts->timer_expires_base = 0;
988  
989  	/*
990  	 * Now the tick should be stopped definitely - so the timer base needs
991  	 * to be marked idle as well to not miss a newly queued timer.
992  	 */
993  	expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle);
994  	if (expires > ts->timer_expires) {
995  		/*
996  		 * This path could only happen when the first timer was removed
997  		 * between calculating the possible sleep length and now (when
998  		 * high resolution mode is not active, timer could also be a
999  		 * hrtimer).
1000  		 *
1001  		 * We have to stick to the original calculated expiry value to
1002  		 * not stop the tick for too long with a shallow C-state (which
1003  		 * was programmed by cpuidle because of an early next expiration
1004  		 * value).
1005  		 */
1006  		expires = ts->timer_expires;
1007  	}
1008  
1009  	/* If the timer base is not idle, retain the not yet stopped tick. */
1010  	if (!timer_idle)
1011  		return;
1012  
1013  	/*
1014  	 * If this CPU is the one which updates jiffies, then give up
1015  	 * the assignment and let it be taken by the CPU which runs
1016  	 * the tick timer next, which might be this CPU as well. If we
1017  	 * don't drop this here, the jiffies might be stale and
1018  	 * do_timer() never gets invoked. Keep track of the fact that it
1019  	 * was the one which had the do_timer() duty last.
1020  	 */
1021  	tick_cpu = READ_ONCE(tick_do_timer_cpu);
1022  	if (tick_cpu == cpu) {
1023  		WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE);
1024  		tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST);
1025  	} else if (tick_cpu != TICK_DO_TIMER_NONE) {
1026  		tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST);
1027  	}
1028  
1029  	/* Skip reprogram of event if it's not changed */
1030  	if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) {
1031  		/* Sanity check: make sure clockevent is actually programmed */
1032  		if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
1033  			return;
1034  
1035  		WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu "
1036  			  "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick,
1037  			  dev->next_event, hrtimer_active(&ts->sched_timer),
1038  			  hrtimer_get_expires(&ts->sched_timer));
1039  	}
1040  
1041  	/*
1042  	 * tick_nohz_stop_tick() can be called several times before
1043  	 * tick_nohz_restart_sched_tick() is called. This happens when
1044  	 * interrupts arrive which do not cause a reschedule. In the first
1045  	 * call we save the current tick time, so we can restart the
1046  	 * scheduler tick in tick_nohz_restart_sched_tick().
1047  	 */
1048  	if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
1049  		calc_load_nohz_start();
1050  		quiet_vmstat();
1051  
1052  		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
1053  		tick_sched_flag_set(ts, TS_FLAG_STOPPED);
1054  		trace_tick_stop(1, TICK_DEP_MASK_NONE);
1055  	}
1056  
1057  	ts->next_tick = expires;
1058  
1059  	/*
1060  	 * If the expiration time == KTIME_MAX, then we simply stop
1061  	 * the tick timer.
1062  	 */
1063  	if (unlikely(expires == KTIME_MAX)) {
1064  		tick_sched_timer_cancel(ts);
1065  		return;
1066  	}
1067  
1068  	if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
1069  		hrtimer_start(&ts->sched_timer, expires,
1070  			      HRTIMER_MODE_ABS_PINNED_HARD);
1071  	} else {
1072  		hrtimer_set_expires(&ts->sched_timer, expires);
1073  		tick_program_event(expires, 1);
1074  	}
1075  }
1076  
tick_nohz_retain_tick(struct tick_sched * ts)1077  static void tick_nohz_retain_tick(struct tick_sched *ts)
1078  {
1079  	ts->timer_expires_base = 0;
1080  }
1081  
1082  #ifdef CONFIG_NO_HZ_FULL
tick_nohz_full_stop_tick(struct tick_sched * ts,int cpu)1083  static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu)
1084  {
1085  	if (tick_nohz_next_event(ts, cpu))
1086  		tick_nohz_stop_tick(ts, cpu);
1087  	else
1088  		tick_nohz_retain_tick(ts);
1089  }
1090  #endif /* CONFIG_NO_HZ_FULL */
1091  
tick_nohz_restart_sched_tick(struct tick_sched * ts,ktime_t now)1092  static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
1093  {
1094  	/* Update jiffies first */
1095  	tick_do_update_jiffies64(now);
1096  
1097  	/*
1098  	 * Clear the timer idle flag, so we avoid IPIs on remote queueing and
1099  	 * the clock forward checks in the enqueue path:
1100  	 */
1101  	timer_clear_idle();
1102  
1103  	calc_load_nohz_stop();
1104  	touch_softlockup_watchdog_sched();
1105  
1106  	/* Cancel the scheduled timer and restore the tick: */
1107  	tick_sched_flag_clear(ts, TS_FLAG_STOPPED);
1108  	tick_nohz_restart(ts, now);
1109  }
1110  
__tick_nohz_full_update_tick(struct tick_sched * ts,ktime_t now)1111  static void __tick_nohz_full_update_tick(struct tick_sched *ts,
1112  					 ktime_t now)
1113  {
1114  #ifdef CONFIG_NO_HZ_FULL
1115  	int cpu = smp_processor_id();
1116  
1117  	if (can_stop_full_tick(cpu, ts))
1118  		tick_nohz_full_stop_tick(ts, cpu);
1119  	else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
1120  		tick_nohz_restart_sched_tick(ts, now);
1121  #endif
1122  }
1123  
tick_nohz_full_update_tick(struct tick_sched * ts)1124  static void tick_nohz_full_update_tick(struct tick_sched *ts)
1125  {
1126  	if (!tick_nohz_full_cpu(smp_processor_id()))
1127  		return;
1128  
1129  	if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ))
1130  		return;
1131  
1132  	__tick_nohz_full_update_tick(ts, ktime_get());
1133  }
1134  
1135  /*
1136   * A pending softirq outside an IRQ (or softirq disabled section) context
1137   * should be waiting for ksoftirqd to handle it. Therefore we shouldn't
1138   * reach this code due to the need_resched() early check in can_stop_idle_tick().
1139   *
1140   * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
1141   * cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
1142   * triggering the code below, since wakep_softirqd() is ignored.
1143   *
1144   */
report_idle_softirq(void)1145  static bool report_idle_softirq(void)
1146  {
1147  	static int ratelimit;
1148  	unsigned int pending = local_softirq_pending();
1149  
1150  	if (likely(!pending))
1151  		return false;
1152  
1153  	/* Some softirqs claim to be safe against hotplug and ksoftirqd parking */
1154  	if (!cpu_active(smp_processor_id())) {
1155  		pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK;
1156  		if (!pending)
1157  			return false;
1158  	}
1159  
1160  	if (ratelimit >= 10)
1161  		return false;
1162  
1163  	/* On RT, softirq handling may be waiting on some lock */
1164  	if (local_bh_blocked())
1165  		return false;
1166  
1167  	pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
1168  		pending);
1169  	ratelimit++;
1170  
1171  	return true;
1172  }
1173  
can_stop_idle_tick(int cpu,struct tick_sched * ts)1174  static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
1175  {
1176  	WARN_ON_ONCE(cpu_is_offline(cpu));
1177  
1178  	if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ)))
1179  		return false;
1180  
1181  	if (need_resched())
1182  		return false;
1183  
1184  	if (unlikely(report_idle_softirq()))
1185  		return false;
1186  
1187  	if (tick_nohz_full_enabled()) {
1188  		int tick_cpu = READ_ONCE(tick_do_timer_cpu);
1189  
1190  		/*
1191  		 * Keep the tick alive to guarantee timekeeping progression
1192  		 * if there are full dynticks CPUs around
1193  		 */
1194  		if (tick_cpu == cpu)
1195  			return false;
1196  
1197  		/* Should not happen for nohz-full */
1198  		if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE))
1199  			return false;
1200  	}
1201  
1202  	return true;
1203  }
1204  
1205  /**
1206   * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
1207   *
1208   * When the next event is more than a tick into the future, stop the idle tick
1209   */
tick_nohz_idle_stop_tick(void)1210  void tick_nohz_idle_stop_tick(void)
1211  {
1212  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1213  	int cpu = smp_processor_id();
1214  	ktime_t expires;
1215  
1216  	/*
1217  	 * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
1218  	 * tick timer expiration time is known already.
1219  	 */
1220  	if (ts->timer_expires_base)
1221  		expires = ts->timer_expires;
1222  	else if (can_stop_idle_tick(cpu, ts))
1223  		expires = tick_nohz_next_event(ts, cpu);
1224  	else
1225  		return;
1226  
1227  	ts->idle_calls++;
1228  
1229  	if (expires > 0LL) {
1230  		int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
1231  
1232  		tick_nohz_stop_tick(ts, cpu);
1233  
1234  		ts->idle_sleeps++;
1235  		ts->idle_expires = expires;
1236  
1237  		if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
1238  			ts->idle_jiffies = ts->last_jiffies;
1239  			nohz_balance_enter_idle(cpu);
1240  		}
1241  	} else {
1242  		tick_nohz_retain_tick(ts);
1243  	}
1244  }
1245  
tick_nohz_idle_retain_tick(void)1246  void tick_nohz_idle_retain_tick(void)
1247  {
1248  	tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
1249  }
1250  
1251  /**
1252   * tick_nohz_idle_enter - prepare for entering idle on the current CPU
1253   *
1254   * Called when we start the idle loop.
1255   */
tick_nohz_idle_enter(void)1256  void tick_nohz_idle_enter(void)
1257  {
1258  	struct tick_sched *ts;
1259  
1260  	lockdep_assert_irqs_enabled();
1261  
1262  	local_irq_disable();
1263  
1264  	ts = this_cpu_ptr(&tick_cpu_sched);
1265  
1266  	WARN_ON_ONCE(ts->timer_expires_base);
1267  
1268  	tick_sched_flag_set(ts, TS_FLAG_INIDLE);
1269  	tick_nohz_start_idle(ts);
1270  
1271  	local_irq_enable();
1272  }
1273  
1274  /**
1275   * tick_nohz_irq_exit - Notify the tick about IRQ exit
1276   *
1277   * A timer may have been added/modified/deleted either by the current IRQ,
1278   * or by another place using this IRQ as a notification. This IRQ may have
1279   * also updated the RCU callback list. These events may require a
1280   * re-evaluation of the next tick. Depending on the context:
1281   *
1282   * 1) If the CPU is idle and no resched is pending, just proceed with idle
1283   *    time accounting. The next tick will be re-evaluated on the next idle
1284   *    loop iteration.
1285   *
1286   * 2) If the CPU is nohz_full:
1287   *
1288   *    2.1) If there is any tick dependency, restart the tick if stopped.
1289   *
1290   *    2.2) If there is no tick dependency, (re-)evaluate the next tick and
1291   *         stop/update it accordingly.
1292   */
tick_nohz_irq_exit(void)1293  void tick_nohz_irq_exit(void)
1294  {
1295  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1296  
1297  	if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
1298  		tick_nohz_start_idle(ts);
1299  	else
1300  		tick_nohz_full_update_tick(ts);
1301  }
1302  
1303  /**
1304   * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
1305   *
1306   * Return: %true if the tick handler has run, otherwise %false
1307   */
tick_nohz_idle_got_tick(void)1308  bool tick_nohz_idle_got_tick(void)
1309  {
1310  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1311  
1312  	if (ts->got_idle_tick) {
1313  		ts->got_idle_tick = 0;
1314  		return true;
1315  	}
1316  	return false;
1317  }
1318  
1319  /**
1320   * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
1321   * or the tick, whichever expires first. Note that, if the tick has been
1322   * stopped, it returns the next hrtimer.
1323   *
1324   * Called from power state control code with interrupts disabled
1325   *
1326   * Return: the next expiration time
1327   */
tick_nohz_get_next_hrtimer(void)1328  ktime_t tick_nohz_get_next_hrtimer(void)
1329  {
1330  	return __this_cpu_read(tick_cpu_device.evtdev)->next_event;
1331  }
1332  
1333  /**
1334   * tick_nohz_get_sleep_length - return the expected length of the current sleep
1335   * @delta_next: duration until the next event if the tick cannot be stopped
1336   *
1337   * Called from power state control code with interrupts disabled.
1338   *
1339   * The return value of this function and/or the value returned by it through the
1340   * @delta_next pointer can be negative which must be taken into account by its
1341   * callers.
1342   *
1343   * Return: the expected length of the current sleep
1344   */
tick_nohz_get_sleep_length(ktime_t * delta_next)1345  ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
1346  {
1347  	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
1348  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1349  	int cpu = smp_processor_id();
1350  	/*
1351  	 * The idle entry time is expected to be a sufficient approximation of
1352  	 * the current time at this point.
1353  	 */
1354  	ktime_t now = ts->idle_entrytime;
1355  	ktime_t next_event;
1356  
1357  	WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));
1358  
1359  	*delta_next = ktime_sub(dev->next_event, now);
1360  
1361  	if (!can_stop_idle_tick(cpu, ts))
1362  		return *delta_next;
1363  
1364  	next_event = tick_nohz_next_event(ts, cpu);
1365  	if (!next_event)
1366  		return *delta_next;
1367  
1368  	/*
1369  	 * If the next highres timer to expire is earlier than 'next_event', the
1370  	 * idle governor needs to know that.
1371  	 */
1372  	next_event = min_t(u64, next_event,
1373  			   hrtimer_next_event_without(&ts->sched_timer));
1374  
1375  	return ktime_sub(next_event, now);
1376  }
1377  
1378  /**
1379   * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
1380   * for a particular CPU.
1381   * @cpu: target CPU number
1382   *
1383   * Called from the schedutil frequency scaling governor in scheduler context.
1384   *
1385   * Return: the current idle calls counter value for @cpu
1386   */
tick_nohz_get_idle_calls_cpu(int cpu)1387  unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
1388  {
1389  	struct tick_sched *ts = tick_get_tick_sched(cpu);
1390  
1391  	return ts->idle_calls;
1392  }
1393  
tick_nohz_account_idle_time(struct tick_sched * ts,ktime_t now)1394  static void tick_nohz_account_idle_time(struct tick_sched *ts,
1395  					ktime_t now)
1396  {
1397  	unsigned long ticks;
1398  
1399  	ts->idle_exittime = now;
1400  
1401  	if (vtime_accounting_enabled_this_cpu())
1402  		return;
1403  	/*
1404  	 * We stopped the tick in idle. update_process_times() would miss the
1405  	 * time we slept, as it does only a 1 tick accounting.
1406  	 * Enforce that this is accounted to idle !
1407  	 */
1408  	ticks = jiffies - ts->idle_jiffies;
1409  	/*
1410  	 * We might be one off. Do not randomly account a huge number of ticks!
1411  	 */
1412  	if (ticks && ticks < LONG_MAX)
1413  		account_idle_ticks(ticks);
1414  }
1415  
tick_nohz_idle_restart_tick(void)1416  void tick_nohz_idle_restart_tick(void)
1417  {
1418  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1419  
1420  	if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
1421  		ktime_t now = ktime_get();
1422  		tick_nohz_restart_sched_tick(ts, now);
1423  		tick_nohz_account_idle_time(ts, now);
1424  	}
1425  }
1426  
tick_nohz_idle_update_tick(struct tick_sched * ts,ktime_t now)1427  static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
1428  {
1429  	if (tick_nohz_full_cpu(smp_processor_id()))
1430  		__tick_nohz_full_update_tick(ts, now);
1431  	else
1432  		tick_nohz_restart_sched_tick(ts, now);
1433  
1434  	tick_nohz_account_idle_time(ts, now);
1435  }
1436  
1437  /**
1438   * tick_nohz_idle_exit - Update the tick upon idle task exit
1439   *
1440   * When the idle task exits, update the tick depending on the
1441   * following situations:
1442   *
1443   * 1) If the CPU is not in nohz_full mode (most cases), then
1444   *    restart the tick.
1445   *
1446   * 2) If the CPU is in nohz_full mode (corner case):
1447   *   2.1) If the tick can be kept stopped (no tick dependencies)
1448   *        then re-evaluate the next tick and try to keep it stopped
1449   *        as long as possible.
1450   *   2.2) If the tick has dependencies, restart the tick.
1451   *
1452   */
tick_nohz_idle_exit(void)1453  void tick_nohz_idle_exit(void)
1454  {
1455  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1456  	bool idle_active, tick_stopped;
1457  	ktime_t now;
1458  
1459  	local_irq_disable();
1460  
1461  	WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));
1462  	WARN_ON_ONCE(ts->timer_expires_base);
1463  
1464  	tick_sched_flag_clear(ts, TS_FLAG_INIDLE);
1465  	idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
1466  	tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
1467  
1468  	if (idle_active || tick_stopped)
1469  		now = ktime_get();
1470  
1471  	if (idle_active)
1472  		tick_nohz_stop_idle(ts, now);
1473  
1474  	if (tick_stopped)
1475  		tick_nohz_idle_update_tick(ts, now);
1476  
1477  	local_irq_enable();
1478  }
1479  
1480  /*
1481   * In low-resolution mode, the tick handler must be implemented directly
1482   * at the clockevent level. hrtimer can't be used instead, because its
1483   * infrastructure actually relies on the tick itself as a backend in
1484   * low-resolution mode (see hrtimer_run_queues()).
1485   */
tick_nohz_lowres_handler(struct clock_event_device * dev)1486  static void tick_nohz_lowres_handler(struct clock_event_device *dev)
1487  {
1488  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1489  
1490  	dev->next_event = KTIME_MAX;
1491  
1492  	if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
1493  		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
1494  }
1495  
tick_nohz_activate(struct tick_sched * ts)1496  static inline void tick_nohz_activate(struct tick_sched *ts)
1497  {
1498  	if (!tick_nohz_enabled)
1499  		return;
1500  	tick_sched_flag_set(ts, TS_FLAG_NOHZ);
1501  	/* One update is enough */
1502  	if (!test_and_set_bit(0, &tick_nohz_active))
1503  		timers_update_nohz();
1504  }
1505  
1506  /**
1507   * tick_nohz_switch_to_nohz - switch to NOHZ mode
1508   */
tick_nohz_switch_to_nohz(void)1509  static void tick_nohz_switch_to_nohz(void)
1510  {
1511  	if (!tick_nohz_enabled)
1512  		return;
1513  
1514  	if (tick_switch_to_oneshot(tick_nohz_lowres_handler))
1515  		return;
1516  
1517  	/*
1518  	 * Recycle the hrtimer in 'ts', so we can share the
1519  	 * highres code.
1520  	 */
1521  	tick_setup_sched_timer(false);
1522  }
1523  
tick_nohz_irq_enter(void)1524  static inline void tick_nohz_irq_enter(void)
1525  {
1526  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1527  	ktime_t now;
1528  
1529  	if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE))
1530  		return;
1531  	now = ktime_get();
1532  	if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))
1533  		tick_nohz_stop_idle(ts, now);
1534  	/*
1535  	 * If all CPUs are idle we may need to update a stale jiffies value.
1536  	 * Note nohz_full is a special case: a timekeeper is guaranteed to stay
1537  	 * alive but it might be busy looping with interrupts disabled in some
1538  	 * rare case (typically stop machine). So we must make sure we have a
1539  	 * last resort.
1540  	 */
1541  	if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
1542  		tick_nohz_update_jiffies(now);
1543  }
1544  
1545  #else
1546  
tick_nohz_switch_to_nohz(void)1547  static inline void tick_nohz_switch_to_nohz(void) { }
tick_nohz_irq_enter(void)1548  static inline void tick_nohz_irq_enter(void) { }
tick_nohz_activate(struct tick_sched * ts)1549  static inline void tick_nohz_activate(struct tick_sched *ts) { }
1550  
1551  #endif /* CONFIG_NO_HZ_COMMON */
1552  
1553  /*
1554   * Called from irq_enter() to notify about the possible interruption of idle()
1555   */
tick_irq_enter(void)1556  void tick_irq_enter(void)
1557  {
1558  	tick_check_oneshot_broadcast_this_cpu();
1559  	tick_nohz_irq_enter();
1560  }
1561  
1562  static int sched_skew_tick;
1563  
skew_tick(char * str)1564  static int __init skew_tick(char *str)
1565  {
1566  	get_option(&str, &sched_skew_tick);
1567  
1568  	return 0;
1569  }
1570  early_param("skew_tick", skew_tick);
1571  
1572  /**
1573   * tick_setup_sched_timer - setup the tick emulation timer
1574   * @hrtimer: whether to use the hrtimer or not
1575   */
tick_setup_sched_timer(bool hrtimer)1576  void tick_setup_sched_timer(bool hrtimer)
1577  {
1578  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1579  
1580  	/* Emulate tick processing via per-CPU hrtimers: */
1581  	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
1582  
1583  	if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) {
1584  		tick_sched_flag_set(ts, TS_FLAG_HIGHRES);
1585  		ts->sched_timer.function = tick_nohz_handler;
1586  	}
1587  
1588  	/* Get the next period (per-CPU) */
1589  	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
1590  
1591  	/* Offset the tick to avert 'jiffies_lock' contention. */
1592  	if (sched_skew_tick) {
1593  		u64 offset = TICK_NSEC >> 1;
1594  		do_div(offset, num_possible_cpus());
1595  		offset *= smp_processor_id();
1596  		hrtimer_add_expires_ns(&ts->sched_timer, offset);
1597  	}
1598  
1599  	hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
1600  	if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
1601  		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
1602  	else
1603  		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
1604  	tick_nohz_activate(ts);
1605  }
1606  
1607  /*
1608   * Shut down the tick and make sure the CPU won't try to retake the timekeeping
1609   * duty before disabling IRQs in idle for the last time.
1610   */
tick_sched_timer_dying(int cpu)1611  void tick_sched_timer_dying(int cpu)
1612  {
1613  	struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
1614  	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1615  	struct clock_event_device *dev = td->evtdev;
1616  	ktime_t idle_sleeptime, iowait_sleeptime;
1617  	unsigned long idle_calls, idle_sleeps;
1618  
1619  	/* This must happen before hrtimers are migrated! */
1620  	tick_sched_timer_cancel(ts);
1621  
1622  	/*
1623  	 * If the clockevents doesn't support CLOCK_EVT_STATE_ONESHOT_STOPPED,
1624  	 * make sure not to call low-res tick handler.
1625  	 */
1626  	if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
1627  		dev->event_handler = clockevents_handle_noop;
1628  
1629  	idle_sleeptime = ts->idle_sleeptime;
1630  	iowait_sleeptime = ts->iowait_sleeptime;
1631  	idle_calls = ts->idle_calls;
1632  	idle_sleeps = ts->idle_sleeps;
1633  	memset(ts, 0, sizeof(*ts));
1634  	ts->idle_sleeptime = idle_sleeptime;
1635  	ts->iowait_sleeptime = iowait_sleeptime;
1636  	ts->idle_calls = idle_calls;
1637  	ts->idle_sleeps = idle_sleeps;
1638  }
1639  
1640  /*
1641   * Async notification about clocksource changes
1642   */
tick_clock_notify(void)1643  void tick_clock_notify(void)
1644  {
1645  	int cpu;
1646  
1647  	for_each_possible_cpu(cpu)
1648  		set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
1649  }
1650  
1651  /*
1652   * Async notification about clock event changes
1653   */
tick_oneshot_notify(void)1654  void tick_oneshot_notify(void)
1655  {
1656  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1657  
1658  	set_bit(0, &ts->check_clocks);
1659  }
1660  
1661  /*
1662   * Check if a change happened, which makes oneshot possible.
1663   *
1664   * Called cyclically from the hrtimer softirq (driven by the timer
1665   * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ
1666   * mode, because high resolution timers are disabled (either compile
1667   * or runtime). Called with interrupts disabled.
1668   */
tick_check_oneshot_change(int allow_nohz)1669  int tick_check_oneshot_change(int allow_nohz)
1670  {
1671  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1672  
1673  	if (!test_and_clear_bit(0, &ts->check_clocks))
1674  		return 0;
1675  
1676  	if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
1677  		return 0;
1678  
1679  	if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
1680  		return 0;
1681  
1682  	if (!allow_nohz)
1683  		return 1;
1684  
1685  	tick_nohz_switch_to_nohz();
1686  	return 0;
1687  }
1688