1  // SPDX-License-Identifier: GPL-2.0+
2  /*
3   * This file contains the functions which manage clocksource drivers.
4   *
5   * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
6   */
7  
8  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9  
10  #include <linux/device.h>
11  #include <linux/clocksource.h>
12  #include <linux/init.h>
13  #include <linux/module.h>
14  #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
15  #include <linux/tick.h>
16  #include <linux/kthread.h>
17  #include <linux/prandom.h>
18  #include <linux/cpu.h>
19  
20  #include "tick-internal.h"
21  #include "timekeeping_internal.h"
22  
cycles_to_nsec_safe(struct clocksource * cs,u64 start,u64 end)23  static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end)
24  {
25  	u64 delta = clocksource_delta(end, start, cs->mask);
26  
27  	if (likely(delta < cs->max_cycles))
28  		return clocksource_cyc2ns(delta, cs->mult, cs->shift);
29  
30  	return mul_u64_u32_shr(delta, cs->mult, cs->shift);
31  }
32  
33  /**
34   * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
35   * @mult:	pointer to mult variable
36   * @shift:	pointer to shift variable
37   * @from:	frequency to convert from
38   * @to:		frequency to convert to
39   * @maxsec:	guaranteed runtime conversion range in seconds
40   *
41   * The function evaluates the shift/mult pair for the scaled math
42   * operations of clocksources and clockevents.
43   *
44   * @to and @from are frequency values in HZ. For clock sources @to is
45   * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
46   * event @to is the counter frequency and @from is NSEC_PER_SEC.
47   *
48   * The @maxsec conversion range argument controls the time frame in
49   * seconds which must be covered by the runtime conversion with the
50   * calculated mult and shift factors. This guarantees that no 64bit
51   * overflow happens when the input value of the conversion is
52   * multiplied with the calculated mult factor. Larger ranges may
53   * reduce the conversion accuracy by choosing smaller mult and shift
54   * factors.
55   */
56  void
clocks_calc_mult_shift(u32 * mult,u32 * shift,u32 from,u32 to,u32 maxsec)57  clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
58  {
59  	u64 tmp;
60  	u32 sft, sftacc= 32;
61  
62  	/*
63  	 * Calculate the shift factor which is limiting the conversion
64  	 * range:
65  	 */
66  	tmp = ((u64)maxsec * from) >> 32;
67  	while (tmp) {
68  		tmp >>=1;
69  		sftacc--;
70  	}
71  
72  	/*
73  	 * Find the conversion shift/mult pair which has the best
74  	 * accuracy and fits the maxsec conversion range:
75  	 */
76  	for (sft = 32; sft > 0; sft--) {
77  		tmp = (u64) to << sft;
78  		tmp += from / 2;
79  		do_div(tmp, from);
80  		if ((tmp >> sftacc) == 0)
81  			break;
82  	}
83  	*mult = tmp;
84  	*shift = sft;
85  }
86  EXPORT_SYMBOL_GPL(clocks_calc_mult_shift);
87  
88  /*[Clocksource internal variables]---------
89   * curr_clocksource:
90   *	currently selected clocksource.
91   * suspend_clocksource:
92   *	used to calculate the suspend time.
93   * clocksource_list:
94   *	linked list with the registered clocksources
95   * clocksource_mutex:
96   *	protects manipulations to curr_clocksource and the clocksource_list
97   * override_name:
98   *	Name of the user-specified clocksource.
99   */
100  static struct clocksource *curr_clocksource;
101  static struct clocksource *suspend_clocksource;
102  static LIST_HEAD(clocksource_list);
103  static DEFINE_MUTEX(clocksource_mutex);
104  static char override_name[CS_NAME_LEN];
105  static int finished_booting;
106  static u64 suspend_start;
107  
108  /*
109   * Interval: 0.5sec.
110   */
111  #define WATCHDOG_INTERVAL (HZ >> 1)
112  #define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
113  
114  /*
115   * Threshold: 0.0312s, when doubled: 0.0625s.
116   */
117  #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
118  
119  /*
120   * Maximum permissible delay between two readouts of the watchdog
121   * clocksource surrounding a read of the clocksource being validated.
122   * This delay could be due to SMIs, NMIs, or to VCPU preemptions.  Used as
123   * a lower bound for cs->uncertainty_margin values when registering clocks.
124   *
125   * The default of 500 parts per million is based on NTP's limits.
126   * If a clocksource is good enough for NTP, it is good enough for us!
127   *
128   * In other words, by default, even if a clocksource is extremely
129   * precise (for example, with a sub-nanosecond period), the maximum
130   * permissible skew between the clocksource watchdog and the clocksource
131   * under test is not permitted to go below the 500ppm minimum defined
132   * by MAX_SKEW_USEC.  This 500ppm minimum may be overridden using the
133   * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option.
134   */
135  #ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
136  #define MAX_SKEW_USEC	CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
137  #else
138  #define MAX_SKEW_USEC	(125 * WATCHDOG_INTERVAL / HZ)
139  #endif
140  
141  /*
142   * Default for maximum permissible skew when cs->uncertainty_margin is
143   * not specified, and the lower bound even when cs->uncertainty_margin
144   * is specified.  This is also the default that is used when registering
145   * clocks with unspecifed cs->uncertainty_margin, so this macro is used
146   * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels.
147   */
148  #define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
149  
150  #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
151  static void clocksource_watchdog_work(struct work_struct *work);
152  static void clocksource_select(void);
153  
154  static LIST_HEAD(watchdog_list);
155  static struct clocksource *watchdog;
156  static struct timer_list watchdog_timer;
157  static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
158  static DEFINE_SPINLOCK(watchdog_lock);
159  static int watchdog_running;
160  static atomic_t watchdog_reset_pending;
161  static int64_t watchdog_max_interval;
162  
clocksource_watchdog_lock(unsigned long * flags)163  static inline void clocksource_watchdog_lock(unsigned long *flags)
164  {
165  	spin_lock_irqsave(&watchdog_lock, *flags);
166  }
167  
clocksource_watchdog_unlock(unsigned long * flags)168  static inline void clocksource_watchdog_unlock(unsigned long *flags)
169  {
170  	spin_unlock_irqrestore(&watchdog_lock, *flags);
171  }
172  
173  static int clocksource_watchdog_kthread(void *data);
174  static void __clocksource_change_rating(struct clocksource *cs, int rating);
175  
clocksource_watchdog_work(struct work_struct * work)176  static void clocksource_watchdog_work(struct work_struct *work)
177  {
178  	/*
179  	 * We cannot directly run clocksource_watchdog_kthread() here, because
180  	 * clocksource_select() calls timekeeping_notify() which uses
181  	 * stop_machine(). One cannot use stop_machine() from a workqueue() due
182  	 * lock inversions wrt CPU hotplug.
183  	 *
184  	 * Also, we only ever run this work once or twice during the lifetime
185  	 * of the kernel, so there is no point in creating a more permanent
186  	 * kthread for this.
187  	 *
188  	 * If kthread_run fails the next watchdog scan over the
189  	 * watchdog_list will find the unstable clock again.
190  	 */
191  	kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
192  }
193  
__clocksource_unstable(struct clocksource * cs)194  static void __clocksource_unstable(struct clocksource *cs)
195  {
196  	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
197  	cs->flags |= CLOCK_SOURCE_UNSTABLE;
198  
199  	/*
200  	 * If the clocksource is registered clocksource_watchdog_kthread() will
201  	 * re-rate and re-select.
202  	 */
203  	if (list_empty(&cs->list)) {
204  		cs->rating = 0;
205  		return;
206  	}
207  
208  	if (cs->mark_unstable)
209  		cs->mark_unstable(cs);
210  
211  	/* kick clocksource_watchdog_kthread() */
212  	if (finished_booting)
213  		schedule_work(&watchdog_work);
214  }
215  
216  /**
217   * clocksource_mark_unstable - mark clocksource unstable via watchdog
218   * @cs:		clocksource to be marked unstable
219   *
220   * This function is called by the x86 TSC code to mark clocksources as unstable;
221   * it defers demotion and re-selection to a kthread.
222   */
clocksource_mark_unstable(struct clocksource * cs)223  void clocksource_mark_unstable(struct clocksource *cs)
224  {
225  	unsigned long flags;
226  
227  	spin_lock_irqsave(&watchdog_lock, flags);
228  	if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
229  		if (!list_empty(&cs->list) && list_empty(&cs->wd_list))
230  			list_add(&cs->wd_list, &watchdog_list);
231  		__clocksource_unstable(cs);
232  	}
233  	spin_unlock_irqrestore(&watchdog_lock, flags);
234  }
235  
236  static int verify_n_cpus = 8;
237  module_param(verify_n_cpus, int, 0644);
238  
239  enum wd_read_status {
240  	WD_READ_SUCCESS,
241  	WD_READ_UNSTABLE,
242  	WD_READ_SKIP
243  };
244  
cs_watchdog_read(struct clocksource * cs,u64 * csnow,u64 * wdnow)245  static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
246  {
247  	int64_t md = 2 * watchdog->uncertainty_margin;
248  	unsigned int nretries, max_retries;
249  	int64_t wd_delay, wd_seq_delay;
250  	u64 wd_end, wd_end2;
251  
252  	max_retries = clocksource_get_max_watchdog_retry();
253  	for (nretries = 0; nretries <= max_retries; nretries++) {
254  		local_irq_disable();
255  		*wdnow = watchdog->read(watchdog);
256  		*csnow = cs->read(cs);
257  		wd_end = watchdog->read(watchdog);
258  		wd_end2 = watchdog->read(watchdog);
259  		local_irq_enable();
260  
261  		wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end);
262  		if (wd_delay <= md + cs->uncertainty_margin) {
263  			if (nretries > 1 && nretries >= max_retries) {
264  				pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
265  					smp_processor_id(), watchdog->name, nretries);
266  			}
267  			return WD_READ_SUCCESS;
268  		}
269  
270  		/*
271  		 * Now compute delay in consecutive watchdog read to see if
272  		 * there is too much external interferences that cause
273  		 * significant delay in reading both clocksource and watchdog.
274  		 *
275  		 * If consecutive WD read-back delay > md, report
276  		 * system busy, reinit the watchdog and skip the current
277  		 * watchdog test.
278  		 */
279  		wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2);
280  		if (wd_seq_delay > md)
281  			goto skip_test;
282  	}
283  
284  	pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
285  		smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
286  	return WD_READ_UNSTABLE;
287  
288  skip_test:
289  	pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
290  		smp_processor_id(), watchdog->name, wd_seq_delay);
291  	pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
292  		cs->name, wd_delay);
293  	return WD_READ_SKIP;
294  }
295  
296  static u64 csnow_mid;
297  static cpumask_t cpus_ahead;
298  static cpumask_t cpus_behind;
299  static cpumask_t cpus_chosen;
300  
clocksource_verify_choose_cpus(void)301  static void clocksource_verify_choose_cpus(void)
302  {
303  	int cpu, i, n = verify_n_cpus;
304  
305  	if (n < 0) {
306  		/* Check all of the CPUs. */
307  		cpumask_copy(&cpus_chosen, cpu_online_mask);
308  		cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
309  		return;
310  	}
311  
312  	/* If no checking desired, or no other CPU to check, leave. */
313  	cpumask_clear(&cpus_chosen);
314  	if (n == 0 || num_online_cpus() <= 1)
315  		return;
316  
317  	/* Make sure to select at least one CPU other than the current CPU. */
318  	cpu = cpumask_first(cpu_online_mask);
319  	if (cpu == smp_processor_id())
320  		cpu = cpumask_next(cpu, cpu_online_mask);
321  	if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
322  		return;
323  	cpumask_set_cpu(cpu, &cpus_chosen);
324  
325  	/* Force a sane value for the boot parameter. */
326  	if (n > nr_cpu_ids)
327  		n = nr_cpu_ids;
328  
329  	/*
330  	 * Randomly select the specified number of CPUs.  If the same
331  	 * CPU is selected multiple times, that CPU is checked only once,
332  	 * and no replacement CPU is selected.  This gracefully handles
333  	 * situations where verify_n_cpus is greater than the number of
334  	 * CPUs that are currently online.
335  	 */
336  	for (i = 1; i < n; i++) {
337  		cpu = get_random_u32_below(nr_cpu_ids);
338  		cpu = cpumask_next(cpu - 1, cpu_online_mask);
339  		if (cpu >= nr_cpu_ids)
340  			cpu = cpumask_first(cpu_online_mask);
341  		if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
342  			cpumask_set_cpu(cpu, &cpus_chosen);
343  	}
344  
345  	/* Don't verify ourselves. */
346  	cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
347  }
348  
clocksource_verify_one_cpu(void * csin)349  static void clocksource_verify_one_cpu(void *csin)
350  {
351  	struct clocksource *cs = (struct clocksource *)csin;
352  
353  	csnow_mid = cs->read(cs);
354  }
355  
clocksource_verify_percpu(struct clocksource * cs)356  void clocksource_verify_percpu(struct clocksource *cs)
357  {
358  	int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
359  	u64 csnow_begin, csnow_end;
360  	int cpu, testcpu;
361  	s64 delta;
362  
363  	if (verify_n_cpus == 0)
364  		return;
365  	cpumask_clear(&cpus_ahead);
366  	cpumask_clear(&cpus_behind);
367  	cpus_read_lock();
368  	preempt_disable();
369  	clocksource_verify_choose_cpus();
370  	if (cpumask_empty(&cpus_chosen)) {
371  		preempt_enable();
372  		cpus_read_unlock();
373  		pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
374  		return;
375  	}
376  	testcpu = smp_processor_id();
377  	pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
378  	for_each_cpu(cpu, &cpus_chosen) {
379  		if (cpu == testcpu)
380  			continue;
381  		csnow_begin = cs->read(cs);
382  		smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
383  		csnow_end = cs->read(cs);
384  		delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
385  		if (delta < 0)
386  			cpumask_set_cpu(cpu, &cpus_behind);
387  		delta = (csnow_end - csnow_mid) & cs->mask;
388  		if (delta < 0)
389  			cpumask_set_cpu(cpu, &cpus_ahead);
390  		cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end);
391  		if (cs_nsec > cs_nsec_max)
392  			cs_nsec_max = cs_nsec;
393  		if (cs_nsec < cs_nsec_min)
394  			cs_nsec_min = cs_nsec;
395  	}
396  	preempt_enable();
397  	cpus_read_unlock();
398  	if (!cpumask_empty(&cpus_ahead))
399  		pr_warn("        CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
400  			cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
401  	if (!cpumask_empty(&cpus_behind))
402  		pr_warn("        CPUs %*pbl behind CPU %d for clocksource %s.\n",
403  			cpumask_pr_args(&cpus_behind), testcpu, cs->name);
404  	if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
405  		pr_warn("        CPU %d check durations %lldns - %lldns for clocksource %s.\n",
406  			testcpu, cs_nsec_min, cs_nsec_max, cs->name);
407  }
408  EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
409  
clocksource_reset_watchdog(void)410  static inline void clocksource_reset_watchdog(void)
411  {
412  	struct clocksource *cs;
413  
414  	list_for_each_entry(cs, &watchdog_list, wd_list)
415  		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
416  }
417  
418  
clocksource_watchdog(struct timer_list * unused)419  static void clocksource_watchdog(struct timer_list *unused)
420  {
421  	int64_t wd_nsec, cs_nsec, interval;
422  	u64 csnow, wdnow, cslast, wdlast;
423  	int next_cpu, reset_pending;
424  	struct clocksource *cs;
425  	enum wd_read_status read_ret;
426  	unsigned long extra_wait = 0;
427  	u32 md;
428  
429  	spin_lock(&watchdog_lock);
430  	if (!watchdog_running)
431  		goto out;
432  
433  	reset_pending = atomic_read(&watchdog_reset_pending);
434  
435  	list_for_each_entry(cs, &watchdog_list, wd_list) {
436  
437  		/* Clocksource already marked unstable? */
438  		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
439  			if (finished_booting)
440  				schedule_work(&watchdog_work);
441  			continue;
442  		}
443  
444  		read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
445  
446  		if (read_ret == WD_READ_UNSTABLE) {
447  			/* Clock readout unreliable, so give it up. */
448  			__clocksource_unstable(cs);
449  			continue;
450  		}
451  
452  		/*
453  		 * When WD_READ_SKIP is returned, it means the system is likely
454  		 * under very heavy load, where the latency of reading
455  		 * watchdog/clocksource is very big, and affect the accuracy of
456  		 * watchdog check. So give system some space and suspend the
457  		 * watchdog check for 5 minutes.
458  		 */
459  		if (read_ret == WD_READ_SKIP) {
460  			/*
461  			 * As the watchdog timer will be suspended, and
462  			 * cs->last could keep unchanged for 5 minutes, reset
463  			 * the counters.
464  			 */
465  			clocksource_reset_watchdog();
466  			extra_wait = HZ * 300;
467  			break;
468  		}
469  
470  		/* Clocksource initialized ? */
471  		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
472  		    atomic_read(&watchdog_reset_pending)) {
473  			cs->flags |= CLOCK_SOURCE_WATCHDOG;
474  			cs->wd_last = wdnow;
475  			cs->cs_last = csnow;
476  			continue;
477  		}
478  
479  		wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow);
480  		cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow);
481  		wdlast = cs->wd_last; /* save these in case we print them */
482  		cslast = cs->cs_last;
483  		cs->cs_last = csnow;
484  		cs->wd_last = wdnow;
485  
486  		if (atomic_read(&watchdog_reset_pending))
487  			continue;
488  
489  		/*
490  		 * The processing of timer softirqs can get delayed (usually
491  		 * on account of ksoftirqd not getting to run in a timely
492  		 * manner), which causes the watchdog interval to stretch.
493  		 * Skew detection may fail for longer watchdog intervals
494  		 * on account of fixed margins being used.
495  		 * Some clocksources, e.g. acpi_pm, cannot tolerate
496  		 * watchdog intervals longer than a few seconds.
497  		 */
498  		interval = max(cs_nsec, wd_nsec);
499  		if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
500  			if (system_state > SYSTEM_SCHEDULING &&
501  			    interval > 2 * watchdog_max_interval) {
502  				watchdog_max_interval = interval;
503  				pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
504  					cs_nsec, wd_nsec);
505  			}
506  			watchdog_timer.expires = jiffies;
507  			continue;
508  		}
509  
510  		/* Check the deviation from the watchdog clocksource. */
511  		md = cs->uncertainty_margin + watchdog->uncertainty_margin;
512  		if (abs(cs_nsec - wd_nsec) > md) {
513  			s64 cs_wd_msec;
514  			s64 wd_msec;
515  			u32 wd_rem;
516  
517  			pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
518  				smp_processor_id(), cs->name);
519  			pr_warn("                      '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
520  				watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
521  			pr_warn("                      '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
522  				cs->name, cs_nsec, csnow, cslast, cs->mask);
523  			cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem);
524  			wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem);
525  			pr_warn("                      Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
526  				cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
527  			if (curr_clocksource == cs)
528  				pr_warn("                      '%s' is current clocksource.\n", cs->name);
529  			else if (curr_clocksource)
530  				pr_warn("                      '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name);
531  			else
532  				pr_warn("                      No current clocksource.\n");
533  			__clocksource_unstable(cs);
534  			continue;
535  		}
536  
537  		if (cs == curr_clocksource && cs->tick_stable)
538  			cs->tick_stable(cs);
539  
540  		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
541  		    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
542  		    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
543  			/* Mark it valid for high-res. */
544  			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
545  
546  			/*
547  			 * clocksource_done_booting() will sort it if
548  			 * finished_booting is not set yet.
549  			 */
550  			if (!finished_booting)
551  				continue;
552  
553  			/*
554  			 * If this is not the current clocksource let
555  			 * the watchdog thread reselect it. Due to the
556  			 * change to high res this clocksource might
557  			 * be preferred now. If it is the current
558  			 * clocksource let the tick code know about
559  			 * that change.
560  			 */
561  			if (cs != curr_clocksource) {
562  				cs->flags |= CLOCK_SOURCE_RESELECT;
563  				schedule_work(&watchdog_work);
564  			} else {
565  				tick_clock_notify();
566  			}
567  		}
568  	}
569  
570  	/*
571  	 * We only clear the watchdog_reset_pending, when we did a
572  	 * full cycle through all clocksources.
573  	 */
574  	if (reset_pending)
575  		atomic_dec(&watchdog_reset_pending);
576  
577  	/*
578  	 * Cycle through CPUs to check if the CPUs stay synchronized
579  	 * to each other.
580  	 */
581  	next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
582  	if (next_cpu >= nr_cpu_ids)
583  		next_cpu = cpumask_first(cpu_online_mask);
584  
585  	/*
586  	 * Arm timer if not already pending: could race with concurrent
587  	 * pair clocksource_stop_watchdog() clocksource_start_watchdog().
588  	 */
589  	if (!timer_pending(&watchdog_timer)) {
590  		watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
591  		add_timer_on(&watchdog_timer, next_cpu);
592  	}
593  out:
594  	spin_unlock(&watchdog_lock);
595  }
596  
clocksource_start_watchdog(void)597  static inline void clocksource_start_watchdog(void)
598  {
599  	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
600  		return;
601  	timer_setup(&watchdog_timer, clocksource_watchdog, 0);
602  	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
603  	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
604  	watchdog_running = 1;
605  }
606  
clocksource_stop_watchdog(void)607  static inline void clocksource_stop_watchdog(void)
608  {
609  	if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
610  		return;
611  	del_timer(&watchdog_timer);
612  	watchdog_running = 0;
613  }
614  
clocksource_resume_watchdog(void)615  static void clocksource_resume_watchdog(void)
616  {
617  	atomic_inc(&watchdog_reset_pending);
618  }
619  
clocksource_enqueue_watchdog(struct clocksource * cs)620  static void clocksource_enqueue_watchdog(struct clocksource *cs)
621  {
622  	INIT_LIST_HEAD(&cs->wd_list);
623  
624  	if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
625  		/* cs is a clocksource to be watched. */
626  		list_add(&cs->wd_list, &watchdog_list);
627  		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
628  	} else {
629  		/* cs is a watchdog. */
630  		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
631  			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
632  	}
633  }
634  
clocksource_select_watchdog(bool fallback)635  static void clocksource_select_watchdog(bool fallback)
636  {
637  	struct clocksource *cs, *old_wd;
638  	unsigned long flags;
639  
640  	spin_lock_irqsave(&watchdog_lock, flags);
641  	/* save current watchdog */
642  	old_wd = watchdog;
643  	if (fallback)
644  		watchdog = NULL;
645  
646  	list_for_each_entry(cs, &clocksource_list, list) {
647  		/* cs is a clocksource to be watched. */
648  		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
649  			continue;
650  
651  		/* Skip current if we were requested for a fallback. */
652  		if (fallback && cs == old_wd)
653  			continue;
654  
655  		/* Pick the best watchdog. */
656  		if (!watchdog || cs->rating > watchdog->rating)
657  			watchdog = cs;
658  	}
659  	/* If we failed to find a fallback restore the old one. */
660  	if (!watchdog)
661  		watchdog = old_wd;
662  
663  	/* If we changed the watchdog we need to reset cycles. */
664  	if (watchdog != old_wd)
665  		clocksource_reset_watchdog();
666  
667  	/* Check if the watchdog timer needs to be started. */
668  	clocksource_start_watchdog();
669  	spin_unlock_irqrestore(&watchdog_lock, flags);
670  }
671  
clocksource_dequeue_watchdog(struct clocksource * cs)672  static void clocksource_dequeue_watchdog(struct clocksource *cs)
673  {
674  	if (cs != watchdog) {
675  		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
676  			/* cs is a watched clocksource. */
677  			list_del_init(&cs->wd_list);
678  			/* Check if the watchdog timer needs to be stopped. */
679  			clocksource_stop_watchdog();
680  		}
681  	}
682  }
683  
__clocksource_watchdog_kthread(void)684  static int __clocksource_watchdog_kthread(void)
685  {
686  	struct clocksource *cs, *tmp;
687  	unsigned long flags;
688  	int select = 0;
689  
690  	/* Do any required per-CPU skew verification. */
691  	if (curr_clocksource &&
692  	    curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
693  	    curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
694  		clocksource_verify_percpu(curr_clocksource);
695  
696  	spin_lock_irqsave(&watchdog_lock, flags);
697  	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
698  		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
699  			list_del_init(&cs->wd_list);
700  			__clocksource_change_rating(cs, 0);
701  			select = 1;
702  		}
703  		if (cs->flags & CLOCK_SOURCE_RESELECT) {
704  			cs->flags &= ~CLOCK_SOURCE_RESELECT;
705  			select = 1;
706  		}
707  	}
708  	/* Check if the watchdog timer needs to be stopped. */
709  	clocksource_stop_watchdog();
710  	spin_unlock_irqrestore(&watchdog_lock, flags);
711  
712  	return select;
713  }
714  
clocksource_watchdog_kthread(void * data)715  static int clocksource_watchdog_kthread(void *data)
716  {
717  	mutex_lock(&clocksource_mutex);
718  	if (__clocksource_watchdog_kthread())
719  		clocksource_select();
720  	mutex_unlock(&clocksource_mutex);
721  	return 0;
722  }
723  
clocksource_is_watchdog(struct clocksource * cs)724  static bool clocksource_is_watchdog(struct clocksource *cs)
725  {
726  	return cs == watchdog;
727  }
728  
729  #else /* CONFIG_CLOCKSOURCE_WATCHDOG */
730  
clocksource_enqueue_watchdog(struct clocksource * cs)731  static void clocksource_enqueue_watchdog(struct clocksource *cs)
732  {
733  	if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
734  		cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
735  }
736  
clocksource_select_watchdog(bool fallback)737  static void clocksource_select_watchdog(bool fallback) { }
clocksource_dequeue_watchdog(struct clocksource * cs)738  static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
clocksource_resume_watchdog(void)739  static inline void clocksource_resume_watchdog(void) { }
__clocksource_watchdog_kthread(void)740  static inline int __clocksource_watchdog_kthread(void) { return 0; }
clocksource_is_watchdog(struct clocksource * cs)741  static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
clocksource_mark_unstable(struct clocksource * cs)742  void clocksource_mark_unstable(struct clocksource *cs) { }
743  
clocksource_watchdog_lock(unsigned long * flags)744  static inline void clocksource_watchdog_lock(unsigned long *flags) { }
clocksource_watchdog_unlock(unsigned long * flags)745  static inline void clocksource_watchdog_unlock(unsigned long *flags) { }
746  
747  #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
748  
clocksource_is_suspend(struct clocksource * cs)749  static bool clocksource_is_suspend(struct clocksource *cs)
750  {
751  	return cs == suspend_clocksource;
752  }
753  
__clocksource_suspend_select(struct clocksource * cs)754  static void __clocksource_suspend_select(struct clocksource *cs)
755  {
756  	/*
757  	 * Skip the clocksource which will be stopped in suspend state.
758  	 */
759  	if (!(cs->flags & CLOCK_SOURCE_SUSPEND_NONSTOP))
760  		return;
761  
762  	/*
763  	 * The nonstop clocksource can be selected as the suspend clocksource to
764  	 * calculate the suspend time, so it should not supply suspend/resume
765  	 * interfaces to suspend the nonstop clocksource when system suspends.
766  	 */
767  	if (cs->suspend || cs->resume) {
768  		pr_warn("Nonstop clocksource %s should not supply suspend/resume interfaces\n",
769  			cs->name);
770  	}
771  
772  	/* Pick the best rating. */
773  	if (!suspend_clocksource || cs->rating > suspend_clocksource->rating)
774  		suspend_clocksource = cs;
775  }
776  
777  /**
778   * clocksource_suspend_select - Select the best clocksource for suspend timing
779   * @fallback:	if select a fallback clocksource
780   */
clocksource_suspend_select(bool fallback)781  static void clocksource_suspend_select(bool fallback)
782  {
783  	struct clocksource *cs, *old_suspend;
784  
785  	old_suspend = suspend_clocksource;
786  	if (fallback)
787  		suspend_clocksource = NULL;
788  
789  	list_for_each_entry(cs, &clocksource_list, list) {
790  		/* Skip current if we were requested for a fallback. */
791  		if (fallback && cs == old_suspend)
792  			continue;
793  
794  		__clocksource_suspend_select(cs);
795  	}
796  }
797  
798  /**
799   * clocksource_start_suspend_timing - Start measuring the suspend timing
800   * @cs:			current clocksource from timekeeping
801   * @start_cycles:	current cycles from timekeeping
802   *
803   * This function will save the start cycle values of suspend timer to calculate
804   * the suspend time when resuming system.
805   *
806   * This function is called late in the suspend process from timekeeping_suspend(),
807   * that means processes are frozen, non-boot cpus and interrupts are disabled
808   * now. It is therefore possible to start the suspend timer without taking the
809   * clocksource mutex.
810   */
clocksource_start_suspend_timing(struct clocksource * cs,u64 start_cycles)811  void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles)
812  {
813  	if (!suspend_clocksource)
814  		return;
815  
816  	/*
817  	 * If current clocksource is the suspend timer, we should use the
818  	 * tkr_mono.cycle_last value as suspend_start to avoid same reading
819  	 * from suspend timer.
820  	 */
821  	if (clocksource_is_suspend(cs)) {
822  		suspend_start = start_cycles;
823  		return;
824  	}
825  
826  	if (suspend_clocksource->enable &&
827  	    suspend_clocksource->enable(suspend_clocksource)) {
828  		pr_warn_once("Failed to enable the non-suspend-able clocksource.\n");
829  		return;
830  	}
831  
832  	suspend_start = suspend_clocksource->read(suspend_clocksource);
833  }
834  
835  /**
836   * clocksource_stop_suspend_timing - Stop measuring the suspend timing
837   * @cs:		current clocksource from timekeeping
838   * @cycle_now:	current cycles from timekeeping
839   *
840   * This function will calculate the suspend time from suspend timer.
841   *
842   * Returns nanoseconds since suspend started, 0 if no usable suspend clocksource.
843   *
844   * This function is called early in the resume process from timekeeping_resume(),
845   * that means there is only one cpu, no processes are running and the interrupts
846   * are disabled. It is therefore possible to stop the suspend timer without
847   * taking the clocksource mutex.
848   */
clocksource_stop_suspend_timing(struct clocksource * cs,u64 cycle_now)849  u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
850  {
851  	u64 now, nsec = 0;
852  
853  	if (!suspend_clocksource)
854  		return 0;
855  
856  	/*
857  	 * If current clocksource is the suspend timer, we should use the
858  	 * tkr_mono.cycle_last value from timekeeping as current cycle to
859  	 * avoid same reading from suspend timer.
860  	 */
861  	if (clocksource_is_suspend(cs))
862  		now = cycle_now;
863  	else
864  		now = suspend_clocksource->read(suspend_clocksource);
865  
866  	if (now > suspend_start)
867  		nsec = cycles_to_nsec_safe(suspend_clocksource, suspend_start, now);
868  
869  	/*
870  	 * Disable the suspend timer to save power if current clocksource is
871  	 * not the suspend timer.
872  	 */
873  	if (!clocksource_is_suspend(cs) && suspend_clocksource->disable)
874  		suspend_clocksource->disable(suspend_clocksource);
875  
876  	return nsec;
877  }
878  
879  /**
880   * clocksource_suspend - suspend the clocksource(s)
881   */
clocksource_suspend(void)882  void clocksource_suspend(void)
883  {
884  	struct clocksource *cs;
885  
886  	list_for_each_entry_reverse(cs, &clocksource_list, list)
887  		if (cs->suspend)
888  			cs->suspend(cs);
889  }
890  
891  /**
892   * clocksource_resume - resume the clocksource(s)
893   */
clocksource_resume(void)894  void clocksource_resume(void)
895  {
896  	struct clocksource *cs;
897  
898  	list_for_each_entry(cs, &clocksource_list, list)
899  		if (cs->resume)
900  			cs->resume(cs);
901  
902  	clocksource_resume_watchdog();
903  }
904  
905  /**
906   * clocksource_touch_watchdog - Update watchdog
907   *
908   * Update the watchdog after exception contexts such as kgdb so as not
909   * to incorrectly trip the watchdog. This might fail when the kernel
910   * was stopped in code which holds watchdog_lock.
911   */
clocksource_touch_watchdog(void)912  void clocksource_touch_watchdog(void)
913  {
914  	clocksource_resume_watchdog();
915  }
916  
917  /**
918   * clocksource_max_adjustment- Returns max adjustment amount
919   * @cs:         Pointer to clocksource
920   *
921   */
clocksource_max_adjustment(struct clocksource * cs)922  static u32 clocksource_max_adjustment(struct clocksource *cs)
923  {
924  	u64 ret;
925  	/*
926  	 * We won't try to correct for more than 11% adjustments (110,000 ppm),
927  	 */
928  	ret = (u64)cs->mult * 11;
929  	do_div(ret,100);
930  	return (u32)ret;
931  }
932  
933  /**
934   * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
935   * @mult:	cycle to nanosecond multiplier
936   * @shift:	cycle to nanosecond divisor (power of two)
937   * @maxadj:	maximum adjustment value to mult (~11%)
938   * @mask:	bitmask for two's complement subtraction of non 64 bit counters
939   * @max_cyc:	maximum cycle value before potential overflow (does not include
940   *		any safety margin)
941   *
942   * NOTE: This function includes a safety margin of 50%, in other words, we
943   * return half the number of nanoseconds the hardware counter can technically
944   * cover. This is done so that we can potentially detect problems caused by
945   * delayed timers or bad hardware, which might result in time intervals that
946   * are larger than what the math used can handle without overflows.
947   */
clocks_calc_max_nsecs(u32 mult,u32 shift,u32 maxadj,u64 mask,u64 * max_cyc)948  u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
949  {
950  	u64 max_nsecs, max_cycles;
951  
952  	/*
953  	 * Calculate the maximum number of cycles that we can pass to the
954  	 * cyc2ns() function without overflowing a 64-bit result.
955  	 */
956  	max_cycles = ULLONG_MAX;
957  	do_div(max_cycles, mult+maxadj);
958  
959  	/*
960  	 * The actual maximum number of cycles we can defer the clocksource is
961  	 * determined by the minimum of max_cycles and mask.
962  	 * Note: Here we subtract the maxadj to make sure we don't sleep for
963  	 * too long if there's a large negative adjustment.
964  	 */
965  	max_cycles = min(max_cycles, mask);
966  	max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
967  
968  	/* return the max_cycles value as well if requested */
969  	if (max_cyc)
970  		*max_cyc = max_cycles;
971  
972  	/* Return 50% of the actual maximum, so we can detect bad values */
973  	max_nsecs >>= 1;
974  
975  	return max_nsecs;
976  }
977  
978  /**
979   * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
980   * @cs:         Pointer to clocksource to be updated
981   *
982   */
clocksource_update_max_deferment(struct clocksource * cs)983  static inline void clocksource_update_max_deferment(struct clocksource *cs)
984  {
985  	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
986  						cs->maxadj, cs->mask,
987  						&cs->max_cycles);
988  }
989  
clocksource_find_best(bool oneshot,bool skipcur)990  static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
991  {
992  	struct clocksource *cs;
993  
994  	if (!finished_booting || list_empty(&clocksource_list))
995  		return NULL;
996  
997  	/*
998  	 * We pick the clocksource with the highest rating. If oneshot
999  	 * mode is active, we pick the highres valid clocksource with
1000  	 * the best rating.
1001  	 */
1002  	list_for_each_entry(cs, &clocksource_list, list) {
1003  		if (skipcur && cs == curr_clocksource)
1004  			continue;
1005  		if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
1006  			continue;
1007  		return cs;
1008  	}
1009  	return NULL;
1010  }
1011  
__clocksource_select(bool skipcur)1012  static void __clocksource_select(bool skipcur)
1013  {
1014  	bool oneshot = tick_oneshot_mode_active();
1015  	struct clocksource *best, *cs;
1016  
1017  	/* Find the best suitable clocksource */
1018  	best = clocksource_find_best(oneshot, skipcur);
1019  	if (!best)
1020  		return;
1021  
1022  	if (!strlen(override_name))
1023  		goto found;
1024  
1025  	/* Check for the override clocksource. */
1026  	list_for_each_entry(cs, &clocksource_list, list) {
1027  		if (skipcur && cs == curr_clocksource)
1028  			continue;
1029  		if (strcmp(cs->name, override_name) != 0)
1030  			continue;
1031  		/*
1032  		 * Check to make sure we don't switch to a non-highres
1033  		 * capable clocksource if the tick code is in oneshot
1034  		 * mode (highres or nohz)
1035  		 */
1036  		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
1037  			/* Override clocksource cannot be used. */
1038  			if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
1039  				pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
1040  					cs->name);
1041  				override_name[0] = 0;
1042  			} else {
1043  				/*
1044  				 * The override cannot be currently verified.
1045  				 * Deferring to let the watchdog check.
1046  				 */
1047  				pr_info("Override clocksource %s is not currently HRT compatible - deferring\n",
1048  					cs->name);
1049  			}
1050  		} else
1051  			/* Override clocksource can be used. */
1052  			best = cs;
1053  		break;
1054  	}
1055  
1056  found:
1057  	if (curr_clocksource != best && !timekeeping_notify(best)) {
1058  		pr_info("Switched to clocksource %s\n", best->name);
1059  		curr_clocksource = best;
1060  	}
1061  }
1062  
1063  /**
1064   * clocksource_select - Select the best clocksource available
1065   *
1066   * Private function. Must hold clocksource_mutex when called.
1067   *
1068   * Select the clocksource with the best rating, or the clocksource,
1069   * which is selected by userspace override.
1070   */
clocksource_select(void)1071  static void clocksource_select(void)
1072  {
1073  	__clocksource_select(false);
1074  }
1075  
clocksource_select_fallback(void)1076  static void clocksource_select_fallback(void)
1077  {
1078  	__clocksource_select(true);
1079  }
1080  
1081  /*
1082   * clocksource_done_booting - Called near the end of core bootup
1083   *
1084   * Hack to avoid lots of clocksource churn at boot time.
1085   * We use fs_initcall because we want this to start before
1086   * device_initcall but after subsys_initcall.
1087   */
clocksource_done_booting(void)1088  static int __init clocksource_done_booting(void)
1089  {
1090  	mutex_lock(&clocksource_mutex);
1091  	curr_clocksource = clocksource_default_clock();
1092  	finished_booting = 1;
1093  	/*
1094  	 * Run the watchdog first to eliminate unstable clock sources
1095  	 */
1096  	__clocksource_watchdog_kthread();
1097  	clocksource_select();
1098  	mutex_unlock(&clocksource_mutex);
1099  	return 0;
1100  }
1101  fs_initcall(clocksource_done_booting);
1102  
1103  /*
1104   * Enqueue the clocksource sorted by rating
1105   */
clocksource_enqueue(struct clocksource * cs)1106  static void clocksource_enqueue(struct clocksource *cs)
1107  {
1108  	struct list_head *entry = &clocksource_list;
1109  	struct clocksource *tmp;
1110  
1111  	list_for_each_entry(tmp, &clocksource_list, list) {
1112  		/* Keep track of the place, where to insert */
1113  		if (tmp->rating < cs->rating)
1114  			break;
1115  		entry = &tmp->list;
1116  	}
1117  	list_add(&cs->list, entry);
1118  }
1119  
1120  /**
1121   * __clocksource_update_freq_scale - Used update clocksource with new freq
1122   * @cs:		clocksource to be registered
1123   * @scale:	Scale factor multiplied against freq to get clocksource hz
1124   * @freq:	clocksource frequency (cycles per second) divided by scale
1125   *
1126   * This should only be called from the clocksource->enable() method.
1127   *
1128   * This *SHOULD NOT* be called directly! Please use the
1129   * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
1130   * functions.
1131   */
__clocksource_update_freq_scale(struct clocksource * cs,u32 scale,u32 freq)1132  void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
1133  {
1134  	u64 sec;
1135  
1136  	/*
1137  	 * Default clocksources are *special* and self-define their mult/shift.
1138  	 * But, you're not special, so you should specify a freq value.
1139  	 */
1140  	if (freq) {
1141  		/*
1142  		 * Calc the maximum number of seconds which we can run before
1143  		 * wrapping around. For clocksources which have a mask > 32-bit
1144  		 * we need to limit the max sleep time to have a good
1145  		 * conversion precision. 10 minutes is still a reasonable
1146  		 * amount. That results in a shift value of 24 for a
1147  		 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
1148  		 * ~ 0.06ppm granularity for NTP.
1149  		 */
1150  		sec = cs->mask;
1151  		do_div(sec, freq);
1152  		do_div(sec, scale);
1153  		if (!sec)
1154  			sec = 1;
1155  		else if (sec > 600 && cs->mask > UINT_MAX)
1156  			sec = 600;
1157  
1158  		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
1159  				       NSEC_PER_SEC / scale, sec * scale);
1160  	}
1161  
1162  	/*
1163  	 * If the uncertainty margin is not specified, calculate it.  If
1164  	 * both scale and freq are non-zero, calculate the clock period, but
1165  	 * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default.
1166  	 * However, if either of scale or freq is zero, be very conservative
1167  	 * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value
1168  	 * for the uncertainty margin.  Allow stupidly small uncertainty
1169  	 * margins to be specified by the caller for testing purposes,
1170  	 * but warn to discourage production use of this capability.
1171  	 *
1172  	 * Bottom line:  The sum of the uncertainty margins of the
1173  	 * watchdog clocksource and the clocksource under test will be at
1174  	 * least 500ppm by default.  For more information, please see the
1175  	 * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above.
1176  	 */
1177  	if (scale && freq && !cs->uncertainty_margin) {
1178  		cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
1179  		if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
1180  			cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
1181  	} else if (!cs->uncertainty_margin) {
1182  		cs->uncertainty_margin = WATCHDOG_THRESHOLD;
1183  	}
1184  	WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
1185  
1186  	/*
1187  	 * Ensure clocksources that have large 'mult' values don't overflow
1188  	 * when adjusted.
1189  	 */
1190  	cs->maxadj = clocksource_max_adjustment(cs);
1191  	while (freq && ((cs->mult + cs->maxadj < cs->mult)
1192  		|| (cs->mult - cs->maxadj > cs->mult))) {
1193  		cs->mult >>= 1;
1194  		cs->shift--;
1195  		cs->maxadj = clocksource_max_adjustment(cs);
1196  	}
1197  
1198  	/*
1199  	 * Only warn for *special* clocksources that self-define
1200  	 * their mult/shift values and don't specify a freq.
1201  	 */
1202  	WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
1203  		"timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
1204  		cs->name);
1205  
1206  	clocksource_update_max_deferment(cs);
1207  
1208  	pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
1209  		cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
1210  }
1211  EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
1212  
1213  /**
1214   * __clocksource_register_scale - Used to install new clocksources
1215   * @cs:		clocksource to be registered
1216   * @scale:	Scale factor multiplied against freq to get clocksource hz
1217   * @freq:	clocksource frequency (cycles per second) divided by scale
1218   *
1219   * Returns -EBUSY if registration fails, zero otherwise.
1220   *
1221   * This *SHOULD NOT* be called directly! Please use the
1222   * clocksource_register_hz() or clocksource_register_khz helper functions.
1223   */
__clocksource_register_scale(struct clocksource * cs,u32 scale,u32 freq)1224  int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
1225  {
1226  	unsigned long flags;
1227  
1228  	clocksource_arch_init(cs);
1229  
1230  	if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
1231  		cs->id = CSID_GENERIC;
1232  	if (cs->vdso_clock_mode < 0 ||
1233  	    cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
1234  		pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
1235  			cs->name, cs->vdso_clock_mode);
1236  		cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE;
1237  	}
1238  
1239  	/* Initialize mult/shift and max_idle_ns */
1240  	__clocksource_update_freq_scale(cs, scale, freq);
1241  
1242  	/* Add clocksource to the clocksource list */
1243  	mutex_lock(&clocksource_mutex);
1244  
1245  	clocksource_watchdog_lock(&flags);
1246  	clocksource_enqueue(cs);
1247  	clocksource_enqueue_watchdog(cs);
1248  	clocksource_watchdog_unlock(&flags);
1249  
1250  	clocksource_select();
1251  	clocksource_select_watchdog(false);
1252  	__clocksource_suspend_select(cs);
1253  	mutex_unlock(&clocksource_mutex);
1254  	return 0;
1255  }
1256  EXPORT_SYMBOL_GPL(__clocksource_register_scale);
1257  
__clocksource_change_rating(struct clocksource * cs,int rating)1258  static void __clocksource_change_rating(struct clocksource *cs, int rating)
1259  {
1260  	list_del(&cs->list);
1261  	cs->rating = rating;
1262  	clocksource_enqueue(cs);
1263  }
1264  
1265  /**
1266   * clocksource_change_rating - Change the rating of a registered clocksource
1267   * @cs:		clocksource to be changed
1268   * @rating:	new rating
1269   */
clocksource_change_rating(struct clocksource * cs,int rating)1270  void clocksource_change_rating(struct clocksource *cs, int rating)
1271  {
1272  	unsigned long flags;
1273  
1274  	mutex_lock(&clocksource_mutex);
1275  	clocksource_watchdog_lock(&flags);
1276  	__clocksource_change_rating(cs, rating);
1277  	clocksource_watchdog_unlock(&flags);
1278  
1279  	clocksource_select();
1280  	clocksource_select_watchdog(false);
1281  	clocksource_suspend_select(false);
1282  	mutex_unlock(&clocksource_mutex);
1283  }
1284  EXPORT_SYMBOL(clocksource_change_rating);
1285  
1286  /*
1287   * Unbind clocksource @cs. Called with clocksource_mutex held
1288   */
clocksource_unbind(struct clocksource * cs)1289  static int clocksource_unbind(struct clocksource *cs)
1290  {
1291  	unsigned long flags;
1292  
1293  	if (clocksource_is_watchdog(cs)) {
1294  		/* Select and try to install a replacement watchdog. */
1295  		clocksource_select_watchdog(true);
1296  		if (clocksource_is_watchdog(cs))
1297  			return -EBUSY;
1298  	}
1299  
1300  	if (cs == curr_clocksource) {
1301  		/* Select and try to install a replacement clock source */
1302  		clocksource_select_fallback();
1303  		if (curr_clocksource == cs)
1304  			return -EBUSY;
1305  	}
1306  
1307  	if (clocksource_is_suspend(cs)) {
1308  		/*
1309  		 * Select and try to install a replacement suspend clocksource.
1310  		 * If no replacement suspend clocksource, we will just let the
1311  		 * clocksource go and have no suspend clocksource.
1312  		 */
1313  		clocksource_suspend_select(true);
1314  	}
1315  
1316  	clocksource_watchdog_lock(&flags);
1317  	clocksource_dequeue_watchdog(cs);
1318  	list_del_init(&cs->list);
1319  	clocksource_watchdog_unlock(&flags);
1320  
1321  	return 0;
1322  }
1323  
1324  /**
1325   * clocksource_unregister - remove a registered clocksource
1326   * @cs:	clocksource to be unregistered
1327   */
clocksource_unregister(struct clocksource * cs)1328  int clocksource_unregister(struct clocksource *cs)
1329  {
1330  	int ret = 0;
1331  
1332  	mutex_lock(&clocksource_mutex);
1333  	if (!list_empty(&cs->list))
1334  		ret = clocksource_unbind(cs);
1335  	mutex_unlock(&clocksource_mutex);
1336  	return ret;
1337  }
1338  EXPORT_SYMBOL(clocksource_unregister);
1339  
1340  #ifdef CONFIG_SYSFS
1341  /**
1342   * current_clocksource_show - sysfs interface for current clocksource
1343   * @dev:	unused
1344   * @attr:	unused
1345   * @buf:	char buffer to be filled with clocksource list
1346   *
1347   * Provides sysfs interface for listing current clocksource.
1348   */
current_clocksource_show(struct device * dev,struct device_attribute * attr,char * buf)1349  static ssize_t current_clocksource_show(struct device *dev,
1350  					struct device_attribute *attr,
1351  					char *buf)
1352  {
1353  	ssize_t count = 0;
1354  
1355  	mutex_lock(&clocksource_mutex);
1356  	count = sysfs_emit(buf, "%s\n", curr_clocksource->name);
1357  	mutex_unlock(&clocksource_mutex);
1358  
1359  	return count;
1360  }
1361  
sysfs_get_uname(const char * buf,char * dst,size_t cnt)1362  ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
1363  {
1364  	size_t ret = cnt;
1365  
1366  	/* strings from sysfs write are not 0 terminated! */
1367  	if (!cnt || cnt >= CS_NAME_LEN)
1368  		return -EINVAL;
1369  
1370  	/* strip of \n: */
1371  	if (buf[cnt-1] == '\n')
1372  		cnt--;
1373  	if (cnt > 0)
1374  		memcpy(dst, buf, cnt);
1375  	dst[cnt] = 0;
1376  	return ret;
1377  }
1378  
1379  /**
1380   * current_clocksource_store - interface for manually overriding clocksource
1381   * @dev:	unused
1382   * @attr:	unused
1383   * @buf:	name of override clocksource
1384   * @count:	length of buffer
1385   *
1386   * Takes input from sysfs interface for manually overriding the default
1387   * clocksource selection.
1388   */
current_clocksource_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)1389  static ssize_t current_clocksource_store(struct device *dev,
1390  					 struct device_attribute *attr,
1391  					 const char *buf, size_t count)
1392  {
1393  	ssize_t ret;
1394  
1395  	mutex_lock(&clocksource_mutex);
1396  
1397  	ret = sysfs_get_uname(buf, override_name, count);
1398  	if (ret >= 0)
1399  		clocksource_select();
1400  
1401  	mutex_unlock(&clocksource_mutex);
1402  
1403  	return ret;
1404  }
1405  static DEVICE_ATTR_RW(current_clocksource);
1406  
1407  /**
1408   * unbind_clocksource_store - interface for manually unbinding clocksource
1409   * @dev:	unused
1410   * @attr:	unused
1411   * @buf:	unused
1412   * @count:	length of buffer
1413   *
1414   * Takes input from sysfs interface for manually unbinding a clocksource.
1415   */
unbind_clocksource_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)1416  static ssize_t unbind_clocksource_store(struct device *dev,
1417  					struct device_attribute *attr,
1418  					const char *buf, size_t count)
1419  {
1420  	struct clocksource *cs;
1421  	char name[CS_NAME_LEN];
1422  	ssize_t ret;
1423  
1424  	ret = sysfs_get_uname(buf, name, count);
1425  	if (ret < 0)
1426  		return ret;
1427  
1428  	ret = -ENODEV;
1429  	mutex_lock(&clocksource_mutex);
1430  	list_for_each_entry(cs, &clocksource_list, list) {
1431  		if (strcmp(cs->name, name))
1432  			continue;
1433  		ret = clocksource_unbind(cs);
1434  		break;
1435  	}
1436  	mutex_unlock(&clocksource_mutex);
1437  
1438  	return ret ? ret : count;
1439  }
1440  static DEVICE_ATTR_WO(unbind_clocksource);
1441  
1442  /**
1443   * available_clocksource_show - sysfs interface for listing clocksource
1444   * @dev:	unused
1445   * @attr:	unused
1446   * @buf:	char buffer to be filled with clocksource list
1447   *
1448   * Provides sysfs interface for listing registered clocksources
1449   */
available_clocksource_show(struct device * dev,struct device_attribute * attr,char * buf)1450  static ssize_t available_clocksource_show(struct device *dev,
1451  					  struct device_attribute *attr,
1452  					  char *buf)
1453  {
1454  	struct clocksource *src;
1455  	ssize_t count = 0;
1456  
1457  	mutex_lock(&clocksource_mutex);
1458  	list_for_each_entry(src, &clocksource_list, list) {
1459  		/*
1460  		 * Don't show non-HRES clocksource if the tick code is
1461  		 * in one shot mode (highres=on or nohz=on)
1462  		 */
1463  		if (!tick_oneshot_mode_active() ||
1464  		    (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
1465  			count += snprintf(buf + count,
1466  				  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
1467  				  "%s ", src->name);
1468  	}
1469  	mutex_unlock(&clocksource_mutex);
1470  
1471  	count += snprintf(buf + count,
1472  			  max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
1473  
1474  	return count;
1475  }
1476  static DEVICE_ATTR_RO(available_clocksource);
1477  
1478  static struct attribute *clocksource_attrs[] = {
1479  	&dev_attr_current_clocksource.attr,
1480  	&dev_attr_unbind_clocksource.attr,
1481  	&dev_attr_available_clocksource.attr,
1482  	NULL
1483  };
1484  ATTRIBUTE_GROUPS(clocksource);
1485  
1486  static const struct bus_type clocksource_subsys = {
1487  	.name = "clocksource",
1488  	.dev_name = "clocksource",
1489  };
1490  
1491  static struct device device_clocksource = {
1492  	.id	= 0,
1493  	.bus	= &clocksource_subsys,
1494  	.groups	= clocksource_groups,
1495  };
1496  
init_clocksource_sysfs(void)1497  static int __init init_clocksource_sysfs(void)
1498  {
1499  	int error = subsys_system_register(&clocksource_subsys, NULL);
1500  
1501  	if (!error)
1502  		error = device_register(&device_clocksource);
1503  
1504  	return error;
1505  }
1506  
1507  device_initcall(init_clocksource_sysfs);
1508  #endif /* CONFIG_SYSFS */
1509  
1510  /**
1511   * boot_override_clocksource - boot clock override
1512   * @str:	override name
1513   *
1514   * Takes a clocksource= boot argument and uses it
1515   * as the clocksource override name.
1516   */
boot_override_clocksource(char * str)1517  static int __init boot_override_clocksource(char* str)
1518  {
1519  	mutex_lock(&clocksource_mutex);
1520  	if (str)
1521  		strscpy(override_name, str, sizeof(override_name));
1522  	mutex_unlock(&clocksource_mutex);
1523  	return 1;
1524  }
1525  
1526  __setup("clocksource=", boot_override_clocksource);
1527  
1528  /**
1529   * boot_override_clock - Compatibility layer for deprecated boot option
1530   * @str:	override name
1531   *
1532   * DEPRECATED! Takes a clock= boot argument and uses it
1533   * as the clocksource override name
1534   */
boot_override_clock(char * str)1535  static int __init boot_override_clock(char* str)
1536  {
1537  	if (!strcmp(str, "pmtmr")) {
1538  		pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n");
1539  		return boot_override_clocksource("acpi_pm");
1540  	}
1541  	pr_warn("clock= boot option is deprecated - use clocksource=xyz\n");
1542  	return boot_override_clocksource(str);
1543  }
1544  
1545  __setup("clock=", boot_override_clock);
1546