1  // SPDX-License-Identifier: GPL-2.0
2  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3  
4  #include <linux/errno.h>
5  #include <linux/kernel.h>
6  #include <linux/mm.h>
7  #include <linux/smp.h>
8  #include <linux/cpu.h>
9  #include <linux/prctl.h>
10  #include <linux/slab.h>
11  #include <linux/sched.h>
12  #include <linux/sched/idle.h>
13  #include <linux/sched/debug.h>
14  #include <linux/sched/task.h>
15  #include <linux/sched/task_stack.h>
16  #include <linux/init.h>
17  #include <linux/export.h>
18  #include <linux/pm.h>
19  #include <linux/tick.h>
20  #include <linux/random.h>
21  #include <linux/user-return-notifier.h>
22  #include <linux/dmi.h>
23  #include <linux/utsname.h>
24  #include <linux/stackprotector.h>
25  #include <linux/cpuidle.h>
26  #include <linux/acpi.h>
27  #include <linux/elf-randomize.h>
28  #include <linux/static_call.h>
29  #include <trace/events/power.h>
30  #include <linux/hw_breakpoint.h>
31  #include <linux/entry-common.h>
32  #include <asm/cpu.h>
33  #include <asm/apic.h>
34  #include <linux/uaccess.h>
35  #include <asm/mwait.h>
36  #include <asm/fpu/api.h>
37  #include <asm/fpu/sched.h>
38  #include <asm/fpu/xstate.h>
39  #include <asm/debugreg.h>
40  #include <asm/nmi.h>
41  #include <asm/tlbflush.h>
42  #include <asm/mce.h>
43  #include <asm/vm86.h>
44  #include <asm/switch_to.h>
45  #include <asm/desc.h>
46  #include <asm/prctl.h>
47  #include <asm/spec-ctrl.h>
48  #include <asm/io_bitmap.h>
49  #include <asm/proto.h>
50  #include <asm/frame.h>
51  #include <asm/unwind.h>
52  #include <asm/tdx.h>
53  #include <asm/mmu_context.h>
54  #include <asm/shstk.h>
55  
56  #include "process.h"
57  
58  /*
59   * per-CPU TSS segments. Threads are completely 'soft' on Linux,
60   * no more per-task TSS's. The TSS size is kept cacheline-aligned
61   * so they are allowed to end up in the .data..cacheline_aligned
62   * section. Since TSS's are completely CPU-local, we want them
63   * on exact cacheline boundaries, to eliminate cacheline ping-pong.
64   */
65  __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
66  	.x86_tss = {
67  		/*
68  		 * .sp0 is only used when entering ring 0 from a lower
69  		 * privilege level.  Since the init task never runs anything
70  		 * but ring 0 code, there is no need for a valid value here.
71  		 * Poison it.
72  		 */
73  		.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
74  
75  #ifdef CONFIG_X86_32
76  		.sp1 = TOP_OF_INIT_STACK,
77  
78  		.ss0 = __KERNEL_DS,
79  		.ss1 = __KERNEL_CS,
80  #endif
81  		.io_bitmap_base	= IO_BITMAP_OFFSET_INVALID,
82  	 },
83  };
84  EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
85  
86  DEFINE_PER_CPU(bool, __tss_limit_invalid);
87  EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
88  
89  /*
90   * this gets called so that we can store lazy state into memory and copy the
91   * current task into the new thread.
92   */
arch_dup_task_struct(struct task_struct * dst,struct task_struct * src)93  int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
94  {
95  	memcpy(dst, src, arch_task_struct_size);
96  #ifdef CONFIG_VM86
97  	dst->thread.vm86 = NULL;
98  #endif
99  	/* Drop the copied pointer to current's fpstate */
100  	dst->thread.fpu.fpstate = NULL;
101  
102  	return 0;
103  }
104  
105  #ifdef CONFIG_X86_64
arch_release_task_struct(struct task_struct * tsk)106  void arch_release_task_struct(struct task_struct *tsk)
107  {
108  	if (fpu_state_size_dynamic())
109  		fpstate_free(&tsk->thread.fpu);
110  }
111  #endif
112  
113  /*
114   * Free thread data structures etc..
115   */
exit_thread(struct task_struct * tsk)116  void exit_thread(struct task_struct *tsk)
117  {
118  	struct thread_struct *t = &tsk->thread;
119  	struct fpu *fpu = &t->fpu;
120  
121  	if (test_thread_flag(TIF_IO_BITMAP))
122  		io_bitmap_exit(tsk);
123  
124  	free_vm86(t);
125  
126  	shstk_free(tsk);
127  	fpu__drop(fpu);
128  }
129  
set_new_tls(struct task_struct * p,unsigned long tls)130  static int set_new_tls(struct task_struct *p, unsigned long tls)
131  {
132  	struct user_desc __user *utls = (struct user_desc __user *)tls;
133  
134  	if (in_ia32_syscall())
135  		return do_set_thread_area(p, -1, utls, 0);
136  	else
137  		return do_set_thread_area_64(p, ARCH_SET_FS, tls);
138  }
139  
ret_from_fork(struct task_struct * prev,struct pt_regs * regs,int (* fn)(void *),void * fn_arg)140  __visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
141  				     int (*fn)(void *), void *fn_arg)
142  {
143  	schedule_tail(prev);
144  
145  	/* Is this a kernel thread? */
146  	if (unlikely(fn)) {
147  		fn(fn_arg);
148  		/*
149  		 * A kernel thread is allowed to return here after successfully
150  		 * calling kernel_execve().  Exit to userspace to complete the
151  		 * execve() syscall.
152  		 */
153  		regs->ax = 0;
154  	}
155  
156  	syscall_exit_to_user_mode(regs);
157  }
158  
copy_thread(struct task_struct * p,const struct kernel_clone_args * args)159  int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
160  {
161  	unsigned long clone_flags = args->flags;
162  	unsigned long sp = args->stack;
163  	unsigned long tls = args->tls;
164  	struct inactive_task_frame *frame;
165  	struct fork_frame *fork_frame;
166  	struct pt_regs *childregs;
167  	unsigned long new_ssp;
168  	int ret = 0;
169  
170  	childregs = task_pt_regs(p);
171  	fork_frame = container_of(childregs, struct fork_frame, regs);
172  	frame = &fork_frame->frame;
173  
174  	frame->bp = encode_frame_pointer(childregs);
175  	frame->ret_addr = (unsigned long) ret_from_fork_asm;
176  	p->thread.sp = (unsigned long) fork_frame;
177  	p->thread.io_bitmap = NULL;
178  	p->thread.iopl_warn = 0;
179  	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
180  
181  #ifdef CONFIG_X86_64
182  	current_save_fsgs();
183  	p->thread.fsindex = current->thread.fsindex;
184  	p->thread.fsbase = current->thread.fsbase;
185  	p->thread.gsindex = current->thread.gsindex;
186  	p->thread.gsbase = current->thread.gsbase;
187  
188  	savesegment(es, p->thread.es);
189  	savesegment(ds, p->thread.ds);
190  
191  	if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM)
192  		set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags);
193  #else
194  	p->thread.sp0 = (unsigned long) (childregs + 1);
195  	savesegment(gs, p->thread.gs);
196  	/*
197  	 * Clear all status flags including IF and set fixed bit. 64bit
198  	 * does not have this initialization as the frame does not contain
199  	 * flags. The flags consistency (especially vs. AC) is there
200  	 * ensured via objtool, which lacks 32bit support.
201  	 */
202  	frame->flags = X86_EFLAGS_FIXED;
203  #endif
204  
205  	/*
206  	 * Allocate a new shadow stack for thread if needed. If shadow stack,
207  	 * is disabled, new_ssp will remain 0, and fpu_clone() will know not to
208  	 * update it.
209  	 */
210  	new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size);
211  	if (IS_ERR_VALUE(new_ssp))
212  		return PTR_ERR((void *)new_ssp);
213  
214  	fpu_clone(p, clone_flags, args->fn, new_ssp);
215  
216  	/* Kernel thread ? */
217  	if (unlikely(p->flags & PF_KTHREAD)) {
218  		p->thread.pkru = pkru_get_init_value();
219  		memset(childregs, 0, sizeof(struct pt_regs));
220  		kthread_frame_init(frame, args->fn, args->fn_arg);
221  		return 0;
222  	}
223  
224  	/*
225  	 * Clone current's PKRU value from hardware. tsk->thread.pkru
226  	 * is only valid when scheduled out.
227  	 */
228  	p->thread.pkru = read_pkru();
229  
230  	frame->bx = 0;
231  	*childregs = *current_pt_regs();
232  	childregs->ax = 0;
233  	if (sp)
234  		childregs->sp = sp;
235  
236  	if (unlikely(args->fn)) {
237  		/*
238  		 * A user space thread, but it doesn't return to
239  		 * ret_after_fork().
240  		 *
241  		 * In order to indicate that to tools like gdb,
242  		 * we reset the stack and instruction pointers.
243  		 *
244  		 * It does the same kernel frame setup to return to a kernel
245  		 * function that a kernel thread does.
246  		 */
247  		childregs->sp = 0;
248  		childregs->ip = 0;
249  		kthread_frame_init(frame, args->fn, args->fn_arg);
250  		return 0;
251  	}
252  
253  	/* Set a new TLS for the child thread? */
254  	if (clone_flags & CLONE_SETTLS)
255  		ret = set_new_tls(p, tls);
256  
257  	if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
258  		io_bitmap_share(p);
259  
260  	return ret;
261  }
262  
pkru_flush_thread(void)263  static void pkru_flush_thread(void)
264  {
265  	/*
266  	 * If PKRU is enabled the default PKRU value has to be loaded into
267  	 * the hardware right here (similar to context switch).
268  	 */
269  	pkru_write_default();
270  }
271  
flush_thread(void)272  void flush_thread(void)
273  {
274  	struct task_struct *tsk = current;
275  
276  	flush_ptrace_hw_breakpoint(tsk);
277  	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
278  
279  	fpu_flush_thread();
280  	pkru_flush_thread();
281  }
282  
disable_TSC(void)283  void disable_TSC(void)
284  {
285  	preempt_disable();
286  	if (!test_and_set_thread_flag(TIF_NOTSC))
287  		/*
288  		 * Must flip the CPU state synchronously with
289  		 * TIF_NOTSC in the current running context.
290  		 */
291  		cr4_set_bits(X86_CR4_TSD);
292  	preempt_enable();
293  }
294  
enable_TSC(void)295  static void enable_TSC(void)
296  {
297  	preempt_disable();
298  	if (test_and_clear_thread_flag(TIF_NOTSC))
299  		/*
300  		 * Must flip the CPU state synchronously with
301  		 * TIF_NOTSC in the current running context.
302  		 */
303  		cr4_clear_bits(X86_CR4_TSD);
304  	preempt_enable();
305  }
306  
get_tsc_mode(unsigned long adr)307  int get_tsc_mode(unsigned long adr)
308  {
309  	unsigned int val;
310  
311  	if (test_thread_flag(TIF_NOTSC))
312  		val = PR_TSC_SIGSEGV;
313  	else
314  		val = PR_TSC_ENABLE;
315  
316  	return put_user(val, (unsigned int __user *)adr);
317  }
318  
set_tsc_mode(unsigned int val)319  int set_tsc_mode(unsigned int val)
320  {
321  	if (val == PR_TSC_SIGSEGV)
322  		disable_TSC();
323  	else if (val == PR_TSC_ENABLE)
324  		enable_TSC();
325  	else
326  		return -EINVAL;
327  
328  	return 0;
329  }
330  
331  DEFINE_PER_CPU(u64, msr_misc_features_shadow);
332  
set_cpuid_faulting(bool on)333  static void set_cpuid_faulting(bool on)
334  {
335  	u64 msrval;
336  
337  	msrval = this_cpu_read(msr_misc_features_shadow);
338  	msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
339  	msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
340  	this_cpu_write(msr_misc_features_shadow, msrval);
341  	wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
342  }
343  
disable_cpuid(void)344  static void disable_cpuid(void)
345  {
346  	preempt_disable();
347  	if (!test_and_set_thread_flag(TIF_NOCPUID)) {
348  		/*
349  		 * Must flip the CPU state synchronously with
350  		 * TIF_NOCPUID in the current running context.
351  		 */
352  		set_cpuid_faulting(true);
353  	}
354  	preempt_enable();
355  }
356  
enable_cpuid(void)357  static void enable_cpuid(void)
358  {
359  	preempt_disable();
360  	if (test_and_clear_thread_flag(TIF_NOCPUID)) {
361  		/*
362  		 * Must flip the CPU state synchronously with
363  		 * TIF_NOCPUID in the current running context.
364  		 */
365  		set_cpuid_faulting(false);
366  	}
367  	preempt_enable();
368  }
369  
get_cpuid_mode(void)370  static int get_cpuid_mode(void)
371  {
372  	return !test_thread_flag(TIF_NOCPUID);
373  }
374  
set_cpuid_mode(unsigned long cpuid_enabled)375  static int set_cpuid_mode(unsigned long cpuid_enabled)
376  {
377  	if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT))
378  		return -ENODEV;
379  
380  	if (cpuid_enabled)
381  		enable_cpuid();
382  	else
383  		disable_cpuid();
384  
385  	return 0;
386  }
387  
388  /*
389   * Called immediately after a successful exec.
390   */
arch_setup_new_exec(void)391  void arch_setup_new_exec(void)
392  {
393  	/* If cpuid was previously disabled for this task, re-enable it. */
394  	if (test_thread_flag(TIF_NOCPUID))
395  		enable_cpuid();
396  
397  	/*
398  	 * Don't inherit TIF_SSBD across exec boundary when
399  	 * PR_SPEC_DISABLE_NOEXEC is used.
400  	 */
401  	if (test_thread_flag(TIF_SSBD) &&
402  	    task_spec_ssb_noexec(current)) {
403  		clear_thread_flag(TIF_SSBD);
404  		task_clear_spec_ssb_disable(current);
405  		task_clear_spec_ssb_noexec(current);
406  		speculation_ctrl_update(read_thread_flags());
407  	}
408  
409  	mm_reset_untag_mask(current->mm);
410  }
411  
412  #ifdef CONFIG_X86_IOPL_IOPERM
switch_to_bitmap(unsigned long tifp)413  static inline void switch_to_bitmap(unsigned long tifp)
414  {
415  	/*
416  	 * Invalidate I/O bitmap if the previous task used it. This prevents
417  	 * any possible leakage of an active I/O bitmap.
418  	 *
419  	 * If the next task has an I/O bitmap it will handle it on exit to
420  	 * user mode.
421  	 */
422  	if (tifp & _TIF_IO_BITMAP)
423  		tss_invalidate_io_bitmap();
424  }
425  
tss_copy_io_bitmap(struct tss_struct * tss,struct io_bitmap * iobm)426  static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
427  {
428  	/*
429  	 * Copy at least the byte range of the incoming tasks bitmap which
430  	 * covers the permitted I/O ports.
431  	 *
432  	 * If the previous task which used an I/O bitmap had more bits
433  	 * permitted, then the copy needs to cover those as well so they
434  	 * get turned off.
435  	 */
436  	memcpy(tss->io_bitmap.bitmap, iobm->bitmap,
437  	       max(tss->io_bitmap.prev_max, iobm->max));
438  
439  	/*
440  	 * Store the new max and the sequence number of this bitmap
441  	 * and a pointer to the bitmap itself.
442  	 */
443  	tss->io_bitmap.prev_max = iobm->max;
444  	tss->io_bitmap.prev_sequence = iobm->sequence;
445  }
446  
447  /**
448   * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode
449   */
native_tss_update_io_bitmap(void)450  void native_tss_update_io_bitmap(void)
451  {
452  	struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
453  	struct thread_struct *t = &current->thread;
454  	u16 *base = &tss->x86_tss.io_bitmap_base;
455  
456  	if (!test_thread_flag(TIF_IO_BITMAP)) {
457  		native_tss_invalidate_io_bitmap();
458  		return;
459  	}
460  
461  	if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
462  		*base = IO_BITMAP_OFFSET_VALID_ALL;
463  	} else {
464  		struct io_bitmap *iobm = t->io_bitmap;
465  
466  		/*
467  		 * Only copy bitmap data when the sequence number differs. The
468  		 * update time is accounted to the incoming task.
469  		 */
470  		if (tss->io_bitmap.prev_sequence != iobm->sequence)
471  			tss_copy_io_bitmap(tss, iobm);
472  
473  		/* Enable the bitmap */
474  		*base = IO_BITMAP_OFFSET_VALID_MAP;
475  	}
476  
477  	/*
478  	 * Make sure that the TSS limit is covering the IO bitmap. It might have
479  	 * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
480  	 * access from user space to trigger a #GP because the bitmap is outside
481  	 * the TSS limit.
482  	 */
483  	refresh_tss_limit();
484  }
485  #else /* CONFIG_X86_IOPL_IOPERM */
switch_to_bitmap(unsigned long tifp)486  static inline void switch_to_bitmap(unsigned long tifp) { }
487  #endif
488  
489  #ifdef CONFIG_SMP
490  
491  struct ssb_state {
492  	struct ssb_state	*shared_state;
493  	raw_spinlock_t		lock;
494  	unsigned int		disable_state;
495  	unsigned long		local_state;
496  };
497  
498  #define LSTATE_SSB	0
499  
500  static DEFINE_PER_CPU(struct ssb_state, ssb_state);
501  
speculative_store_bypass_ht_init(void)502  void speculative_store_bypass_ht_init(void)
503  {
504  	struct ssb_state *st = this_cpu_ptr(&ssb_state);
505  	unsigned int this_cpu = smp_processor_id();
506  	unsigned int cpu;
507  
508  	st->local_state = 0;
509  
510  	/*
511  	 * Shared state setup happens once on the first bringup
512  	 * of the CPU. It's not destroyed on CPU hotunplug.
513  	 */
514  	if (st->shared_state)
515  		return;
516  
517  	raw_spin_lock_init(&st->lock);
518  
519  	/*
520  	 * Go over HT siblings and check whether one of them has set up the
521  	 * shared state pointer already.
522  	 */
523  	for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) {
524  		if (cpu == this_cpu)
525  			continue;
526  
527  		if (!per_cpu(ssb_state, cpu).shared_state)
528  			continue;
529  
530  		/* Link it to the state of the sibling: */
531  		st->shared_state = per_cpu(ssb_state, cpu).shared_state;
532  		return;
533  	}
534  
535  	/*
536  	 * First HT sibling to come up on the core.  Link shared state of
537  	 * the first HT sibling to itself. The siblings on the same core
538  	 * which come up later will see the shared state pointer and link
539  	 * themselves to the state of this CPU.
540  	 */
541  	st->shared_state = st;
542  }
543  
544  /*
545   * Logic is: First HT sibling enables SSBD for both siblings in the core
546   * and last sibling to disable it, disables it for the whole core. This how
547   * MSR_SPEC_CTRL works in "hardware":
548   *
549   *  CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL
550   */
amd_set_core_ssb_state(unsigned long tifn)551  static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
552  {
553  	struct ssb_state *st = this_cpu_ptr(&ssb_state);
554  	u64 msr = x86_amd_ls_cfg_base;
555  
556  	if (!static_cpu_has(X86_FEATURE_ZEN)) {
557  		msr |= ssbd_tif_to_amd_ls_cfg(tifn);
558  		wrmsrl(MSR_AMD64_LS_CFG, msr);
559  		return;
560  	}
561  
562  	if (tifn & _TIF_SSBD) {
563  		/*
564  		 * Since this can race with prctl(), block reentry on the
565  		 * same CPU.
566  		 */
567  		if (__test_and_set_bit(LSTATE_SSB, &st->local_state))
568  			return;
569  
570  		msr |= x86_amd_ls_cfg_ssbd_mask;
571  
572  		raw_spin_lock(&st->shared_state->lock);
573  		/* First sibling enables SSBD: */
574  		if (!st->shared_state->disable_state)
575  			wrmsrl(MSR_AMD64_LS_CFG, msr);
576  		st->shared_state->disable_state++;
577  		raw_spin_unlock(&st->shared_state->lock);
578  	} else {
579  		if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state))
580  			return;
581  
582  		raw_spin_lock(&st->shared_state->lock);
583  		st->shared_state->disable_state--;
584  		if (!st->shared_state->disable_state)
585  			wrmsrl(MSR_AMD64_LS_CFG, msr);
586  		raw_spin_unlock(&st->shared_state->lock);
587  	}
588  }
589  #else
amd_set_core_ssb_state(unsigned long tifn)590  static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
591  {
592  	u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
593  
594  	wrmsrl(MSR_AMD64_LS_CFG, msr);
595  }
596  #endif
597  
amd_set_ssb_virt_state(unsigned long tifn)598  static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
599  {
600  	/*
601  	 * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
602  	 * so ssbd_tif_to_spec_ctrl() just works.
603  	 */
604  	wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
605  }
606  
607  /*
608   * Update the MSRs managing speculation control, during context switch.
609   *
610   * tifp: Previous task's thread flags
611   * tifn: Next task's thread flags
612   */
__speculation_ctrl_update(unsigned long tifp,unsigned long tifn)613  static __always_inline void __speculation_ctrl_update(unsigned long tifp,
614  						      unsigned long tifn)
615  {
616  	unsigned long tif_diff = tifp ^ tifn;
617  	u64 msr = x86_spec_ctrl_base;
618  	bool updmsr = false;
619  
620  	lockdep_assert_irqs_disabled();
621  
622  	/* Handle change of TIF_SSBD depending on the mitigation method. */
623  	if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
624  		if (tif_diff & _TIF_SSBD)
625  			amd_set_ssb_virt_state(tifn);
626  	} else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
627  		if (tif_diff & _TIF_SSBD)
628  			amd_set_core_ssb_state(tifn);
629  	} else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
630  		   static_cpu_has(X86_FEATURE_AMD_SSBD)) {
631  		updmsr |= !!(tif_diff & _TIF_SSBD);
632  		msr |= ssbd_tif_to_spec_ctrl(tifn);
633  	}
634  
635  	/* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */
636  	if (IS_ENABLED(CONFIG_SMP) &&
637  	    static_branch_unlikely(&switch_to_cond_stibp)) {
638  		updmsr |= !!(tif_diff & _TIF_SPEC_IB);
639  		msr |= stibp_tif_to_spec_ctrl(tifn);
640  	}
641  
642  	if (updmsr)
643  		update_spec_ctrl_cond(msr);
644  }
645  
speculation_ctrl_update_tif(struct task_struct * tsk)646  static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
647  {
648  	if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
649  		if (task_spec_ssb_disable(tsk))
650  			set_tsk_thread_flag(tsk, TIF_SSBD);
651  		else
652  			clear_tsk_thread_flag(tsk, TIF_SSBD);
653  
654  		if (task_spec_ib_disable(tsk))
655  			set_tsk_thread_flag(tsk, TIF_SPEC_IB);
656  		else
657  			clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
658  	}
659  	/* Return the updated threadinfo flags*/
660  	return read_task_thread_flags(tsk);
661  }
662  
speculation_ctrl_update(unsigned long tif)663  void speculation_ctrl_update(unsigned long tif)
664  {
665  	unsigned long flags;
666  
667  	/* Forced update. Make sure all relevant TIF flags are different */
668  	local_irq_save(flags);
669  	__speculation_ctrl_update(~tif, tif);
670  	local_irq_restore(flags);
671  }
672  
673  /* Called from seccomp/prctl update */
speculation_ctrl_update_current(void)674  void speculation_ctrl_update_current(void)
675  {
676  	preempt_disable();
677  	speculation_ctrl_update(speculation_ctrl_update_tif(current));
678  	preempt_enable();
679  }
680  
cr4_toggle_bits_irqsoff(unsigned long mask)681  static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
682  {
683  	unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
684  
685  	newval = cr4 ^ mask;
686  	if (newval != cr4) {
687  		this_cpu_write(cpu_tlbstate.cr4, newval);
688  		__write_cr4(newval);
689  	}
690  }
691  
__switch_to_xtra(struct task_struct * prev_p,struct task_struct * next_p)692  void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
693  {
694  	unsigned long tifp, tifn;
695  
696  	tifn = read_task_thread_flags(next_p);
697  	tifp = read_task_thread_flags(prev_p);
698  
699  	switch_to_bitmap(tifp);
700  
701  	propagate_user_return_notify(prev_p, next_p);
702  
703  	if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
704  	    arch_has_block_step()) {
705  		unsigned long debugctl, msk;
706  
707  		rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
708  		debugctl &= ~DEBUGCTLMSR_BTF;
709  		msk = tifn & _TIF_BLOCKSTEP;
710  		debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
711  		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
712  	}
713  
714  	if ((tifp ^ tifn) & _TIF_NOTSC)
715  		cr4_toggle_bits_irqsoff(X86_CR4_TSD);
716  
717  	if ((tifp ^ tifn) & _TIF_NOCPUID)
718  		set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
719  
720  	if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
721  		__speculation_ctrl_update(tifp, tifn);
722  	} else {
723  		speculation_ctrl_update_tif(prev_p);
724  		tifn = speculation_ctrl_update_tif(next_p);
725  
726  		/* Enforce MSR update to ensure consistent state */
727  		__speculation_ctrl_update(~tifn, tifn);
728  	}
729  }
730  
731  /*
732   * Idle related variables and functions
733   */
734  unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
735  EXPORT_SYMBOL(boot_option_idle_override);
736  
737  /*
738   * We use this if we don't have any better idle routine..
739   */
default_idle(void)740  void __cpuidle default_idle(void)
741  {
742  	raw_safe_halt();
743  	raw_local_irq_disable();
744  }
745  #if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
746  EXPORT_SYMBOL(default_idle);
747  #endif
748  
749  DEFINE_STATIC_CALL_NULL(x86_idle, default_idle);
750  
x86_idle_set(void)751  static bool x86_idle_set(void)
752  {
753  	return !!static_call_query(x86_idle);
754  }
755  
756  #ifndef CONFIG_SMP
play_dead(void)757  static inline void __noreturn play_dead(void)
758  {
759  	BUG();
760  }
761  #endif
762  
arch_cpu_idle_enter(void)763  void arch_cpu_idle_enter(void)
764  {
765  	tsc_verify_tsc_adjust(false);
766  	local_touch_nmi();
767  }
768  
arch_cpu_idle_dead(void)769  void __noreturn arch_cpu_idle_dead(void)
770  {
771  	play_dead();
772  }
773  
774  /*
775   * Called from the generic idle code.
776   */
arch_cpu_idle(void)777  void __cpuidle arch_cpu_idle(void)
778  {
779  	static_call(x86_idle)();
780  }
781  EXPORT_SYMBOL_GPL(arch_cpu_idle);
782  
783  #ifdef CONFIG_XEN
xen_set_default_idle(void)784  bool xen_set_default_idle(void)
785  {
786  	bool ret = x86_idle_set();
787  
788  	static_call_update(x86_idle, default_idle);
789  
790  	return ret;
791  }
792  #endif
793  
794  struct cpumask cpus_stop_mask;
795  
stop_this_cpu(void * dummy)796  void __noreturn stop_this_cpu(void *dummy)
797  {
798  	struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
799  	unsigned int cpu = smp_processor_id();
800  
801  	local_irq_disable();
802  
803  	/*
804  	 * Remove this CPU from the online mask and disable it
805  	 * unconditionally. This might be redundant in case that the reboot
806  	 * vector was handled late and stop_other_cpus() sent an NMI.
807  	 *
808  	 * According to SDM and APM NMIs can be accepted even after soft
809  	 * disabling the local APIC.
810  	 */
811  	set_cpu_online(cpu, false);
812  	disable_local_APIC();
813  	mcheck_cpu_clear(c);
814  
815  	/*
816  	 * Use wbinvd on processors that support SME. This provides support
817  	 * for performing a successful kexec when going from SME inactive
818  	 * to SME active (or vice-versa). The cache must be cleared so that
819  	 * if there are entries with the same physical address, both with and
820  	 * without the encryption bit, they don't race each other when flushed
821  	 * and potentially end up with the wrong entry being committed to
822  	 * memory.
823  	 *
824  	 * Test the CPUID bit directly because the machine might've cleared
825  	 * X86_FEATURE_SME due to cmdline options.
826  	 */
827  	if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
828  		native_wbinvd();
829  
830  	/*
831  	 * This brings a cache line back and dirties it, but
832  	 * native_stop_other_cpus() will overwrite cpus_stop_mask after it
833  	 * observed that all CPUs reported stop. This write will invalidate
834  	 * the related cache line on this CPU.
835  	 */
836  	cpumask_clear_cpu(cpu, &cpus_stop_mask);
837  
838  #ifdef CONFIG_SMP
839  	if (smp_ops.stop_this_cpu) {
840  		smp_ops.stop_this_cpu();
841  		unreachable();
842  	}
843  #endif
844  
845  	for (;;) {
846  		/*
847  		 * Use native_halt() so that memory contents don't change
848  		 * (stack usage and variables) after possibly issuing the
849  		 * native_wbinvd() above.
850  		 */
851  		native_halt();
852  	}
853  }
854  
855  /*
856   * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf
857   * exists and whenever MONITOR/MWAIT extensions are present there is at
858   * least one C1 substate.
859   *
860   * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait
861   * is passed to kernel commandline parameter.
862   */
prefer_mwait_c1_over_halt(void)863  static __init bool prefer_mwait_c1_over_halt(void)
864  {
865  	const struct cpuinfo_x86 *c = &boot_cpu_data;
866  	u32 eax, ebx, ecx, edx;
867  
868  	/* If override is enforced on the command line, fall back to HALT. */
869  	if (boot_option_idle_override != IDLE_NO_OVERRIDE)
870  		return false;
871  
872  	/* MWAIT is not supported on this platform. Fallback to HALT */
873  	if (!cpu_has(c, X86_FEATURE_MWAIT))
874  		return false;
875  
876  	/* Monitor has a bug or APIC stops in C1E. Fallback to HALT */
877  	if (boot_cpu_has_bug(X86_BUG_MONITOR) || boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E))
878  		return false;
879  
880  	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
881  
882  	/*
883  	 * If MWAIT extensions are not available, it is safe to use MWAIT
884  	 * with EAX=0, ECX=0.
885  	 */
886  	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED))
887  		return true;
888  
889  	/*
890  	 * If MWAIT extensions are available, there should be at least one
891  	 * MWAIT C1 substate present.
892  	 */
893  	return !!(edx & MWAIT_C1_SUBSTATE_MASK);
894  }
895  
896  /*
897   * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
898   * with interrupts enabled and no flags, which is backwards compatible with the
899   * original MWAIT implementation.
900   */
mwait_idle(void)901  static __cpuidle void mwait_idle(void)
902  {
903  	if (!current_set_polling_and_test()) {
904  		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
905  			mb(); /* quirk */
906  			clflush((void *)&current_thread_info()->flags);
907  			mb(); /* quirk */
908  		}
909  
910  		__monitor((void *)&current_thread_info()->flags, 0, 0);
911  		if (!need_resched()) {
912  			__sti_mwait(0, 0);
913  			raw_local_irq_disable();
914  		}
915  	}
916  	__current_clr_polling();
917  }
918  
select_idle_routine(void)919  void __init select_idle_routine(void)
920  {
921  	if (boot_option_idle_override == IDLE_POLL) {
922  		if (IS_ENABLED(CONFIG_SMP) && __max_threads_per_core > 1)
923  			pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
924  		return;
925  	}
926  
927  	/* Required to guard against xen_set_default_idle() */
928  	if (x86_idle_set())
929  		return;
930  
931  	if (prefer_mwait_c1_over_halt()) {
932  		pr_info("using mwait in idle threads\n");
933  		static_call_update(x86_idle, mwait_idle);
934  	} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
935  		pr_info("using TDX aware idle routine\n");
936  		static_call_update(x86_idle, tdx_safe_halt);
937  	} else {
938  		static_call_update(x86_idle, default_idle);
939  	}
940  }
941  
amd_e400_c1e_apic_setup(void)942  void amd_e400_c1e_apic_setup(void)
943  {
944  	if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
945  		pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id());
946  		local_irq_disable();
947  		tick_broadcast_force();
948  		local_irq_enable();
949  	}
950  }
951  
arch_post_acpi_subsys_init(void)952  void __init arch_post_acpi_subsys_init(void)
953  {
954  	u32 lo, hi;
955  
956  	if (!boot_cpu_has_bug(X86_BUG_AMD_E400))
957  		return;
958  
959  	/*
960  	 * AMD E400 detection needs to happen after ACPI has been enabled. If
961  	 * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in
962  	 * MSR_K8_INT_PENDING_MSG.
963  	 */
964  	rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
965  	if (!(lo & K8_INTP_C1E_ACTIVE_MASK))
966  		return;
967  
968  	boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E);
969  
970  	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
971  		mark_tsc_unstable("TSC halt in AMD C1E");
972  
973  	if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE))
974  		static_branch_enable(&arch_needs_tick_broadcast);
975  	pr_info("System has AMD C1E erratum E400. Workaround enabled.\n");
976  }
977  
idle_setup(char * str)978  static int __init idle_setup(char *str)
979  {
980  	if (!str)
981  		return -EINVAL;
982  
983  	if (!strcmp(str, "poll")) {
984  		pr_info("using polling idle threads\n");
985  		boot_option_idle_override = IDLE_POLL;
986  		cpu_idle_poll_ctrl(true);
987  	} else if (!strcmp(str, "halt")) {
988  		/* 'idle=halt' HALT for idle. C-states are disabled. */
989  		boot_option_idle_override = IDLE_HALT;
990  	} else if (!strcmp(str, "nomwait")) {
991  		/* 'idle=nomwait' disables MWAIT for idle */
992  		boot_option_idle_override = IDLE_NOMWAIT;
993  	} else {
994  		return -EINVAL;
995  	}
996  
997  	return 0;
998  }
999  early_param("idle", idle_setup);
1000  
arch_align_stack(unsigned long sp)1001  unsigned long arch_align_stack(unsigned long sp)
1002  {
1003  	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
1004  		sp -= get_random_u32_below(8192);
1005  	return sp & ~0xf;
1006  }
1007  
arch_randomize_brk(struct mm_struct * mm)1008  unsigned long arch_randomize_brk(struct mm_struct *mm)
1009  {
1010  	if (mmap_is_ia32())
1011  		return randomize_page(mm->brk, SZ_32M);
1012  
1013  	return randomize_page(mm->brk, SZ_1G);
1014  }
1015  
1016  /*
1017   * Called from fs/proc with a reference on @p to find the function
1018   * which called into schedule(). This needs to be done carefully
1019   * because the task might wake up and we might look at a stack
1020   * changing under us.
1021   */
__get_wchan(struct task_struct * p)1022  unsigned long __get_wchan(struct task_struct *p)
1023  {
1024  	struct unwind_state state;
1025  	unsigned long addr = 0;
1026  
1027  	if (!try_get_task_stack(p))
1028  		return 0;
1029  
1030  	for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state);
1031  	     unwind_next_frame(&state)) {
1032  		addr = unwind_get_return_address(&state);
1033  		if (!addr)
1034  			break;
1035  		if (in_sched_functions(addr))
1036  			continue;
1037  		break;
1038  	}
1039  
1040  	put_task_stack(p);
1041  
1042  	return addr;
1043  }
1044  
do_arch_prctl_common(int option,unsigned long arg2)1045  long do_arch_prctl_common(int option, unsigned long arg2)
1046  {
1047  	switch (option) {
1048  	case ARCH_GET_CPUID:
1049  		return get_cpuid_mode();
1050  	case ARCH_SET_CPUID:
1051  		return set_cpuid_mode(arg2);
1052  	case ARCH_GET_XCOMP_SUPP:
1053  	case ARCH_GET_XCOMP_PERM:
1054  	case ARCH_REQ_XCOMP_PERM:
1055  	case ARCH_GET_XCOMP_GUEST_PERM:
1056  	case ARCH_REQ_XCOMP_GUEST_PERM:
1057  		return fpu_xstate_prctl(option, arg2);
1058  	}
1059  
1060  	return -EINVAL;
1061  }
1062