1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  Copyright (C) 1995  Linus Torvalds
4   *
5   *  Pentium III FXSR, SSE support
6   *	Gareth Hughes <gareth@valinux.com>, May 2000
7   *
8   *  X86-64 port
9   *	Andi Kleen.
10   *
11   *	CPU hotplug support - ashok.raj@intel.com
12   */
13  
14  /*
15   * This file handles the architecture-dependent parts of process handling..
16   */
17  
18  #include <linux/cpu.h>
19  #include <linux/errno.h>
20  #include <linux/sched.h>
21  #include <linux/sched/task.h>
22  #include <linux/sched/task_stack.h>
23  #include <linux/fs.h>
24  #include <linux/kernel.h>
25  #include <linux/mm.h>
26  #include <linux/elfcore.h>
27  #include <linux/smp.h>
28  #include <linux/slab.h>
29  #include <linux/user.h>
30  #include <linux/interrupt.h>
31  #include <linux/delay.h>
32  #include <linux/export.h>
33  #include <linux/ptrace.h>
34  #include <linux/notifier.h>
35  #include <linux/kprobes.h>
36  #include <linux/kdebug.h>
37  #include <linux/prctl.h>
38  #include <linux/uaccess.h>
39  #include <linux/io.h>
40  #include <linux/ftrace.h>
41  #include <linux/syscalls.h>
42  #include <linux/iommu.h>
43  
44  #include <asm/processor.h>
45  #include <asm/pkru.h>
46  #include <asm/fpu/sched.h>
47  #include <asm/mmu_context.h>
48  #include <asm/prctl.h>
49  #include <asm/desc.h>
50  #include <asm/proto.h>
51  #include <asm/ia32.h>
52  #include <asm/debugreg.h>
53  #include <asm/switch_to.h>
54  #include <asm/xen/hypervisor.h>
55  #include <asm/vdso.h>
56  #include <asm/resctrl.h>
57  #include <asm/unistd.h>
58  #include <asm/fsgsbase.h>
59  #include <asm/fred.h>
60  #ifdef CONFIG_IA32_EMULATION
61  /* Not included via unistd.h */
62  #include <asm/unistd_32_ia32.h>
63  #endif
64  
65  #include "process.h"
66  
67  /* Prints also some state that isn't saved in the pt_regs */
__show_regs(struct pt_regs * regs,enum show_regs_mode mode,const char * log_lvl)68  void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
69  		 const char *log_lvl)
70  {
71  	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
72  	unsigned long d0, d1, d2, d3, d6, d7;
73  	unsigned int fsindex, gsindex;
74  	unsigned int ds, es;
75  
76  	show_iret_regs(regs, log_lvl);
77  
78  	if (regs->orig_ax != -1)
79  		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
80  	else
81  		pr_cont("\n");
82  
83  	printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
84  	       log_lvl, regs->ax, regs->bx, regs->cx);
85  	printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
86  	       log_lvl, regs->dx, regs->si, regs->di);
87  	printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
88  	       log_lvl, regs->bp, regs->r8, regs->r9);
89  	printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
90  	       log_lvl, regs->r10, regs->r11, regs->r12);
91  	printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
92  	       log_lvl, regs->r13, regs->r14, regs->r15);
93  
94  	if (mode == SHOW_REGS_SHORT)
95  		return;
96  
97  	if (mode == SHOW_REGS_USER) {
98  		rdmsrl(MSR_FS_BASE, fs);
99  		rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
100  		printk("%sFS:  %016lx GS:  %016lx\n",
101  		       log_lvl, fs, shadowgs);
102  		return;
103  	}
104  
105  	asm("movl %%ds,%0" : "=r" (ds));
106  	asm("movl %%es,%0" : "=r" (es));
107  	asm("movl %%fs,%0" : "=r" (fsindex));
108  	asm("movl %%gs,%0" : "=r" (gsindex));
109  
110  	rdmsrl(MSR_FS_BASE, fs);
111  	rdmsrl(MSR_GS_BASE, gs);
112  	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
113  
114  	cr0 = read_cr0();
115  	cr2 = read_cr2();
116  	cr3 = __read_cr3();
117  	cr4 = __read_cr4();
118  
119  	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
120  	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
121  	printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
122  		log_lvl, regs->cs, ds, es, cr0);
123  	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
124  		log_lvl, cr2, cr3, cr4);
125  
126  	get_debugreg(d0, 0);
127  	get_debugreg(d1, 1);
128  	get_debugreg(d2, 2);
129  	get_debugreg(d3, 3);
130  	get_debugreg(d6, 6);
131  	get_debugreg(d7, 7);
132  
133  	/* Only print out debug registers if they are in their non-default state. */
134  	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
135  	    (d6 == DR6_RESERVED) && (d7 == 0x400))) {
136  		printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
137  		       log_lvl, d0, d1, d2);
138  		printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
139  		       log_lvl, d3, d6, d7);
140  	}
141  
142  	if (cr4 & X86_CR4_PKE)
143  		printk("%sPKRU: %08x\n", log_lvl, read_pkru());
144  }
145  
release_thread(struct task_struct * dead_task)146  void release_thread(struct task_struct *dead_task)
147  {
148  	WARN_ON(dead_task->mm);
149  }
150  
151  enum which_selector {
152  	FS,
153  	GS
154  };
155  
156  /*
157   * Out of line to be protected from kprobes and tracing. If this would be
158   * traced or probed than any access to a per CPU variable happens with
159   * the wrong GS.
160   *
161   * It is not used on Xen paravirt. When paravirt support is needed, it
162   * needs to be renamed with native_ prefix.
163   */
__rdgsbase_inactive(void)164  static noinstr unsigned long __rdgsbase_inactive(void)
165  {
166  	unsigned long gsbase;
167  
168  	lockdep_assert_irqs_disabled();
169  
170  	/*
171  	 * SWAPGS is no longer needed thus NOT allowed with FRED because
172  	 * FRED transitions ensure that an operating system can _always_
173  	 * operate with its own GS base address:
174  	 * - For events that occur in ring 3, FRED event delivery swaps
175  	 *   the GS base address with the IA32_KERNEL_GS_BASE MSR.
176  	 * - ERETU (the FRED transition that returns to ring 3) also swaps
177  	 *   the GS base address with the IA32_KERNEL_GS_BASE MSR.
178  	 *
179  	 * And the operating system can still setup the GS segment for a
180  	 * user thread without the need of loading a user thread GS with:
181  	 * - Using LKGS, available with FRED, to modify other attributes
182  	 *   of the GS segment without compromising its ability always to
183  	 *   operate with its own GS base address.
184  	 * - Accessing the GS segment base address for a user thread as
185  	 *   before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR.
186  	 *
187  	 * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE
188  	 * MSR instead of the GS segment’s descriptor cache. As such, the
189  	 * operating system never changes its runtime GS base address.
190  	 */
191  	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
192  	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
193  		native_swapgs();
194  		gsbase = rdgsbase();
195  		native_swapgs();
196  	} else {
197  		instrumentation_begin();
198  		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
199  		instrumentation_end();
200  	}
201  
202  	return gsbase;
203  }
204  
205  /*
206   * Out of line to be protected from kprobes and tracing. If this would be
207   * traced or probed than any access to a per CPU variable happens with
208   * the wrong GS.
209   *
210   * It is not used on Xen paravirt. When paravirt support is needed, it
211   * needs to be renamed with native_ prefix.
212   */
__wrgsbase_inactive(unsigned long gsbase)213  static noinstr void __wrgsbase_inactive(unsigned long gsbase)
214  {
215  	lockdep_assert_irqs_disabled();
216  
217  	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
218  	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
219  		native_swapgs();
220  		wrgsbase(gsbase);
221  		native_swapgs();
222  	} else {
223  		instrumentation_begin();
224  		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
225  		instrumentation_end();
226  	}
227  }
228  
229  /*
230   * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
231   * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
232   * It's forcibly inlined because it'll generate better code and this function
233   * is hot.
234   */
save_base_legacy(struct task_struct * prev_p,unsigned short selector,enum which_selector which)235  static __always_inline void save_base_legacy(struct task_struct *prev_p,
236  					     unsigned short selector,
237  					     enum which_selector which)
238  {
239  	if (likely(selector == 0)) {
240  		/*
241  		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
242  		 * be the pre-existing saved base or it could be zero.  On AMD
243  		 * (with X86_BUG_NULL_SEG), the segment base could be almost
244  		 * anything.
245  		 *
246  		 * This branch is very hot (it's hit twice on almost every
247  		 * context switch between 64-bit programs), and avoiding
248  		 * the RDMSR helps a lot, so we just assume that whatever
249  		 * value is already saved is correct.  This matches historical
250  		 * Linux behavior, so it won't break existing applications.
251  		 *
252  		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
253  		 * report that the base is zero, it needs to actually be zero:
254  		 * see the corresponding logic in load_seg_legacy.
255  		 */
256  	} else {
257  		/*
258  		 * If the selector is 1, 2, or 3, then the base is zero on
259  		 * !X86_BUG_NULL_SEG CPUs and could be anything on
260  		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
261  		 * has never attempted to preserve the base across context
262  		 * switches.
263  		 *
264  		 * If selector > 3, then it refers to a real segment, and
265  		 * saving the base isn't necessary.
266  		 */
267  		if (which == FS)
268  			prev_p->thread.fsbase = 0;
269  		else
270  			prev_p->thread.gsbase = 0;
271  	}
272  }
273  
save_fsgs(struct task_struct * task)274  static __always_inline void save_fsgs(struct task_struct *task)
275  {
276  	savesegment(fs, task->thread.fsindex);
277  	savesegment(gs, task->thread.gsindex);
278  	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
279  		/*
280  		 * If FSGSBASE is enabled, we can't make any useful guesses
281  		 * about the base, and user code expects us to save the current
282  		 * value.  Fortunately, reading the base directly is efficient.
283  		 */
284  		task->thread.fsbase = rdfsbase();
285  		task->thread.gsbase = __rdgsbase_inactive();
286  	} else {
287  		save_base_legacy(task, task->thread.fsindex, FS);
288  		save_base_legacy(task, task->thread.gsindex, GS);
289  	}
290  }
291  
292  /*
293   * While a process is running,current->thread.fsbase and current->thread.gsbase
294   * may not match the corresponding CPU registers (see save_base_legacy()).
295   */
current_save_fsgs(void)296  void current_save_fsgs(void)
297  {
298  	unsigned long flags;
299  
300  	/* Interrupts need to be off for FSGSBASE */
301  	local_irq_save(flags);
302  	save_fsgs(current);
303  	local_irq_restore(flags);
304  }
305  #if IS_ENABLED(CONFIG_KVM)
306  EXPORT_SYMBOL_GPL(current_save_fsgs);
307  #endif
308  
loadseg(enum which_selector which,unsigned short sel)309  static __always_inline void loadseg(enum which_selector which,
310  				    unsigned short sel)
311  {
312  	if (which == FS)
313  		loadsegment(fs, sel);
314  	else
315  		load_gs_index(sel);
316  }
317  
load_seg_legacy(unsigned short prev_index,unsigned long prev_base,unsigned short next_index,unsigned long next_base,enum which_selector which)318  static __always_inline void load_seg_legacy(unsigned short prev_index,
319  					    unsigned long prev_base,
320  					    unsigned short next_index,
321  					    unsigned long next_base,
322  					    enum which_selector which)
323  {
324  	if (likely(next_index <= 3)) {
325  		/*
326  		 * The next task is using 64-bit TLS, is not using this
327  		 * segment at all, or is having fun with arcane CPU features.
328  		 */
329  		if (next_base == 0) {
330  			/*
331  			 * Nasty case: on AMD CPUs, we need to forcibly zero
332  			 * the base.
333  			 */
334  			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
335  				loadseg(which, __USER_DS);
336  				loadseg(which, next_index);
337  			} else {
338  				/*
339  				 * We could try to exhaustively detect cases
340  				 * under which we can skip the segment load,
341  				 * but there's really only one case that matters
342  				 * for performance: if both the previous and
343  				 * next states are fully zeroed, we can skip
344  				 * the load.
345  				 *
346  				 * (This assumes that prev_base == 0 has no
347  				 * false positives.  This is the case on
348  				 * Intel-style CPUs.)
349  				 */
350  				if (likely(prev_index | next_index | prev_base))
351  					loadseg(which, next_index);
352  			}
353  		} else {
354  			if (prev_index != next_index)
355  				loadseg(which, next_index);
356  			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
357  			       next_base);
358  		}
359  	} else {
360  		/*
361  		 * The next task is using a real segment.  Loading the selector
362  		 * is sufficient.
363  		 */
364  		loadseg(which, next_index);
365  	}
366  }
367  
368  /*
369   * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
370   * is not XSTATE managed on context switch because that would require a
371   * lookup in the task's FPU xsave buffer and require to keep that updated
372   * in various places.
373   */
x86_pkru_load(struct thread_struct * prev,struct thread_struct * next)374  static __always_inline void x86_pkru_load(struct thread_struct *prev,
375  					  struct thread_struct *next)
376  {
377  	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
378  		return;
379  
380  	/* Stash the prev task's value: */
381  	prev->pkru = rdpkru();
382  
383  	/*
384  	 * PKRU writes are slightly expensive.  Avoid them when not
385  	 * strictly necessary:
386  	 */
387  	if (prev->pkru != next->pkru)
388  		wrpkru(next->pkru);
389  }
390  
x86_fsgsbase_load(struct thread_struct * prev,struct thread_struct * next)391  static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
392  					      struct thread_struct *next)
393  {
394  	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
395  		/* Update the FS and GS selectors if they could have changed. */
396  		if (unlikely(prev->fsindex || next->fsindex))
397  			loadseg(FS, next->fsindex);
398  		if (unlikely(prev->gsindex || next->gsindex))
399  			loadseg(GS, next->gsindex);
400  
401  		/* Update the bases. */
402  		wrfsbase(next->fsbase);
403  		__wrgsbase_inactive(next->gsbase);
404  	} else {
405  		load_seg_legacy(prev->fsindex, prev->fsbase,
406  				next->fsindex, next->fsbase, FS);
407  		load_seg_legacy(prev->gsindex, prev->gsbase,
408  				next->gsindex, next->gsbase, GS);
409  	}
410  }
411  
x86_fsgsbase_read_task(struct task_struct * task,unsigned short selector)412  unsigned long x86_fsgsbase_read_task(struct task_struct *task,
413  				     unsigned short selector)
414  {
415  	unsigned short idx = selector >> 3;
416  	unsigned long base;
417  
418  	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
419  		if (unlikely(idx >= GDT_ENTRIES))
420  			return 0;
421  
422  		/*
423  		 * There are no user segments in the GDT with nonzero bases
424  		 * other than the TLS segments.
425  		 */
426  		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
427  			return 0;
428  
429  		idx -= GDT_ENTRY_TLS_MIN;
430  		base = get_desc_base(&task->thread.tls_array[idx]);
431  	} else {
432  #ifdef CONFIG_MODIFY_LDT_SYSCALL
433  		struct ldt_struct *ldt;
434  
435  		/*
436  		 * If performance here mattered, we could protect the LDT
437  		 * with RCU.  This is a slow path, though, so we can just
438  		 * take the mutex.
439  		 */
440  		mutex_lock(&task->mm->context.lock);
441  		ldt = task->mm->context.ldt;
442  		if (unlikely(!ldt || idx >= ldt->nr_entries))
443  			base = 0;
444  		else
445  			base = get_desc_base(ldt->entries + idx);
446  		mutex_unlock(&task->mm->context.lock);
447  #else
448  		base = 0;
449  #endif
450  	}
451  
452  	return base;
453  }
454  
x86_gsbase_read_cpu_inactive(void)455  unsigned long x86_gsbase_read_cpu_inactive(void)
456  {
457  	unsigned long gsbase;
458  
459  	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
460  		unsigned long flags;
461  
462  		local_irq_save(flags);
463  		gsbase = __rdgsbase_inactive();
464  		local_irq_restore(flags);
465  	} else {
466  		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
467  	}
468  
469  	return gsbase;
470  }
471  
x86_gsbase_write_cpu_inactive(unsigned long gsbase)472  void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
473  {
474  	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
475  		unsigned long flags;
476  
477  		local_irq_save(flags);
478  		__wrgsbase_inactive(gsbase);
479  		local_irq_restore(flags);
480  	} else {
481  		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
482  	}
483  }
484  
x86_fsbase_read_task(struct task_struct * task)485  unsigned long x86_fsbase_read_task(struct task_struct *task)
486  {
487  	unsigned long fsbase;
488  
489  	if (task == current)
490  		fsbase = x86_fsbase_read_cpu();
491  	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
492  		 (task->thread.fsindex == 0))
493  		fsbase = task->thread.fsbase;
494  	else
495  		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
496  
497  	return fsbase;
498  }
499  
x86_gsbase_read_task(struct task_struct * task)500  unsigned long x86_gsbase_read_task(struct task_struct *task)
501  {
502  	unsigned long gsbase;
503  
504  	if (task == current)
505  		gsbase = x86_gsbase_read_cpu_inactive();
506  	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
507  		 (task->thread.gsindex == 0))
508  		gsbase = task->thread.gsbase;
509  	else
510  		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
511  
512  	return gsbase;
513  }
514  
x86_fsbase_write_task(struct task_struct * task,unsigned long fsbase)515  void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
516  {
517  	WARN_ON_ONCE(task == current);
518  
519  	task->thread.fsbase = fsbase;
520  }
521  
x86_gsbase_write_task(struct task_struct * task,unsigned long gsbase)522  void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
523  {
524  	WARN_ON_ONCE(task == current);
525  
526  	task->thread.gsbase = gsbase;
527  }
528  
529  static void
start_thread_common(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp,u16 _cs,u16 _ss,u16 _ds)530  start_thread_common(struct pt_regs *regs, unsigned long new_ip,
531  		    unsigned long new_sp,
532  		    u16 _cs, u16 _ss, u16 _ds)
533  {
534  	WARN_ON_ONCE(regs != current_pt_regs());
535  
536  	if (static_cpu_has(X86_BUG_NULL_SEG)) {
537  		/* Loading zero below won't clear the base. */
538  		loadsegment(fs, __USER_DS);
539  		load_gs_index(__USER_DS);
540  	}
541  
542  	reset_thread_features();
543  
544  	loadsegment(fs, 0);
545  	loadsegment(es, _ds);
546  	loadsegment(ds, _ds);
547  	load_gs_index(0);
548  
549  	regs->ip	= new_ip;
550  	regs->sp	= new_sp;
551  	regs->csx	= _cs;
552  	regs->ssx	= _ss;
553  	/*
554  	 * Allow single-step trap and NMI when starting a new task, thus
555  	 * once the new task enters user space, single-step trap and NMI
556  	 * are both enabled immediately.
557  	 *
558  	 * Entering a new task is logically speaking a return from a
559  	 * system call (exec, fork, clone, etc.). As such, if ptrace
560  	 * enables single stepping a single step exception should be
561  	 * allowed to trigger immediately upon entering user space.
562  	 * This is not optional.
563  	 *
564  	 * NMI should *never* be disabled in user space. As such, this
565  	 * is an optional, opportunistic way to catch errors.
566  	 *
567  	 * Paranoia: High-order 48 bits above the lowest 16 bit SS are
568  	 * discarded by the legacy IRET instruction on all Intel, AMD,
569  	 * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally,
570  	 * even when FRED is not enabled. But we choose the safer side
571  	 * to use these bits only when FRED is enabled.
572  	 */
573  	if (cpu_feature_enabled(X86_FEATURE_FRED)) {
574  		regs->fred_ss.swevent	= true;
575  		regs->fred_ss.nmi	= true;
576  	}
577  
578  	regs->flags	= X86_EFLAGS_IF | X86_EFLAGS_FIXED;
579  }
580  
581  void
start_thread(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp)582  start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
583  {
584  	start_thread_common(regs, new_ip, new_sp,
585  			    __USER_CS, __USER_DS, 0);
586  }
587  EXPORT_SYMBOL_GPL(start_thread);
588  
589  #ifdef CONFIG_COMPAT
compat_start_thread(struct pt_regs * regs,u32 new_ip,u32 new_sp,bool x32)590  void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
591  {
592  	start_thread_common(regs, new_ip, new_sp,
593  			    x32 ? __USER_CS : __USER32_CS,
594  			    __USER_DS, __USER_DS);
595  }
596  #endif
597  
598  /*
599   *	switch_to(x,y) should switch tasks from x to y.
600   *
601   * This could still be optimized:
602   * - fold all the options into a flag word and test it with a single test.
603   * - could test fs/gs bitsliced
604   *
605   * Kprobes not supported here. Set the probe on schedule instead.
606   * Function graph tracer not supported too.
607   */
608  __no_kmsan_checks
609  __visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct * prev_p,struct task_struct * next_p)610  __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
611  {
612  	struct thread_struct *prev = &prev_p->thread;
613  	struct thread_struct *next = &next_p->thread;
614  	int cpu = smp_processor_id();
615  
616  	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
617  		     this_cpu_read(pcpu_hot.hardirq_stack_inuse));
618  
619  	if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD))
620  		switch_fpu_prepare(prev_p, cpu);
621  
622  	/* We must save %fs and %gs before load_TLS() because
623  	 * %fs and %gs may be cleared by load_TLS().
624  	 *
625  	 * (e.g. xen_load_tls())
626  	 */
627  	save_fsgs(prev_p);
628  
629  	/*
630  	 * Load TLS before restoring any segments so that segment loads
631  	 * reference the correct GDT entries.
632  	 */
633  	load_TLS(next, cpu);
634  
635  	/*
636  	 * Leave lazy mode, flushing any hypercalls made here.  This
637  	 * must be done after loading TLS entries in the GDT but before
638  	 * loading segments that might reference them.
639  	 */
640  	arch_end_context_switch(next_p);
641  
642  	/* Switch DS and ES.
643  	 *
644  	 * Reading them only returns the selectors, but writing them (if
645  	 * nonzero) loads the full descriptor from the GDT or LDT.  The
646  	 * LDT for next is loaded in switch_mm, and the GDT is loaded
647  	 * above.
648  	 *
649  	 * We therefore need to write new values to the segment
650  	 * registers on every context switch unless both the new and old
651  	 * values are zero.
652  	 *
653  	 * Note that we don't need to do anything for CS and SS, as
654  	 * those are saved and restored as part of pt_regs.
655  	 */
656  	savesegment(es, prev->es);
657  	if (unlikely(next->es | prev->es))
658  		loadsegment(es, next->es);
659  
660  	savesegment(ds, prev->ds);
661  	if (unlikely(next->ds | prev->ds))
662  		loadsegment(ds, next->ds);
663  
664  	x86_fsgsbase_load(prev, next);
665  
666  	x86_pkru_load(prev, next);
667  
668  	/*
669  	 * Switch the PDA and FPU contexts.
670  	 */
671  	raw_cpu_write(pcpu_hot.current_task, next_p);
672  	raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p));
673  
674  	switch_fpu_finish(next_p);
675  
676  	/* Reload sp0. */
677  	update_task_stack(next_p);
678  
679  	switch_to_extra(prev_p, next_p);
680  
681  	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
682  		/*
683  		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
684  		 * does not update the cached descriptor.  As a result, if we
685  		 * do SYSRET while SS is NULL, we'll end up in user mode with
686  		 * SS apparently equal to __USER_DS but actually unusable.
687  		 *
688  		 * The straightforward workaround would be to fix it up just
689  		 * before SYSRET, but that would slow down the system call
690  		 * fast paths.  Instead, we ensure that SS is never NULL in
691  		 * system call context.  We do this by replacing NULL SS
692  		 * selectors at every context switch.  SYSCALL sets up a valid
693  		 * SS, so the only way to get NULL is to re-enter the kernel
694  		 * from CPL 3 through an interrupt.  Since that can't happen
695  		 * in the same task as a running syscall, we are guaranteed to
696  		 * context switch between every interrupt vector entry and a
697  		 * subsequent SYSRET.
698  		 *
699  		 * We read SS first because SS reads are much faster than
700  		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
701  		 * it previously had a different non-NULL value.
702  		 */
703  		unsigned short ss_sel;
704  		savesegment(ss, ss_sel);
705  		if (ss_sel != __KERNEL_DS)
706  			loadsegment(ss, __KERNEL_DS);
707  	}
708  
709  	/* Load the Intel cache allocation PQR MSR. */
710  	resctrl_sched_in(next_p);
711  
712  	return prev_p;
713  }
714  
set_personality_64bit(void)715  void set_personality_64bit(void)
716  {
717  	/* inherit personality from parent */
718  
719  	/* Make sure to be in 64bit mode */
720  	clear_thread_flag(TIF_ADDR32);
721  	/* Pretend that this comes from a 64bit execve */
722  	task_pt_regs(current)->orig_ax = __NR_execve;
723  	current_thread_info()->status &= ~TS_COMPAT;
724  	if (current->mm)
725  		__set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags);
726  
727  	/* TBD: overwrites user setup. Should have two bits.
728  	   But 64bit processes have always behaved this way,
729  	   so it's not too bad. The main problem is just that
730  	   32bit children are affected again. */
731  	current->personality &= ~READ_IMPLIES_EXEC;
732  }
733  
__set_personality_x32(void)734  static void __set_personality_x32(void)
735  {
736  #ifdef CONFIG_X86_X32_ABI
737  	if (current->mm)
738  		current->mm->context.flags = 0;
739  
740  	current->personality &= ~READ_IMPLIES_EXEC;
741  	/*
742  	 * in_32bit_syscall() uses the presence of the x32 syscall bit
743  	 * flag to determine compat status.  The x86 mmap() code relies on
744  	 * the syscall bitness so set x32 syscall bit right here to make
745  	 * in_32bit_syscall() work during exec().
746  	 *
747  	 * Pretend to come from a x32 execve.
748  	 */
749  	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
750  	current_thread_info()->status &= ~TS_COMPAT;
751  #endif
752  }
753  
__set_personality_ia32(void)754  static void __set_personality_ia32(void)
755  {
756  #ifdef CONFIG_IA32_EMULATION
757  	if (current->mm) {
758  		/*
759  		 * uprobes applied to this MM need to know this and
760  		 * cannot use user_64bit_mode() at that time.
761  		 */
762  		__set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags);
763  	}
764  
765  	current->personality |= force_personality32;
766  	/* Prepare the first "return" to user space */
767  	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
768  	current_thread_info()->status |= TS_COMPAT;
769  #endif
770  }
771  
set_personality_ia32(bool x32)772  void set_personality_ia32(bool x32)
773  {
774  	/* Make sure to be in 32bit mode */
775  	set_thread_flag(TIF_ADDR32);
776  
777  	if (x32)
778  		__set_personality_x32();
779  	else
780  		__set_personality_ia32();
781  }
782  EXPORT_SYMBOL_GPL(set_personality_ia32);
783  
784  #ifdef CONFIG_CHECKPOINT_RESTORE
prctl_map_vdso(const struct vdso_image * image,unsigned long addr)785  static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
786  {
787  	int ret;
788  
789  	ret = map_vdso_once(image, addr);
790  	if (ret)
791  		return ret;
792  
793  	return (long)image->size;
794  }
795  #endif
796  
797  #ifdef CONFIG_ADDRESS_MASKING
798  
799  #define LAM_U57_BITS 6
800  
enable_lam_func(void * __mm)801  static void enable_lam_func(void *__mm)
802  {
803  	struct mm_struct *mm = __mm;
804  	unsigned long lam;
805  
806  	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
807  		lam = mm_lam_cr3_mask(mm);
808  		write_cr3(__read_cr3() | lam);
809  		cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
810  	}
811  }
812  
mm_enable_lam(struct mm_struct * mm)813  static void mm_enable_lam(struct mm_struct *mm)
814  {
815  	mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
816  	mm->context.untag_mask =  ~GENMASK(62, 57);
817  
818  	/*
819  	 * Even though the process must still be single-threaded at this
820  	 * point, kernel threads may be using the mm.  IPI those kernel
821  	 * threads if they exist.
822  	 */
823  	on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
824  	set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
825  }
826  
prctl_enable_tagged_addr(struct mm_struct * mm,unsigned long nr_bits)827  static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
828  {
829  	if (!cpu_feature_enabled(X86_FEATURE_LAM))
830  		return -ENODEV;
831  
832  	/* PTRACE_ARCH_PRCTL */
833  	if (current->mm != mm)
834  		return -EINVAL;
835  
836  	if (mm_valid_pasid(mm) &&
837  	    !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags))
838  		return -EINVAL;
839  
840  	if (mmap_write_lock_killable(mm))
841  		return -EINTR;
842  
843  	/*
844  	 * MM_CONTEXT_LOCK_LAM is set on clone.  Prevent LAM from
845  	 * being enabled unless the process is single threaded:
846  	 */
847  	if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
848  		mmap_write_unlock(mm);
849  		return -EBUSY;
850  	}
851  
852  	if (!nr_bits || nr_bits > LAM_U57_BITS) {
853  		mmap_write_unlock(mm);
854  		return -EINVAL;
855  	}
856  
857  	mm_enable_lam(mm);
858  
859  	mmap_write_unlock(mm);
860  
861  	return 0;
862  }
863  #endif
864  
do_arch_prctl_64(struct task_struct * task,int option,unsigned long arg2)865  long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
866  {
867  	int ret = 0;
868  
869  	switch (option) {
870  	case ARCH_SET_GS: {
871  		if (unlikely(arg2 >= TASK_SIZE_MAX))
872  			return -EPERM;
873  
874  		preempt_disable();
875  		/*
876  		 * ARCH_SET_GS has always overwritten the index
877  		 * and the base. Zero is the most sensible value
878  		 * to put in the index, and is the only value that
879  		 * makes any sense if FSGSBASE is unavailable.
880  		 */
881  		if (task == current) {
882  			loadseg(GS, 0);
883  			x86_gsbase_write_cpu_inactive(arg2);
884  
885  			/*
886  			 * On non-FSGSBASE systems, save_base_legacy() expects
887  			 * that we also fill in thread.gsbase.
888  			 */
889  			task->thread.gsbase = arg2;
890  
891  		} else {
892  			task->thread.gsindex = 0;
893  			x86_gsbase_write_task(task, arg2);
894  		}
895  		preempt_enable();
896  		break;
897  	}
898  	case ARCH_SET_FS: {
899  		/*
900  		 * Not strictly needed for %fs, but do it for symmetry
901  		 * with %gs
902  		 */
903  		if (unlikely(arg2 >= TASK_SIZE_MAX))
904  			return -EPERM;
905  
906  		preempt_disable();
907  		/*
908  		 * Set the selector to 0 for the same reason
909  		 * as %gs above.
910  		 */
911  		if (task == current) {
912  			loadseg(FS, 0);
913  			x86_fsbase_write_cpu(arg2);
914  
915  			/*
916  			 * On non-FSGSBASE systems, save_base_legacy() expects
917  			 * that we also fill in thread.fsbase.
918  			 */
919  			task->thread.fsbase = arg2;
920  		} else {
921  			task->thread.fsindex = 0;
922  			x86_fsbase_write_task(task, arg2);
923  		}
924  		preempt_enable();
925  		break;
926  	}
927  	case ARCH_GET_FS: {
928  		unsigned long base = x86_fsbase_read_task(task);
929  
930  		ret = put_user(base, (unsigned long __user *)arg2);
931  		break;
932  	}
933  	case ARCH_GET_GS: {
934  		unsigned long base = x86_gsbase_read_task(task);
935  
936  		ret = put_user(base, (unsigned long __user *)arg2);
937  		break;
938  	}
939  
940  #ifdef CONFIG_CHECKPOINT_RESTORE
941  # ifdef CONFIG_X86_X32_ABI
942  	case ARCH_MAP_VDSO_X32:
943  		return prctl_map_vdso(&vdso_image_x32, arg2);
944  # endif
945  # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
946  	case ARCH_MAP_VDSO_32:
947  		return prctl_map_vdso(&vdso_image_32, arg2);
948  # endif
949  	case ARCH_MAP_VDSO_64:
950  		return prctl_map_vdso(&vdso_image_64, arg2);
951  #endif
952  #ifdef CONFIG_ADDRESS_MASKING
953  	case ARCH_GET_UNTAG_MASK:
954  		return put_user(task->mm->context.untag_mask,
955  				(unsigned long __user *)arg2);
956  	case ARCH_ENABLE_TAGGED_ADDR:
957  		return prctl_enable_tagged_addr(task->mm, arg2);
958  	case ARCH_FORCE_TAGGED_SVA:
959  		if (current != task)
960  			return -EINVAL;
961  		set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags);
962  		return 0;
963  	case ARCH_GET_MAX_TAG_BITS:
964  		if (!cpu_feature_enabled(X86_FEATURE_LAM))
965  			return put_user(0, (unsigned long __user *)arg2);
966  		else
967  			return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
968  #endif
969  	case ARCH_SHSTK_ENABLE:
970  	case ARCH_SHSTK_DISABLE:
971  	case ARCH_SHSTK_LOCK:
972  	case ARCH_SHSTK_UNLOCK:
973  	case ARCH_SHSTK_STATUS:
974  		return shstk_prctl(task, option, arg2);
975  	default:
976  		ret = -EINVAL;
977  		break;
978  	}
979  
980  	return ret;
981  }
982  
SYSCALL_DEFINE2(arch_prctl,int,option,unsigned long,arg2)983  SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
984  {
985  	long ret;
986  
987  	ret = do_arch_prctl_64(current, option, arg2);
988  	if (ret == -EINVAL)
989  		ret = do_arch_prctl_common(option, arg2);
990  
991  	return ret;
992  }
993  
994  #ifdef CONFIG_IA32_EMULATION
COMPAT_SYSCALL_DEFINE2(arch_prctl,int,option,unsigned long,arg2)995  COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
996  {
997  	return do_arch_prctl_common(option, arg2);
998  }
999  #endif
1000  
KSTK_ESP(struct task_struct * task)1001  unsigned long KSTK_ESP(struct task_struct *task)
1002  {
1003  	return task_pt_regs(task)->sp;
1004  }
1005