1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Debug helper to dump the current kernel pagetables of the system
4   * so that we can see what the various memory ranges are set to.
5   *
6   * (C) Copyright 2008 Intel Corporation
7   *
8   * Author: Arjan van de Ven <arjan@linux.intel.com>
9   */
10  
11  #include <linux/debugfs.h>
12  #include <linux/kasan.h>
13  #include <linux/mm.h>
14  #include <linux/init.h>
15  #include <linux/sched.h>
16  #include <linux/seq_file.h>
17  #include <linux/highmem.h>
18  #include <linux/pci.h>
19  #include <linux/ptdump.h>
20  
21  #include <asm/e820/types.h>
22  
23  /*
24   * The dumper groups pagetable entries of the same type into one, and for
25   * that it needs to keep some state when walking, and flush this state
26   * when a "break" in the continuity is found.
27   */
28  struct pg_state {
29  	struct ptdump_state ptdump;
30  	int level;
31  	pgprotval_t current_prot;
32  	pgprotval_t effective_prot;
33  	pgprotval_t prot_levels[5];
34  	unsigned long start_address;
35  	const struct addr_marker *marker;
36  	unsigned long lines;
37  	bool to_dmesg;
38  	bool check_wx;
39  	unsigned long wx_pages;
40  	struct seq_file *seq;
41  };
42  
43  struct addr_marker {
44  	unsigned long start_address;
45  	const char *name;
46  	unsigned long max_lines;
47  };
48  
49  /* Address space markers hints */
50  
51  #ifdef CONFIG_X86_64
52  
53  enum address_markers_idx {
54  	USER_SPACE_NR = 0,
55  	KERNEL_SPACE_NR,
56  #ifdef CONFIG_MODIFY_LDT_SYSCALL
57  	LDT_NR,
58  #endif
59  	LOW_KERNEL_NR,
60  	VMALLOC_START_NR,
61  	VMEMMAP_START_NR,
62  #ifdef CONFIG_KASAN
63  	KASAN_SHADOW_START_NR,
64  	KASAN_SHADOW_END_NR,
65  #endif
66  	CPU_ENTRY_AREA_NR,
67  #ifdef CONFIG_X86_ESPFIX64
68  	ESPFIX_START_NR,
69  #endif
70  #ifdef CONFIG_EFI
71  	EFI_END_NR,
72  #endif
73  	HIGH_KERNEL_NR,
74  	MODULES_VADDR_NR,
75  	MODULES_END_NR,
76  	FIXADDR_START_NR,
77  	END_OF_SPACE_NR,
78  };
79  
80  static struct addr_marker address_markers[] = {
81  	[USER_SPACE_NR]		= { 0,			"User Space" },
82  	[KERNEL_SPACE_NR]	= { (1UL << 63),	"Kernel Space" },
83  	[LOW_KERNEL_NR]		= { 0UL,		"Low Kernel Mapping" },
84  	[VMALLOC_START_NR]	= { 0UL,		"vmalloc() Area" },
85  	[VMEMMAP_START_NR]	= { 0UL,		"Vmemmap" },
86  #ifdef CONFIG_KASAN
87  	/*
88  	 * These fields get initialized with the (dynamic)
89  	 * KASAN_SHADOW_{START,END} values in pt_dump_init().
90  	 */
91  	[KASAN_SHADOW_START_NR]	= { 0UL,		"KASAN shadow" },
92  	[KASAN_SHADOW_END_NR]	= { 0UL,		"KASAN shadow end" },
93  #endif
94  #ifdef CONFIG_MODIFY_LDT_SYSCALL
95  	[LDT_NR]		= { 0UL,		"LDT remap" },
96  #endif
97  	[CPU_ENTRY_AREA_NR]	= { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
98  #ifdef CONFIG_X86_ESPFIX64
99  	[ESPFIX_START_NR]	= { ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
100  #endif
101  #ifdef CONFIG_EFI
102  	[EFI_END_NR]		= { EFI_VA_END,		"EFI Runtime Services" },
103  #endif
104  	[HIGH_KERNEL_NR]	= { __START_KERNEL_map,	"High Kernel Mapping" },
105  	[MODULES_VADDR_NR]	= { MODULES_VADDR,	"Modules" },
106  	[MODULES_END_NR]	= { MODULES_END,	"End Modules" },
107  	[FIXADDR_START_NR]	= { FIXADDR_START,	"Fixmap Area" },
108  	[END_OF_SPACE_NR]	= { -1,			NULL }
109  };
110  
111  #define INIT_PGD	((pgd_t *) &init_top_pgt)
112  
113  #else /* CONFIG_X86_64 */
114  
115  enum address_markers_idx {
116  	USER_SPACE_NR = 0,
117  	KERNEL_SPACE_NR,
118  	VMALLOC_START_NR,
119  	VMALLOC_END_NR,
120  #ifdef CONFIG_HIGHMEM
121  	PKMAP_BASE_NR,
122  #endif
123  #ifdef CONFIG_MODIFY_LDT_SYSCALL
124  	LDT_NR,
125  #endif
126  	CPU_ENTRY_AREA_NR,
127  	FIXADDR_START_NR,
128  	END_OF_SPACE_NR,
129  };
130  
131  static struct addr_marker address_markers[] = {
132  	[USER_SPACE_NR]		= { 0,			"User Space" },
133  	[KERNEL_SPACE_NR]	= { PAGE_OFFSET,	"Kernel Mapping" },
134  	[VMALLOC_START_NR]	= { 0UL,		"vmalloc() Area" },
135  	[VMALLOC_END_NR]	= { 0UL,		"vmalloc() End" },
136  #ifdef CONFIG_HIGHMEM
137  	[PKMAP_BASE_NR]		= { 0UL,		"Persistent kmap() Area" },
138  #endif
139  #ifdef CONFIG_MODIFY_LDT_SYSCALL
140  	[LDT_NR]		= { 0UL,		"LDT remap" },
141  #endif
142  	[CPU_ENTRY_AREA_NR]	= { 0UL,		"CPU entry area" },
143  	[FIXADDR_START_NR]	= { 0UL,		"Fixmap area" },
144  	[END_OF_SPACE_NR]	= { -1,			NULL }
145  };
146  
147  #define INIT_PGD	(swapper_pg_dir)
148  
149  #endif /* !CONFIG_X86_64 */
150  
151  /* Multipliers for offsets within the PTEs */
152  #define PTE_LEVEL_MULT (PAGE_SIZE)
153  #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
154  #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
155  #define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
156  #define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
157  
158  #define pt_dump_seq_printf(m, to_dmesg, fmt, args...)		\
159  ({								\
160  	if (to_dmesg)					\
161  		printk(KERN_INFO fmt, ##args);			\
162  	else							\
163  		if (m)						\
164  			seq_printf(m, fmt, ##args);		\
165  })
166  
167  #define pt_dump_cont_printf(m, to_dmesg, fmt, args...)		\
168  ({								\
169  	if (to_dmesg)					\
170  		printk(KERN_CONT fmt, ##args);			\
171  	else							\
172  		if (m)						\
173  			seq_printf(m, fmt, ##args);		\
174  })
175  
176  /*
177   * Print a readable form of a pgprot_t to the seq_file
178   */
printk_prot(struct seq_file * m,pgprotval_t pr,int level,bool dmsg)179  static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg)
180  {
181  	static const char * const level_name[] =
182  		{ "pgd", "p4d", "pud", "pmd", "pte" };
183  
184  	if (!(pr & _PAGE_PRESENT)) {
185  		/* Not present */
186  		pt_dump_cont_printf(m, dmsg, "                              ");
187  	} else {
188  		if (pr & _PAGE_USER)
189  			pt_dump_cont_printf(m, dmsg, "USR ");
190  		else
191  			pt_dump_cont_printf(m, dmsg, "    ");
192  		if (pr & _PAGE_RW)
193  			pt_dump_cont_printf(m, dmsg, "RW ");
194  		else
195  			pt_dump_cont_printf(m, dmsg, "ro ");
196  		if (pr & _PAGE_PWT)
197  			pt_dump_cont_printf(m, dmsg, "PWT ");
198  		else
199  			pt_dump_cont_printf(m, dmsg, "    ");
200  		if (pr & _PAGE_PCD)
201  			pt_dump_cont_printf(m, dmsg, "PCD ");
202  		else
203  			pt_dump_cont_printf(m, dmsg, "    ");
204  
205  		/* Bit 7 has a different meaning on level 3 vs 4 */
206  		if (level <= 3 && pr & _PAGE_PSE)
207  			pt_dump_cont_printf(m, dmsg, "PSE ");
208  		else
209  			pt_dump_cont_printf(m, dmsg, "    ");
210  		if ((level == 4 && pr & _PAGE_PAT) ||
211  		    ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
212  			pt_dump_cont_printf(m, dmsg, "PAT ");
213  		else
214  			pt_dump_cont_printf(m, dmsg, "    ");
215  		if (pr & _PAGE_GLOBAL)
216  			pt_dump_cont_printf(m, dmsg, "GLB ");
217  		else
218  			pt_dump_cont_printf(m, dmsg, "    ");
219  		if (pr & _PAGE_NX)
220  			pt_dump_cont_printf(m, dmsg, "NX ");
221  		else
222  			pt_dump_cont_printf(m, dmsg, "x  ");
223  	}
224  	pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
225  }
226  
note_wx(struct pg_state * st,unsigned long addr)227  static void note_wx(struct pg_state *st, unsigned long addr)
228  {
229  	unsigned long npages;
230  
231  	npages = (addr - st->start_address) / PAGE_SIZE;
232  
233  #ifdef CONFIG_PCI_BIOS
234  	/*
235  	 * If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
236  	 * Inform about it, but avoid the warning.
237  	 */
238  	if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
239  	    addr <= PAGE_OFFSET + BIOS_END) {
240  		pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
241  		return;
242  	}
243  #endif
244  	/* Account the WX pages */
245  	st->wx_pages += npages;
246  	WARN_ONCE(__supported_pte_mask & _PAGE_NX,
247  		  "x86/mm: Found insecure W+X mapping at address %pS\n",
248  		  (void *)st->start_address);
249  }
250  
effective_prot(struct ptdump_state * pt_st,int level,u64 val)251  static void effective_prot(struct ptdump_state *pt_st, int level, u64 val)
252  {
253  	struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
254  	pgprotval_t prot = val & PTE_FLAGS_MASK;
255  	pgprotval_t effective;
256  
257  	if (level > 0) {
258  		pgprotval_t higher_prot = st->prot_levels[level - 1];
259  
260  		effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) |
261  			    ((higher_prot | prot) & _PAGE_NX);
262  	} else {
263  		effective = prot;
264  	}
265  
266  	st->prot_levels[level] = effective;
267  }
268  
269  /*
270   * This function gets called on a break in a continuous series
271   * of PTE entries; the next one is different so we need to
272   * print what we collected so far.
273   */
note_page(struct ptdump_state * pt_st,unsigned long addr,int level,u64 val)274  static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
275  		      u64 val)
276  {
277  	struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
278  	pgprotval_t new_prot, new_eff;
279  	pgprotval_t cur, eff;
280  	static const char units[] = "BKMGTPE";
281  	struct seq_file *m = st->seq;
282  
283  	new_prot = val & PTE_FLAGS_MASK;
284  	if (!val)
285  		new_eff = 0;
286  	else
287  		new_eff = st->prot_levels[level];
288  
289  	/*
290  	 * If we have a "break" in the series, we need to flush the state that
291  	 * we have now. "break" is either changing perms, levels or
292  	 * address space marker.
293  	 */
294  	cur = st->current_prot;
295  	eff = st->effective_prot;
296  
297  	if (st->level == -1) {
298  		/* First entry */
299  		st->current_prot = new_prot;
300  		st->effective_prot = new_eff;
301  		st->level = level;
302  		st->marker = address_markers;
303  		st->lines = 0;
304  		pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
305  				   st->marker->name);
306  	} else if (new_prot != cur || new_eff != eff || level != st->level ||
307  		   addr >= st->marker[1].start_address) {
308  		const char *unit = units;
309  		unsigned long delta;
310  		int width = sizeof(unsigned long) * 2;
311  
312  		if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
313  			note_wx(st, addr);
314  
315  		/*
316  		 * Now print the actual finished series
317  		 */
318  		if (!st->marker->max_lines ||
319  		    st->lines < st->marker->max_lines) {
320  			pt_dump_seq_printf(m, st->to_dmesg,
321  					   "0x%0*lx-0x%0*lx   ",
322  					   width, st->start_address,
323  					   width, addr);
324  
325  			delta = addr - st->start_address;
326  			while (!(delta & 1023) && unit[1]) {
327  				delta >>= 10;
328  				unit++;
329  			}
330  			pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
331  					    delta, *unit);
332  			printk_prot(m, st->current_prot, st->level,
333  				    st->to_dmesg);
334  		}
335  		st->lines++;
336  
337  		/*
338  		 * We print markers for special areas of address space,
339  		 * such as the start of vmalloc space etc.
340  		 * This helps in the interpretation.
341  		 */
342  		if (addr >= st->marker[1].start_address) {
343  			if (st->marker->max_lines &&
344  			    st->lines > st->marker->max_lines) {
345  				unsigned long nskip =
346  					st->lines - st->marker->max_lines;
347  				pt_dump_seq_printf(m, st->to_dmesg,
348  						   "... %lu entr%s skipped ... \n",
349  						   nskip,
350  						   nskip == 1 ? "y" : "ies");
351  			}
352  			st->marker++;
353  			st->lines = 0;
354  			pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
355  					   st->marker->name);
356  		}
357  
358  		st->start_address = addr;
359  		st->current_prot = new_prot;
360  		st->effective_prot = new_eff;
361  		st->level = level;
362  	}
363  }
364  
ptdump_walk_pgd_level_core(struct seq_file * m,struct mm_struct * mm,pgd_t * pgd,bool checkwx,bool dmesg)365  bool ptdump_walk_pgd_level_core(struct seq_file *m,
366  				struct mm_struct *mm, pgd_t *pgd,
367  				bool checkwx, bool dmesg)
368  {
369  	const struct ptdump_range ptdump_ranges[] = {
370  #ifdef CONFIG_X86_64
371  	{0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2},
372  	{GUARD_HOLE_END_ADDR, ~0UL},
373  #else
374  	{0, ~0UL},
375  #endif
376  	{0, 0}
377  };
378  
379  	struct pg_state st = {
380  		.ptdump = {
381  			.note_page	= note_page,
382  			.effective_prot = effective_prot,
383  			.range		= ptdump_ranges
384  		},
385  		.level = -1,
386  		.to_dmesg	= dmesg,
387  		.check_wx	= checkwx,
388  		.seq		= m
389  	};
390  
391  	ptdump_walk_pgd(&st.ptdump, mm, pgd);
392  
393  	if (!checkwx)
394  		return true;
395  	if (st.wx_pages) {
396  		pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
397  			st.wx_pages);
398  
399  		return false;
400  	} else {
401  		pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
402  
403  		return true;
404  	}
405  }
406  
ptdump_walk_pgd_level(struct seq_file * m,struct mm_struct * mm)407  void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm)
408  {
409  	ptdump_walk_pgd_level_core(m, mm, mm->pgd, false, true);
410  }
411  
ptdump_walk_pgd_level_debugfs(struct seq_file * m,struct mm_struct * mm,bool user)412  void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
413  				   bool user)
414  {
415  	pgd_t *pgd = mm->pgd;
416  #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
417  	if (user && boot_cpu_has(X86_FEATURE_PTI))
418  		pgd = kernel_to_user_pgdp(pgd);
419  #endif
420  	ptdump_walk_pgd_level_core(m, mm, pgd, false, false);
421  }
422  EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
423  
ptdump_walk_user_pgd_level_checkwx(void)424  void ptdump_walk_user_pgd_level_checkwx(void)
425  {
426  #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
427  	pgd_t *pgd = INIT_PGD;
428  
429  	if (!(__supported_pte_mask & _PAGE_NX) ||
430  	    !boot_cpu_has(X86_FEATURE_PTI))
431  		return;
432  
433  	pr_info("x86/mm: Checking user space page tables\n");
434  	pgd = kernel_to_user_pgdp(pgd);
435  	ptdump_walk_pgd_level_core(NULL, &init_mm, pgd, true, false);
436  #endif
437  }
438  
ptdump_walk_pgd_level_checkwx(void)439  bool ptdump_walk_pgd_level_checkwx(void)
440  {
441  	if (!(__supported_pte_mask & _PAGE_NX))
442  		return true;
443  
444  	return ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
445  }
446  
pt_dump_init(void)447  static int __init pt_dump_init(void)
448  {
449  	/*
450  	 * Various markers are not compile-time constants, so assign them
451  	 * here.
452  	 */
453  #ifdef CONFIG_X86_64
454  	address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
455  	address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
456  	address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
457  #ifdef CONFIG_MODIFY_LDT_SYSCALL
458  	address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
459  #endif
460  #ifdef CONFIG_KASAN
461  	address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
462  	address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
463  #endif
464  #endif
465  #ifdef CONFIG_X86_32
466  	address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
467  	address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
468  # ifdef CONFIG_HIGHMEM
469  	address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
470  # endif
471  	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
472  	address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
473  # ifdef CONFIG_MODIFY_LDT_SYSCALL
474  	address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
475  # endif
476  #endif
477  	return 0;
478  }
479  __initcall(pt_dump_init);
480