1  
2  // SPDX-License-Identifier: GPL-2.0-only
3  /*
4   *  linux/mm/memory.c
5   *
6   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
7   */
8  
9  /*
10   * demand-loading started 01.12.91 - seems it is high on the list of
11   * things wanted, and it should be easy to implement. - Linus
12   */
13  
14  /*
15   * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
16   * pages started 02.12.91, seems to work. - Linus.
17   *
18   * Tested sharing by executing about 30 /bin/sh: under the old kernel it
19   * would have taken more than the 6M I have free, but it worked well as
20   * far as I could see.
21   *
22   * Also corrected some "invalidate()"s - I wasn't doing enough of them.
23   */
24  
25  /*
26   * Real VM (paging to/from disk) started 18.12.91. Much more work and
27   * thought has to go into this. Oh, well..
28   * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
29   *		Found it. Everything seems to work now.
30   * 20.12.91  -  Ok, making the swap-device changeable like the root.
31   */
32  
33  /*
34   * 05.04.94  -  Multi-page memory management added for v1.1.
35   *              Idea by Alex Bligh (alex@cconcepts.co.uk)
36   *
37   * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
38   *		(Gerhard.Wichert@pdb.siemens.de)
39   *
40   * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
41   */
42  
43  #include <linux/kernel_stat.h>
44  #include <linux/mm.h>
45  #include <linux/mm_inline.h>
46  #include <linux/sched/mm.h>
47  #include <linux/sched/coredump.h>
48  #include <linux/sched/numa_balancing.h>
49  #include <linux/sched/task.h>
50  #include <linux/hugetlb.h>
51  #include <linux/mman.h>
52  #include <linux/swap.h>
53  #include <linux/highmem.h>
54  #include <linux/pagemap.h>
55  #include <linux/memremap.h>
56  #include <linux/kmsan.h>
57  #include <linux/ksm.h>
58  #include <linux/rmap.h>
59  #include <linux/export.h>
60  #include <linux/delayacct.h>
61  #include <linux/init.h>
62  #include <linux/pfn_t.h>
63  #include <linux/writeback.h>
64  #include <linux/memcontrol.h>
65  #include <linux/mmu_notifier.h>
66  #include <linux/swapops.h>
67  #include <linux/elf.h>
68  #include <linux/gfp.h>
69  #include <linux/migrate.h>
70  #include <linux/string.h>
71  #include <linux/memory-tiers.h>
72  #include <linux/debugfs.h>
73  #include <linux/userfaultfd_k.h>
74  #include <linux/dax.h>
75  #include <linux/oom.h>
76  #include <linux/numa.h>
77  #include <linux/perf_event.h>
78  #include <linux/ptrace.h>
79  #include <linux/vmalloc.h>
80  #include <linux/sched/sysctl.h>
81  
82  #include <trace/events/kmem.h>
83  
84  #include <asm/io.h>
85  #include <asm/mmu_context.h>
86  #include <asm/pgalloc.h>
87  #include <linux/uaccess.h>
88  #include <asm/tlb.h>
89  #include <asm/tlbflush.h>
90  
91  #include "pgalloc-track.h"
92  #include "internal.h"
93  #include "swap.h"
94  
95  #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
96  #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
97  #endif
98  
99  #ifndef CONFIG_NUMA
100  unsigned long max_mapnr;
101  EXPORT_SYMBOL(max_mapnr);
102  
103  struct page *mem_map;
104  EXPORT_SYMBOL(mem_map);
105  #endif
106  
107  static vm_fault_t do_fault(struct vm_fault *vmf);
108  static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
109  static bool vmf_pte_changed(struct vm_fault *vmf);
110  
111  /*
112   * Return true if the original pte was a uffd-wp pte marker (so the pte was
113   * wr-protected).
114   */
vmf_orig_pte_uffd_wp(struct vm_fault * vmf)115  static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
116  {
117  	if (!userfaultfd_wp(vmf->vma))
118  		return false;
119  	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
120  		return false;
121  
122  	return pte_marker_uffd_wp(vmf->orig_pte);
123  }
124  
125  /*
126   * A number of key systems in x86 including ioremap() rely on the assumption
127   * that high_memory defines the upper bound on direct map memory, then end
128   * of ZONE_NORMAL.
129   */
130  void *high_memory;
131  EXPORT_SYMBOL(high_memory);
132  
133  /*
134   * Randomize the address space (stacks, mmaps, brk, etc.).
135   *
136   * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
137   *   as ancient (libc5 based) binaries can segfault. )
138   */
139  int randomize_va_space __read_mostly =
140  #ifdef CONFIG_COMPAT_BRK
141  					1;
142  #else
143  					2;
144  #endif
145  
146  #ifndef arch_wants_old_prefaulted_pte
arch_wants_old_prefaulted_pte(void)147  static inline bool arch_wants_old_prefaulted_pte(void)
148  {
149  	/*
150  	 * Transitioning a PTE from 'old' to 'young' can be expensive on
151  	 * some architectures, even if it's performed in hardware. By
152  	 * default, "false" means prefaulted entries will be 'young'.
153  	 */
154  	return false;
155  }
156  #endif
157  
disable_randmaps(char * s)158  static int __init disable_randmaps(char *s)
159  {
160  	randomize_va_space = 0;
161  	return 1;
162  }
163  __setup("norandmaps", disable_randmaps);
164  
165  unsigned long zero_pfn __read_mostly;
166  EXPORT_SYMBOL(zero_pfn);
167  
168  unsigned long highest_memmap_pfn __read_mostly;
169  
170  /*
171   * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
172   */
init_zero_pfn(void)173  static int __init init_zero_pfn(void)
174  {
175  	zero_pfn = page_to_pfn(ZERO_PAGE(0));
176  	return 0;
177  }
178  early_initcall(init_zero_pfn);
179  
mm_trace_rss_stat(struct mm_struct * mm,int member)180  void mm_trace_rss_stat(struct mm_struct *mm, int member)
181  {
182  	trace_rss_stat(mm, member);
183  }
184  
185  /*
186   * Note: this doesn't free the actual pages themselves. That
187   * has been handled earlier when unmapping all the memory regions.
188   */
free_pte_range(struct mmu_gather * tlb,pmd_t * pmd,unsigned long addr)189  static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
190  			   unsigned long addr)
191  {
192  	pgtable_t token = pmd_pgtable(*pmd);
193  	pmd_clear(pmd);
194  	pte_free_tlb(tlb, token, addr);
195  	mm_dec_nr_ptes(tlb->mm);
196  }
197  
free_pmd_range(struct mmu_gather * tlb,pud_t * pud,unsigned long addr,unsigned long end,unsigned long floor,unsigned long ceiling)198  static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
199  				unsigned long addr, unsigned long end,
200  				unsigned long floor, unsigned long ceiling)
201  {
202  	pmd_t *pmd;
203  	unsigned long next;
204  	unsigned long start;
205  
206  	start = addr;
207  	pmd = pmd_offset(pud, addr);
208  	do {
209  		next = pmd_addr_end(addr, end);
210  		if (pmd_none_or_clear_bad(pmd))
211  			continue;
212  		free_pte_range(tlb, pmd, addr);
213  	} while (pmd++, addr = next, addr != end);
214  
215  	start &= PUD_MASK;
216  	if (start < floor)
217  		return;
218  	if (ceiling) {
219  		ceiling &= PUD_MASK;
220  		if (!ceiling)
221  			return;
222  	}
223  	if (end - 1 > ceiling - 1)
224  		return;
225  
226  	pmd = pmd_offset(pud, start);
227  	pud_clear(pud);
228  	pmd_free_tlb(tlb, pmd, start);
229  	mm_dec_nr_pmds(tlb->mm);
230  }
231  
free_pud_range(struct mmu_gather * tlb,p4d_t * p4d,unsigned long addr,unsigned long end,unsigned long floor,unsigned long ceiling)232  static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
233  				unsigned long addr, unsigned long end,
234  				unsigned long floor, unsigned long ceiling)
235  {
236  	pud_t *pud;
237  	unsigned long next;
238  	unsigned long start;
239  
240  	start = addr;
241  	pud = pud_offset(p4d, addr);
242  	do {
243  		next = pud_addr_end(addr, end);
244  		if (pud_none_or_clear_bad(pud))
245  			continue;
246  		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
247  	} while (pud++, addr = next, addr != end);
248  
249  	start &= P4D_MASK;
250  	if (start < floor)
251  		return;
252  	if (ceiling) {
253  		ceiling &= P4D_MASK;
254  		if (!ceiling)
255  			return;
256  	}
257  	if (end - 1 > ceiling - 1)
258  		return;
259  
260  	pud = pud_offset(p4d, start);
261  	p4d_clear(p4d);
262  	pud_free_tlb(tlb, pud, start);
263  	mm_dec_nr_puds(tlb->mm);
264  }
265  
free_p4d_range(struct mmu_gather * tlb,pgd_t * pgd,unsigned long addr,unsigned long end,unsigned long floor,unsigned long ceiling)266  static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
267  				unsigned long addr, unsigned long end,
268  				unsigned long floor, unsigned long ceiling)
269  {
270  	p4d_t *p4d;
271  	unsigned long next;
272  	unsigned long start;
273  
274  	start = addr;
275  	p4d = p4d_offset(pgd, addr);
276  	do {
277  		next = p4d_addr_end(addr, end);
278  		if (p4d_none_or_clear_bad(p4d))
279  			continue;
280  		free_pud_range(tlb, p4d, addr, next, floor, ceiling);
281  	} while (p4d++, addr = next, addr != end);
282  
283  	start &= PGDIR_MASK;
284  	if (start < floor)
285  		return;
286  	if (ceiling) {
287  		ceiling &= PGDIR_MASK;
288  		if (!ceiling)
289  			return;
290  	}
291  	if (end - 1 > ceiling - 1)
292  		return;
293  
294  	p4d = p4d_offset(pgd, start);
295  	pgd_clear(pgd);
296  	p4d_free_tlb(tlb, p4d, start);
297  }
298  
299  /*
300   * This function frees user-level page tables of a process.
301   */
free_pgd_range(struct mmu_gather * tlb,unsigned long addr,unsigned long end,unsigned long floor,unsigned long ceiling)302  void free_pgd_range(struct mmu_gather *tlb,
303  			unsigned long addr, unsigned long end,
304  			unsigned long floor, unsigned long ceiling)
305  {
306  	pgd_t *pgd;
307  	unsigned long next;
308  
309  	/*
310  	 * The next few lines have given us lots of grief...
311  	 *
312  	 * Why are we testing PMD* at this top level?  Because often
313  	 * there will be no work to do at all, and we'd prefer not to
314  	 * go all the way down to the bottom just to discover that.
315  	 *
316  	 * Why all these "- 1"s?  Because 0 represents both the bottom
317  	 * of the address space and the top of it (using -1 for the
318  	 * top wouldn't help much: the masks would do the wrong thing).
319  	 * The rule is that addr 0 and floor 0 refer to the bottom of
320  	 * the address space, but end 0 and ceiling 0 refer to the top
321  	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
322  	 * that end 0 case should be mythical).
323  	 *
324  	 * Wherever addr is brought up or ceiling brought down, we must
325  	 * be careful to reject "the opposite 0" before it confuses the
326  	 * subsequent tests.  But what about where end is brought down
327  	 * by PMD_SIZE below? no, end can't go down to 0 there.
328  	 *
329  	 * Whereas we round start (addr) and ceiling down, by different
330  	 * masks at different levels, in order to test whether a table
331  	 * now has no other vmas using it, so can be freed, we don't
332  	 * bother to round floor or end up - the tests don't need that.
333  	 */
334  
335  	addr &= PMD_MASK;
336  	if (addr < floor) {
337  		addr += PMD_SIZE;
338  		if (!addr)
339  			return;
340  	}
341  	if (ceiling) {
342  		ceiling &= PMD_MASK;
343  		if (!ceiling)
344  			return;
345  	}
346  	if (end - 1 > ceiling - 1)
347  		end -= PMD_SIZE;
348  	if (addr > end - 1)
349  		return;
350  	/*
351  	 * We add page table cache pages with PAGE_SIZE,
352  	 * (see pte_free_tlb()), flush the tlb if we need
353  	 */
354  	tlb_change_page_size(tlb, PAGE_SIZE);
355  	pgd = pgd_offset(tlb->mm, addr);
356  	do {
357  		next = pgd_addr_end(addr, end);
358  		if (pgd_none_or_clear_bad(pgd))
359  			continue;
360  		free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
361  	} while (pgd++, addr = next, addr != end);
362  }
363  
free_pgtables(struct mmu_gather * tlb,struct ma_state * mas,struct vm_area_struct * vma,unsigned long floor,unsigned long ceiling,bool mm_wr_locked)364  void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
365  		   struct vm_area_struct *vma, unsigned long floor,
366  		   unsigned long ceiling, bool mm_wr_locked)
367  {
368  	struct unlink_vma_file_batch vb;
369  
370  	do {
371  		unsigned long addr = vma->vm_start;
372  		struct vm_area_struct *next;
373  
374  		/*
375  		 * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
376  		 * be 0.  This will underflow and is okay.
377  		 */
378  		next = mas_find(mas, ceiling - 1);
379  		if (unlikely(xa_is_zero(next)))
380  			next = NULL;
381  
382  		/*
383  		 * Hide vma from rmap and truncate_pagecache before freeing
384  		 * pgtables
385  		 */
386  		if (mm_wr_locked)
387  			vma_start_write(vma);
388  		unlink_anon_vmas(vma);
389  
390  		if (is_vm_hugetlb_page(vma)) {
391  			unlink_file_vma(vma);
392  			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
393  				floor, next ? next->vm_start : ceiling);
394  		} else {
395  			unlink_file_vma_batch_init(&vb);
396  			unlink_file_vma_batch_add(&vb, vma);
397  
398  			/*
399  			 * Optimization: gather nearby vmas into one call down
400  			 */
401  			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
402  			       && !is_vm_hugetlb_page(next)) {
403  				vma = next;
404  				next = mas_find(mas, ceiling - 1);
405  				if (unlikely(xa_is_zero(next)))
406  					next = NULL;
407  				if (mm_wr_locked)
408  					vma_start_write(vma);
409  				unlink_anon_vmas(vma);
410  				unlink_file_vma_batch_add(&vb, vma);
411  			}
412  			unlink_file_vma_batch_final(&vb);
413  			free_pgd_range(tlb, addr, vma->vm_end,
414  				floor, next ? next->vm_start : ceiling);
415  		}
416  		vma = next;
417  	} while (vma);
418  }
419  
pmd_install(struct mm_struct * mm,pmd_t * pmd,pgtable_t * pte)420  void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
421  {
422  	spinlock_t *ptl = pmd_lock(mm, pmd);
423  
424  	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
425  		mm_inc_nr_ptes(mm);
426  		/*
427  		 * Ensure all pte setup (eg. pte page lock and page clearing) are
428  		 * visible before the pte is made visible to other CPUs by being
429  		 * put into page tables.
430  		 *
431  		 * The other side of the story is the pointer chasing in the page
432  		 * table walking code (when walking the page table without locking;
433  		 * ie. most of the time). Fortunately, these data accesses consist
434  		 * of a chain of data-dependent loads, meaning most CPUs (alpha
435  		 * being the notable exception) will already guarantee loads are
436  		 * seen in-order. See the alpha page table accessors for the
437  		 * smp_rmb() barriers in page table walking code.
438  		 */
439  		smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
440  		pmd_populate(mm, pmd, *pte);
441  		*pte = NULL;
442  	}
443  	spin_unlock(ptl);
444  }
445  
__pte_alloc(struct mm_struct * mm,pmd_t * pmd)446  int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
447  {
448  	pgtable_t new = pte_alloc_one(mm);
449  	if (!new)
450  		return -ENOMEM;
451  
452  	pmd_install(mm, pmd, &new);
453  	if (new)
454  		pte_free(mm, new);
455  	return 0;
456  }
457  
__pte_alloc_kernel(pmd_t * pmd)458  int __pte_alloc_kernel(pmd_t *pmd)
459  {
460  	pte_t *new = pte_alloc_one_kernel(&init_mm);
461  	if (!new)
462  		return -ENOMEM;
463  
464  	spin_lock(&init_mm.page_table_lock);
465  	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
466  		smp_wmb(); /* See comment in pmd_install() */
467  		pmd_populate_kernel(&init_mm, pmd, new);
468  		new = NULL;
469  	}
470  	spin_unlock(&init_mm.page_table_lock);
471  	if (new)
472  		pte_free_kernel(&init_mm, new);
473  	return 0;
474  }
475  
init_rss_vec(int * rss)476  static inline void init_rss_vec(int *rss)
477  {
478  	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
479  }
480  
add_mm_rss_vec(struct mm_struct * mm,int * rss)481  static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
482  {
483  	int i;
484  
485  	for (i = 0; i < NR_MM_COUNTERS; i++)
486  		if (rss[i])
487  			add_mm_counter(mm, i, rss[i]);
488  }
489  
490  /*
491   * This function is called to print an error when a bad pte
492   * is found. For example, we might have a PFN-mapped pte in
493   * a region that doesn't allow it.
494   *
495   * The calling function must still handle the error.
496   */
print_bad_pte(struct vm_area_struct * vma,unsigned long addr,pte_t pte,struct page * page)497  static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
498  			  pte_t pte, struct page *page)
499  {
500  	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
501  	p4d_t *p4d = p4d_offset(pgd, addr);
502  	pud_t *pud = pud_offset(p4d, addr);
503  	pmd_t *pmd = pmd_offset(pud, addr);
504  	struct address_space *mapping;
505  	pgoff_t index;
506  	static unsigned long resume;
507  	static unsigned long nr_shown;
508  	static unsigned long nr_unshown;
509  
510  	/*
511  	 * Allow a burst of 60 reports, then keep quiet for that minute;
512  	 * or allow a steady drip of one report per second.
513  	 */
514  	if (nr_shown == 60) {
515  		if (time_before(jiffies, resume)) {
516  			nr_unshown++;
517  			return;
518  		}
519  		if (nr_unshown) {
520  			pr_alert("BUG: Bad page map: %lu messages suppressed\n",
521  				 nr_unshown);
522  			nr_unshown = 0;
523  		}
524  		nr_shown = 0;
525  	}
526  	if (nr_shown++ == 0)
527  		resume = jiffies + 60 * HZ;
528  
529  	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
530  	index = linear_page_index(vma, addr);
531  
532  	pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
533  		 current->comm,
534  		 (long long)pte_val(pte), (long long)pmd_val(*pmd));
535  	if (page)
536  		dump_page(page, "bad pte");
537  	pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
538  		 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
539  	pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
540  		 vma->vm_file,
541  		 vma->vm_ops ? vma->vm_ops->fault : NULL,
542  		 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
543  		 mapping ? mapping->a_ops->read_folio : NULL);
544  	dump_stack();
545  	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
546  }
547  
548  /*
549   * vm_normal_page -- This function gets the "struct page" associated with a pte.
550   *
551   * "Special" mappings do not wish to be associated with a "struct page" (either
552   * it doesn't exist, or it exists but they don't want to touch it). In this
553   * case, NULL is returned here. "Normal" mappings do have a struct page.
554   *
555   * There are 2 broad cases. Firstly, an architecture may define a pte_special()
556   * pte bit, in which case this function is trivial. Secondly, an architecture
557   * may not have a spare pte bit, which requires a more complicated scheme,
558   * described below.
559   *
560   * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
561   * special mapping (even if there are underlying and valid "struct pages").
562   * COWed pages of a VM_PFNMAP are always normal.
563   *
564   * The way we recognize COWed pages within VM_PFNMAP mappings is through the
565   * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
566   * set, and the vm_pgoff will point to the first PFN mapped: thus every special
567   * mapping will always honor the rule
568   *
569   *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
570   *
571   * And for normal mappings this is false.
572   *
573   * This restricts such mappings to be a linear translation from virtual address
574   * to pfn. To get around this restriction, we allow arbitrary mappings so long
575   * as the vma is not a COW mapping; in that case, we know that all ptes are
576   * special (because none can have been COWed).
577   *
578   *
579   * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
580   *
581   * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
582   * page" backing, however the difference is that _all_ pages with a struct
583   * page (that is, those where pfn_valid is true) are refcounted and considered
584   * normal pages by the VM. The only exception are zeropages, which are
585   * *never* refcounted.
586   *
587   * The disadvantage is that pages are refcounted (which can be slower and
588   * simply not an option for some PFNMAP users). The advantage is that we
589   * don't have to follow the strict linearity rule of PFNMAP mappings in
590   * order to support COWable mappings.
591   *
592   */
vm_normal_page(struct vm_area_struct * vma,unsigned long addr,pte_t pte)593  struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
594  			    pte_t pte)
595  {
596  	unsigned long pfn = pte_pfn(pte);
597  
598  	if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
599  		if (likely(!pte_special(pte)))
600  			goto check_pfn;
601  		if (vma->vm_ops && vma->vm_ops->find_special_page)
602  			return vma->vm_ops->find_special_page(vma, addr);
603  		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
604  			return NULL;
605  		if (is_zero_pfn(pfn))
606  			return NULL;
607  		if (pte_devmap(pte))
608  		/*
609  		 * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
610  		 * and will have refcounts incremented on their struct pages
611  		 * when they are inserted into PTEs, thus they are safe to
612  		 * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
613  		 * do not have refcounts. Example of legacy ZONE_DEVICE is
614  		 * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
615  		 */
616  			return NULL;
617  
618  		print_bad_pte(vma, addr, pte, NULL);
619  		return NULL;
620  	}
621  
622  	/* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
623  
624  	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
625  		if (vma->vm_flags & VM_MIXEDMAP) {
626  			if (!pfn_valid(pfn))
627  				return NULL;
628  			if (is_zero_pfn(pfn))
629  				return NULL;
630  			goto out;
631  		} else {
632  			unsigned long off;
633  			off = (addr - vma->vm_start) >> PAGE_SHIFT;
634  			if (pfn == vma->vm_pgoff + off)
635  				return NULL;
636  			if (!is_cow_mapping(vma->vm_flags))
637  				return NULL;
638  		}
639  	}
640  
641  	if (is_zero_pfn(pfn))
642  		return NULL;
643  
644  check_pfn:
645  	if (unlikely(pfn > highest_memmap_pfn)) {
646  		print_bad_pte(vma, addr, pte, NULL);
647  		return NULL;
648  	}
649  
650  	/*
651  	 * NOTE! We still have PageReserved() pages in the page tables.
652  	 * eg. VDSO mappings can cause them to exist.
653  	 */
654  out:
655  	VM_WARN_ON_ONCE(is_zero_pfn(pfn));
656  	return pfn_to_page(pfn);
657  }
658  
vm_normal_folio(struct vm_area_struct * vma,unsigned long addr,pte_t pte)659  struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
660  			    pte_t pte)
661  {
662  	struct page *page = vm_normal_page(vma, addr, pte);
663  
664  	if (page)
665  		return page_folio(page);
666  	return NULL;
667  }
668  
669  #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
vm_normal_page_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t pmd)670  struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
671  				pmd_t pmd)
672  {
673  	unsigned long pfn = pmd_pfn(pmd);
674  
675  	/* Currently it's only used for huge pfnmaps */
676  	if (unlikely(pmd_special(pmd)))
677  		return NULL;
678  
679  	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
680  		if (vma->vm_flags & VM_MIXEDMAP) {
681  			if (!pfn_valid(pfn))
682  				return NULL;
683  			goto out;
684  		} else {
685  			unsigned long off;
686  			off = (addr - vma->vm_start) >> PAGE_SHIFT;
687  			if (pfn == vma->vm_pgoff + off)
688  				return NULL;
689  			if (!is_cow_mapping(vma->vm_flags))
690  				return NULL;
691  		}
692  	}
693  
694  	if (pmd_devmap(pmd))
695  		return NULL;
696  	if (is_huge_zero_pmd(pmd))
697  		return NULL;
698  	if (unlikely(pfn > highest_memmap_pfn))
699  		return NULL;
700  
701  	/*
702  	 * NOTE! We still have PageReserved() pages in the page tables.
703  	 * eg. VDSO mappings can cause them to exist.
704  	 */
705  out:
706  	return pfn_to_page(pfn);
707  }
708  
vm_normal_folio_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t pmd)709  struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
710  				  unsigned long addr, pmd_t pmd)
711  {
712  	struct page *page = vm_normal_page_pmd(vma, addr, pmd);
713  
714  	if (page)
715  		return page_folio(page);
716  	return NULL;
717  }
718  #endif
719  
restore_exclusive_pte(struct vm_area_struct * vma,struct page * page,unsigned long address,pte_t * ptep)720  static void restore_exclusive_pte(struct vm_area_struct *vma,
721  				  struct page *page, unsigned long address,
722  				  pte_t *ptep)
723  {
724  	struct folio *folio = page_folio(page);
725  	pte_t orig_pte;
726  	pte_t pte;
727  	swp_entry_t entry;
728  
729  	orig_pte = ptep_get(ptep);
730  	pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
731  	if (pte_swp_soft_dirty(orig_pte))
732  		pte = pte_mksoft_dirty(pte);
733  
734  	entry = pte_to_swp_entry(orig_pte);
735  	if (pte_swp_uffd_wp(orig_pte))
736  		pte = pte_mkuffd_wp(pte);
737  	else if (is_writable_device_exclusive_entry(entry))
738  		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
739  
740  	VM_BUG_ON_FOLIO(pte_write(pte) && (!folio_test_anon(folio) &&
741  					   PageAnonExclusive(page)), folio);
742  
743  	/*
744  	 * No need to take a page reference as one was already
745  	 * created when the swap entry was made.
746  	 */
747  	if (folio_test_anon(folio))
748  		folio_add_anon_rmap_pte(folio, page, vma, address, RMAP_NONE);
749  	else
750  		/*
751  		 * Currently device exclusive access only supports anonymous
752  		 * memory so the entry shouldn't point to a filebacked page.
753  		 */
754  		WARN_ON_ONCE(1);
755  
756  	set_pte_at(vma->vm_mm, address, ptep, pte);
757  
758  	/*
759  	 * No need to invalidate - it was non-present before. However
760  	 * secondary CPUs may have mappings that need invalidating.
761  	 */
762  	update_mmu_cache(vma, address, ptep);
763  }
764  
765  /*
766   * Tries to restore an exclusive pte if the page lock can be acquired without
767   * sleeping.
768   */
769  static int
try_restore_exclusive_pte(pte_t * src_pte,struct vm_area_struct * vma,unsigned long addr)770  try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
771  			unsigned long addr)
772  {
773  	swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
774  	struct page *page = pfn_swap_entry_to_page(entry);
775  
776  	if (trylock_page(page)) {
777  		restore_exclusive_pte(vma, page, addr, src_pte);
778  		unlock_page(page);
779  		return 0;
780  	}
781  
782  	return -EBUSY;
783  }
784  
785  /*
786   * copy one vm_area from one task to the other. Assumes the page tables
787   * already present in the new task to be cleared in the whole range
788   * covered by this vma.
789   */
790  
791  static unsigned long
copy_nonpresent_pte(struct mm_struct * dst_mm,struct mm_struct * src_mm,pte_t * dst_pte,pte_t * src_pte,struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,unsigned long addr,int * rss)792  copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
793  		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
794  		struct vm_area_struct *src_vma, unsigned long addr, int *rss)
795  {
796  	unsigned long vm_flags = dst_vma->vm_flags;
797  	pte_t orig_pte = ptep_get(src_pte);
798  	pte_t pte = orig_pte;
799  	struct folio *folio;
800  	struct page *page;
801  	swp_entry_t entry = pte_to_swp_entry(orig_pte);
802  
803  	if (likely(!non_swap_entry(entry))) {
804  		if (swap_duplicate(entry) < 0)
805  			return -EIO;
806  
807  		/* make sure dst_mm is on swapoff's mmlist. */
808  		if (unlikely(list_empty(&dst_mm->mmlist))) {
809  			spin_lock(&mmlist_lock);
810  			if (list_empty(&dst_mm->mmlist))
811  				list_add(&dst_mm->mmlist,
812  						&src_mm->mmlist);
813  			spin_unlock(&mmlist_lock);
814  		}
815  		/* Mark the swap entry as shared. */
816  		if (pte_swp_exclusive(orig_pte)) {
817  			pte = pte_swp_clear_exclusive(orig_pte);
818  			set_pte_at(src_mm, addr, src_pte, pte);
819  		}
820  		rss[MM_SWAPENTS]++;
821  	} else if (is_migration_entry(entry)) {
822  		folio = pfn_swap_entry_folio(entry);
823  
824  		rss[mm_counter(folio)]++;
825  
826  		if (!is_readable_migration_entry(entry) &&
827  				is_cow_mapping(vm_flags)) {
828  			/*
829  			 * COW mappings require pages in both parent and child
830  			 * to be set to read. A previously exclusive entry is
831  			 * now shared.
832  			 */
833  			entry = make_readable_migration_entry(
834  							swp_offset(entry));
835  			pte = swp_entry_to_pte(entry);
836  			if (pte_swp_soft_dirty(orig_pte))
837  				pte = pte_swp_mksoft_dirty(pte);
838  			if (pte_swp_uffd_wp(orig_pte))
839  				pte = pte_swp_mkuffd_wp(pte);
840  			set_pte_at(src_mm, addr, src_pte, pte);
841  		}
842  	} else if (is_device_private_entry(entry)) {
843  		page = pfn_swap_entry_to_page(entry);
844  		folio = page_folio(page);
845  
846  		/*
847  		 * Update rss count even for unaddressable pages, as
848  		 * they should treated just like normal pages in this
849  		 * respect.
850  		 *
851  		 * We will likely want to have some new rss counters
852  		 * for unaddressable pages, at some point. But for now
853  		 * keep things as they are.
854  		 */
855  		folio_get(folio);
856  		rss[mm_counter(folio)]++;
857  		/* Cannot fail as these pages cannot get pinned. */
858  		folio_try_dup_anon_rmap_pte(folio, page, src_vma);
859  
860  		/*
861  		 * We do not preserve soft-dirty information, because so
862  		 * far, checkpoint/restore is the only feature that
863  		 * requires that. And checkpoint/restore does not work
864  		 * when a device driver is involved (you cannot easily
865  		 * save and restore device driver state).
866  		 */
867  		if (is_writable_device_private_entry(entry) &&
868  		    is_cow_mapping(vm_flags)) {
869  			entry = make_readable_device_private_entry(
870  							swp_offset(entry));
871  			pte = swp_entry_to_pte(entry);
872  			if (pte_swp_uffd_wp(orig_pte))
873  				pte = pte_swp_mkuffd_wp(pte);
874  			set_pte_at(src_mm, addr, src_pte, pte);
875  		}
876  	} else if (is_device_exclusive_entry(entry)) {
877  		/*
878  		 * Make device exclusive entries present by restoring the
879  		 * original entry then copying as for a present pte. Device
880  		 * exclusive entries currently only support private writable
881  		 * (ie. COW) mappings.
882  		 */
883  		VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
884  		if (try_restore_exclusive_pte(src_pte, src_vma, addr))
885  			return -EBUSY;
886  		return -ENOENT;
887  	} else if (is_pte_marker_entry(entry)) {
888  		pte_marker marker = copy_pte_marker(entry, dst_vma);
889  
890  		if (marker)
891  			set_pte_at(dst_mm, addr, dst_pte,
892  				   make_pte_marker(marker));
893  		return 0;
894  	}
895  	if (!userfaultfd_wp(dst_vma))
896  		pte = pte_swp_clear_uffd_wp(pte);
897  	set_pte_at(dst_mm, addr, dst_pte, pte);
898  	return 0;
899  }
900  
901  /*
902   * Copy a present and normal page.
903   *
904   * NOTE! The usual case is that this isn't required;
905   * instead, the caller can just increase the page refcount
906   * and re-use the pte the traditional way.
907   *
908   * And if we need a pre-allocated page but don't yet have
909   * one, return a negative error to let the preallocation
910   * code know so that it can do so outside the page table
911   * lock.
912   */
913  static inline int
copy_present_page(struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,pte_t * dst_pte,pte_t * src_pte,unsigned long addr,int * rss,struct folio ** prealloc,struct page * page)914  copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
915  		  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
916  		  struct folio **prealloc, struct page *page)
917  {
918  	struct folio *new_folio;
919  	pte_t pte;
920  
921  	new_folio = *prealloc;
922  	if (!new_folio)
923  		return -EAGAIN;
924  
925  	/*
926  	 * We have a prealloc page, all good!  Take it
927  	 * over and copy the page & arm it.
928  	 */
929  
930  	if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma))
931  		return -EHWPOISON;
932  
933  	*prealloc = NULL;
934  	__folio_mark_uptodate(new_folio);
935  	folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE);
936  	folio_add_lru_vma(new_folio, dst_vma);
937  	rss[MM_ANONPAGES]++;
938  
939  	/* All done, just insert the new page copy in the child */
940  	pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
941  	pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
942  	if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
943  		/* Uffd-wp needs to be delivered to dest pte as well */
944  		pte = pte_mkuffd_wp(pte);
945  	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
946  	return 0;
947  }
948  
__copy_present_ptes(struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,pte_t * dst_pte,pte_t * src_pte,pte_t pte,unsigned long addr,int nr)949  static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
950  		struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
951  		pte_t pte, unsigned long addr, int nr)
952  {
953  	struct mm_struct *src_mm = src_vma->vm_mm;
954  
955  	/* If it's a COW mapping, write protect it both processes. */
956  	if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
957  		wrprotect_ptes(src_mm, addr, src_pte, nr);
958  		pte = pte_wrprotect(pte);
959  	}
960  
961  	/* If it's a shared mapping, mark it clean in the child. */
962  	if (src_vma->vm_flags & VM_SHARED)
963  		pte = pte_mkclean(pte);
964  	pte = pte_mkold(pte);
965  
966  	if (!userfaultfd_wp(dst_vma))
967  		pte = pte_clear_uffd_wp(pte);
968  
969  	set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
970  }
971  
972  /*
973   * Copy one present PTE, trying to batch-process subsequent PTEs that map
974   * consecutive pages of the same folio by copying them as well.
975   *
976   * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
977   * Otherwise, returns the number of copied PTEs (at least 1).
978   */
979  static inline int
copy_present_ptes(struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,pte_t * dst_pte,pte_t * src_pte,pte_t pte,unsigned long addr,int max_nr,int * rss,struct folio ** prealloc)980  copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
981  		 pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
982  		 int max_nr, int *rss, struct folio **prealloc)
983  {
984  	struct page *page;
985  	struct folio *folio;
986  	bool any_writable;
987  	fpb_t flags = 0;
988  	int err, nr;
989  
990  	page = vm_normal_page(src_vma, addr, pte);
991  	if (unlikely(!page))
992  		goto copy_pte;
993  
994  	folio = page_folio(page);
995  
996  	/*
997  	 * If we likely have to copy, just don't bother with batching. Make
998  	 * sure that the common "small folio" case is as fast as possible
999  	 * by keeping the batching logic separate.
1000  	 */
1001  	if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
1002  		if (src_vma->vm_flags & VM_SHARED)
1003  			flags |= FPB_IGNORE_DIRTY;
1004  		if (!vma_soft_dirty_enabled(src_vma))
1005  			flags |= FPB_IGNORE_SOFT_DIRTY;
1006  
1007  		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
1008  				     &any_writable, NULL, NULL);
1009  		folio_ref_add(folio, nr);
1010  		if (folio_test_anon(folio)) {
1011  			if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
1012  								  nr, src_vma))) {
1013  				folio_ref_sub(folio, nr);
1014  				return -EAGAIN;
1015  			}
1016  			rss[MM_ANONPAGES] += nr;
1017  			VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
1018  		} else {
1019  			folio_dup_file_rmap_ptes(folio, page, nr);
1020  			rss[mm_counter_file(folio)] += nr;
1021  		}
1022  		if (any_writable)
1023  			pte = pte_mkwrite(pte, src_vma);
1024  		__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
1025  				    addr, nr);
1026  		return nr;
1027  	}
1028  
1029  	folio_get(folio);
1030  	if (folio_test_anon(folio)) {
1031  		/*
1032  		 * If this page may have been pinned by the parent process,
1033  		 * copy the page immediately for the child so that we'll always
1034  		 * guarantee the pinned page won't be randomly replaced in the
1035  		 * future.
1036  		 */
1037  		if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
1038  			/* Page may be pinned, we have to copy. */
1039  			folio_put(folio);
1040  			err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
1041  						addr, rss, prealloc, page);
1042  			return err ? err : 1;
1043  		}
1044  		rss[MM_ANONPAGES]++;
1045  		VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
1046  	} else {
1047  		folio_dup_file_rmap_pte(folio, page);
1048  		rss[mm_counter_file(folio)]++;
1049  	}
1050  
1051  copy_pte:
1052  	__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
1053  	return 1;
1054  }
1055  
folio_prealloc(struct mm_struct * src_mm,struct vm_area_struct * vma,unsigned long addr,bool need_zero)1056  static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
1057  		struct vm_area_struct *vma, unsigned long addr, bool need_zero)
1058  {
1059  	struct folio *new_folio;
1060  
1061  	if (need_zero)
1062  		new_folio = vma_alloc_zeroed_movable_folio(vma, addr);
1063  	else
1064  		new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
1065  					    addr, false);
1066  
1067  	if (!new_folio)
1068  		return NULL;
1069  
1070  	if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) {
1071  		folio_put(new_folio);
1072  		return NULL;
1073  	}
1074  	folio_throttle_swaprate(new_folio, GFP_KERNEL);
1075  
1076  	return new_folio;
1077  }
1078  
1079  static int
copy_pte_range(struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,pmd_t * dst_pmd,pmd_t * src_pmd,unsigned long addr,unsigned long end)1080  copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1081  	       pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1082  	       unsigned long end)
1083  {
1084  	struct mm_struct *dst_mm = dst_vma->vm_mm;
1085  	struct mm_struct *src_mm = src_vma->vm_mm;
1086  	pte_t *orig_src_pte, *orig_dst_pte;
1087  	pte_t *src_pte, *dst_pte;
1088  	pte_t ptent;
1089  	spinlock_t *src_ptl, *dst_ptl;
1090  	int progress, max_nr, ret = 0;
1091  	int rss[NR_MM_COUNTERS];
1092  	swp_entry_t entry = (swp_entry_t){0};
1093  	struct folio *prealloc = NULL;
1094  	int nr;
1095  
1096  again:
1097  	progress = 0;
1098  	init_rss_vec(rss);
1099  
1100  	/*
1101  	 * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
1102  	 * error handling here, assume that exclusive mmap_lock on dst and src
1103  	 * protects anon from unexpected THP transitions; with shmem and file
1104  	 * protected by mmap_lock-less collapse skipping areas with anon_vma
1105  	 * (whereas vma_needs_copy() skips areas without anon_vma).  A rework
1106  	 * can remove such assumptions later, but this is good enough for now.
1107  	 */
1108  	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1109  	if (!dst_pte) {
1110  		ret = -ENOMEM;
1111  		goto out;
1112  	}
1113  	src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
1114  	if (!src_pte) {
1115  		pte_unmap_unlock(dst_pte, dst_ptl);
1116  		/* ret == 0 */
1117  		goto out;
1118  	}
1119  	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1120  	orig_src_pte = src_pte;
1121  	orig_dst_pte = dst_pte;
1122  	arch_enter_lazy_mmu_mode();
1123  
1124  	do {
1125  		nr = 1;
1126  
1127  		/*
1128  		 * We are holding two locks at this point - either of them
1129  		 * could generate latencies in another task on another CPU.
1130  		 */
1131  		if (progress >= 32) {
1132  			progress = 0;
1133  			if (need_resched() ||
1134  			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
1135  				break;
1136  		}
1137  		ptent = ptep_get(src_pte);
1138  		if (pte_none(ptent)) {
1139  			progress++;
1140  			continue;
1141  		}
1142  		if (unlikely(!pte_present(ptent))) {
1143  			ret = copy_nonpresent_pte(dst_mm, src_mm,
1144  						  dst_pte, src_pte,
1145  						  dst_vma, src_vma,
1146  						  addr, rss);
1147  			if (ret == -EIO) {
1148  				entry = pte_to_swp_entry(ptep_get(src_pte));
1149  				break;
1150  			} else if (ret == -EBUSY) {
1151  				break;
1152  			} else if (!ret) {
1153  				progress += 8;
1154  				continue;
1155  			}
1156  			ptent = ptep_get(src_pte);
1157  			VM_WARN_ON_ONCE(!pte_present(ptent));
1158  
1159  			/*
1160  			 * Device exclusive entry restored, continue by copying
1161  			 * the now present pte.
1162  			 */
1163  			WARN_ON_ONCE(ret != -ENOENT);
1164  		}
1165  		/* copy_present_ptes() will clear `*prealloc' if consumed */
1166  		max_nr = (end - addr) / PAGE_SIZE;
1167  		ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
1168  					ptent, addr, max_nr, rss, &prealloc);
1169  		/*
1170  		 * If we need a pre-allocated page for this pte, drop the
1171  		 * locks, allocate, and try again.
1172  		 * If copy failed due to hwpoison in source page, break out.
1173  		 */
1174  		if (unlikely(ret == -EAGAIN || ret == -EHWPOISON))
1175  			break;
1176  		if (unlikely(prealloc)) {
1177  			/*
1178  			 * pre-alloc page cannot be reused by next time so as
1179  			 * to strictly follow mempolicy (e.g., alloc_page_vma()
1180  			 * will allocate page according to address).  This
1181  			 * could only happen if one pinned pte changed.
1182  			 */
1183  			folio_put(prealloc);
1184  			prealloc = NULL;
1185  		}
1186  		nr = ret;
1187  		progress += 8 * nr;
1188  	} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
1189  		 addr != end);
1190  
1191  	arch_leave_lazy_mmu_mode();
1192  	pte_unmap_unlock(orig_src_pte, src_ptl);
1193  	add_mm_rss_vec(dst_mm, rss);
1194  	pte_unmap_unlock(orig_dst_pte, dst_ptl);
1195  	cond_resched();
1196  
1197  	if (ret == -EIO) {
1198  		VM_WARN_ON_ONCE(!entry.val);
1199  		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
1200  			ret = -ENOMEM;
1201  			goto out;
1202  		}
1203  		entry.val = 0;
1204  	} else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) {
1205  		goto out;
1206  	} else if (ret ==  -EAGAIN) {
1207  		prealloc = folio_prealloc(src_mm, src_vma, addr, false);
1208  		if (!prealloc)
1209  			return -ENOMEM;
1210  	} else if (ret < 0) {
1211  		VM_WARN_ON_ONCE(1);
1212  	}
1213  
1214  	/* We've captured and resolved the error. Reset, try again. */
1215  	ret = 0;
1216  
1217  	if (addr != end)
1218  		goto again;
1219  out:
1220  	if (unlikely(prealloc))
1221  		folio_put(prealloc);
1222  	return ret;
1223  }
1224  
1225  static inline int
copy_pmd_range(struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,pud_t * dst_pud,pud_t * src_pud,unsigned long addr,unsigned long end)1226  copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1227  	       pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1228  	       unsigned long end)
1229  {
1230  	struct mm_struct *dst_mm = dst_vma->vm_mm;
1231  	struct mm_struct *src_mm = src_vma->vm_mm;
1232  	pmd_t *src_pmd, *dst_pmd;
1233  	unsigned long next;
1234  
1235  	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
1236  	if (!dst_pmd)
1237  		return -ENOMEM;
1238  	src_pmd = pmd_offset(src_pud, addr);
1239  	do {
1240  		next = pmd_addr_end(addr, end);
1241  		if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
1242  			|| pmd_devmap(*src_pmd)) {
1243  			int err;
1244  			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
1245  			err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
1246  					    addr, dst_vma, src_vma);
1247  			if (err == -ENOMEM)
1248  				return -ENOMEM;
1249  			if (!err)
1250  				continue;
1251  			/* fall through */
1252  		}
1253  		if (pmd_none_or_clear_bad(src_pmd))
1254  			continue;
1255  		if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
1256  				   addr, next))
1257  			return -ENOMEM;
1258  	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
1259  	return 0;
1260  }
1261  
1262  static inline int
copy_pud_range(struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,p4d_t * dst_p4d,p4d_t * src_p4d,unsigned long addr,unsigned long end)1263  copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1264  	       p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
1265  	       unsigned long end)
1266  {
1267  	struct mm_struct *dst_mm = dst_vma->vm_mm;
1268  	struct mm_struct *src_mm = src_vma->vm_mm;
1269  	pud_t *src_pud, *dst_pud;
1270  	unsigned long next;
1271  
1272  	dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
1273  	if (!dst_pud)
1274  		return -ENOMEM;
1275  	src_pud = pud_offset(src_p4d, addr);
1276  	do {
1277  		next = pud_addr_end(addr, end);
1278  		if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1279  			int err;
1280  
1281  			VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
1282  			err = copy_huge_pud(dst_mm, src_mm,
1283  					    dst_pud, src_pud, addr, src_vma);
1284  			if (err == -ENOMEM)
1285  				return -ENOMEM;
1286  			if (!err)
1287  				continue;
1288  			/* fall through */
1289  		}
1290  		if (pud_none_or_clear_bad(src_pud))
1291  			continue;
1292  		if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
1293  				   addr, next))
1294  			return -ENOMEM;
1295  	} while (dst_pud++, src_pud++, addr = next, addr != end);
1296  	return 0;
1297  }
1298  
1299  static inline int
copy_p4d_range(struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,pgd_t * dst_pgd,pgd_t * src_pgd,unsigned long addr,unsigned long end)1300  copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1301  	       pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
1302  	       unsigned long end)
1303  {
1304  	struct mm_struct *dst_mm = dst_vma->vm_mm;
1305  	p4d_t *src_p4d, *dst_p4d;
1306  	unsigned long next;
1307  
1308  	dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
1309  	if (!dst_p4d)
1310  		return -ENOMEM;
1311  	src_p4d = p4d_offset(src_pgd, addr);
1312  	do {
1313  		next = p4d_addr_end(addr, end);
1314  		if (p4d_none_or_clear_bad(src_p4d))
1315  			continue;
1316  		if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
1317  				   addr, next))
1318  			return -ENOMEM;
1319  	} while (dst_p4d++, src_p4d++, addr = next, addr != end);
1320  	return 0;
1321  }
1322  
1323  /*
1324   * Return true if the vma needs to copy the pgtable during this fork().  Return
1325   * false when we can speed up fork() by allowing lazy page faults later until
1326   * when the child accesses the memory range.
1327   */
1328  static bool
vma_needs_copy(struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma)1329  vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1330  {
1331  	/*
1332  	 * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
1333  	 * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
1334  	 * contains uffd-wp protection information, that's something we can't
1335  	 * retrieve from page cache, and skip copying will lose those info.
1336  	 */
1337  	if (userfaultfd_wp(dst_vma))
1338  		return true;
1339  
1340  	if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
1341  		return true;
1342  
1343  	if (src_vma->anon_vma)
1344  		return true;
1345  
1346  	/*
1347  	 * Don't copy ptes where a page fault will fill them correctly.  Fork
1348  	 * becomes much lighter when there are big shared or private readonly
1349  	 * mappings. The tradeoff is that copy_page_range is more efficient
1350  	 * than faulting.
1351  	 */
1352  	return false;
1353  }
1354  
1355  int
copy_page_range(struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma)1356  copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1357  {
1358  	pgd_t *src_pgd, *dst_pgd;
1359  	unsigned long next;
1360  	unsigned long addr = src_vma->vm_start;
1361  	unsigned long end = src_vma->vm_end;
1362  	struct mm_struct *dst_mm = dst_vma->vm_mm;
1363  	struct mm_struct *src_mm = src_vma->vm_mm;
1364  	struct mmu_notifier_range range;
1365  	bool is_cow;
1366  	int ret;
1367  
1368  	if (!vma_needs_copy(dst_vma, src_vma))
1369  		return 0;
1370  
1371  	if (is_vm_hugetlb_page(src_vma))
1372  		return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
1373  
1374  	if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
1375  		/*
1376  		 * We do not free on error cases below as remove_vma
1377  		 * gets called on error from higher level routine
1378  		 */
1379  		ret = track_pfn_copy(src_vma);
1380  		if (ret)
1381  			return ret;
1382  	}
1383  
1384  	/*
1385  	 * We need to invalidate the secondary MMU mappings only when
1386  	 * there could be a permission downgrade on the ptes of the
1387  	 * parent mm. And a permission downgrade will only happen if
1388  	 * is_cow_mapping() returns true.
1389  	 */
1390  	is_cow = is_cow_mapping(src_vma->vm_flags);
1391  
1392  	if (is_cow) {
1393  		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
1394  					0, src_mm, addr, end);
1395  		mmu_notifier_invalidate_range_start(&range);
1396  		/*
1397  		 * Disabling preemption is not needed for the write side, as
1398  		 * the read side doesn't spin, but goes to the mmap_lock.
1399  		 *
1400  		 * Use the raw variant of the seqcount_t write API to avoid
1401  		 * lockdep complaining about preemptibility.
1402  		 */
1403  		vma_assert_write_locked(src_vma);
1404  		raw_write_seqcount_begin(&src_mm->write_protect_seq);
1405  	}
1406  
1407  	ret = 0;
1408  	dst_pgd = pgd_offset(dst_mm, addr);
1409  	src_pgd = pgd_offset(src_mm, addr);
1410  	do {
1411  		next = pgd_addr_end(addr, end);
1412  		if (pgd_none_or_clear_bad(src_pgd))
1413  			continue;
1414  		if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
1415  					    addr, next))) {
1416  			untrack_pfn_clear(dst_vma);
1417  			ret = -ENOMEM;
1418  			break;
1419  		}
1420  	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
1421  
1422  	if (is_cow) {
1423  		raw_write_seqcount_end(&src_mm->write_protect_seq);
1424  		mmu_notifier_invalidate_range_end(&range);
1425  	}
1426  	return ret;
1427  }
1428  
1429  /* Whether we should zap all COWed (private) pages too */
should_zap_cows(struct zap_details * details)1430  static inline bool should_zap_cows(struct zap_details *details)
1431  {
1432  	/* By default, zap all pages */
1433  	if (!details)
1434  		return true;
1435  
1436  	/* Or, we zap COWed pages only if the caller wants to */
1437  	return details->even_cows;
1438  }
1439  
1440  /* Decides whether we should zap this folio with the folio pointer specified */
should_zap_folio(struct zap_details * details,struct folio * folio)1441  static inline bool should_zap_folio(struct zap_details *details,
1442  				    struct folio *folio)
1443  {
1444  	/* If we can make a decision without *folio.. */
1445  	if (should_zap_cows(details))
1446  		return true;
1447  
1448  	/* Otherwise we should only zap non-anon folios */
1449  	return !folio_test_anon(folio);
1450  }
1451  
zap_drop_file_uffd_wp(struct zap_details * details)1452  static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
1453  {
1454  	if (!details)
1455  		return false;
1456  
1457  	return details->zap_flags & ZAP_FLAG_DROP_MARKER;
1458  }
1459  
1460  /*
1461   * This function makes sure that we'll replace the none pte with an uffd-wp
1462   * swap special pte marker when necessary. Must be with the pgtable lock held.
1463   */
1464  static inline void
zap_install_uffd_wp_if_needed(struct vm_area_struct * vma,unsigned long addr,pte_t * pte,int nr,struct zap_details * details,pte_t pteval)1465  zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
1466  			      unsigned long addr, pte_t *pte, int nr,
1467  			      struct zap_details *details, pte_t pteval)
1468  {
1469  	/* Zap on anonymous always means dropping everything */
1470  	if (vma_is_anonymous(vma))
1471  		return;
1472  
1473  	if (zap_drop_file_uffd_wp(details))
1474  		return;
1475  
1476  	for (;;) {
1477  		/* the PFN in the PTE is irrelevant. */
1478  		pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
1479  		if (--nr == 0)
1480  			break;
1481  		pte++;
1482  		addr += PAGE_SIZE;
1483  	}
1484  }
1485  
zap_present_folio_ptes(struct mmu_gather * tlb,struct vm_area_struct * vma,struct folio * folio,struct page * page,pte_t * pte,pte_t ptent,unsigned int nr,unsigned long addr,struct zap_details * details,int * rss,bool * force_flush,bool * force_break)1486  static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
1487  		struct vm_area_struct *vma, struct folio *folio,
1488  		struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
1489  		unsigned long addr, struct zap_details *details, int *rss,
1490  		bool *force_flush, bool *force_break)
1491  {
1492  	struct mm_struct *mm = tlb->mm;
1493  	bool delay_rmap = false;
1494  
1495  	if (!folio_test_anon(folio)) {
1496  		ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
1497  		if (pte_dirty(ptent)) {
1498  			folio_mark_dirty(folio);
1499  			if (tlb_delay_rmap(tlb)) {
1500  				delay_rmap = true;
1501  				*force_flush = true;
1502  			}
1503  		}
1504  		if (pte_young(ptent) && likely(vma_has_recency(vma)))
1505  			folio_mark_accessed(folio);
1506  		rss[mm_counter(folio)] -= nr;
1507  	} else {
1508  		/* We don't need up-to-date accessed/dirty bits. */
1509  		clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
1510  		rss[MM_ANONPAGES] -= nr;
1511  	}
1512  	/* Checking a single PTE in a batch is sufficient. */
1513  	arch_check_zapped_pte(vma, ptent);
1514  	tlb_remove_tlb_entries(tlb, pte, nr, addr);
1515  	if (unlikely(userfaultfd_pte_wp(vma, ptent)))
1516  		zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
1517  					      ptent);
1518  
1519  	if (!delay_rmap) {
1520  		folio_remove_rmap_ptes(folio, page, nr, vma);
1521  
1522  		if (unlikely(folio_mapcount(folio) < 0))
1523  			print_bad_pte(vma, addr, ptent, page);
1524  	}
1525  	if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
1526  		*force_flush = true;
1527  		*force_break = true;
1528  	}
1529  }
1530  
1531  /*
1532   * Zap or skip at least one present PTE, trying to batch-process subsequent
1533   * PTEs that map consecutive pages of the same folio.
1534   *
1535   * Returns the number of processed (skipped or zapped) PTEs (at least 1).
1536   */
zap_present_ptes(struct mmu_gather * tlb,struct vm_area_struct * vma,pte_t * pte,pte_t ptent,unsigned int max_nr,unsigned long addr,struct zap_details * details,int * rss,bool * force_flush,bool * force_break)1537  static inline int zap_present_ptes(struct mmu_gather *tlb,
1538  		struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
1539  		unsigned int max_nr, unsigned long addr,
1540  		struct zap_details *details, int *rss, bool *force_flush,
1541  		bool *force_break)
1542  {
1543  	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
1544  	struct mm_struct *mm = tlb->mm;
1545  	struct folio *folio;
1546  	struct page *page;
1547  	int nr;
1548  
1549  	page = vm_normal_page(vma, addr, ptent);
1550  	if (!page) {
1551  		/* We don't need up-to-date accessed/dirty bits. */
1552  		ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
1553  		arch_check_zapped_pte(vma, ptent);
1554  		tlb_remove_tlb_entry(tlb, pte, addr);
1555  		if (userfaultfd_pte_wp(vma, ptent))
1556  			zap_install_uffd_wp_if_needed(vma, addr, pte, 1,
1557  						      details, ptent);
1558  		ksm_might_unmap_zero_page(mm, ptent);
1559  		return 1;
1560  	}
1561  
1562  	folio = page_folio(page);
1563  	if (unlikely(!should_zap_folio(details, folio)))
1564  		return 1;
1565  
1566  	/*
1567  	 * Make sure that the common "small folio" case is as fast as possible
1568  	 * by keeping the batching logic separate.
1569  	 */
1570  	if (unlikely(folio_test_large(folio) && max_nr != 1)) {
1571  		nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
1572  				     NULL, NULL, NULL);
1573  
1574  		zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
1575  				       addr, details, rss, force_flush,
1576  				       force_break);
1577  		return nr;
1578  	}
1579  	zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
1580  			       details, rss, force_flush, force_break);
1581  	return 1;
1582  }
1583  
zap_pte_range(struct mmu_gather * tlb,struct vm_area_struct * vma,pmd_t * pmd,unsigned long addr,unsigned long end,struct zap_details * details)1584  static unsigned long zap_pte_range(struct mmu_gather *tlb,
1585  				struct vm_area_struct *vma, pmd_t *pmd,
1586  				unsigned long addr, unsigned long end,
1587  				struct zap_details *details)
1588  {
1589  	bool force_flush = false, force_break = false;
1590  	struct mm_struct *mm = tlb->mm;
1591  	int rss[NR_MM_COUNTERS];
1592  	spinlock_t *ptl;
1593  	pte_t *start_pte;
1594  	pte_t *pte;
1595  	swp_entry_t entry;
1596  	int nr;
1597  
1598  	tlb_change_page_size(tlb, PAGE_SIZE);
1599  	init_rss_vec(rss);
1600  	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1601  	if (!pte)
1602  		return addr;
1603  
1604  	flush_tlb_batched_pending(mm);
1605  	arch_enter_lazy_mmu_mode();
1606  	do {
1607  		pte_t ptent = ptep_get(pte);
1608  		struct folio *folio;
1609  		struct page *page;
1610  		int max_nr;
1611  
1612  		nr = 1;
1613  		if (pte_none(ptent))
1614  			continue;
1615  
1616  		if (need_resched())
1617  			break;
1618  
1619  		if (pte_present(ptent)) {
1620  			max_nr = (end - addr) / PAGE_SIZE;
1621  			nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
1622  					      addr, details, rss, &force_flush,
1623  					      &force_break);
1624  			if (unlikely(force_break)) {
1625  				addr += nr * PAGE_SIZE;
1626  				break;
1627  			}
1628  			continue;
1629  		}
1630  
1631  		entry = pte_to_swp_entry(ptent);
1632  		if (is_device_private_entry(entry) ||
1633  		    is_device_exclusive_entry(entry)) {
1634  			page = pfn_swap_entry_to_page(entry);
1635  			folio = page_folio(page);
1636  			if (unlikely(!should_zap_folio(details, folio)))
1637  				continue;
1638  			/*
1639  			 * Both device private/exclusive mappings should only
1640  			 * work with anonymous page so far, so we don't need to
1641  			 * consider uffd-wp bit when zap. For more information,
1642  			 * see zap_install_uffd_wp_if_needed().
1643  			 */
1644  			WARN_ON_ONCE(!vma_is_anonymous(vma));
1645  			rss[mm_counter(folio)]--;
1646  			if (is_device_private_entry(entry))
1647  				folio_remove_rmap_pte(folio, page, vma);
1648  			folio_put(folio);
1649  		} else if (!non_swap_entry(entry)) {
1650  			max_nr = (end - addr) / PAGE_SIZE;
1651  			nr = swap_pte_batch(pte, max_nr, ptent);
1652  			/* Genuine swap entries, hence a private anon pages */
1653  			if (!should_zap_cows(details))
1654  				continue;
1655  			rss[MM_SWAPENTS] -= nr;
1656  			free_swap_and_cache_nr(entry, nr);
1657  		} else if (is_migration_entry(entry)) {
1658  			folio = pfn_swap_entry_folio(entry);
1659  			if (!should_zap_folio(details, folio))
1660  				continue;
1661  			rss[mm_counter(folio)]--;
1662  		} else if (pte_marker_entry_uffd_wp(entry)) {
1663  			/*
1664  			 * For anon: always drop the marker; for file: only
1665  			 * drop the marker if explicitly requested.
1666  			 */
1667  			if (!vma_is_anonymous(vma) &&
1668  			    !zap_drop_file_uffd_wp(details))
1669  				continue;
1670  		} else if (is_hwpoison_entry(entry) ||
1671  			   is_poisoned_swp_entry(entry)) {
1672  			if (!should_zap_cows(details))
1673  				continue;
1674  		} else {
1675  			/* We should have covered all the swap entry types */
1676  			pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
1677  			WARN_ON_ONCE(1);
1678  		}
1679  		clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
1680  		zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
1681  	} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
1682  
1683  	add_mm_rss_vec(mm, rss);
1684  	arch_leave_lazy_mmu_mode();
1685  
1686  	/* Do the actual TLB flush before dropping ptl */
1687  	if (force_flush) {
1688  		tlb_flush_mmu_tlbonly(tlb);
1689  		tlb_flush_rmaps(tlb, vma);
1690  	}
1691  	pte_unmap_unlock(start_pte, ptl);
1692  
1693  	/*
1694  	 * If we forced a TLB flush (either due to running out of
1695  	 * batch buffers or because we needed to flush dirty TLB
1696  	 * entries before releasing the ptl), free the batched
1697  	 * memory too. Come back again if we didn't do everything.
1698  	 */
1699  	if (force_flush)
1700  		tlb_flush_mmu(tlb);
1701  
1702  	return addr;
1703  }
1704  
zap_pmd_range(struct mmu_gather * tlb,struct vm_area_struct * vma,pud_t * pud,unsigned long addr,unsigned long end,struct zap_details * details)1705  static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1706  				struct vm_area_struct *vma, pud_t *pud,
1707  				unsigned long addr, unsigned long end,
1708  				struct zap_details *details)
1709  {
1710  	pmd_t *pmd;
1711  	unsigned long next;
1712  
1713  	pmd = pmd_offset(pud, addr);
1714  	do {
1715  		next = pmd_addr_end(addr, end);
1716  		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1717  			if (next - addr != HPAGE_PMD_SIZE)
1718  				__split_huge_pmd(vma, pmd, addr, false, NULL);
1719  			else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
1720  				addr = next;
1721  				continue;
1722  			}
1723  			/* fall through */
1724  		} else if (details && details->single_folio &&
1725  			   folio_test_pmd_mappable(details->single_folio) &&
1726  			   next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
1727  			spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
1728  			/*
1729  			 * Take and drop THP pmd lock so that we cannot return
1730  			 * prematurely, while zap_huge_pmd() has cleared *pmd,
1731  			 * but not yet decremented compound_mapcount().
1732  			 */
1733  			spin_unlock(ptl);
1734  		}
1735  		if (pmd_none(*pmd)) {
1736  			addr = next;
1737  			continue;
1738  		}
1739  		addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
1740  		if (addr != next)
1741  			pmd--;
1742  	} while (pmd++, cond_resched(), addr != end);
1743  
1744  	return addr;
1745  }
1746  
zap_pud_range(struct mmu_gather * tlb,struct vm_area_struct * vma,p4d_t * p4d,unsigned long addr,unsigned long end,struct zap_details * details)1747  static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1748  				struct vm_area_struct *vma, p4d_t *p4d,
1749  				unsigned long addr, unsigned long end,
1750  				struct zap_details *details)
1751  {
1752  	pud_t *pud;
1753  	unsigned long next;
1754  
1755  	pud = pud_offset(p4d, addr);
1756  	do {
1757  		next = pud_addr_end(addr, end);
1758  		if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1759  			if (next - addr != HPAGE_PUD_SIZE) {
1760  				mmap_assert_locked(tlb->mm);
1761  				split_huge_pud(vma, pud, addr);
1762  			} else if (zap_huge_pud(tlb, vma, pud, addr))
1763  				goto next;
1764  			/* fall through */
1765  		}
1766  		if (pud_none_or_clear_bad(pud))
1767  			continue;
1768  		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1769  next:
1770  		cond_resched();
1771  	} while (pud++, addr = next, addr != end);
1772  
1773  	return addr;
1774  }
1775  
zap_p4d_range(struct mmu_gather * tlb,struct vm_area_struct * vma,pgd_t * pgd,unsigned long addr,unsigned long end,struct zap_details * details)1776  static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1777  				struct vm_area_struct *vma, pgd_t *pgd,
1778  				unsigned long addr, unsigned long end,
1779  				struct zap_details *details)
1780  {
1781  	p4d_t *p4d;
1782  	unsigned long next;
1783  
1784  	p4d = p4d_offset(pgd, addr);
1785  	do {
1786  		next = p4d_addr_end(addr, end);
1787  		if (p4d_none_or_clear_bad(p4d))
1788  			continue;
1789  		next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1790  	} while (p4d++, addr = next, addr != end);
1791  
1792  	return addr;
1793  }
1794  
unmap_page_range(struct mmu_gather * tlb,struct vm_area_struct * vma,unsigned long addr,unsigned long end,struct zap_details * details)1795  void unmap_page_range(struct mmu_gather *tlb,
1796  			     struct vm_area_struct *vma,
1797  			     unsigned long addr, unsigned long end,
1798  			     struct zap_details *details)
1799  {
1800  	pgd_t *pgd;
1801  	unsigned long next;
1802  
1803  	BUG_ON(addr >= end);
1804  	tlb_start_vma(tlb, vma);
1805  	pgd = pgd_offset(vma->vm_mm, addr);
1806  	do {
1807  		next = pgd_addr_end(addr, end);
1808  		if (pgd_none_or_clear_bad(pgd))
1809  			continue;
1810  		next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1811  	} while (pgd++, addr = next, addr != end);
1812  	tlb_end_vma(tlb, vma);
1813  }
1814  
1815  
unmap_single_vma(struct mmu_gather * tlb,struct vm_area_struct * vma,unsigned long start_addr,unsigned long end_addr,struct zap_details * details,bool mm_wr_locked)1816  static void unmap_single_vma(struct mmu_gather *tlb,
1817  		struct vm_area_struct *vma, unsigned long start_addr,
1818  		unsigned long end_addr,
1819  		struct zap_details *details, bool mm_wr_locked)
1820  {
1821  	unsigned long start = max(vma->vm_start, start_addr);
1822  	unsigned long end;
1823  
1824  	if (start >= vma->vm_end)
1825  		return;
1826  	end = min(vma->vm_end, end_addr);
1827  	if (end <= vma->vm_start)
1828  		return;
1829  
1830  	if (vma->vm_file)
1831  		uprobe_munmap(vma, start, end);
1832  
1833  	if (unlikely(vma->vm_flags & VM_PFNMAP))
1834  		untrack_pfn(vma, 0, 0, mm_wr_locked);
1835  
1836  	if (start != end) {
1837  		if (unlikely(is_vm_hugetlb_page(vma))) {
1838  			/*
1839  			 * It is undesirable to test vma->vm_file as it
1840  			 * should be non-null for valid hugetlb area.
1841  			 * However, vm_file will be NULL in the error
1842  			 * cleanup path of mmap_region. When
1843  			 * hugetlbfs ->mmap method fails,
1844  			 * mmap_region() nullifies vma->vm_file
1845  			 * before calling this function to clean up.
1846  			 * Since no pte has actually been setup, it is
1847  			 * safe to do nothing in this case.
1848  			 */
1849  			if (vma->vm_file) {
1850  				zap_flags_t zap_flags = details ?
1851  				    details->zap_flags : 0;
1852  				__unmap_hugepage_range(tlb, vma, start, end,
1853  							     NULL, zap_flags);
1854  			}
1855  		} else
1856  			unmap_page_range(tlb, vma, start, end, details);
1857  	}
1858  }
1859  
1860  /**
1861   * unmap_vmas - unmap a range of memory covered by a list of vma's
1862   * @tlb: address of the caller's struct mmu_gather
1863   * @mas: the maple state
1864   * @vma: the starting vma
1865   * @start_addr: virtual address at which to start unmapping
1866   * @end_addr: virtual address at which to end unmapping
1867   * @tree_end: The maximum index to check
1868   * @mm_wr_locked: lock flag
1869   *
1870   * Unmap all pages in the vma list.
1871   *
1872   * Only addresses between `start' and `end' will be unmapped.
1873   *
1874   * The VMA list must be sorted in ascending virtual address order.
1875   *
1876   * unmap_vmas() assumes that the caller will flush the whole unmapped address
1877   * range after unmap_vmas() returns.  So the only responsibility here is to
1878   * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
1879   * drops the lock and schedules.
1880   */
unmap_vmas(struct mmu_gather * tlb,struct ma_state * mas,struct vm_area_struct * vma,unsigned long start_addr,unsigned long end_addr,unsigned long tree_end,bool mm_wr_locked)1881  void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
1882  		struct vm_area_struct *vma, unsigned long start_addr,
1883  		unsigned long end_addr, unsigned long tree_end,
1884  		bool mm_wr_locked)
1885  {
1886  	struct mmu_notifier_range range;
1887  	struct zap_details details = {
1888  		.zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
1889  		/* Careful - we need to zap private pages too! */
1890  		.even_cows = true,
1891  	};
1892  
1893  	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
1894  				start_addr, end_addr);
1895  	mmu_notifier_invalidate_range_start(&range);
1896  	do {
1897  		unsigned long start = start_addr;
1898  		unsigned long end = end_addr;
1899  		hugetlb_zap_begin(vma, &start, &end);
1900  		unmap_single_vma(tlb, vma, start, end, &details,
1901  				 mm_wr_locked);
1902  		hugetlb_zap_end(vma, &details);
1903  		vma = mas_find(mas, tree_end - 1);
1904  	} while (vma && likely(!xa_is_zero(vma)));
1905  	mmu_notifier_invalidate_range_end(&range);
1906  }
1907  
1908  /**
1909   * zap_page_range_single - remove user pages in a given range
1910   * @vma: vm_area_struct holding the applicable pages
1911   * @address: starting address of pages to zap
1912   * @size: number of bytes to zap
1913   * @details: details of shared cache invalidation
1914   *
1915   * The range must fit into one VMA.
1916   */
zap_page_range_single(struct vm_area_struct * vma,unsigned long address,unsigned long size,struct zap_details * details)1917  void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1918  		unsigned long size, struct zap_details *details)
1919  {
1920  	const unsigned long end = address + size;
1921  	struct mmu_notifier_range range;
1922  	struct mmu_gather tlb;
1923  
1924  	lru_add_drain();
1925  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
1926  				address, end);
1927  	hugetlb_zap_begin(vma, &range.start, &range.end);
1928  	tlb_gather_mmu(&tlb, vma->vm_mm);
1929  	update_hiwater_rss(vma->vm_mm);
1930  	mmu_notifier_invalidate_range_start(&range);
1931  	/*
1932  	 * unmap 'address-end' not 'range.start-range.end' as range
1933  	 * could have been expanded for hugetlb pmd sharing.
1934  	 */
1935  	unmap_single_vma(&tlb, vma, address, end, details, false);
1936  	mmu_notifier_invalidate_range_end(&range);
1937  	tlb_finish_mmu(&tlb);
1938  	hugetlb_zap_end(vma, details);
1939  }
1940  
1941  /**
1942   * zap_vma_ptes - remove ptes mapping the vma
1943   * @vma: vm_area_struct holding ptes to be zapped
1944   * @address: starting address of pages to zap
1945   * @size: number of bytes to zap
1946   *
1947   * This function only unmaps ptes assigned to VM_PFNMAP vmas.
1948   *
1949   * The entire address range must be fully contained within the vma.
1950   *
1951   */
zap_vma_ptes(struct vm_area_struct * vma,unsigned long address,unsigned long size)1952  void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1953  		unsigned long size)
1954  {
1955  	if (!range_in_vma(vma, address, address + size) ||
1956  	    		!(vma->vm_flags & VM_PFNMAP))
1957  		return;
1958  
1959  	zap_page_range_single(vma, address, size, NULL);
1960  }
1961  EXPORT_SYMBOL_GPL(zap_vma_ptes);
1962  
walk_to_pmd(struct mm_struct * mm,unsigned long addr)1963  static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
1964  {
1965  	pgd_t *pgd;
1966  	p4d_t *p4d;
1967  	pud_t *pud;
1968  	pmd_t *pmd;
1969  
1970  	pgd = pgd_offset(mm, addr);
1971  	p4d = p4d_alloc(mm, pgd, addr);
1972  	if (!p4d)
1973  		return NULL;
1974  	pud = pud_alloc(mm, p4d, addr);
1975  	if (!pud)
1976  		return NULL;
1977  	pmd = pmd_alloc(mm, pud, addr);
1978  	if (!pmd)
1979  		return NULL;
1980  
1981  	VM_BUG_ON(pmd_trans_huge(*pmd));
1982  	return pmd;
1983  }
1984  
__get_locked_pte(struct mm_struct * mm,unsigned long addr,spinlock_t ** ptl)1985  pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1986  			spinlock_t **ptl)
1987  {
1988  	pmd_t *pmd = walk_to_pmd(mm, addr);
1989  
1990  	if (!pmd)
1991  		return NULL;
1992  	return pte_alloc_map_lock(mm, pmd, addr, ptl);
1993  }
1994  
vm_mixed_zeropage_allowed(struct vm_area_struct * vma)1995  static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma)
1996  {
1997  	VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP);
1998  	/*
1999  	 * Whoever wants to forbid the zeropage after some zeropages
2000  	 * might already have been mapped has to scan the page tables and
2001  	 * bail out on any zeropages. Zeropages in COW mappings can
2002  	 * be unshared using FAULT_FLAG_UNSHARE faults.
2003  	 */
2004  	if (mm_forbids_zeropage(vma->vm_mm))
2005  		return false;
2006  	/* zeropages in COW mappings are common and unproblematic. */
2007  	if (is_cow_mapping(vma->vm_flags))
2008  		return true;
2009  	/* Mappings that do not allow for writable PTEs are unproblematic. */
2010  	if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
2011  		return true;
2012  	/*
2013  	 * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could
2014  	 * find the shared zeropage and longterm-pin it, which would
2015  	 * be problematic as soon as the zeropage gets replaced by a different
2016  	 * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would
2017  	 * now differ to what GUP looked up. FSDAX is incompatible to
2018  	 * FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see
2019  	 * check_vma_flags).
2020  	 */
2021  	return vma->vm_ops && vma->vm_ops->pfn_mkwrite &&
2022  	       (vma_is_fsdax(vma) || vma->vm_flags & VM_IO);
2023  }
2024  
validate_page_before_insert(struct vm_area_struct * vma,struct page * page)2025  static int validate_page_before_insert(struct vm_area_struct *vma,
2026  				       struct page *page)
2027  {
2028  	struct folio *folio = page_folio(page);
2029  
2030  	if (!folio_ref_count(folio))
2031  		return -EINVAL;
2032  	if (unlikely(is_zero_folio(folio))) {
2033  		if (!vm_mixed_zeropage_allowed(vma))
2034  			return -EINVAL;
2035  		return 0;
2036  	}
2037  	if (folio_test_anon(folio) || folio_test_slab(folio) ||
2038  	    page_has_type(page))
2039  		return -EINVAL;
2040  	flush_dcache_folio(folio);
2041  	return 0;
2042  }
2043  
insert_page_into_pte_locked(struct vm_area_struct * vma,pte_t * pte,unsigned long addr,struct page * page,pgprot_t prot)2044  static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
2045  			unsigned long addr, struct page *page, pgprot_t prot)
2046  {
2047  	struct folio *folio = page_folio(page);
2048  	pte_t pteval;
2049  
2050  	if (!pte_none(ptep_get(pte)))
2051  		return -EBUSY;
2052  	/* Ok, finally just insert the thing.. */
2053  	pteval = mk_pte(page, prot);
2054  	if (unlikely(is_zero_folio(folio))) {
2055  		pteval = pte_mkspecial(pteval);
2056  	} else {
2057  		folio_get(folio);
2058  		inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
2059  		folio_add_file_rmap_pte(folio, page, vma);
2060  	}
2061  	set_pte_at(vma->vm_mm, addr, pte, pteval);
2062  	return 0;
2063  }
2064  
insert_page(struct vm_area_struct * vma,unsigned long addr,struct page * page,pgprot_t prot)2065  static int insert_page(struct vm_area_struct *vma, unsigned long addr,
2066  			struct page *page, pgprot_t prot)
2067  {
2068  	int retval;
2069  	pte_t *pte;
2070  	spinlock_t *ptl;
2071  
2072  	retval = validate_page_before_insert(vma, page);
2073  	if (retval)
2074  		goto out;
2075  	retval = -ENOMEM;
2076  	pte = get_locked_pte(vma->vm_mm, addr, &ptl);
2077  	if (!pte)
2078  		goto out;
2079  	retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
2080  	pte_unmap_unlock(pte, ptl);
2081  out:
2082  	return retval;
2083  }
2084  
insert_page_in_batch_locked(struct vm_area_struct * vma,pte_t * pte,unsigned long addr,struct page * page,pgprot_t prot)2085  static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
2086  			unsigned long addr, struct page *page, pgprot_t prot)
2087  {
2088  	int err;
2089  
2090  	err = validate_page_before_insert(vma, page);
2091  	if (err)
2092  		return err;
2093  	return insert_page_into_pte_locked(vma, pte, addr, page, prot);
2094  }
2095  
2096  /* insert_pages() amortizes the cost of spinlock operations
2097   * when inserting pages in a loop.
2098   */
insert_pages(struct vm_area_struct * vma,unsigned long addr,struct page ** pages,unsigned long * num,pgprot_t prot)2099  static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
2100  			struct page **pages, unsigned long *num, pgprot_t prot)
2101  {
2102  	pmd_t *pmd = NULL;
2103  	pte_t *start_pte, *pte;
2104  	spinlock_t *pte_lock;
2105  	struct mm_struct *const mm = vma->vm_mm;
2106  	unsigned long curr_page_idx = 0;
2107  	unsigned long remaining_pages_total = *num;
2108  	unsigned long pages_to_write_in_pmd;
2109  	int ret;
2110  more:
2111  	ret = -EFAULT;
2112  	pmd = walk_to_pmd(mm, addr);
2113  	if (!pmd)
2114  		goto out;
2115  
2116  	pages_to_write_in_pmd = min_t(unsigned long,
2117  		remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
2118  
2119  	/* Allocate the PTE if necessary; takes PMD lock once only. */
2120  	ret = -ENOMEM;
2121  	if (pte_alloc(mm, pmd))
2122  		goto out;
2123  
2124  	while (pages_to_write_in_pmd) {
2125  		int pte_idx = 0;
2126  		const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
2127  
2128  		start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
2129  		if (!start_pte) {
2130  			ret = -EFAULT;
2131  			goto out;
2132  		}
2133  		for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
2134  			int err = insert_page_in_batch_locked(vma, pte,
2135  				addr, pages[curr_page_idx], prot);
2136  			if (unlikely(err)) {
2137  				pte_unmap_unlock(start_pte, pte_lock);
2138  				ret = err;
2139  				remaining_pages_total -= pte_idx;
2140  				goto out;
2141  			}
2142  			addr += PAGE_SIZE;
2143  			++curr_page_idx;
2144  		}
2145  		pte_unmap_unlock(start_pte, pte_lock);
2146  		pages_to_write_in_pmd -= batch_size;
2147  		remaining_pages_total -= batch_size;
2148  	}
2149  	if (remaining_pages_total)
2150  		goto more;
2151  	ret = 0;
2152  out:
2153  	*num = remaining_pages_total;
2154  	return ret;
2155  }
2156  
2157  /**
2158   * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
2159   * @vma: user vma to map to
2160   * @addr: target start user address of these pages
2161   * @pages: source kernel pages
2162   * @num: in: number of pages to map. out: number of pages that were *not*
2163   * mapped. (0 means all pages were successfully mapped).
2164   *
2165   * Preferred over vm_insert_page() when inserting multiple pages.
2166   *
2167   * In case of error, we may have mapped a subset of the provided
2168   * pages. It is the caller's responsibility to account for this case.
2169   *
2170   * The same restrictions apply as in vm_insert_page().
2171   */
vm_insert_pages(struct vm_area_struct * vma,unsigned long addr,struct page ** pages,unsigned long * num)2172  int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
2173  			struct page **pages, unsigned long *num)
2174  {
2175  	const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
2176  
2177  	if (addr < vma->vm_start || end_addr >= vma->vm_end)
2178  		return -EFAULT;
2179  	if (!(vma->vm_flags & VM_MIXEDMAP)) {
2180  		BUG_ON(mmap_read_trylock(vma->vm_mm));
2181  		BUG_ON(vma->vm_flags & VM_PFNMAP);
2182  		vm_flags_set(vma, VM_MIXEDMAP);
2183  	}
2184  	/* Defer page refcount checking till we're about to map that page. */
2185  	return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
2186  }
2187  EXPORT_SYMBOL(vm_insert_pages);
2188  
2189  /**
2190   * vm_insert_page - insert single page into user vma
2191   * @vma: user vma to map to
2192   * @addr: target user address of this page
2193   * @page: source kernel page
2194   *
2195   * This allows drivers to insert individual pages they've allocated
2196   * into a user vma. The zeropage is supported in some VMAs,
2197   * see vm_mixed_zeropage_allowed().
2198   *
2199   * The page has to be a nice clean _individual_ kernel allocation.
2200   * If you allocate a compound page, you need to have marked it as
2201   * such (__GFP_COMP), or manually just split the page up yourself
2202   * (see split_page()).
2203   *
2204   * NOTE! Traditionally this was done with "remap_pfn_range()" which
2205   * took an arbitrary page protection parameter. This doesn't allow
2206   * that. Your vma protection will have to be set up correctly, which
2207   * means that if you want a shared writable mapping, you'd better
2208   * ask for a shared writable mapping!
2209   *
2210   * The page does not need to be reserved.
2211   *
2212   * Usually this function is called from f_op->mmap() handler
2213   * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
2214   * Caller must set VM_MIXEDMAP on vma if it wants to call this
2215   * function from other places, for example from page-fault handler.
2216   *
2217   * Return: %0 on success, negative error code otherwise.
2218   */
vm_insert_page(struct vm_area_struct * vma,unsigned long addr,struct page * page)2219  int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2220  			struct page *page)
2221  {
2222  	if (addr < vma->vm_start || addr >= vma->vm_end)
2223  		return -EFAULT;
2224  	if (!(vma->vm_flags & VM_MIXEDMAP)) {
2225  		BUG_ON(mmap_read_trylock(vma->vm_mm));
2226  		BUG_ON(vma->vm_flags & VM_PFNMAP);
2227  		vm_flags_set(vma, VM_MIXEDMAP);
2228  	}
2229  	return insert_page(vma, addr, page, vma->vm_page_prot);
2230  }
2231  EXPORT_SYMBOL(vm_insert_page);
2232  
2233  /*
2234   * __vm_map_pages - maps range of kernel pages into user vma
2235   * @vma: user vma to map to
2236   * @pages: pointer to array of source kernel pages
2237   * @num: number of pages in page array
2238   * @offset: user's requested vm_pgoff
2239   *
2240   * This allows drivers to map range of kernel pages into a user vma.
2241   * The zeropage is supported in some VMAs, see
2242   * vm_mixed_zeropage_allowed().
2243   *
2244   * Return: 0 on success and error code otherwise.
2245   */
__vm_map_pages(struct vm_area_struct * vma,struct page ** pages,unsigned long num,unsigned long offset)2246  static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
2247  				unsigned long num, unsigned long offset)
2248  {
2249  	unsigned long count = vma_pages(vma);
2250  	unsigned long uaddr = vma->vm_start;
2251  	int ret, i;
2252  
2253  	/* Fail if the user requested offset is beyond the end of the object */
2254  	if (offset >= num)
2255  		return -ENXIO;
2256  
2257  	/* Fail if the user requested size exceeds available object size */
2258  	if (count > num - offset)
2259  		return -ENXIO;
2260  
2261  	for (i = 0; i < count; i++) {
2262  		ret = vm_insert_page(vma, uaddr, pages[offset + i]);
2263  		if (ret < 0)
2264  			return ret;
2265  		uaddr += PAGE_SIZE;
2266  	}
2267  
2268  	return 0;
2269  }
2270  
2271  /**
2272   * vm_map_pages - maps range of kernel pages starts with non zero offset
2273   * @vma: user vma to map to
2274   * @pages: pointer to array of source kernel pages
2275   * @num: number of pages in page array
2276   *
2277   * Maps an object consisting of @num pages, catering for the user's
2278   * requested vm_pgoff
2279   *
2280   * If we fail to insert any page into the vma, the function will return
2281   * immediately leaving any previously inserted pages present.  Callers
2282   * from the mmap handler may immediately return the error as their caller
2283   * will destroy the vma, removing any successfully inserted pages. Other
2284   * callers should make their own arrangements for calling unmap_region().
2285   *
2286   * Context: Process context. Called by mmap handlers.
2287   * Return: 0 on success and error code otherwise.
2288   */
vm_map_pages(struct vm_area_struct * vma,struct page ** pages,unsigned long num)2289  int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
2290  				unsigned long num)
2291  {
2292  	return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
2293  }
2294  EXPORT_SYMBOL(vm_map_pages);
2295  
2296  /**
2297   * vm_map_pages_zero - map range of kernel pages starts with zero offset
2298   * @vma: user vma to map to
2299   * @pages: pointer to array of source kernel pages
2300   * @num: number of pages in page array
2301   *
2302   * Similar to vm_map_pages(), except that it explicitly sets the offset
2303   * to 0. This function is intended for the drivers that did not consider
2304   * vm_pgoff.
2305   *
2306   * Context: Process context. Called by mmap handlers.
2307   * Return: 0 on success and error code otherwise.
2308   */
vm_map_pages_zero(struct vm_area_struct * vma,struct page ** pages,unsigned long num)2309  int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
2310  				unsigned long num)
2311  {
2312  	return __vm_map_pages(vma, pages, num, 0);
2313  }
2314  EXPORT_SYMBOL(vm_map_pages_zero);
2315  
insert_pfn(struct vm_area_struct * vma,unsigned long addr,pfn_t pfn,pgprot_t prot,bool mkwrite)2316  static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2317  			pfn_t pfn, pgprot_t prot, bool mkwrite)
2318  {
2319  	struct mm_struct *mm = vma->vm_mm;
2320  	pte_t *pte, entry;
2321  	spinlock_t *ptl;
2322  
2323  	pte = get_locked_pte(mm, addr, &ptl);
2324  	if (!pte)
2325  		return VM_FAULT_OOM;
2326  	entry = ptep_get(pte);
2327  	if (!pte_none(entry)) {
2328  		if (mkwrite) {
2329  			/*
2330  			 * For read faults on private mappings the PFN passed
2331  			 * in may not match the PFN we have mapped if the
2332  			 * mapped PFN is a writeable COW page.  In the mkwrite
2333  			 * case we are creating a writable PTE for a shared
2334  			 * mapping and we expect the PFNs to match. If they
2335  			 * don't match, we are likely racing with block
2336  			 * allocation and mapping invalidation so just skip the
2337  			 * update.
2338  			 */
2339  			if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
2340  				WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
2341  				goto out_unlock;
2342  			}
2343  			entry = pte_mkyoung(entry);
2344  			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2345  			if (ptep_set_access_flags(vma, addr, pte, entry, 1))
2346  				update_mmu_cache(vma, addr, pte);
2347  		}
2348  		goto out_unlock;
2349  	}
2350  
2351  	/* Ok, finally just insert the thing.. */
2352  	if (pfn_t_devmap(pfn))
2353  		entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
2354  	else
2355  		entry = pte_mkspecial(pfn_t_pte(pfn, prot));
2356  
2357  	if (mkwrite) {
2358  		entry = pte_mkyoung(entry);
2359  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2360  	}
2361  
2362  	set_pte_at(mm, addr, pte, entry);
2363  	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
2364  
2365  out_unlock:
2366  	pte_unmap_unlock(pte, ptl);
2367  	return VM_FAULT_NOPAGE;
2368  }
2369  
2370  /**
2371   * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
2372   * @vma: user vma to map to
2373   * @addr: target user address of this page
2374   * @pfn: source kernel pfn
2375   * @pgprot: pgprot flags for the inserted page
2376   *
2377   * This is exactly like vmf_insert_pfn(), except that it allows drivers
2378   * to override pgprot on a per-page basis.
2379   *
2380   * This only makes sense for IO mappings, and it makes no sense for
2381   * COW mappings.  In general, using multiple vmas is preferable;
2382   * vmf_insert_pfn_prot should only be used if using multiple VMAs is
2383   * impractical.
2384   *
2385   * pgprot typically only differs from @vma->vm_page_prot when drivers set
2386   * caching- and encryption bits different than those of @vma->vm_page_prot,
2387   * because the caching- or encryption mode may not be known at mmap() time.
2388   *
2389   * This is ok as long as @vma->vm_page_prot is not used by the core vm
2390   * to set caching and encryption bits for those vmas (except for COW pages).
2391   * This is ensured by core vm only modifying these page table entries using
2392   * functions that don't touch caching- or encryption bits, using pte_modify()
2393   * if needed. (See for example mprotect()).
2394   *
2395   * Also when new page-table entries are created, this is only done using the
2396   * fault() callback, and never using the value of vma->vm_page_prot,
2397   * except for page-table entries that point to anonymous pages as the result
2398   * of COW.
2399   *
2400   * Context: Process context.  May allocate using %GFP_KERNEL.
2401   * Return: vm_fault_t value.
2402   */
vmf_insert_pfn_prot(struct vm_area_struct * vma,unsigned long addr,unsigned long pfn,pgprot_t pgprot)2403  vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
2404  			unsigned long pfn, pgprot_t pgprot)
2405  {
2406  	/*
2407  	 * Technically, architectures with pte_special can avoid all these
2408  	 * restrictions (same for remap_pfn_range).  However we would like
2409  	 * consistency in testing and feature parity among all, so we should
2410  	 * try to keep these invariants in place for everybody.
2411  	 */
2412  	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2413  	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2414  						(VM_PFNMAP|VM_MIXEDMAP));
2415  	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2416  	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2417  
2418  	if (addr < vma->vm_start || addr >= vma->vm_end)
2419  		return VM_FAULT_SIGBUS;
2420  
2421  	if (!pfn_modify_allowed(pfn, pgprot))
2422  		return VM_FAULT_SIGBUS;
2423  
2424  	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
2425  
2426  	return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
2427  			false);
2428  }
2429  EXPORT_SYMBOL(vmf_insert_pfn_prot);
2430  
2431  /**
2432   * vmf_insert_pfn - insert single pfn into user vma
2433   * @vma: user vma to map to
2434   * @addr: target user address of this page
2435   * @pfn: source kernel pfn
2436   *
2437   * Similar to vm_insert_page, this allows drivers to insert individual pages
2438   * they've allocated into a user vma. Same comments apply.
2439   *
2440   * This function should only be called from a vm_ops->fault handler, and
2441   * in that case the handler should return the result of this function.
2442   *
2443   * vma cannot be a COW mapping.
2444   *
2445   * As this is called only for pages that do not currently exist, we
2446   * do not need to flush old virtual caches or the TLB.
2447   *
2448   * Context: Process context.  May allocate using %GFP_KERNEL.
2449   * Return: vm_fault_t value.
2450   */
vmf_insert_pfn(struct vm_area_struct * vma,unsigned long addr,unsigned long pfn)2451  vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2452  			unsigned long pfn)
2453  {
2454  	return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
2455  }
2456  EXPORT_SYMBOL(vmf_insert_pfn);
2457  
vm_mixed_ok(struct vm_area_struct * vma,pfn_t pfn,bool mkwrite)2458  static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite)
2459  {
2460  	if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) &&
2461  	    (mkwrite || !vm_mixed_zeropage_allowed(vma)))
2462  		return false;
2463  	/* these checks mirror the abort conditions in vm_normal_page */
2464  	if (vma->vm_flags & VM_MIXEDMAP)
2465  		return true;
2466  	if (pfn_t_devmap(pfn))
2467  		return true;
2468  	if (pfn_t_special(pfn))
2469  		return true;
2470  	if (is_zero_pfn(pfn_t_to_pfn(pfn)))
2471  		return true;
2472  	return false;
2473  }
2474  
__vm_insert_mixed(struct vm_area_struct * vma,unsigned long addr,pfn_t pfn,bool mkwrite)2475  static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
2476  		unsigned long addr, pfn_t pfn, bool mkwrite)
2477  {
2478  	pgprot_t pgprot = vma->vm_page_prot;
2479  	int err;
2480  
2481  	if (!vm_mixed_ok(vma, pfn, mkwrite))
2482  		return VM_FAULT_SIGBUS;
2483  
2484  	if (addr < vma->vm_start || addr >= vma->vm_end)
2485  		return VM_FAULT_SIGBUS;
2486  
2487  	track_pfn_insert(vma, &pgprot, pfn);
2488  
2489  	if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
2490  		return VM_FAULT_SIGBUS;
2491  
2492  	/*
2493  	 * If we don't have pte special, then we have to use the pfn_valid()
2494  	 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
2495  	 * refcount the page if pfn_valid is true (hence insert_page rather
2496  	 * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
2497  	 * without pte special, it would there be refcounted as a normal page.
2498  	 */
2499  	if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
2500  	    !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
2501  		struct page *page;
2502  
2503  		/*
2504  		 * At this point we are committed to insert_page()
2505  		 * regardless of whether the caller specified flags that
2506  		 * result in pfn_t_has_page() == false.
2507  		 */
2508  		page = pfn_to_page(pfn_t_to_pfn(pfn));
2509  		err = insert_page(vma, addr, page, pgprot);
2510  	} else {
2511  		return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
2512  	}
2513  
2514  	if (err == -ENOMEM)
2515  		return VM_FAULT_OOM;
2516  	if (err < 0 && err != -EBUSY)
2517  		return VM_FAULT_SIGBUS;
2518  
2519  	return VM_FAULT_NOPAGE;
2520  }
2521  
vmf_insert_mixed(struct vm_area_struct * vma,unsigned long addr,pfn_t pfn)2522  vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2523  		pfn_t pfn)
2524  {
2525  	return __vm_insert_mixed(vma, addr, pfn, false);
2526  }
2527  EXPORT_SYMBOL(vmf_insert_mixed);
2528  
2529  /*
2530   *  If the insertion of PTE failed because someone else already added a
2531   *  different entry in the mean time, we treat that as success as we assume
2532   *  the same entry was actually inserted.
2533   */
vmf_insert_mixed_mkwrite(struct vm_area_struct * vma,unsigned long addr,pfn_t pfn)2534  vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
2535  		unsigned long addr, pfn_t pfn)
2536  {
2537  	return __vm_insert_mixed(vma, addr, pfn, true);
2538  }
2539  
2540  /*
2541   * maps a range of physical memory into the requested pages. the old
2542   * mappings are removed. any references to nonexistent pages results
2543   * in null mappings (currently treated as "copy-on-access")
2544   */
remap_pte_range(struct mm_struct * mm,pmd_t * pmd,unsigned long addr,unsigned long end,unsigned long pfn,pgprot_t prot)2545  static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2546  			unsigned long addr, unsigned long end,
2547  			unsigned long pfn, pgprot_t prot)
2548  {
2549  	pte_t *pte, *mapped_pte;
2550  	spinlock_t *ptl;
2551  	int err = 0;
2552  
2553  	mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2554  	if (!pte)
2555  		return -ENOMEM;
2556  	arch_enter_lazy_mmu_mode();
2557  	do {
2558  		BUG_ON(!pte_none(ptep_get(pte)));
2559  		if (!pfn_modify_allowed(pfn, prot)) {
2560  			err = -EACCES;
2561  			break;
2562  		}
2563  		set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2564  		pfn++;
2565  	} while (pte++, addr += PAGE_SIZE, addr != end);
2566  	arch_leave_lazy_mmu_mode();
2567  	pte_unmap_unlock(mapped_pte, ptl);
2568  	return err;
2569  }
2570  
remap_pmd_range(struct mm_struct * mm,pud_t * pud,unsigned long addr,unsigned long end,unsigned long pfn,pgprot_t prot)2571  static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2572  			unsigned long addr, unsigned long end,
2573  			unsigned long pfn, pgprot_t prot)
2574  {
2575  	pmd_t *pmd;
2576  	unsigned long next;
2577  	int err;
2578  
2579  	pfn -= addr >> PAGE_SHIFT;
2580  	pmd = pmd_alloc(mm, pud, addr);
2581  	if (!pmd)
2582  		return -ENOMEM;
2583  	VM_BUG_ON(pmd_trans_huge(*pmd));
2584  	do {
2585  		next = pmd_addr_end(addr, end);
2586  		err = remap_pte_range(mm, pmd, addr, next,
2587  				pfn + (addr >> PAGE_SHIFT), prot);
2588  		if (err)
2589  			return err;
2590  	} while (pmd++, addr = next, addr != end);
2591  	return 0;
2592  }
2593  
remap_pud_range(struct mm_struct * mm,p4d_t * p4d,unsigned long addr,unsigned long end,unsigned long pfn,pgprot_t prot)2594  static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
2595  			unsigned long addr, unsigned long end,
2596  			unsigned long pfn, pgprot_t prot)
2597  {
2598  	pud_t *pud;
2599  	unsigned long next;
2600  	int err;
2601  
2602  	pfn -= addr >> PAGE_SHIFT;
2603  	pud = pud_alloc(mm, p4d, addr);
2604  	if (!pud)
2605  		return -ENOMEM;
2606  	do {
2607  		next = pud_addr_end(addr, end);
2608  		err = remap_pmd_range(mm, pud, addr, next,
2609  				pfn + (addr >> PAGE_SHIFT), prot);
2610  		if (err)
2611  			return err;
2612  	} while (pud++, addr = next, addr != end);
2613  	return 0;
2614  }
2615  
remap_p4d_range(struct mm_struct * mm,pgd_t * pgd,unsigned long addr,unsigned long end,unsigned long pfn,pgprot_t prot)2616  static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2617  			unsigned long addr, unsigned long end,
2618  			unsigned long pfn, pgprot_t prot)
2619  {
2620  	p4d_t *p4d;
2621  	unsigned long next;
2622  	int err;
2623  
2624  	pfn -= addr >> PAGE_SHIFT;
2625  	p4d = p4d_alloc(mm, pgd, addr);
2626  	if (!p4d)
2627  		return -ENOMEM;
2628  	do {
2629  		next = p4d_addr_end(addr, end);
2630  		err = remap_pud_range(mm, p4d, addr, next,
2631  				pfn + (addr >> PAGE_SHIFT), prot);
2632  		if (err)
2633  			return err;
2634  	} while (p4d++, addr = next, addr != end);
2635  	return 0;
2636  }
2637  
remap_pfn_range_internal(struct vm_area_struct * vma,unsigned long addr,unsigned long pfn,unsigned long size,pgprot_t prot)2638  static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr,
2639  		unsigned long pfn, unsigned long size, pgprot_t prot)
2640  {
2641  	pgd_t *pgd;
2642  	unsigned long next;
2643  	unsigned long end = addr + PAGE_ALIGN(size);
2644  	struct mm_struct *mm = vma->vm_mm;
2645  	int err;
2646  
2647  	if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
2648  		return -EINVAL;
2649  
2650  	/*
2651  	 * Physically remapped pages are special. Tell the
2652  	 * rest of the world about it:
2653  	 *   VM_IO tells people not to look at these pages
2654  	 *	(accesses can have side effects).
2655  	 *   VM_PFNMAP tells the core MM that the base pages are just
2656  	 *	raw PFN mappings, and do not have a "struct page" associated
2657  	 *	with them.
2658  	 *   VM_DONTEXPAND
2659  	 *      Disable vma merging and expanding with mremap().
2660  	 *   VM_DONTDUMP
2661  	 *      Omit vma from core dump, even when VM_IO turned off.
2662  	 *
2663  	 * There's a horrible special case to handle copy-on-write
2664  	 * behaviour that some programs depend on. We mark the "original"
2665  	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
2666  	 * See vm_normal_page() for details.
2667  	 */
2668  	if (is_cow_mapping(vma->vm_flags)) {
2669  		if (addr != vma->vm_start || end != vma->vm_end)
2670  			return -EINVAL;
2671  		vma->vm_pgoff = pfn;
2672  	}
2673  
2674  	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
2675  
2676  	BUG_ON(addr >= end);
2677  	pfn -= addr >> PAGE_SHIFT;
2678  	pgd = pgd_offset(mm, addr);
2679  	flush_cache_range(vma, addr, end);
2680  	do {
2681  		next = pgd_addr_end(addr, end);
2682  		err = remap_p4d_range(mm, pgd, addr, next,
2683  				pfn + (addr >> PAGE_SHIFT), prot);
2684  		if (err)
2685  			return err;
2686  	} while (pgd++, addr = next, addr != end);
2687  
2688  	return 0;
2689  }
2690  
2691  /*
2692   * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
2693   * must have pre-validated the caching bits of the pgprot_t.
2694   */
remap_pfn_range_notrack(struct vm_area_struct * vma,unsigned long addr,unsigned long pfn,unsigned long size,pgprot_t prot)2695  int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
2696  		unsigned long pfn, unsigned long size, pgprot_t prot)
2697  {
2698  	int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);
2699  
2700  	if (!error)
2701  		return 0;
2702  
2703  	/*
2704  	 * A partial pfn range mapping is dangerous: it does not
2705  	 * maintain page reference counts, and callers may free
2706  	 * pages due to the error. So zap it early.
2707  	 */
2708  	zap_page_range_single(vma, addr, size, NULL);
2709  	return error;
2710  }
2711  
2712  /**
2713   * remap_pfn_range - remap kernel memory to userspace
2714   * @vma: user vma to map to
2715   * @addr: target page aligned user address to start at
2716   * @pfn: page frame number of kernel physical memory address
2717   * @size: size of mapping area
2718   * @prot: page protection flags for this mapping
2719   *
2720   * Note: this is only safe if the mm semaphore is held when called.
2721   *
2722   * Return: %0 on success, negative error code otherwise.
2723   */
remap_pfn_range(struct vm_area_struct * vma,unsigned long addr,unsigned long pfn,unsigned long size,pgprot_t prot)2724  int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2725  		    unsigned long pfn, unsigned long size, pgprot_t prot)
2726  {
2727  	int err;
2728  
2729  	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
2730  	if (err)
2731  		return -EINVAL;
2732  
2733  	err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
2734  	if (err)
2735  		untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
2736  	return err;
2737  }
2738  EXPORT_SYMBOL(remap_pfn_range);
2739  
2740  /**
2741   * vm_iomap_memory - remap memory to userspace
2742   * @vma: user vma to map to
2743   * @start: start of the physical memory to be mapped
2744   * @len: size of area
2745   *
2746   * This is a simplified io_remap_pfn_range() for common driver use. The
2747   * driver just needs to give us the physical memory range to be mapped,
2748   * we'll figure out the rest from the vma information.
2749   *
2750   * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
2751   * whatever write-combining details or similar.
2752   *
2753   * Return: %0 on success, negative error code otherwise.
2754   */
vm_iomap_memory(struct vm_area_struct * vma,phys_addr_t start,unsigned long len)2755  int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2756  {
2757  	unsigned long vm_len, pfn, pages;
2758  
2759  	/* Check that the physical memory area passed in looks valid */
2760  	if (start + len < start)
2761  		return -EINVAL;
2762  	/*
2763  	 * You *really* shouldn't map things that aren't page-aligned,
2764  	 * but we've historically allowed it because IO memory might
2765  	 * just have smaller alignment.
2766  	 */
2767  	len += start & ~PAGE_MASK;
2768  	pfn = start >> PAGE_SHIFT;
2769  	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2770  	if (pfn + pages < pfn)
2771  		return -EINVAL;
2772  
2773  	/* We start the mapping 'vm_pgoff' pages into the area */
2774  	if (vma->vm_pgoff > pages)
2775  		return -EINVAL;
2776  	pfn += vma->vm_pgoff;
2777  	pages -= vma->vm_pgoff;
2778  
2779  	/* Can we fit all of the mapping? */
2780  	vm_len = vma->vm_end - vma->vm_start;
2781  	if (vm_len >> PAGE_SHIFT > pages)
2782  		return -EINVAL;
2783  
2784  	/* Ok, let it rip */
2785  	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2786  }
2787  EXPORT_SYMBOL(vm_iomap_memory);
2788  
apply_to_pte_range(struct mm_struct * mm,pmd_t * pmd,unsigned long addr,unsigned long end,pte_fn_t fn,void * data,bool create,pgtbl_mod_mask * mask)2789  static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2790  				     unsigned long addr, unsigned long end,
2791  				     pte_fn_t fn, void *data, bool create,
2792  				     pgtbl_mod_mask *mask)
2793  {
2794  	pte_t *pte, *mapped_pte;
2795  	int err = 0;
2796  	spinlock_t *ptl;
2797  
2798  	if (create) {
2799  		mapped_pte = pte = (mm == &init_mm) ?
2800  			pte_alloc_kernel_track(pmd, addr, mask) :
2801  			pte_alloc_map_lock(mm, pmd, addr, &ptl);
2802  		if (!pte)
2803  			return -ENOMEM;
2804  	} else {
2805  		mapped_pte = pte = (mm == &init_mm) ?
2806  			pte_offset_kernel(pmd, addr) :
2807  			pte_offset_map_lock(mm, pmd, addr, &ptl);
2808  		if (!pte)
2809  			return -EINVAL;
2810  	}
2811  
2812  	arch_enter_lazy_mmu_mode();
2813  
2814  	if (fn) {
2815  		do {
2816  			if (create || !pte_none(ptep_get(pte))) {
2817  				err = fn(pte++, addr, data);
2818  				if (err)
2819  					break;
2820  			}
2821  		} while (addr += PAGE_SIZE, addr != end);
2822  	}
2823  	*mask |= PGTBL_PTE_MODIFIED;
2824  
2825  	arch_leave_lazy_mmu_mode();
2826  
2827  	if (mm != &init_mm)
2828  		pte_unmap_unlock(mapped_pte, ptl);
2829  	return err;
2830  }
2831  
apply_to_pmd_range(struct mm_struct * mm,pud_t * pud,unsigned long addr,unsigned long end,pte_fn_t fn,void * data,bool create,pgtbl_mod_mask * mask)2832  static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2833  				     unsigned long addr, unsigned long end,
2834  				     pte_fn_t fn, void *data, bool create,
2835  				     pgtbl_mod_mask *mask)
2836  {
2837  	pmd_t *pmd;
2838  	unsigned long next;
2839  	int err = 0;
2840  
2841  	BUG_ON(pud_leaf(*pud));
2842  
2843  	if (create) {
2844  		pmd = pmd_alloc_track(mm, pud, addr, mask);
2845  		if (!pmd)
2846  			return -ENOMEM;
2847  	} else {
2848  		pmd = pmd_offset(pud, addr);
2849  	}
2850  	do {
2851  		next = pmd_addr_end(addr, end);
2852  		if (pmd_none(*pmd) && !create)
2853  			continue;
2854  		if (WARN_ON_ONCE(pmd_leaf(*pmd)))
2855  			return -EINVAL;
2856  		if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
2857  			if (!create)
2858  				continue;
2859  			pmd_clear_bad(pmd);
2860  		}
2861  		err = apply_to_pte_range(mm, pmd, addr, next,
2862  					 fn, data, create, mask);
2863  		if (err)
2864  			break;
2865  	} while (pmd++, addr = next, addr != end);
2866  
2867  	return err;
2868  }
2869  
apply_to_pud_range(struct mm_struct * mm,p4d_t * p4d,unsigned long addr,unsigned long end,pte_fn_t fn,void * data,bool create,pgtbl_mod_mask * mask)2870  static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2871  				     unsigned long addr, unsigned long end,
2872  				     pte_fn_t fn, void *data, bool create,
2873  				     pgtbl_mod_mask *mask)
2874  {
2875  	pud_t *pud;
2876  	unsigned long next;
2877  	int err = 0;
2878  
2879  	if (create) {
2880  		pud = pud_alloc_track(mm, p4d, addr, mask);
2881  		if (!pud)
2882  			return -ENOMEM;
2883  	} else {
2884  		pud = pud_offset(p4d, addr);
2885  	}
2886  	do {
2887  		next = pud_addr_end(addr, end);
2888  		if (pud_none(*pud) && !create)
2889  			continue;
2890  		if (WARN_ON_ONCE(pud_leaf(*pud)))
2891  			return -EINVAL;
2892  		if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
2893  			if (!create)
2894  				continue;
2895  			pud_clear_bad(pud);
2896  		}
2897  		err = apply_to_pmd_range(mm, pud, addr, next,
2898  					 fn, data, create, mask);
2899  		if (err)
2900  			break;
2901  	} while (pud++, addr = next, addr != end);
2902  
2903  	return err;
2904  }
2905  
apply_to_p4d_range(struct mm_struct * mm,pgd_t * pgd,unsigned long addr,unsigned long end,pte_fn_t fn,void * data,bool create,pgtbl_mod_mask * mask)2906  static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2907  				     unsigned long addr, unsigned long end,
2908  				     pte_fn_t fn, void *data, bool create,
2909  				     pgtbl_mod_mask *mask)
2910  {
2911  	p4d_t *p4d;
2912  	unsigned long next;
2913  	int err = 0;
2914  
2915  	if (create) {
2916  		p4d = p4d_alloc_track(mm, pgd, addr, mask);
2917  		if (!p4d)
2918  			return -ENOMEM;
2919  	} else {
2920  		p4d = p4d_offset(pgd, addr);
2921  	}
2922  	do {
2923  		next = p4d_addr_end(addr, end);
2924  		if (p4d_none(*p4d) && !create)
2925  			continue;
2926  		if (WARN_ON_ONCE(p4d_leaf(*p4d)))
2927  			return -EINVAL;
2928  		if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
2929  			if (!create)
2930  				continue;
2931  			p4d_clear_bad(p4d);
2932  		}
2933  		err = apply_to_pud_range(mm, p4d, addr, next,
2934  					 fn, data, create, mask);
2935  		if (err)
2936  			break;
2937  	} while (p4d++, addr = next, addr != end);
2938  
2939  	return err;
2940  }
2941  
__apply_to_page_range(struct mm_struct * mm,unsigned long addr,unsigned long size,pte_fn_t fn,void * data,bool create)2942  static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2943  				 unsigned long size, pte_fn_t fn,
2944  				 void *data, bool create)
2945  {
2946  	pgd_t *pgd;
2947  	unsigned long start = addr, next;
2948  	unsigned long end = addr + size;
2949  	pgtbl_mod_mask mask = 0;
2950  	int err = 0;
2951  
2952  	if (WARN_ON(addr >= end))
2953  		return -EINVAL;
2954  
2955  	pgd = pgd_offset(mm, addr);
2956  	do {
2957  		next = pgd_addr_end(addr, end);
2958  		if (pgd_none(*pgd) && !create)
2959  			continue;
2960  		if (WARN_ON_ONCE(pgd_leaf(*pgd)))
2961  			return -EINVAL;
2962  		if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
2963  			if (!create)
2964  				continue;
2965  			pgd_clear_bad(pgd);
2966  		}
2967  		err = apply_to_p4d_range(mm, pgd, addr, next,
2968  					 fn, data, create, &mask);
2969  		if (err)
2970  			break;
2971  	} while (pgd++, addr = next, addr != end);
2972  
2973  	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
2974  		arch_sync_kernel_mappings(start, start + size);
2975  
2976  	return err;
2977  }
2978  
2979  /*
2980   * Scan a region of virtual memory, filling in page tables as necessary
2981   * and calling a provided function on each leaf page table.
2982   */
apply_to_page_range(struct mm_struct * mm,unsigned long addr,unsigned long size,pte_fn_t fn,void * data)2983  int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2984  			unsigned long size, pte_fn_t fn, void *data)
2985  {
2986  	return __apply_to_page_range(mm, addr, size, fn, data, true);
2987  }
2988  EXPORT_SYMBOL_GPL(apply_to_page_range);
2989  
2990  /*
2991   * Scan a region of virtual memory, calling a provided function on
2992   * each leaf page table where it exists.
2993   *
2994   * Unlike apply_to_page_range, this does _not_ fill in page tables
2995   * where they are absent.
2996   */
apply_to_existing_page_range(struct mm_struct * mm,unsigned long addr,unsigned long size,pte_fn_t fn,void * data)2997  int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
2998  				 unsigned long size, pte_fn_t fn, void *data)
2999  {
3000  	return __apply_to_page_range(mm, addr, size, fn, data, false);
3001  }
3002  EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
3003  
3004  /*
3005   * handle_pte_fault chooses page fault handler according to an entry which was
3006   * read non-atomically.  Before making any commitment, on those architectures
3007   * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
3008   * parts, do_swap_page must check under lock before unmapping the pte and
3009   * proceeding (but do_wp_page is only called after already making such a check;
3010   * and do_anonymous_page can safely check later on).
3011   */
pte_unmap_same(struct vm_fault * vmf)3012  static inline int pte_unmap_same(struct vm_fault *vmf)
3013  {
3014  	int same = 1;
3015  #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
3016  	if (sizeof(pte_t) > sizeof(unsigned long)) {
3017  		spin_lock(vmf->ptl);
3018  		same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
3019  		spin_unlock(vmf->ptl);
3020  	}
3021  #endif
3022  	pte_unmap(vmf->pte);
3023  	vmf->pte = NULL;
3024  	return same;
3025  }
3026  
3027  /*
3028   * Return:
3029   *	0:		copied succeeded
3030   *	-EHWPOISON:	copy failed due to hwpoison in source page
3031   *	-EAGAIN:	copied failed (some other reason)
3032   */
__wp_page_copy_user(struct page * dst,struct page * src,struct vm_fault * vmf)3033  static inline int __wp_page_copy_user(struct page *dst, struct page *src,
3034  				      struct vm_fault *vmf)
3035  {
3036  	int ret;
3037  	void *kaddr;
3038  	void __user *uaddr;
3039  	struct vm_area_struct *vma = vmf->vma;
3040  	struct mm_struct *mm = vma->vm_mm;
3041  	unsigned long addr = vmf->address;
3042  
3043  	if (likely(src)) {
3044  		if (copy_mc_user_highpage(dst, src, addr, vma))
3045  			return -EHWPOISON;
3046  		return 0;
3047  	}
3048  
3049  	/*
3050  	 * If the source page was a PFN mapping, we don't have
3051  	 * a "struct page" for it. We do a best-effort copy by
3052  	 * just copying from the original user address. If that
3053  	 * fails, we just zero-fill it. Live with it.
3054  	 */
3055  	kaddr = kmap_local_page(dst);
3056  	pagefault_disable();
3057  	uaddr = (void __user *)(addr & PAGE_MASK);
3058  
3059  	/*
3060  	 * On architectures with software "accessed" bits, we would
3061  	 * take a double page fault, so mark it accessed here.
3062  	 */
3063  	vmf->pte = NULL;
3064  	if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
3065  		pte_t entry;
3066  
3067  		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
3068  		if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
3069  			/*
3070  			 * Other thread has already handled the fault
3071  			 * and update local tlb only
3072  			 */
3073  			if (vmf->pte)
3074  				update_mmu_tlb(vma, addr, vmf->pte);
3075  			ret = -EAGAIN;
3076  			goto pte_unlock;
3077  		}
3078  
3079  		entry = pte_mkyoung(vmf->orig_pte);
3080  		if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
3081  			update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
3082  	}
3083  
3084  	/*
3085  	 * This really shouldn't fail, because the page is there
3086  	 * in the page tables. But it might just be unreadable,
3087  	 * in which case we just give up and fill the result with
3088  	 * zeroes.
3089  	 */
3090  	if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
3091  		if (vmf->pte)
3092  			goto warn;
3093  
3094  		/* Re-validate under PTL if the page is still mapped */
3095  		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
3096  		if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
3097  			/* The PTE changed under us, update local tlb */
3098  			if (vmf->pte)
3099  				update_mmu_tlb(vma, addr, vmf->pte);
3100  			ret = -EAGAIN;
3101  			goto pte_unlock;
3102  		}
3103  
3104  		/*
3105  		 * The same page can be mapped back since last copy attempt.
3106  		 * Try to copy again under PTL.
3107  		 */
3108  		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
3109  			/*
3110  			 * Give a warn in case there can be some obscure
3111  			 * use-case
3112  			 */
3113  warn:
3114  			WARN_ON_ONCE(1);
3115  			clear_page(kaddr);
3116  		}
3117  	}
3118  
3119  	ret = 0;
3120  
3121  pte_unlock:
3122  	if (vmf->pte)
3123  		pte_unmap_unlock(vmf->pte, vmf->ptl);
3124  	pagefault_enable();
3125  	kunmap_local(kaddr);
3126  	flush_dcache_page(dst);
3127  
3128  	return ret;
3129  }
3130  
__get_fault_gfp_mask(struct vm_area_struct * vma)3131  static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
3132  {
3133  	struct file *vm_file = vma->vm_file;
3134  
3135  	if (vm_file)
3136  		return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
3137  
3138  	/*
3139  	 * Special mappings (e.g. VDSO) do not have any file so fake
3140  	 * a default GFP_KERNEL for them.
3141  	 */
3142  	return GFP_KERNEL;
3143  }
3144  
3145  /*
3146   * Notify the address space that the page is about to become writable so that
3147   * it can prohibit this or wait for the page to get into an appropriate state.
3148   *
3149   * We do this without the lock held, so that it can sleep if it needs to.
3150   */
do_page_mkwrite(struct vm_fault * vmf,struct folio * folio)3151  static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
3152  {
3153  	vm_fault_t ret;
3154  	unsigned int old_flags = vmf->flags;
3155  
3156  	vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3157  
3158  	if (vmf->vma->vm_file &&
3159  	    IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
3160  		return VM_FAULT_SIGBUS;
3161  
3162  	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
3163  	/* Restore original flags so that caller is not surprised */
3164  	vmf->flags = old_flags;
3165  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
3166  		return ret;
3167  	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
3168  		folio_lock(folio);
3169  		if (!folio->mapping) {
3170  			folio_unlock(folio);
3171  			return 0; /* retry */
3172  		}
3173  		ret |= VM_FAULT_LOCKED;
3174  	} else
3175  		VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
3176  	return ret;
3177  }
3178  
3179  /*
3180   * Handle dirtying of a page in shared file mapping on a write fault.
3181   *
3182   * The function expects the page to be locked and unlocks it.
3183   */
fault_dirty_shared_page(struct vm_fault * vmf)3184  static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
3185  {
3186  	struct vm_area_struct *vma = vmf->vma;
3187  	struct address_space *mapping;
3188  	struct folio *folio = page_folio(vmf->page);
3189  	bool dirtied;
3190  	bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
3191  
3192  	dirtied = folio_mark_dirty(folio);
3193  	VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
3194  	/*
3195  	 * Take a local copy of the address_space - folio.mapping may be zeroed
3196  	 * by truncate after folio_unlock().   The address_space itself remains
3197  	 * pinned by vma->vm_file's reference.  We rely on folio_unlock()'s
3198  	 * release semantics to prevent the compiler from undoing this copying.
3199  	 */
3200  	mapping = folio_raw_mapping(folio);
3201  	folio_unlock(folio);
3202  
3203  	if (!page_mkwrite)
3204  		file_update_time(vma->vm_file);
3205  
3206  	/*
3207  	 * Throttle page dirtying rate down to writeback speed.
3208  	 *
3209  	 * mapping may be NULL here because some device drivers do not
3210  	 * set page.mapping but still dirty their pages
3211  	 *
3212  	 * Drop the mmap_lock before waiting on IO, if we can. The file
3213  	 * is pinning the mapping, as per above.
3214  	 */
3215  	if ((dirtied || page_mkwrite) && mapping) {
3216  		struct file *fpin;
3217  
3218  		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
3219  		balance_dirty_pages_ratelimited(mapping);
3220  		if (fpin) {
3221  			fput(fpin);
3222  			return VM_FAULT_COMPLETED;
3223  		}
3224  	}
3225  
3226  	return 0;
3227  }
3228  
3229  /*
3230   * Handle write page faults for pages that can be reused in the current vma
3231   *
3232   * This can happen either due to the mapping being with the VM_SHARED flag,
3233   * or due to us being the last reference standing to the page. In either
3234   * case, all we need to do here is to mark the page as writable and update
3235   * any related book-keeping.
3236   */
wp_page_reuse(struct vm_fault * vmf,struct folio * folio)3237  static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
3238  	__releases(vmf->ptl)
3239  {
3240  	struct vm_area_struct *vma = vmf->vma;
3241  	pte_t entry;
3242  
3243  	VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
3244  	VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte)));
3245  
3246  	if (folio) {
3247  		VM_BUG_ON(folio_test_anon(folio) &&
3248  			  !PageAnonExclusive(vmf->page));
3249  		/*
3250  		 * Clear the folio's cpupid information as the existing
3251  		 * information potentially belongs to a now completely
3252  		 * unrelated process.
3253  		 */
3254  		folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1);
3255  	}
3256  
3257  	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
3258  	entry = pte_mkyoung(vmf->orig_pte);
3259  	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3260  	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
3261  		update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
3262  	pte_unmap_unlock(vmf->pte, vmf->ptl);
3263  	count_vm_event(PGREUSE);
3264  }
3265  
3266  /*
3267   * We could add a bitflag somewhere, but for now, we know that all
3268   * vm_ops that have a ->map_pages have been audited and don't need
3269   * the mmap_lock to be held.
3270   */
vmf_can_call_fault(const struct vm_fault * vmf)3271  static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
3272  {
3273  	struct vm_area_struct *vma = vmf->vma;
3274  
3275  	if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
3276  		return 0;
3277  	vma_end_read(vma);
3278  	return VM_FAULT_RETRY;
3279  }
3280  
3281  /**
3282   * __vmf_anon_prepare - Prepare to handle an anonymous fault.
3283   * @vmf: The vm_fault descriptor passed from the fault handler.
3284   *
3285   * When preparing to insert an anonymous page into a VMA from a
3286   * fault handler, call this function rather than anon_vma_prepare().
3287   * If this vma does not already have an associated anon_vma and we are
3288   * only protected by the per-VMA lock, the caller must retry with the
3289   * mmap_lock held.  __anon_vma_prepare() will look at adjacent VMAs to
3290   * determine if this VMA can share its anon_vma, and that's not safe to
3291   * do with only the per-VMA lock held for this VMA.
3292   *
3293   * Return: 0 if fault handling can proceed.  Any other value should be
3294   * returned to the caller.
3295   */
__vmf_anon_prepare(struct vm_fault * vmf)3296  vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf)
3297  {
3298  	struct vm_area_struct *vma = vmf->vma;
3299  	vm_fault_t ret = 0;
3300  
3301  	if (likely(vma->anon_vma))
3302  		return 0;
3303  	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
3304  		if (!mmap_read_trylock(vma->vm_mm))
3305  			return VM_FAULT_RETRY;
3306  	}
3307  	if (__anon_vma_prepare(vma))
3308  		ret = VM_FAULT_OOM;
3309  	if (vmf->flags & FAULT_FLAG_VMA_LOCK)
3310  		mmap_read_unlock(vma->vm_mm);
3311  	return ret;
3312  }
3313  
3314  /*
3315   * Handle the case of a page which we actually need to copy to a new page,
3316   * either due to COW or unsharing.
3317   *
3318   * Called with mmap_lock locked and the old page referenced, but
3319   * without the ptl held.
3320   *
3321   * High level logic flow:
3322   *
3323   * - Allocate a page, copy the content of the old page to the new one.
3324   * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
3325   * - Take the PTL. If the pte changed, bail out and release the allocated page
3326   * - If the pte is still the way we remember it, update the page table and all
3327   *   relevant references. This includes dropping the reference the page-table
3328   *   held to the old page, as well as updating the rmap.
3329   * - In any case, unlock the PTL and drop the reference we took to the old page.
3330   */
wp_page_copy(struct vm_fault * vmf)3331  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
3332  {
3333  	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
3334  	struct vm_area_struct *vma = vmf->vma;
3335  	struct mm_struct *mm = vma->vm_mm;
3336  	struct folio *old_folio = NULL;
3337  	struct folio *new_folio = NULL;
3338  	pte_t entry;
3339  	int page_copied = 0;
3340  	struct mmu_notifier_range range;
3341  	vm_fault_t ret;
3342  	bool pfn_is_zero;
3343  
3344  	delayacct_wpcopy_start();
3345  
3346  	if (vmf->page)
3347  		old_folio = page_folio(vmf->page);
3348  	ret = vmf_anon_prepare(vmf);
3349  	if (unlikely(ret))
3350  		goto out;
3351  
3352  	pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte));
3353  	new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero);
3354  	if (!new_folio)
3355  		goto oom;
3356  
3357  	if (!pfn_is_zero) {
3358  		int err;
3359  
3360  		err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
3361  		if (err) {
3362  			/*
3363  			 * COW failed, if the fault was solved by other,
3364  			 * it's fine. If not, userspace would re-fault on
3365  			 * the same address and we will handle the fault
3366  			 * from the second attempt.
3367  			 * The -EHWPOISON case will not be retried.
3368  			 */
3369  			folio_put(new_folio);
3370  			if (old_folio)
3371  				folio_put(old_folio);
3372  
3373  			delayacct_wpcopy_end();
3374  			return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
3375  		}
3376  		kmsan_copy_page_meta(&new_folio->page, vmf->page);
3377  	}
3378  
3379  	__folio_mark_uptodate(new_folio);
3380  
3381  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
3382  				vmf->address & PAGE_MASK,
3383  				(vmf->address & PAGE_MASK) + PAGE_SIZE);
3384  	mmu_notifier_invalidate_range_start(&range);
3385  
3386  	/*
3387  	 * Re-check the pte - we dropped the lock
3388  	 */
3389  	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
3390  	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
3391  		if (old_folio) {
3392  			if (!folio_test_anon(old_folio)) {
3393  				dec_mm_counter(mm, mm_counter_file(old_folio));
3394  				inc_mm_counter(mm, MM_ANONPAGES);
3395  			}
3396  		} else {
3397  			ksm_might_unmap_zero_page(mm, vmf->orig_pte);
3398  			inc_mm_counter(mm, MM_ANONPAGES);
3399  		}
3400  		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
3401  		entry = mk_pte(&new_folio->page, vma->vm_page_prot);
3402  		entry = pte_sw_mkyoung(entry);
3403  		if (unlikely(unshare)) {
3404  			if (pte_soft_dirty(vmf->orig_pte))
3405  				entry = pte_mksoft_dirty(entry);
3406  			if (pte_uffd_wp(vmf->orig_pte))
3407  				entry = pte_mkuffd_wp(entry);
3408  		} else {
3409  			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3410  		}
3411  
3412  		/*
3413  		 * Clear the pte entry and flush it first, before updating the
3414  		 * pte with the new entry, to keep TLBs on different CPUs in
3415  		 * sync. This code used to set the new PTE then flush TLBs, but
3416  		 * that left a window where the new PTE could be loaded into
3417  		 * some TLBs while the old PTE remains in others.
3418  		 */
3419  		ptep_clear_flush(vma, vmf->address, vmf->pte);
3420  		folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE);
3421  		folio_add_lru_vma(new_folio, vma);
3422  		BUG_ON(unshare && pte_write(entry));
3423  		set_pte_at(mm, vmf->address, vmf->pte, entry);
3424  		update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
3425  		if (old_folio) {
3426  			/*
3427  			 * Only after switching the pte to the new page may
3428  			 * we remove the mapcount here. Otherwise another
3429  			 * process may come and find the rmap count decremented
3430  			 * before the pte is switched to the new page, and
3431  			 * "reuse" the old page writing into it while our pte
3432  			 * here still points into it and can be read by other
3433  			 * threads.
3434  			 *
3435  			 * The critical issue is to order this
3436  			 * folio_remove_rmap_pte() with the ptp_clear_flush
3437  			 * above. Those stores are ordered by (if nothing else,)
3438  			 * the barrier present in the atomic_add_negative
3439  			 * in folio_remove_rmap_pte();
3440  			 *
3441  			 * Then the TLB flush in ptep_clear_flush ensures that
3442  			 * no process can access the old page before the
3443  			 * decremented mapcount is visible. And the old page
3444  			 * cannot be reused until after the decremented
3445  			 * mapcount is visible. So transitively, TLBs to
3446  			 * old page will be flushed before it can be reused.
3447  			 */
3448  			folio_remove_rmap_pte(old_folio, vmf->page, vma);
3449  		}
3450  
3451  		/* Free the old page.. */
3452  		new_folio = old_folio;
3453  		page_copied = 1;
3454  		pte_unmap_unlock(vmf->pte, vmf->ptl);
3455  	} else if (vmf->pte) {
3456  		update_mmu_tlb(vma, vmf->address, vmf->pte);
3457  		pte_unmap_unlock(vmf->pte, vmf->ptl);
3458  	}
3459  
3460  	mmu_notifier_invalidate_range_end(&range);
3461  
3462  	if (new_folio)
3463  		folio_put(new_folio);
3464  	if (old_folio) {
3465  		if (page_copied)
3466  			free_swap_cache(old_folio);
3467  		folio_put(old_folio);
3468  	}
3469  
3470  	delayacct_wpcopy_end();
3471  	return 0;
3472  oom:
3473  	ret = VM_FAULT_OOM;
3474  out:
3475  	if (old_folio)
3476  		folio_put(old_folio);
3477  
3478  	delayacct_wpcopy_end();
3479  	return ret;
3480  }
3481  
3482  /**
3483   * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
3484   *			  writeable once the page is prepared
3485   *
3486   * @vmf: structure describing the fault
3487   * @folio: the folio of vmf->page
3488   *
3489   * This function handles all that is needed to finish a write page fault in a
3490   * shared mapping due to PTE being read-only once the mapped page is prepared.
3491   * It handles locking of PTE and modifying it.
3492   *
3493   * The function expects the page to be locked or other protection against
3494   * concurrent faults / writeback (such as DAX radix tree locks).
3495   *
3496   * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
3497   * we acquired PTE lock.
3498   */
finish_mkwrite_fault(struct vm_fault * vmf,struct folio * folio)3499  static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio)
3500  {
3501  	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
3502  	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
3503  				       &vmf->ptl);
3504  	if (!vmf->pte)
3505  		return VM_FAULT_NOPAGE;
3506  	/*
3507  	 * We might have raced with another page fault while we released the
3508  	 * pte_offset_map_lock.
3509  	 */
3510  	if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
3511  		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
3512  		pte_unmap_unlock(vmf->pte, vmf->ptl);
3513  		return VM_FAULT_NOPAGE;
3514  	}
3515  	wp_page_reuse(vmf, folio);
3516  	return 0;
3517  }
3518  
3519  /*
3520   * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
3521   * mapping
3522   */
wp_pfn_shared(struct vm_fault * vmf)3523  static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
3524  {
3525  	struct vm_area_struct *vma = vmf->vma;
3526  
3527  	if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
3528  		vm_fault_t ret;
3529  
3530  		pte_unmap_unlock(vmf->pte, vmf->ptl);
3531  		ret = vmf_can_call_fault(vmf);
3532  		if (ret)
3533  			return ret;
3534  
3535  		vmf->flags |= FAULT_FLAG_MKWRITE;
3536  		ret = vma->vm_ops->pfn_mkwrite(vmf);
3537  		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
3538  			return ret;
3539  		return finish_mkwrite_fault(vmf, NULL);
3540  	}
3541  	wp_page_reuse(vmf, NULL);
3542  	return 0;
3543  }
3544  
wp_page_shared(struct vm_fault * vmf,struct folio * folio)3545  static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
3546  	__releases(vmf->ptl)
3547  {
3548  	struct vm_area_struct *vma = vmf->vma;
3549  	vm_fault_t ret = 0;
3550  
3551  	folio_get(folio);
3552  
3553  	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
3554  		vm_fault_t tmp;
3555  
3556  		pte_unmap_unlock(vmf->pte, vmf->ptl);
3557  		tmp = vmf_can_call_fault(vmf);
3558  		if (tmp) {
3559  			folio_put(folio);
3560  			return tmp;
3561  		}
3562  
3563  		tmp = do_page_mkwrite(vmf, folio);
3564  		if (unlikely(!tmp || (tmp &
3565  				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3566  			folio_put(folio);
3567  			return tmp;
3568  		}
3569  		tmp = finish_mkwrite_fault(vmf, folio);
3570  		if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3571  			folio_unlock(folio);
3572  			folio_put(folio);
3573  			return tmp;
3574  		}
3575  	} else {
3576  		wp_page_reuse(vmf, folio);
3577  		folio_lock(folio);
3578  	}
3579  	ret |= fault_dirty_shared_page(vmf);
3580  	folio_put(folio);
3581  
3582  	return ret;
3583  }
3584  
wp_can_reuse_anon_folio(struct folio * folio,struct vm_area_struct * vma)3585  static bool wp_can_reuse_anon_folio(struct folio *folio,
3586  				    struct vm_area_struct *vma)
3587  {
3588  	/*
3589  	 * We could currently only reuse a subpage of a large folio if no
3590  	 * other subpages of the large folios are still mapped. However,
3591  	 * let's just consistently not reuse subpages even if we could
3592  	 * reuse in that scenario, and give back a large folio a bit
3593  	 * sooner.
3594  	 */
3595  	if (folio_test_large(folio))
3596  		return false;
3597  
3598  	/*
3599  	 * We have to verify under folio lock: these early checks are
3600  	 * just an optimization to avoid locking the folio and freeing
3601  	 * the swapcache if there is little hope that we can reuse.
3602  	 *
3603  	 * KSM doesn't necessarily raise the folio refcount.
3604  	 */
3605  	if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
3606  		return false;
3607  	if (!folio_test_lru(folio))
3608  		/*
3609  		 * We cannot easily detect+handle references from
3610  		 * remote LRU caches or references to LRU folios.
3611  		 */
3612  		lru_add_drain();
3613  	if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
3614  		return false;
3615  	if (!folio_trylock(folio))
3616  		return false;
3617  	if (folio_test_swapcache(folio))
3618  		folio_free_swap(folio);
3619  	if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
3620  		folio_unlock(folio);
3621  		return false;
3622  	}
3623  	/*
3624  	 * Ok, we've got the only folio reference from our mapping
3625  	 * and the folio is locked, it's dark out, and we're wearing
3626  	 * sunglasses. Hit it.
3627  	 */
3628  	folio_move_anon_rmap(folio, vma);
3629  	folio_unlock(folio);
3630  	return true;
3631  }
3632  
3633  /*
3634   * This routine handles present pages, when
3635   * * users try to write to a shared page (FAULT_FLAG_WRITE)
3636   * * GUP wants to take a R/O pin on a possibly shared anonymous page
3637   *   (FAULT_FLAG_UNSHARE)
3638   *
3639   * It is done by copying the page to a new address and decrementing the
3640   * shared-page counter for the old page.
3641   *
3642   * Note that this routine assumes that the protection checks have been
3643   * done by the caller (the low-level page fault routine in most cases).
3644   * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
3645   * done any necessary COW.
3646   *
3647   * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
3648   * though the page will change only once the write actually happens. This
3649   * avoids a few races, and potentially makes it more efficient.
3650   *
3651   * We enter with non-exclusive mmap_lock (to exclude vma changes,
3652   * but allow concurrent faults), with pte both mapped and locked.
3653   * We return with mmap_lock still held, but pte unmapped and unlocked.
3654   */
do_wp_page(struct vm_fault * vmf)3655  static vm_fault_t do_wp_page(struct vm_fault *vmf)
3656  	__releases(vmf->ptl)
3657  {
3658  	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
3659  	struct vm_area_struct *vma = vmf->vma;
3660  	struct folio *folio = NULL;
3661  	pte_t pte;
3662  
3663  	if (likely(!unshare)) {
3664  		if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
3665  			if (!userfaultfd_wp_async(vma)) {
3666  				pte_unmap_unlock(vmf->pte, vmf->ptl);
3667  				return handle_userfault(vmf, VM_UFFD_WP);
3668  			}
3669  
3670  			/*
3671  			 * Nothing needed (cache flush, TLB invalidations,
3672  			 * etc.) because we're only removing the uffd-wp bit,
3673  			 * which is completely invisible to the user.
3674  			 */
3675  			pte = pte_clear_uffd_wp(ptep_get(vmf->pte));
3676  
3677  			set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3678  			/*
3679  			 * Update this to be prepared for following up CoW
3680  			 * handling
3681  			 */
3682  			vmf->orig_pte = pte;
3683  		}
3684  
3685  		/*
3686  		 * Userfaultfd write-protect can defer flushes. Ensure the TLB
3687  		 * is flushed in this case before copying.
3688  		 */
3689  		if (unlikely(userfaultfd_wp(vmf->vma) &&
3690  			     mm_tlb_flush_pending(vmf->vma->vm_mm)))
3691  			flush_tlb_page(vmf->vma, vmf->address);
3692  	}
3693  
3694  	vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
3695  
3696  	if (vmf->page)
3697  		folio = page_folio(vmf->page);
3698  
3699  	/*
3700  	 * Shared mapping: we are guaranteed to have VM_WRITE and
3701  	 * FAULT_FLAG_WRITE set at this point.
3702  	 */
3703  	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
3704  		/*
3705  		 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
3706  		 * VM_PFNMAP VMA.
3707  		 *
3708  		 * We should not cow pages in a shared writeable mapping.
3709  		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
3710  		 */
3711  		if (!vmf->page)
3712  			return wp_pfn_shared(vmf);
3713  		return wp_page_shared(vmf, folio);
3714  	}
3715  
3716  	/*
3717  	 * Private mapping: create an exclusive anonymous page copy if reuse
3718  	 * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
3719  	 *
3720  	 * If we encounter a page that is marked exclusive, we must reuse
3721  	 * the page without further checks.
3722  	 */
3723  	if (folio && folio_test_anon(folio) &&
3724  	    (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) {
3725  		if (!PageAnonExclusive(vmf->page))
3726  			SetPageAnonExclusive(vmf->page);
3727  		if (unlikely(unshare)) {
3728  			pte_unmap_unlock(vmf->pte, vmf->ptl);
3729  			return 0;
3730  		}
3731  		wp_page_reuse(vmf, folio);
3732  		return 0;
3733  	}
3734  	/*
3735  	 * Ok, we need to copy. Oh, well..
3736  	 */
3737  	if (folio)
3738  		folio_get(folio);
3739  
3740  	pte_unmap_unlock(vmf->pte, vmf->ptl);
3741  #ifdef CONFIG_KSM
3742  	if (folio && folio_test_ksm(folio))
3743  		count_vm_event(COW_KSM);
3744  #endif
3745  	return wp_page_copy(vmf);
3746  }
3747  
unmap_mapping_range_vma(struct vm_area_struct * vma,unsigned long start_addr,unsigned long end_addr,struct zap_details * details)3748  static void unmap_mapping_range_vma(struct vm_area_struct *vma,
3749  		unsigned long start_addr, unsigned long end_addr,
3750  		struct zap_details *details)
3751  {
3752  	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
3753  }
3754  
unmap_mapping_range_tree(struct rb_root_cached * root,pgoff_t first_index,pgoff_t last_index,struct zap_details * details)3755  static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
3756  					    pgoff_t first_index,
3757  					    pgoff_t last_index,
3758  					    struct zap_details *details)
3759  {
3760  	struct vm_area_struct *vma;
3761  	pgoff_t vba, vea, zba, zea;
3762  
3763  	vma_interval_tree_foreach(vma, root, first_index, last_index) {
3764  		vba = vma->vm_pgoff;
3765  		vea = vba + vma_pages(vma) - 1;
3766  		zba = max(first_index, vba);
3767  		zea = min(last_index, vea);
3768  
3769  		unmap_mapping_range_vma(vma,
3770  			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
3771  			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
3772  				details);
3773  	}
3774  }
3775  
3776  /**
3777   * unmap_mapping_folio() - Unmap single folio from processes.
3778   * @folio: The locked folio to be unmapped.
3779   *
3780   * Unmap this folio from any userspace process which still has it mmaped.
3781   * Typically, for efficiency, the range of nearby pages has already been
3782   * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
3783   * truncation or invalidation holds the lock on a folio, it may find that
3784   * the page has been remapped again: and then uses unmap_mapping_folio()
3785   * to unmap it finally.
3786   */
unmap_mapping_folio(struct folio * folio)3787  void unmap_mapping_folio(struct folio *folio)
3788  {
3789  	struct address_space *mapping = folio->mapping;
3790  	struct zap_details details = { };
3791  	pgoff_t	first_index;
3792  	pgoff_t	last_index;
3793  
3794  	VM_BUG_ON(!folio_test_locked(folio));
3795  
3796  	first_index = folio->index;
3797  	last_index = folio_next_index(folio) - 1;
3798  
3799  	details.even_cows = false;
3800  	details.single_folio = folio;
3801  	details.zap_flags = ZAP_FLAG_DROP_MARKER;
3802  
3803  	i_mmap_lock_read(mapping);
3804  	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
3805  		unmap_mapping_range_tree(&mapping->i_mmap, first_index,
3806  					 last_index, &details);
3807  	i_mmap_unlock_read(mapping);
3808  }
3809  
3810  /**
3811   * unmap_mapping_pages() - Unmap pages from processes.
3812   * @mapping: The address space containing pages to be unmapped.
3813   * @start: Index of first page to be unmapped.
3814   * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
3815   * @even_cows: Whether to unmap even private COWed pages.
3816   *
3817   * Unmap the pages in this address space from any userspace process which
3818   * has them mmaped.  Generally, you want to remove COWed pages as well when
3819   * a file is being truncated, but not when invalidating pages from the page
3820   * cache.
3821   */
unmap_mapping_pages(struct address_space * mapping,pgoff_t start,pgoff_t nr,bool even_cows)3822  void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
3823  		pgoff_t nr, bool even_cows)
3824  {
3825  	struct zap_details details = { };
3826  	pgoff_t	first_index = start;
3827  	pgoff_t	last_index = start + nr - 1;
3828  
3829  	details.even_cows = even_cows;
3830  	if (last_index < first_index)
3831  		last_index = ULONG_MAX;
3832  
3833  	i_mmap_lock_read(mapping);
3834  	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
3835  		unmap_mapping_range_tree(&mapping->i_mmap, first_index,
3836  					 last_index, &details);
3837  	i_mmap_unlock_read(mapping);
3838  }
3839  EXPORT_SYMBOL_GPL(unmap_mapping_pages);
3840  
3841  /**
3842   * unmap_mapping_range - unmap the portion of all mmaps in the specified
3843   * address_space corresponding to the specified byte range in the underlying
3844   * file.
3845   *
3846   * @mapping: the address space containing mmaps to be unmapped.
3847   * @holebegin: byte in first page to unmap, relative to the start of
3848   * the underlying file.  This will be rounded down to a PAGE_SIZE
3849   * boundary.  Note that this is different from truncate_pagecache(), which
3850   * must keep the partial page.  In contrast, we must get rid of
3851   * partial pages.
3852   * @holelen: size of prospective hole in bytes.  This will be rounded
3853   * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
3854   * end of the file.
3855   * @even_cows: 1 when truncating a file, unmap even private COWed pages;
3856   * but 0 when invalidating pagecache, don't throw away private data.
3857   */
unmap_mapping_range(struct address_space * mapping,loff_t const holebegin,loff_t const holelen,int even_cows)3858  void unmap_mapping_range(struct address_space *mapping,
3859  		loff_t const holebegin, loff_t const holelen, int even_cows)
3860  {
3861  	pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
3862  	pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;
3863  
3864  	/* Check for overflow. */
3865  	if (sizeof(holelen) > sizeof(hlen)) {
3866  		long long holeend =
3867  			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3868  		if (holeend & ~(long long)ULONG_MAX)
3869  			hlen = ULONG_MAX - hba + 1;
3870  	}
3871  
3872  	unmap_mapping_pages(mapping, hba, hlen, even_cows);
3873  }
3874  EXPORT_SYMBOL(unmap_mapping_range);
3875  
3876  /*
3877   * Restore a potential device exclusive pte to a working pte entry
3878   */
remove_device_exclusive_entry(struct vm_fault * vmf)3879  static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
3880  {
3881  	struct folio *folio = page_folio(vmf->page);
3882  	struct vm_area_struct *vma = vmf->vma;
3883  	struct mmu_notifier_range range;
3884  	vm_fault_t ret;
3885  
3886  	/*
3887  	 * We need a reference to lock the folio because we don't hold
3888  	 * the PTL so a racing thread can remove the device-exclusive
3889  	 * entry and unmap it. If the folio is free the entry must
3890  	 * have been removed already. If it happens to have already
3891  	 * been re-allocated after being freed all we do is lock and
3892  	 * unlock it.
3893  	 */
3894  	if (!folio_try_get(folio))
3895  		return 0;
3896  
3897  	ret = folio_lock_or_retry(folio, vmf);
3898  	if (ret) {
3899  		folio_put(folio);
3900  		return ret;
3901  	}
3902  	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
3903  				vma->vm_mm, vmf->address & PAGE_MASK,
3904  				(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
3905  	mmu_notifier_invalidate_range_start(&range);
3906  
3907  	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3908  				&vmf->ptl);
3909  	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
3910  		restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
3911  
3912  	if (vmf->pte)
3913  		pte_unmap_unlock(vmf->pte, vmf->ptl);
3914  	folio_unlock(folio);
3915  	folio_put(folio);
3916  
3917  	mmu_notifier_invalidate_range_end(&range);
3918  	return 0;
3919  }
3920  
should_try_to_free_swap(struct folio * folio,struct vm_area_struct * vma,unsigned int fault_flags)3921  static inline bool should_try_to_free_swap(struct folio *folio,
3922  					   struct vm_area_struct *vma,
3923  					   unsigned int fault_flags)
3924  {
3925  	if (!folio_test_swapcache(folio))
3926  		return false;
3927  	if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
3928  	    folio_test_mlocked(folio))
3929  		return true;
3930  	/*
3931  	 * If we want to map a page that's in the swapcache writable, we
3932  	 * have to detect via the refcount if we're really the exclusive
3933  	 * user. Try freeing the swapcache to get rid of the swapcache
3934  	 * reference only in case it's likely that we'll be the exlusive user.
3935  	 */
3936  	return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
3937  		folio_ref_count(folio) == (1 + folio_nr_pages(folio));
3938  }
3939  
pte_marker_clear(struct vm_fault * vmf)3940  static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
3941  {
3942  	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
3943  				       vmf->address, &vmf->ptl);
3944  	if (!vmf->pte)
3945  		return 0;
3946  	/*
3947  	 * Be careful so that we will only recover a special uffd-wp pte into a
3948  	 * none pte.  Otherwise it means the pte could have changed, so retry.
3949  	 *
3950  	 * This should also cover the case where e.g. the pte changed
3951  	 * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
3952  	 * So is_pte_marker() check is not enough to safely drop the pte.
3953  	 */
3954  	if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
3955  		pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
3956  	pte_unmap_unlock(vmf->pte, vmf->ptl);
3957  	return 0;
3958  }
3959  
do_pte_missing(struct vm_fault * vmf)3960  static vm_fault_t do_pte_missing(struct vm_fault *vmf)
3961  {
3962  	if (vma_is_anonymous(vmf->vma))
3963  		return do_anonymous_page(vmf);
3964  	else
3965  		return do_fault(vmf);
3966  }
3967  
3968  /*
3969   * This is actually a page-missing access, but with uffd-wp special pte
3970   * installed.  It means this pte was wr-protected before being unmapped.
3971   */
pte_marker_handle_uffd_wp(struct vm_fault * vmf)3972  static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
3973  {
3974  	/*
3975  	 * Just in case there're leftover special ptes even after the region
3976  	 * got unregistered - we can simply clear them.
3977  	 */
3978  	if (unlikely(!userfaultfd_wp(vmf->vma)))
3979  		return pte_marker_clear(vmf);
3980  
3981  	return do_pte_missing(vmf);
3982  }
3983  
handle_pte_marker(struct vm_fault * vmf)3984  static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
3985  {
3986  	swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
3987  	unsigned long marker = pte_marker_get(entry);
3988  
3989  	/*
3990  	 * PTE markers should never be empty.  If anything weird happened,
3991  	 * the best thing to do is to kill the process along with its mm.
3992  	 */
3993  	if (WARN_ON_ONCE(!marker))
3994  		return VM_FAULT_SIGBUS;
3995  
3996  	/* Higher priority than uffd-wp when data corrupted */
3997  	if (marker & PTE_MARKER_POISONED)
3998  		return VM_FAULT_HWPOISON;
3999  
4000  	if (pte_marker_entry_uffd_wp(entry))
4001  		return pte_marker_handle_uffd_wp(vmf);
4002  
4003  	/* This is an unknown pte marker */
4004  	return VM_FAULT_SIGBUS;
4005  }
4006  
__alloc_swap_folio(struct vm_fault * vmf)4007  static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
4008  {
4009  	struct vm_area_struct *vma = vmf->vma;
4010  	struct folio *folio;
4011  	swp_entry_t entry;
4012  
4013  	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
4014  				vmf->address, false);
4015  	if (!folio)
4016  		return NULL;
4017  
4018  	entry = pte_to_swp_entry(vmf->orig_pte);
4019  	if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
4020  					   GFP_KERNEL, entry)) {
4021  		folio_put(folio);
4022  		return NULL;
4023  	}
4024  
4025  	return folio;
4026  }
4027  
4028  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
non_swapcache_batch(swp_entry_t entry,int max_nr)4029  static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
4030  {
4031  	struct swap_info_struct *si = swp_swap_info(entry);
4032  	pgoff_t offset = swp_offset(entry);
4033  	int i;
4034  
4035  	/*
4036  	 * While allocating a large folio and doing swap_read_folio, which is
4037  	 * the case the being faulted pte doesn't have swapcache. We need to
4038  	 * ensure all PTEs have no cache as well, otherwise, we might go to
4039  	 * swap devices while the content is in swapcache.
4040  	 */
4041  	for (i = 0; i < max_nr; i++) {
4042  		if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
4043  			return i;
4044  	}
4045  
4046  	return i;
4047  }
4048  
4049  /*
4050   * Check if the PTEs within a range are contiguous swap entries
4051   * and have consistent swapcache, zeromap.
4052   */
can_swapin_thp(struct vm_fault * vmf,pte_t * ptep,int nr_pages)4053  static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
4054  {
4055  	unsigned long addr;
4056  	swp_entry_t entry;
4057  	int idx;
4058  	pte_t pte;
4059  
4060  	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
4061  	idx = (vmf->address - addr) / PAGE_SIZE;
4062  	pte = ptep_get(ptep);
4063  
4064  	if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
4065  		return false;
4066  	entry = pte_to_swp_entry(pte);
4067  	if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
4068  		return false;
4069  
4070  	/*
4071  	 * swap_read_folio() can't handle the case a large folio is hybridly
4072  	 * from different backends. And they are likely corner cases. Similar
4073  	 * things might be added once zswap support large folios.
4074  	 */
4075  	if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
4076  		return false;
4077  	if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
4078  		return false;
4079  
4080  	return true;
4081  }
4082  
thp_swap_suitable_orders(pgoff_t swp_offset,unsigned long addr,unsigned long orders)4083  static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
4084  						     unsigned long addr,
4085  						     unsigned long orders)
4086  {
4087  	int order, nr;
4088  
4089  	order = highest_order(orders);
4090  
4091  	/*
4092  	 * To swap in a THP with nr pages, we require that its first swap_offset
4093  	 * is aligned with that number, as it was when the THP was swapped out.
4094  	 * This helps filter out most invalid entries.
4095  	 */
4096  	while (orders) {
4097  		nr = 1 << order;
4098  		if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr)
4099  			break;
4100  		order = next_order(&orders, order);
4101  	}
4102  
4103  	return orders;
4104  }
4105  
alloc_swap_folio(struct vm_fault * vmf)4106  static struct folio *alloc_swap_folio(struct vm_fault *vmf)
4107  {
4108  	struct vm_area_struct *vma = vmf->vma;
4109  	unsigned long orders;
4110  	struct folio *folio;
4111  	unsigned long addr;
4112  	swp_entry_t entry;
4113  	spinlock_t *ptl;
4114  	pte_t *pte;
4115  	gfp_t gfp;
4116  	int order;
4117  
4118  	/*
4119  	 * If uffd is active for the vma we need per-page fault fidelity to
4120  	 * maintain the uffd semantics.
4121  	 */
4122  	if (unlikely(userfaultfd_armed(vma)))
4123  		goto fallback;
4124  
4125  	/*
4126  	 * A large swapped out folio could be partially or fully in zswap. We
4127  	 * lack handling for such cases, so fallback to swapping in order-0
4128  	 * folio.
4129  	 */
4130  	if (!zswap_never_enabled())
4131  		goto fallback;
4132  
4133  	entry = pte_to_swp_entry(vmf->orig_pte);
4134  	/*
4135  	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
4136  	 * and suitable for swapping THP.
4137  	 */
4138  	orders = thp_vma_allowable_orders(vma, vma->vm_flags,
4139  			TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
4140  	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
4141  	orders = thp_swap_suitable_orders(swp_offset(entry),
4142  					  vmf->address, orders);
4143  
4144  	if (!orders)
4145  		goto fallback;
4146  
4147  	pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
4148  				  vmf->address & PMD_MASK, &ptl);
4149  	if (unlikely(!pte))
4150  		goto fallback;
4151  
4152  	/*
4153  	 * For do_swap_page, find the highest order where the aligned range is
4154  	 * completely swap entries with contiguous swap offsets.
4155  	 */
4156  	order = highest_order(orders);
4157  	while (orders) {
4158  		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
4159  		if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order))
4160  			break;
4161  		order = next_order(&orders, order);
4162  	}
4163  
4164  	pte_unmap_unlock(pte, ptl);
4165  
4166  	/* Try allocating the highest of the remaining orders. */
4167  	gfp = vma_thp_gfp_mask(vma);
4168  	while (orders) {
4169  		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
4170  		folio = vma_alloc_folio(gfp, order, vma, addr, true);
4171  		if (folio) {
4172  			if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
4173  							    gfp, entry))
4174  				return folio;
4175  			folio_put(folio);
4176  		}
4177  		order = next_order(&orders, order);
4178  	}
4179  
4180  fallback:
4181  	return __alloc_swap_folio(vmf);
4182  }
4183  #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
alloc_swap_folio(struct vm_fault * vmf)4184  static struct folio *alloc_swap_folio(struct vm_fault *vmf)
4185  {
4186  	return __alloc_swap_folio(vmf);
4187  }
4188  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
4189  
4190  static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
4191  
4192  /*
4193   * We enter with non-exclusive mmap_lock (to exclude vma changes,
4194   * but allow concurrent faults), and pte mapped but not yet locked.
4195   * We return with pte unmapped and unlocked.
4196   *
4197   * We return with the mmap_lock locked or unlocked in the same cases
4198   * as does filemap_fault().
4199   */
do_swap_page(struct vm_fault * vmf)4200  vm_fault_t do_swap_page(struct vm_fault *vmf)
4201  {
4202  	struct vm_area_struct *vma = vmf->vma;
4203  	struct folio *swapcache, *folio = NULL;
4204  	DECLARE_WAITQUEUE(wait, current);
4205  	struct page *page;
4206  	struct swap_info_struct *si = NULL;
4207  	rmap_t rmap_flags = RMAP_NONE;
4208  	bool need_clear_cache = false;
4209  	bool exclusive = false;
4210  	swp_entry_t entry;
4211  	pte_t pte;
4212  	vm_fault_t ret = 0;
4213  	void *shadow = NULL;
4214  	int nr_pages;
4215  	unsigned long page_idx;
4216  	unsigned long address;
4217  	pte_t *ptep;
4218  
4219  	if (!pte_unmap_same(vmf))
4220  		goto out;
4221  
4222  	entry = pte_to_swp_entry(vmf->orig_pte);
4223  	if (unlikely(non_swap_entry(entry))) {
4224  		if (is_migration_entry(entry)) {
4225  			migration_entry_wait(vma->vm_mm, vmf->pmd,
4226  					     vmf->address);
4227  		} else if (is_device_exclusive_entry(entry)) {
4228  			vmf->page = pfn_swap_entry_to_page(entry);
4229  			ret = remove_device_exclusive_entry(vmf);
4230  		} else if (is_device_private_entry(entry)) {
4231  			if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
4232  				/*
4233  				 * migrate_to_ram is not yet ready to operate
4234  				 * under VMA lock.
4235  				 */
4236  				vma_end_read(vma);
4237  				ret = VM_FAULT_RETRY;
4238  				goto out;
4239  			}
4240  
4241  			vmf->page = pfn_swap_entry_to_page(entry);
4242  			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
4243  					vmf->address, &vmf->ptl);
4244  			if (unlikely(!vmf->pte ||
4245  				     !pte_same(ptep_get(vmf->pte),
4246  							vmf->orig_pte)))
4247  				goto unlock;
4248  
4249  			/*
4250  			 * Get a page reference while we know the page can't be
4251  			 * freed.
4252  			 */
4253  			get_page(vmf->page);
4254  			pte_unmap_unlock(vmf->pte, vmf->ptl);
4255  			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
4256  			put_page(vmf->page);
4257  		} else if (is_hwpoison_entry(entry)) {
4258  			ret = VM_FAULT_HWPOISON;
4259  		} else if (is_pte_marker_entry(entry)) {
4260  			ret = handle_pte_marker(vmf);
4261  		} else {
4262  			print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
4263  			ret = VM_FAULT_SIGBUS;
4264  		}
4265  		goto out;
4266  	}
4267  
4268  	/* Prevent swapoff from happening to us. */
4269  	si = get_swap_device(entry);
4270  	if (unlikely(!si))
4271  		goto out;
4272  
4273  	folio = swap_cache_get_folio(entry, vma, vmf->address);
4274  	if (folio)
4275  		page = folio_file_page(folio, swp_offset(entry));
4276  	swapcache = folio;
4277  
4278  	if (!folio) {
4279  		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
4280  		    __swap_count(entry) == 1) {
4281  			/* skip swapcache */
4282  			folio = alloc_swap_folio(vmf);
4283  			if (folio) {
4284  				__folio_set_locked(folio);
4285  				__folio_set_swapbacked(folio);
4286  
4287  				nr_pages = folio_nr_pages(folio);
4288  				if (folio_test_large(folio))
4289  					entry.val = ALIGN_DOWN(entry.val, nr_pages);
4290  				/*
4291  				 * Prevent parallel swapin from proceeding with
4292  				 * the cache flag. Otherwise, another thread
4293  				 * may finish swapin first, free the entry, and
4294  				 * swapout reusing the same entry. It's
4295  				 * undetectable as pte_same() returns true due
4296  				 * to entry reuse.
4297  				 */
4298  				if (swapcache_prepare(entry, nr_pages)) {
4299  					/*
4300  					 * Relax a bit to prevent rapid
4301  					 * repeated page faults.
4302  					 */
4303  					add_wait_queue(&swapcache_wq, &wait);
4304  					schedule_timeout_uninterruptible(1);
4305  					remove_wait_queue(&swapcache_wq, &wait);
4306  					goto out_page;
4307  				}
4308  				need_clear_cache = true;
4309  
4310  				mem_cgroup_swapin_uncharge_swap(entry, nr_pages);
4311  
4312  				shadow = get_shadow_from_swap_cache(entry);
4313  				if (shadow)
4314  					workingset_refault(folio, shadow);
4315  
4316  				folio_add_lru(folio);
4317  
4318  				/* To provide entry to swap_read_folio() */
4319  				folio->swap = entry;
4320  				swap_read_folio(folio, NULL);
4321  				folio->private = NULL;
4322  			}
4323  		} else {
4324  			folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
4325  						vmf);
4326  			swapcache = folio;
4327  		}
4328  
4329  		if (!folio) {
4330  			/*
4331  			 * Back out if somebody else faulted in this pte
4332  			 * while we released the pte lock.
4333  			 */
4334  			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
4335  					vmf->address, &vmf->ptl);
4336  			if (likely(vmf->pte &&
4337  				   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
4338  				ret = VM_FAULT_OOM;
4339  			goto unlock;
4340  		}
4341  
4342  		/* Had to read the page from swap area: Major fault */
4343  		ret = VM_FAULT_MAJOR;
4344  		count_vm_event(PGMAJFAULT);
4345  		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
4346  		page = folio_file_page(folio, swp_offset(entry));
4347  	} else if (PageHWPoison(page)) {
4348  		/*
4349  		 * hwpoisoned dirty swapcache pages are kept for killing
4350  		 * owner processes (which may be unknown at hwpoison time)
4351  		 */
4352  		ret = VM_FAULT_HWPOISON;
4353  		goto out_release;
4354  	}
4355  
4356  	ret |= folio_lock_or_retry(folio, vmf);
4357  	if (ret & VM_FAULT_RETRY)
4358  		goto out_release;
4359  
4360  	if (swapcache) {
4361  		/*
4362  		 * Make sure folio_free_swap() or swapoff did not release the
4363  		 * swapcache from under us.  The page pin, and pte_same test
4364  		 * below, are not enough to exclude that.  Even if it is still
4365  		 * swapcache, we need to check that the page's swap has not
4366  		 * changed.
4367  		 */
4368  		if (unlikely(!folio_test_swapcache(folio) ||
4369  			     page_swap_entry(page).val != entry.val))
4370  			goto out_page;
4371  
4372  		/*
4373  		 * KSM sometimes has to copy on read faults, for example, if
4374  		 * page->index of !PageKSM() pages would be nonlinear inside the
4375  		 * anon VMA -- PageKSM() is lost on actual swapout.
4376  		 */
4377  		folio = ksm_might_need_to_copy(folio, vma, vmf->address);
4378  		if (unlikely(!folio)) {
4379  			ret = VM_FAULT_OOM;
4380  			folio = swapcache;
4381  			goto out_page;
4382  		} else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
4383  			ret = VM_FAULT_HWPOISON;
4384  			folio = swapcache;
4385  			goto out_page;
4386  		}
4387  		if (folio != swapcache)
4388  			page = folio_page(folio, 0);
4389  
4390  		/*
4391  		 * If we want to map a page that's in the swapcache writable, we
4392  		 * have to detect via the refcount if we're really the exclusive
4393  		 * owner. Try removing the extra reference from the local LRU
4394  		 * caches if required.
4395  		 */
4396  		if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
4397  		    !folio_test_ksm(folio) && !folio_test_lru(folio))
4398  			lru_add_drain();
4399  	}
4400  
4401  	folio_throttle_swaprate(folio, GFP_KERNEL);
4402  
4403  	/*
4404  	 * Back out if somebody else already faulted in this pte.
4405  	 */
4406  	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
4407  			&vmf->ptl);
4408  	if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
4409  		goto out_nomap;
4410  
4411  	if (unlikely(!folio_test_uptodate(folio))) {
4412  		ret = VM_FAULT_SIGBUS;
4413  		goto out_nomap;
4414  	}
4415  
4416  	/* allocated large folios for SWP_SYNCHRONOUS_IO */
4417  	if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
4418  		unsigned long nr = folio_nr_pages(folio);
4419  		unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
4420  		unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
4421  		pte_t *folio_ptep = vmf->pte - idx;
4422  		pte_t folio_pte = ptep_get(folio_ptep);
4423  
4424  		if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
4425  		    swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
4426  			goto out_nomap;
4427  
4428  		page_idx = idx;
4429  		address = folio_start;
4430  		ptep = folio_ptep;
4431  		goto check_folio;
4432  	}
4433  
4434  	nr_pages = 1;
4435  	page_idx = 0;
4436  	address = vmf->address;
4437  	ptep = vmf->pte;
4438  	if (folio_test_large(folio) && folio_test_swapcache(folio)) {
4439  		int nr = folio_nr_pages(folio);
4440  		unsigned long idx = folio_page_idx(folio, page);
4441  		unsigned long folio_start = address - idx * PAGE_SIZE;
4442  		unsigned long folio_end = folio_start + nr * PAGE_SIZE;
4443  		pte_t *folio_ptep;
4444  		pte_t folio_pte;
4445  
4446  		if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start)))
4447  			goto check_folio;
4448  		if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end)))
4449  			goto check_folio;
4450  
4451  		folio_ptep = vmf->pte - idx;
4452  		folio_pte = ptep_get(folio_ptep);
4453  		if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
4454  		    swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
4455  			goto check_folio;
4456  
4457  		page_idx = idx;
4458  		address = folio_start;
4459  		ptep = folio_ptep;
4460  		nr_pages = nr;
4461  		entry = folio->swap;
4462  		page = &folio->page;
4463  	}
4464  
4465  check_folio:
4466  	/*
4467  	 * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
4468  	 * must never point at an anonymous page in the swapcache that is
4469  	 * PG_anon_exclusive. Sanity check that this holds and especially, that
4470  	 * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
4471  	 * check after taking the PT lock and making sure that nobody
4472  	 * concurrently faulted in this page and set PG_anon_exclusive.
4473  	 */
4474  	BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
4475  	BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
4476  
4477  	/*
4478  	 * Check under PT lock (to protect against concurrent fork() sharing
4479  	 * the swap entry concurrently) for certainly exclusive pages.
4480  	 */
4481  	if (!folio_test_ksm(folio)) {
4482  		exclusive = pte_swp_exclusive(vmf->orig_pte);
4483  		if (folio != swapcache) {
4484  			/*
4485  			 * We have a fresh page that is not exposed to the
4486  			 * swapcache -> certainly exclusive.
4487  			 */
4488  			exclusive = true;
4489  		} else if (exclusive && folio_test_writeback(folio) &&
4490  			  data_race(si->flags & SWP_STABLE_WRITES)) {
4491  			/*
4492  			 * This is tricky: not all swap backends support
4493  			 * concurrent page modifications while under writeback.
4494  			 *
4495  			 * So if we stumble over such a page in the swapcache
4496  			 * we must not set the page exclusive, otherwise we can
4497  			 * map it writable without further checks and modify it
4498  			 * while still under writeback.
4499  			 *
4500  			 * For these problematic swap backends, simply drop the
4501  			 * exclusive marker: this is perfectly fine as we start
4502  			 * writeback only if we fully unmapped the page and
4503  			 * there are no unexpected references on the page after
4504  			 * unmapping succeeded. After fully unmapped, no
4505  			 * further GUP references (FOLL_GET and FOLL_PIN) can
4506  			 * appear, so dropping the exclusive marker and mapping
4507  			 * it only R/O is fine.
4508  			 */
4509  			exclusive = false;
4510  		}
4511  	}
4512  
4513  	/*
4514  	 * Some architectures may have to restore extra metadata to the page
4515  	 * when reading from swap. This metadata may be indexed by swap entry
4516  	 * so this must be called before swap_free().
4517  	 */
4518  	arch_swap_restore(folio_swap(entry, folio), folio);
4519  
4520  	/*
4521  	 * Remove the swap entry and conditionally try to free up the swapcache.
4522  	 * We're already holding a reference on the page but haven't mapped it
4523  	 * yet.
4524  	 */
4525  	swap_free_nr(entry, nr_pages);
4526  	if (should_try_to_free_swap(folio, vma, vmf->flags))
4527  		folio_free_swap(folio);
4528  
4529  	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
4530  	add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
4531  	pte = mk_pte(page, vma->vm_page_prot);
4532  	if (pte_swp_soft_dirty(vmf->orig_pte))
4533  		pte = pte_mksoft_dirty(pte);
4534  	if (pte_swp_uffd_wp(vmf->orig_pte))
4535  		pte = pte_mkuffd_wp(pte);
4536  
4537  	/*
4538  	 * Same logic as in do_wp_page(); however, optimize for pages that are
4539  	 * certainly not shared either because we just allocated them without
4540  	 * exposing them to the swapcache or because the swap entry indicates
4541  	 * exclusivity.
4542  	 */
4543  	if (!folio_test_ksm(folio) &&
4544  	    (exclusive || folio_ref_count(folio) == 1)) {
4545  		if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) &&
4546  		    !pte_needs_soft_dirty_wp(vma, pte)) {
4547  			pte = pte_mkwrite(pte, vma);
4548  			if (vmf->flags & FAULT_FLAG_WRITE) {
4549  				pte = pte_mkdirty(pte);
4550  				vmf->flags &= ~FAULT_FLAG_WRITE;
4551  			}
4552  		}
4553  		rmap_flags |= RMAP_EXCLUSIVE;
4554  	}
4555  	folio_ref_add(folio, nr_pages - 1);
4556  	flush_icache_pages(vma, page, nr_pages);
4557  	vmf->orig_pte = pte_advance_pfn(pte, page_idx);
4558  
4559  	/* ksm created a completely new copy */
4560  	if (unlikely(folio != swapcache && swapcache)) {
4561  		folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
4562  		folio_add_lru_vma(folio, vma);
4563  	} else if (!folio_test_anon(folio)) {
4564  		/*
4565  		 * We currently only expect small !anon folios which are either
4566  		 * fully exclusive or fully shared, or new allocated large
4567  		 * folios which are fully exclusive. If we ever get large
4568  		 * folios within swapcache here, we have to be careful.
4569  		 */
4570  		VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
4571  		VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
4572  		folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
4573  	} else {
4574  		folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address,
4575  					rmap_flags);
4576  	}
4577  
4578  	VM_BUG_ON(!folio_test_anon(folio) ||
4579  			(pte_write(pte) && !PageAnonExclusive(page)));
4580  	set_ptes(vma->vm_mm, address, ptep, pte, nr_pages);
4581  	arch_do_swap_page_nr(vma->vm_mm, vma, address,
4582  			pte, pte, nr_pages);
4583  
4584  	folio_unlock(folio);
4585  	if (folio != swapcache && swapcache) {
4586  		/*
4587  		 * Hold the lock to avoid the swap entry to be reused
4588  		 * until we take the PT lock for the pte_same() check
4589  		 * (to avoid false positives from pte_same). For
4590  		 * further safety release the lock after the swap_free
4591  		 * so that the swap count won't change under a
4592  		 * parallel locked swapcache.
4593  		 */
4594  		folio_unlock(swapcache);
4595  		folio_put(swapcache);
4596  	}
4597  
4598  	if (vmf->flags & FAULT_FLAG_WRITE) {
4599  		ret |= do_wp_page(vmf);
4600  		if (ret & VM_FAULT_ERROR)
4601  			ret &= VM_FAULT_ERROR;
4602  		goto out;
4603  	}
4604  
4605  	/* No need to invalidate - it was non-present before */
4606  	update_mmu_cache_range(vmf, vma, address, ptep, nr_pages);
4607  unlock:
4608  	if (vmf->pte)
4609  		pte_unmap_unlock(vmf->pte, vmf->ptl);
4610  out:
4611  	/* Clear the swap cache pin for direct swapin after PTL unlock */
4612  	if (need_clear_cache) {
4613  		swapcache_clear(si, entry, nr_pages);
4614  		if (waitqueue_active(&swapcache_wq))
4615  			wake_up(&swapcache_wq);
4616  	}
4617  	if (si)
4618  		put_swap_device(si);
4619  	return ret;
4620  out_nomap:
4621  	if (vmf->pte)
4622  		pte_unmap_unlock(vmf->pte, vmf->ptl);
4623  out_page:
4624  	folio_unlock(folio);
4625  out_release:
4626  	folio_put(folio);
4627  	if (folio != swapcache && swapcache) {
4628  		folio_unlock(swapcache);
4629  		folio_put(swapcache);
4630  	}
4631  	if (need_clear_cache) {
4632  		swapcache_clear(si, entry, nr_pages);
4633  		if (waitqueue_active(&swapcache_wq))
4634  			wake_up(&swapcache_wq);
4635  	}
4636  	if (si)
4637  		put_swap_device(si);
4638  	return ret;
4639  }
4640  
pte_range_none(pte_t * pte,int nr_pages)4641  static bool pte_range_none(pte_t *pte, int nr_pages)
4642  {
4643  	int i;
4644  
4645  	for (i = 0; i < nr_pages; i++) {
4646  		if (!pte_none(ptep_get_lockless(pte + i)))
4647  			return false;
4648  	}
4649  
4650  	return true;
4651  }
4652  
alloc_anon_folio(struct vm_fault * vmf)4653  static struct folio *alloc_anon_folio(struct vm_fault *vmf)
4654  {
4655  	struct vm_area_struct *vma = vmf->vma;
4656  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4657  	unsigned long orders;
4658  	struct folio *folio;
4659  	unsigned long addr;
4660  	pte_t *pte;
4661  	gfp_t gfp;
4662  	int order;
4663  
4664  	/*
4665  	 * If uffd is active for the vma we need per-page fault fidelity to
4666  	 * maintain the uffd semantics.
4667  	 */
4668  	if (unlikely(userfaultfd_armed(vma)))
4669  		goto fallback;
4670  
4671  	/*
4672  	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
4673  	 * for this vma. Then filter out the orders that can't be allocated over
4674  	 * the faulting address and still be fully contained in the vma.
4675  	 */
4676  	orders = thp_vma_allowable_orders(vma, vma->vm_flags,
4677  			TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
4678  	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
4679  
4680  	if (!orders)
4681  		goto fallback;
4682  
4683  	pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
4684  	if (!pte)
4685  		return ERR_PTR(-EAGAIN);
4686  
4687  	/*
4688  	 * Find the highest order where the aligned range is completely
4689  	 * pte_none(). Note that all remaining orders will be completely
4690  	 * pte_none().
4691  	 */
4692  	order = highest_order(orders);
4693  	while (orders) {
4694  		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
4695  		if (pte_range_none(pte + pte_index(addr), 1 << order))
4696  			break;
4697  		order = next_order(&orders, order);
4698  	}
4699  
4700  	pte_unmap(pte);
4701  
4702  	if (!orders)
4703  		goto fallback;
4704  
4705  	/* Try allocating the highest of the remaining orders. */
4706  	gfp = vma_thp_gfp_mask(vma);
4707  	while (orders) {
4708  		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
4709  		folio = vma_alloc_folio(gfp, order, vma, addr, true);
4710  		if (folio) {
4711  			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
4712  				count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
4713  				folio_put(folio);
4714  				goto next;
4715  			}
4716  			folio_throttle_swaprate(folio, gfp);
4717  			folio_zero_user(folio, vmf->address);
4718  			return folio;
4719  		}
4720  next:
4721  		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
4722  		order = next_order(&orders, order);
4723  	}
4724  
4725  fallback:
4726  #endif
4727  	return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
4728  }
4729  
4730  /*
4731   * We enter with non-exclusive mmap_lock (to exclude vma changes,
4732   * but allow concurrent faults), and pte mapped but not yet locked.
4733   * We return with mmap_lock still held, but pte unmapped and unlocked.
4734   */
do_anonymous_page(struct vm_fault * vmf)4735  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
4736  {
4737  	struct vm_area_struct *vma = vmf->vma;
4738  	unsigned long addr = vmf->address;
4739  	struct folio *folio;
4740  	vm_fault_t ret = 0;
4741  	int nr_pages = 1;
4742  	pte_t entry;
4743  
4744  	/* File mapping without ->vm_ops ? */
4745  	if (vma->vm_flags & VM_SHARED)
4746  		return VM_FAULT_SIGBUS;
4747  
4748  	/*
4749  	 * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
4750  	 * be distinguished from a transient failure of pte_offset_map().
4751  	 */
4752  	if (pte_alloc(vma->vm_mm, vmf->pmd))
4753  		return VM_FAULT_OOM;
4754  
4755  	/* Use the zero-page for reads */
4756  	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
4757  			!mm_forbids_zeropage(vma->vm_mm)) {
4758  		entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
4759  						vma->vm_page_prot));
4760  		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
4761  				vmf->address, &vmf->ptl);
4762  		if (!vmf->pte)
4763  			goto unlock;
4764  		if (vmf_pte_changed(vmf)) {
4765  			update_mmu_tlb(vma, vmf->address, vmf->pte);
4766  			goto unlock;
4767  		}
4768  		ret = check_stable_address_space(vma->vm_mm);
4769  		if (ret)
4770  			goto unlock;
4771  		/* Deliver the page fault to userland, check inside PT lock */
4772  		if (userfaultfd_missing(vma)) {
4773  			pte_unmap_unlock(vmf->pte, vmf->ptl);
4774  			return handle_userfault(vmf, VM_UFFD_MISSING);
4775  		}
4776  		goto setpte;
4777  	}
4778  
4779  	/* Allocate our own private page. */
4780  	ret = vmf_anon_prepare(vmf);
4781  	if (ret)
4782  		return ret;
4783  	/* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */
4784  	folio = alloc_anon_folio(vmf);
4785  	if (IS_ERR(folio))
4786  		return 0;
4787  	if (!folio)
4788  		goto oom;
4789  
4790  	nr_pages = folio_nr_pages(folio);
4791  	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
4792  
4793  	/*
4794  	 * The memory barrier inside __folio_mark_uptodate makes sure that
4795  	 * preceding stores to the page contents become visible before
4796  	 * the set_pte_at() write.
4797  	 */
4798  	__folio_mark_uptodate(folio);
4799  
4800  	entry = mk_pte(&folio->page, vma->vm_page_prot);
4801  	entry = pte_sw_mkyoung(entry);
4802  	if (vma->vm_flags & VM_WRITE)
4803  		entry = pte_mkwrite(pte_mkdirty(entry), vma);
4804  
4805  	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
4806  	if (!vmf->pte)
4807  		goto release;
4808  	if (nr_pages == 1 && vmf_pte_changed(vmf)) {
4809  		update_mmu_tlb(vma, addr, vmf->pte);
4810  		goto release;
4811  	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
4812  		update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
4813  		goto release;
4814  	}
4815  
4816  	ret = check_stable_address_space(vma->vm_mm);
4817  	if (ret)
4818  		goto release;
4819  
4820  	/* Deliver the page fault to userland, check inside PT lock */
4821  	if (userfaultfd_missing(vma)) {
4822  		pte_unmap_unlock(vmf->pte, vmf->ptl);
4823  		folio_put(folio);
4824  		return handle_userfault(vmf, VM_UFFD_MISSING);
4825  	}
4826  
4827  	folio_ref_add(folio, nr_pages - 1);
4828  	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
4829  	count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
4830  	folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
4831  	folio_add_lru_vma(folio, vma);
4832  setpte:
4833  	if (vmf_orig_pte_uffd_wp(vmf))
4834  		entry = pte_mkuffd_wp(entry);
4835  	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
4836  
4837  	/* No need to invalidate - it was non-present before */
4838  	update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
4839  unlock:
4840  	if (vmf->pte)
4841  		pte_unmap_unlock(vmf->pte, vmf->ptl);
4842  	return ret;
4843  release:
4844  	folio_put(folio);
4845  	goto unlock;
4846  oom:
4847  	return VM_FAULT_OOM;
4848  }
4849  
4850  /*
4851   * The mmap_lock must have been held on entry, and may have been
4852   * released depending on flags and vma->vm_ops->fault() return value.
4853   * See filemap_fault() and __lock_page_retry().
4854   */
__do_fault(struct vm_fault * vmf)4855  static vm_fault_t __do_fault(struct vm_fault *vmf)
4856  {
4857  	struct vm_area_struct *vma = vmf->vma;
4858  	struct folio *folio;
4859  	vm_fault_t ret;
4860  
4861  	/*
4862  	 * Preallocate pte before we take page_lock because this might lead to
4863  	 * deadlocks for memcg reclaim which waits for pages under writeback:
4864  	 *				lock_page(A)
4865  	 *				SetPageWriteback(A)
4866  	 *				unlock_page(A)
4867  	 * lock_page(B)
4868  	 *				lock_page(B)
4869  	 * pte_alloc_one
4870  	 *   shrink_folio_list
4871  	 *     wait_on_page_writeback(A)
4872  	 *				SetPageWriteback(B)
4873  	 *				unlock_page(B)
4874  	 *				# flush A, B to clear the writeback
4875  	 */
4876  	if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
4877  		vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
4878  		if (!vmf->prealloc_pte)
4879  			return VM_FAULT_OOM;
4880  	}
4881  
4882  	ret = vma->vm_ops->fault(vmf);
4883  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
4884  			    VM_FAULT_DONE_COW)))
4885  		return ret;
4886  
4887  	folio = page_folio(vmf->page);
4888  	if (unlikely(PageHWPoison(vmf->page))) {
4889  		vm_fault_t poisonret = VM_FAULT_HWPOISON;
4890  		if (ret & VM_FAULT_LOCKED) {
4891  			if (page_mapped(vmf->page))
4892  				unmap_mapping_folio(folio);
4893  			/* Retry if a clean folio was removed from the cache. */
4894  			if (mapping_evict_folio(folio->mapping, folio))
4895  				poisonret = VM_FAULT_NOPAGE;
4896  			folio_unlock(folio);
4897  		}
4898  		folio_put(folio);
4899  		vmf->page = NULL;
4900  		return poisonret;
4901  	}
4902  
4903  	if (unlikely(!(ret & VM_FAULT_LOCKED)))
4904  		folio_lock(folio);
4905  	else
4906  		VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page);
4907  
4908  	return ret;
4909  }
4910  
4911  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
deposit_prealloc_pte(struct vm_fault * vmf)4912  static void deposit_prealloc_pte(struct vm_fault *vmf)
4913  {
4914  	struct vm_area_struct *vma = vmf->vma;
4915  
4916  	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
4917  	/*
4918  	 * We are going to consume the prealloc table,
4919  	 * count that as nr_ptes.
4920  	 */
4921  	mm_inc_nr_ptes(vma->vm_mm);
4922  	vmf->prealloc_pte = NULL;
4923  }
4924  
do_set_pmd(struct vm_fault * vmf,struct page * page)4925  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
4926  {
4927  	struct folio *folio = page_folio(page);
4928  	struct vm_area_struct *vma = vmf->vma;
4929  	bool write = vmf->flags & FAULT_FLAG_WRITE;
4930  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
4931  	pmd_t entry;
4932  	vm_fault_t ret = VM_FAULT_FALLBACK;
4933  
4934  	/*
4935  	 * It is too late to allocate a small folio, we already have a large
4936  	 * folio in the pagecache: especially s390 KVM cannot tolerate any
4937  	 * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
4938  	 * PMD mappings if THPs are disabled.
4939  	 */
4940  	if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
4941  		return ret;
4942  
4943  	if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
4944  		return ret;
4945  
4946  	if (folio_order(folio) != HPAGE_PMD_ORDER)
4947  		return ret;
4948  	page = &folio->page;
4949  
4950  	/*
4951  	 * Just backoff if any subpage of a THP is corrupted otherwise
4952  	 * the corrupted page may mapped by PMD silently to escape the
4953  	 * check.  This kind of THP just can be PTE mapped.  Access to
4954  	 * the corrupted subpage should trigger SIGBUS as expected.
4955  	 */
4956  	if (unlikely(folio_test_has_hwpoisoned(folio)))
4957  		return ret;
4958  
4959  	/*
4960  	 * Archs like ppc64 need additional space to store information
4961  	 * related to pte entry. Use the preallocated table for that.
4962  	 */
4963  	if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
4964  		vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
4965  		if (!vmf->prealloc_pte)
4966  			return VM_FAULT_OOM;
4967  	}
4968  
4969  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
4970  	if (unlikely(!pmd_none(*vmf->pmd)))
4971  		goto out;
4972  
4973  	flush_icache_pages(vma, page, HPAGE_PMD_NR);
4974  
4975  	entry = mk_huge_pmd(page, vma->vm_page_prot);
4976  	if (write)
4977  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
4978  
4979  	add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
4980  	folio_add_file_rmap_pmd(folio, page, vma);
4981  
4982  	/*
4983  	 * deposit and withdraw with pmd lock held
4984  	 */
4985  	if (arch_needs_pgtable_deposit())
4986  		deposit_prealloc_pte(vmf);
4987  
4988  	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
4989  
4990  	update_mmu_cache_pmd(vma, haddr, vmf->pmd);
4991  
4992  	/* fault is handled */
4993  	ret = 0;
4994  	count_vm_event(THP_FILE_MAPPED);
4995  out:
4996  	spin_unlock(vmf->ptl);
4997  	return ret;
4998  }
4999  #else
do_set_pmd(struct vm_fault * vmf,struct page * page)5000  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
5001  {
5002  	return VM_FAULT_FALLBACK;
5003  }
5004  #endif
5005  
5006  /**
5007   * set_pte_range - Set a range of PTEs to point to pages in a folio.
5008   * @vmf: Fault decription.
5009   * @folio: The folio that contains @page.
5010   * @page: The first page to create a PTE for.
5011   * @nr: The number of PTEs to create.
5012   * @addr: The first address to create a PTE for.
5013   */
set_pte_range(struct vm_fault * vmf,struct folio * folio,struct page * page,unsigned int nr,unsigned long addr)5014  void set_pte_range(struct vm_fault *vmf, struct folio *folio,
5015  		struct page *page, unsigned int nr, unsigned long addr)
5016  {
5017  	struct vm_area_struct *vma = vmf->vma;
5018  	bool write = vmf->flags & FAULT_FLAG_WRITE;
5019  	bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE);
5020  	pte_t entry;
5021  
5022  	flush_icache_pages(vma, page, nr);
5023  	entry = mk_pte(page, vma->vm_page_prot);
5024  
5025  	if (prefault && arch_wants_old_prefaulted_pte())
5026  		entry = pte_mkold(entry);
5027  	else
5028  		entry = pte_sw_mkyoung(entry);
5029  
5030  	if (write)
5031  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
5032  	if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
5033  		entry = pte_mkuffd_wp(entry);
5034  	/* copy-on-write page */
5035  	if (write && !(vma->vm_flags & VM_SHARED)) {
5036  		VM_BUG_ON_FOLIO(nr != 1, folio);
5037  		folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
5038  		folio_add_lru_vma(folio, vma);
5039  	} else {
5040  		folio_add_file_rmap_ptes(folio, page, nr, vma);
5041  	}
5042  	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
5043  
5044  	/* no need to invalidate: a not-present page won't be cached */
5045  	update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr);
5046  }
5047  
vmf_pte_changed(struct vm_fault * vmf)5048  static bool vmf_pte_changed(struct vm_fault *vmf)
5049  {
5050  	if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
5051  		return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);
5052  
5053  	return !pte_none(ptep_get(vmf->pte));
5054  }
5055  
5056  /**
5057   * finish_fault - finish page fault once we have prepared the page to fault
5058   *
5059   * @vmf: structure describing the fault
5060   *
5061   * This function handles all that is needed to finish a page fault once the
5062   * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
5063   * given page, adds reverse page mapping, handles memcg charges and LRU
5064   * addition.
5065   *
5066   * The function expects the page to be locked and on success it consumes a
5067   * reference of a page being mapped (for the PTE which maps it).
5068   *
5069   * Return: %0 on success, %VM_FAULT_ code in case of error.
5070   */
finish_fault(struct vm_fault * vmf)5071  vm_fault_t finish_fault(struct vm_fault *vmf)
5072  {
5073  	struct vm_area_struct *vma = vmf->vma;
5074  	struct page *page;
5075  	struct folio *folio;
5076  	vm_fault_t ret;
5077  	bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
5078  		      !(vma->vm_flags & VM_SHARED);
5079  	int type, nr_pages;
5080  	unsigned long addr = vmf->address;
5081  
5082  	/* Did we COW the page? */
5083  	if (is_cow)
5084  		page = vmf->cow_page;
5085  	else
5086  		page = vmf->page;
5087  
5088  	/*
5089  	 * check even for read faults because we might have lost our CoWed
5090  	 * page
5091  	 */
5092  	if (!(vma->vm_flags & VM_SHARED)) {
5093  		ret = check_stable_address_space(vma->vm_mm);
5094  		if (ret)
5095  			return ret;
5096  	}
5097  
5098  	if (pmd_none(*vmf->pmd)) {
5099  		if (PageTransCompound(page)) {
5100  			ret = do_set_pmd(vmf, page);
5101  			if (ret != VM_FAULT_FALLBACK)
5102  				return ret;
5103  		}
5104  
5105  		if (vmf->prealloc_pte)
5106  			pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
5107  		else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
5108  			return VM_FAULT_OOM;
5109  	}
5110  
5111  	folio = page_folio(page);
5112  	nr_pages = folio_nr_pages(folio);
5113  
5114  	/*
5115  	 * Using per-page fault to maintain the uffd semantics, and same
5116  	 * approach also applies to non-anonymous-shmem faults to avoid
5117  	 * inflating the RSS of the process.
5118  	 */
5119  	if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
5120  		nr_pages = 1;
5121  	} else if (nr_pages > 1) {
5122  		pgoff_t idx = folio_page_idx(folio, page);
5123  		/* The page offset of vmf->address within the VMA. */
5124  		pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
5125  		/* The index of the entry in the pagetable for fault page. */
5126  		pgoff_t pte_off = pte_index(vmf->address);
5127  
5128  		/*
5129  		 * Fallback to per-page fault in case the folio size in page
5130  		 * cache beyond the VMA limits and PMD pagetable limits.
5131  		 */
5132  		if (unlikely(vma_off < idx ||
5133  			    vma_off + (nr_pages - idx) > vma_pages(vma) ||
5134  			    pte_off < idx ||
5135  			    pte_off + (nr_pages - idx)  > PTRS_PER_PTE)) {
5136  			nr_pages = 1;
5137  		} else {
5138  			/* Now we can set mappings for the whole large folio. */
5139  			addr = vmf->address - idx * PAGE_SIZE;
5140  			page = &folio->page;
5141  		}
5142  	}
5143  
5144  	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
5145  				       addr, &vmf->ptl);
5146  	if (!vmf->pte)
5147  		return VM_FAULT_NOPAGE;
5148  
5149  	/* Re-check under ptl */
5150  	if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
5151  		update_mmu_tlb(vma, addr, vmf->pte);
5152  		ret = VM_FAULT_NOPAGE;
5153  		goto unlock;
5154  	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
5155  		update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
5156  		ret = VM_FAULT_NOPAGE;
5157  		goto unlock;
5158  	}
5159  
5160  	folio_ref_add(folio, nr_pages - 1);
5161  	set_pte_range(vmf, folio, page, nr_pages, addr);
5162  	type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
5163  	add_mm_counter(vma->vm_mm, type, nr_pages);
5164  	ret = 0;
5165  
5166  unlock:
5167  	pte_unmap_unlock(vmf->pte, vmf->ptl);
5168  	return ret;
5169  }
5170  
5171  static unsigned long fault_around_pages __read_mostly =
5172  	65536 >> PAGE_SHIFT;
5173  
5174  #ifdef CONFIG_DEBUG_FS
fault_around_bytes_get(void * data,u64 * val)5175  static int fault_around_bytes_get(void *data, u64 *val)
5176  {
5177  	*val = fault_around_pages << PAGE_SHIFT;
5178  	return 0;
5179  }
5180  
5181  /*
5182   * fault_around_bytes must be rounded down to the nearest page order as it's
5183   * what do_fault_around() expects to see.
5184   */
fault_around_bytes_set(void * data,u64 val)5185  static int fault_around_bytes_set(void *data, u64 val)
5186  {
5187  	if (val / PAGE_SIZE > PTRS_PER_PTE)
5188  		return -EINVAL;
5189  
5190  	/*
5191  	 * The minimum value is 1 page, however this results in no fault-around
5192  	 * at all. See should_fault_around().
5193  	 */
5194  	val = max(val, PAGE_SIZE);
5195  	fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT;
5196  
5197  	return 0;
5198  }
5199  DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
5200  		fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
5201  
fault_around_debugfs(void)5202  static int __init fault_around_debugfs(void)
5203  {
5204  	debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
5205  				   &fault_around_bytes_fops);
5206  	return 0;
5207  }
5208  late_initcall(fault_around_debugfs);
5209  #endif
5210  
5211  /*
5212   * do_fault_around() tries to map few pages around the fault address. The hope
5213   * is that the pages will be needed soon and this will lower the number of
5214   * faults to handle.
5215   *
5216   * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
5217   * not ready to be mapped: not up-to-date, locked, etc.
5218   *
5219   * This function doesn't cross VMA or page table boundaries, in order to call
5220   * map_pages() and acquire a PTE lock only once.
5221   *
5222   * fault_around_pages defines how many pages we'll try to map.
5223   * do_fault_around() expects it to be set to a power of two less than or equal
5224   * to PTRS_PER_PTE.
5225   *
5226   * The virtual address of the area that we map is naturally aligned to
5227   * fault_around_pages * PAGE_SIZE rounded down to the machine page size
5228   * (and therefore to page order).  This way it's easier to guarantee
5229   * that we don't cross page table boundaries.
5230   */
do_fault_around(struct vm_fault * vmf)5231  static vm_fault_t do_fault_around(struct vm_fault *vmf)
5232  {
5233  	pgoff_t nr_pages = READ_ONCE(fault_around_pages);
5234  	pgoff_t pte_off = pte_index(vmf->address);
5235  	/* The page offset of vmf->address within the VMA. */
5236  	pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
5237  	pgoff_t from_pte, to_pte;
5238  	vm_fault_t ret;
5239  
5240  	/* The PTE offset of the start address, clamped to the VMA. */
5241  	from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
5242  		       pte_off - min(pte_off, vma_off));
5243  
5244  	/* The PTE offset of the end address, clamped to the VMA and PTE. */
5245  	to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
5246  		      pte_off + vma_pages(vmf->vma) - vma_off) - 1;
5247  
5248  	if (pmd_none(*vmf->pmd)) {
5249  		vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
5250  		if (!vmf->prealloc_pte)
5251  			return VM_FAULT_OOM;
5252  	}
5253  
5254  	rcu_read_lock();
5255  	ret = vmf->vma->vm_ops->map_pages(vmf,
5256  			vmf->pgoff + from_pte - pte_off,
5257  			vmf->pgoff + to_pte - pte_off);
5258  	rcu_read_unlock();
5259  
5260  	return ret;
5261  }
5262  
5263  /* Return true if we should do read fault-around, false otherwise */
should_fault_around(struct vm_fault * vmf)5264  static inline bool should_fault_around(struct vm_fault *vmf)
5265  {
5266  	/* No ->map_pages?  No way to fault around... */
5267  	if (!vmf->vma->vm_ops->map_pages)
5268  		return false;
5269  
5270  	if (uffd_disable_fault_around(vmf->vma))
5271  		return false;
5272  
5273  	/* A single page implies no faulting 'around' at all. */
5274  	return fault_around_pages > 1;
5275  }
5276  
do_read_fault(struct vm_fault * vmf)5277  static vm_fault_t do_read_fault(struct vm_fault *vmf)
5278  {
5279  	vm_fault_t ret = 0;
5280  	struct folio *folio;
5281  
5282  	/*
5283  	 * Let's call ->map_pages() first and use ->fault() as fallback
5284  	 * if page by the offset is not ready to be mapped (cold cache or
5285  	 * something).
5286  	 */
5287  	if (should_fault_around(vmf)) {
5288  		ret = do_fault_around(vmf);
5289  		if (ret)
5290  			return ret;
5291  	}
5292  
5293  	ret = vmf_can_call_fault(vmf);
5294  	if (ret)
5295  		return ret;
5296  
5297  	ret = __do_fault(vmf);
5298  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
5299  		return ret;
5300  
5301  	ret |= finish_fault(vmf);
5302  	folio = page_folio(vmf->page);
5303  	folio_unlock(folio);
5304  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
5305  		folio_put(folio);
5306  	return ret;
5307  }
5308  
do_cow_fault(struct vm_fault * vmf)5309  static vm_fault_t do_cow_fault(struct vm_fault *vmf)
5310  {
5311  	struct vm_area_struct *vma = vmf->vma;
5312  	struct folio *folio;
5313  	vm_fault_t ret;
5314  
5315  	ret = vmf_can_call_fault(vmf);
5316  	if (!ret)
5317  		ret = vmf_anon_prepare(vmf);
5318  	if (ret)
5319  		return ret;
5320  
5321  	folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false);
5322  	if (!folio)
5323  		return VM_FAULT_OOM;
5324  
5325  	vmf->cow_page = &folio->page;
5326  
5327  	ret = __do_fault(vmf);
5328  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
5329  		goto uncharge_out;
5330  	if (ret & VM_FAULT_DONE_COW)
5331  		return ret;
5332  
5333  	if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) {
5334  		ret = VM_FAULT_HWPOISON;
5335  		goto unlock;
5336  	}
5337  	__folio_mark_uptodate(folio);
5338  
5339  	ret |= finish_fault(vmf);
5340  unlock:
5341  	unlock_page(vmf->page);
5342  	put_page(vmf->page);
5343  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
5344  		goto uncharge_out;
5345  	return ret;
5346  uncharge_out:
5347  	folio_put(folio);
5348  	return ret;
5349  }
5350  
do_shared_fault(struct vm_fault * vmf)5351  static vm_fault_t do_shared_fault(struct vm_fault *vmf)
5352  {
5353  	struct vm_area_struct *vma = vmf->vma;
5354  	vm_fault_t ret, tmp;
5355  	struct folio *folio;
5356  
5357  	ret = vmf_can_call_fault(vmf);
5358  	if (ret)
5359  		return ret;
5360  
5361  	ret = __do_fault(vmf);
5362  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
5363  		return ret;
5364  
5365  	folio = page_folio(vmf->page);
5366  
5367  	/*
5368  	 * Check if the backing address space wants to know that the page is
5369  	 * about to become writable
5370  	 */
5371  	if (vma->vm_ops->page_mkwrite) {
5372  		folio_unlock(folio);
5373  		tmp = do_page_mkwrite(vmf, folio);
5374  		if (unlikely(!tmp ||
5375  				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
5376  			folio_put(folio);
5377  			return tmp;
5378  		}
5379  	}
5380  
5381  	ret |= finish_fault(vmf);
5382  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
5383  					VM_FAULT_RETRY))) {
5384  		folio_unlock(folio);
5385  		folio_put(folio);
5386  		return ret;
5387  	}
5388  
5389  	ret |= fault_dirty_shared_page(vmf);
5390  	return ret;
5391  }
5392  
5393  /*
5394   * We enter with non-exclusive mmap_lock (to exclude vma changes,
5395   * but allow concurrent faults).
5396   * The mmap_lock may have been released depending on flags and our
5397   * return value.  See filemap_fault() and __folio_lock_or_retry().
5398   * If mmap_lock is released, vma may become invalid (for example
5399   * by other thread calling munmap()).
5400   */
do_fault(struct vm_fault * vmf)5401  static vm_fault_t do_fault(struct vm_fault *vmf)
5402  {
5403  	struct vm_area_struct *vma = vmf->vma;
5404  	struct mm_struct *vm_mm = vma->vm_mm;
5405  	vm_fault_t ret;
5406  
5407  	/*
5408  	 * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
5409  	 */
5410  	if (!vma->vm_ops->fault) {
5411  		vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
5412  					       vmf->address, &vmf->ptl);
5413  		if (unlikely(!vmf->pte))
5414  			ret = VM_FAULT_SIGBUS;
5415  		else {
5416  			/*
5417  			 * Make sure this is not a temporary clearing of pte
5418  			 * by holding ptl and checking again. A R/M/W update
5419  			 * of pte involves: take ptl, clearing the pte so that
5420  			 * we don't have concurrent modification by hardware
5421  			 * followed by an update.
5422  			 */
5423  			if (unlikely(pte_none(ptep_get(vmf->pte))))
5424  				ret = VM_FAULT_SIGBUS;
5425  			else
5426  				ret = VM_FAULT_NOPAGE;
5427  
5428  			pte_unmap_unlock(vmf->pte, vmf->ptl);
5429  		}
5430  	} else if (!(vmf->flags & FAULT_FLAG_WRITE))
5431  		ret = do_read_fault(vmf);
5432  	else if (!(vma->vm_flags & VM_SHARED))
5433  		ret = do_cow_fault(vmf);
5434  	else
5435  		ret = do_shared_fault(vmf);
5436  
5437  	/* preallocated pagetable is unused: free it */
5438  	if (vmf->prealloc_pte) {
5439  		pte_free(vm_mm, vmf->prealloc_pte);
5440  		vmf->prealloc_pte = NULL;
5441  	}
5442  	return ret;
5443  }
5444  
numa_migrate_check(struct folio * folio,struct vm_fault * vmf,unsigned long addr,int * flags,bool writable,int * last_cpupid)5445  int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
5446  		      unsigned long addr, int *flags,
5447  		      bool writable, int *last_cpupid)
5448  {
5449  	struct vm_area_struct *vma = vmf->vma;
5450  
5451  	/*
5452  	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
5453  	 * much anyway since they can be in shared cache state. This misses
5454  	 * the case where a mapping is writable but the process never writes
5455  	 * to it but pte_write gets cleared during protection updates and
5456  	 * pte_dirty has unpredictable behaviour between PTE scan updates,
5457  	 * background writeback, dirty balancing and application behaviour.
5458  	 */
5459  	if (!writable)
5460  		*flags |= TNF_NO_GROUP;
5461  
5462  	/*
5463  	 * Flag if the folio is shared between multiple address spaces. This
5464  	 * is later used when determining whether to group tasks together
5465  	 */
5466  	if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
5467  		*flags |= TNF_SHARED;
5468  	/*
5469  	 * For memory tiering mode, cpupid of slow memory page is used
5470  	 * to record page access time.  So use default value.
5471  	 */
5472  	if (folio_use_access_time(folio))
5473  		*last_cpupid = (-1 & LAST_CPUPID_MASK);
5474  	else
5475  		*last_cpupid = folio_last_cpupid(folio);
5476  
5477  	/* Record the current PID acceesing VMA */
5478  	vma_set_access_pid_bit(vma);
5479  
5480  	count_vm_numa_event(NUMA_HINT_FAULTS);
5481  #ifdef CONFIG_NUMA_BALANCING
5482  	count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1);
5483  #endif
5484  	if (folio_nid(folio) == numa_node_id()) {
5485  		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
5486  		*flags |= TNF_FAULT_LOCAL;
5487  	}
5488  
5489  	return mpol_misplaced(folio, vmf, addr);
5490  }
5491  
numa_rebuild_single_mapping(struct vm_fault * vmf,struct vm_area_struct * vma,unsigned long fault_addr,pte_t * fault_pte,bool writable)5492  static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
5493  					unsigned long fault_addr, pte_t *fault_pte,
5494  					bool writable)
5495  {
5496  	pte_t pte, old_pte;
5497  
5498  	old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte);
5499  	pte = pte_modify(old_pte, vma->vm_page_prot);
5500  	pte = pte_mkyoung(pte);
5501  	if (writable)
5502  		pte = pte_mkwrite(pte, vma);
5503  	ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte);
5504  	update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
5505  }
5506  
numa_rebuild_large_mapping(struct vm_fault * vmf,struct vm_area_struct * vma,struct folio * folio,pte_t fault_pte,bool ignore_writable,bool pte_write_upgrade)5507  static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
5508  				       struct folio *folio, pte_t fault_pte,
5509  				       bool ignore_writable, bool pte_write_upgrade)
5510  {
5511  	int nr = pte_pfn(fault_pte) - folio_pfn(folio);
5512  	unsigned long start, end, addr = vmf->address;
5513  	unsigned long addr_start = addr - (nr << PAGE_SHIFT);
5514  	unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE);
5515  	pte_t *start_ptep;
5516  
5517  	/* Stay within the VMA and within the page table. */
5518  	start = max3(addr_start, pt_start, vma->vm_start);
5519  	end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE,
5520  		   vma->vm_end);
5521  	start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT);
5522  
5523  	/* Restore all PTEs' mapping of the large folio */
5524  	for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
5525  		pte_t ptent = ptep_get(start_ptep);
5526  		bool writable = false;
5527  
5528  		if (!pte_present(ptent) || !pte_protnone(ptent))
5529  			continue;
5530  
5531  		if (pfn_folio(pte_pfn(ptent)) != folio)
5532  			continue;
5533  
5534  		if (!ignore_writable) {
5535  			ptent = pte_modify(ptent, vma->vm_page_prot);
5536  			writable = pte_write(ptent);
5537  			if (!writable && pte_write_upgrade &&
5538  			    can_change_pte_writable(vma, addr, ptent))
5539  				writable = true;
5540  		}
5541  
5542  		numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable);
5543  	}
5544  }
5545  
do_numa_page(struct vm_fault * vmf)5546  static vm_fault_t do_numa_page(struct vm_fault *vmf)
5547  {
5548  	struct vm_area_struct *vma = vmf->vma;
5549  	struct folio *folio = NULL;
5550  	int nid = NUMA_NO_NODE;
5551  	bool writable = false, ignore_writable = false;
5552  	bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
5553  	int last_cpupid;
5554  	int target_nid;
5555  	pte_t pte, old_pte;
5556  	int flags = 0, nr_pages;
5557  
5558  	/*
5559  	 * The pte cannot be used safely until we verify, while holding the page
5560  	 * table lock, that its contents have not changed during fault handling.
5561  	 */
5562  	spin_lock(vmf->ptl);
5563  	/* Read the live PTE from the page tables: */
5564  	old_pte = ptep_get(vmf->pte);
5565  
5566  	if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
5567  		pte_unmap_unlock(vmf->pte, vmf->ptl);
5568  		return 0;
5569  	}
5570  
5571  	pte = pte_modify(old_pte, vma->vm_page_prot);
5572  
5573  	/*
5574  	 * Detect now whether the PTE could be writable; this information
5575  	 * is only valid while holding the PT lock.
5576  	 */
5577  	writable = pte_write(pte);
5578  	if (!writable && pte_write_upgrade &&
5579  	    can_change_pte_writable(vma, vmf->address, pte))
5580  		writable = true;
5581  
5582  	folio = vm_normal_folio(vma, vmf->address, pte);
5583  	if (!folio || folio_is_zone_device(folio))
5584  		goto out_map;
5585  
5586  	nid = folio_nid(folio);
5587  	nr_pages = folio_nr_pages(folio);
5588  
5589  	target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags,
5590  					writable, &last_cpupid);
5591  	if (target_nid == NUMA_NO_NODE)
5592  		goto out_map;
5593  	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
5594  		flags |= TNF_MIGRATE_FAIL;
5595  		goto out_map;
5596  	}
5597  	/* The folio is isolated and isolation code holds a folio reference. */
5598  	pte_unmap_unlock(vmf->pte, vmf->ptl);
5599  	writable = false;
5600  	ignore_writable = true;
5601  
5602  	/* Migrate to the requested node */
5603  	if (!migrate_misplaced_folio(folio, vma, target_nid)) {
5604  		nid = target_nid;
5605  		flags |= TNF_MIGRATED;
5606  		task_numa_fault(last_cpupid, nid, nr_pages, flags);
5607  		return 0;
5608  	}
5609  
5610  	flags |= TNF_MIGRATE_FAIL;
5611  	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
5612  				       vmf->address, &vmf->ptl);
5613  	if (unlikely(!vmf->pte))
5614  		return 0;
5615  	if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
5616  		pte_unmap_unlock(vmf->pte, vmf->ptl);
5617  		return 0;
5618  	}
5619  out_map:
5620  	/*
5621  	 * Make it present again, depending on how arch implements
5622  	 * non-accessible ptes, some can allow access by kernel mode.
5623  	 */
5624  	if (folio && folio_test_large(folio))
5625  		numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable,
5626  					   pte_write_upgrade);
5627  	else
5628  		numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
5629  					    writable);
5630  	pte_unmap_unlock(vmf->pte, vmf->ptl);
5631  
5632  	if (nid != NUMA_NO_NODE)
5633  		task_numa_fault(last_cpupid, nid, nr_pages, flags);
5634  	return 0;
5635  }
5636  
create_huge_pmd(struct vm_fault * vmf)5637  static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
5638  {
5639  	struct vm_area_struct *vma = vmf->vma;
5640  	if (vma_is_anonymous(vma))
5641  		return do_huge_pmd_anonymous_page(vmf);
5642  	if (vma->vm_ops->huge_fault)
5643  		return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
5644  	return VM_FAULT_FALLBACK;
5645  }
5646  
5647  /* `inline' is required to avoid gcc 4.1.2 build error */
wp_huge_pmd(struct vm_fault * vmf)5648  static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
5649  {
5650  	struct vm_area_struct *vma = vmf->vma;
5651  	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
5652  	vm_fault_t ret;
5653  
5654  	if (vma_is_anonymous(vma)) {
5655  		if (likely(!unshare) &&
5656  		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
5657  			if (userfaultfd_wp_async(vmf->vma))
5658  				goto split;
5659  			return handle_userfault(vmf, VM_UFFD_WP);
5660  		}
5661  		return do_huge_pmd_wp_page(vmf);
5662  	}
5663  
5664  	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
5665  		if (vma->vm_ops->huge_fault) {
5666  			ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
5667  			if (!(ret & VM_FAULT_FALLBACK))
5668  				return ret;
5669  		}
5670  	}
5671  
5672  split:
5673  	/* COW or write-notify handled on pte level: split pmd. */
5674  	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
5675  
5676  	return VM_FAULT_FALLBACK;
5677  }
5678  
create_huge_pud(struct vm_fault * vmf)5679  static vm_fault_t create_huge_pud(struct vm_fault *vmf)
5680  {
5681  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&			\
5682  	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
5683  	struct vm_area_struct *vma = vmf->vma;
5684  	/* No support for anonymous transparent PUD pages yet */
5685  	if (vma_is_anonymous(vma))
5686  		return VM_FAULT_FALLBACK;
5687  	if (vma->vm_ops->huge_fault)
5688  		return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
5689  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
5690  	return VM_FAULT_FALLBACK;
5691  }
5692  
wp_huge_pud(struct vm_fault * vmf,pud_t orig_pud)5693  static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
5694  {
5695  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&			\
5696  	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
5697  	struct vm_area_struct *vma = vmf->vma;
5698  	vm_fault_t ret;
5699  
5700  	/* No support for anonymous transparent PUD pages yet */
5701  	if (vma_is_anonymous(vma))
5702  		goto split;
5703  	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
5704  		if (vma->vm_ops->huge_fault) {
5705  			ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
5706  			if (!(ret & VM_FAULT_FALLBACK))
5707  				return ret;
5708  		}
5709  	}
5710  split:
5711  	/* COW or write-notify not handled on PUD level: split pud.*/
5712  	__split_huge_pud(vma, vmf->pud, vmf->address);
5713  #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
5714  	return VM_FAULT_FALLBACK;
5715  }
5716  
5717  /*
5718   * These routines also need to handle stuff like marking pages dirty
5719   * and/or accessed for architectures that don't do it in hardware (most
5720   * RISC architectures).  The early dirtying is also good on the i386.
5721   *
5722   * There is also a hook called "update_mmu_cache()" that architectures
5723   * with external mmu caches can use to update those (ie the Sparc or
5724   * PowerPC hashed page tables that act as extended TLBs).
5725   *
5726   * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
5727   * concurrent faults).
5728   *
5729   * The mmap_lock may have been released depending on flags and our return value.
5730   * See filemap_fault() and __folio_lock_or_retry().
5731   */
handle_pte_fault(struct vm_fault * vmf)5732  static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
5733  {
5734  	pte_t entry;
5735  
5736  	if (unlikely(pmd_none(*vmf->pmd))) {
5737  		/*
5738  		 * Leave __pte_alloc() until later: because vm_ops->fault may
5739  		 * want to allocate huge page, and if we expose page table
5740  		 * for an instant, it will be difficult to retract from
5741  		 * concurrent faults and from rmap lookups.
5742  		 */
5743  		vmf->pte = NULL;
5744  		vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
5745  	} else {
5746  		/*
5747  		 * A regular pmd is established and it can't morph into a huge
5748  		 * pmd by anon khugepaged, since that takes mmap_lock in write
5749  		 * mode; but shmem or file collapse to THP could still morph
5750  		 * it into a huge pmd: just retry later if so.
5751  		 */
5752  		vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
5753  						 vmf->address, &vmf->ptl);
5754  		if (unlikely(!vmf->pte))
5755  			return 0;
5756  		vmf->orig_pte = ptep_get_lockless(vmf->pte);
5757  		vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
5758  
5759  		if (pte_none(vmf->orig_pte)) {
5760  			pte_unmap(vmf->pte);
5761  			vmf->pte = NULL;
5762  		}
5763  	}
5764  
5765  	if (!vmf->pte)
5766  		return do_pte_missing(vmf);
5767  
5768  	if (!pte_present(vmf->orig_pte))
5769  		return do_swap_page(vmf);
5770  
5771  	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
5772  		return do_numa_page(vmf);
5773  
5774  	spin_lock(vmf->ptl);
5775  	entry = vmf->orig_pte;
5776  	if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
5777  		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
5778  		goto unlock;
5779  	}
5780  	if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
5781  		if (!pte_write(entry))
5782  			return do_wp_page(vmf);
5783  		else if (likely(vmf->flags & FAULT_FLAG_WRITE))
5784  			entry = pte_mkdirty(entry);
5785  	}
5786  	entry = pte_mkyoung(entry);
5787  	if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
5788  				vmf->flags & FAULT_FLAG_WRITE)) {
5789  		update_mmu_cache_range(vmf, vmf->vma, vmf->address,
5790  				vmf->pte, 1);
5791  	} else {
5792  		/* Skip spurious TLB flush for retried page fault */
5793  		if (vmf->flags & FAULT_FLAG_TRIED)
5794  			goto unlock;
5795  		/*
5796  		 * This is needed only for protection faults but the arch code
5797  		 * is not yet telling us if this is a protection fault or not.
5798  		 * This still avoids useless tlb flushes for .text page faults
5799  		 * with threads.
5800  		 */
5801  		if (vmf->flags & FAULT_FLAG_WRITE)
5802  			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
5803  						     vmf->pte);
5804  	}
5805  unlock:
5806  	pte_unmap_unlock(vmf->pte, vmf->ptl);
5807  	return 0;
5808  }
5809  
5810  /*
5811   * On entry, we hold either the VMA lock or the mmap_lock
5812   * (FAULT_FLAG_VMA_LOCK tells you which).  If VM_FAULT_RETRY is set in
5813   * the result, the mmap_lock is not held on exit.  See filemap_fault()
5814   * and __folio_lock_or_retry().
5815   */
__handle_mm_fault(struct vm_area_struct * vma,unsigned long address,unsigned int flags)5816  static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
5817  		unsigned long address, unsigned int flags)
5818  {
5819  	struct vm_fault vmf = {
5820  		.vma = vma,
5821  		.address = address & PAGE_MASK,
5822  		.real_address = address,
5823  		.flags = flags,
5824  		.pgoff = linear_page_index(vma, address),
5825  		.gfp_mask = __get_fault_gfp_mask(vma),
5826  	};
5827  	struct mm_struct *mm = vma->vm_mm;
5828  	unsigned long vm_flags = vma->vm_flags;
5829  	pgd_t *pgd;
5830  	p4d_t *p4d;
5831  	vm_fault_t ret;
5832  
5833  	pgd = pgd_offset(mm, address);
5834  	p4d = p4d_alloc(mm, pgd, address);
5835  	if (!p4d)
5836  		return VM_FAULT_OOM;
5837  
5838  	vmf.pud = pud_alloc(mm, p4d, address);
5839  	if (!vmf.pud)
5840  		return VM_FAULT_OOM;
5841  retry_pud:
5842  	if (pud_none(*vmf.pud) &&
5843  	    thp_vma_allowable_order(vma, vm_flags,
5844  				TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {
5845  		ret = create_huge_pud(&vmf);
5846  		if (!(ret & VM_FAULT_FALLBACK))
5847  			return ret;
5848  	} else {
5849  		pud_t orig_pud = *vmf.pud;
5850  
5851  		barrier();
5852  		if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
5853  
5854  			/*
5855  			 * TODO once we support anonymous PUDs: NUMA case and
5856  			 * FAULT_FLAG_UNSHARE handling.
5857  			 */
5858  			if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
5859  				ret = wp_huge_pud(&vmf, orig_pud);
5860  				if (!(ret & VM_FAULT_FALLBACK))
5861  					return ret;
5862  			} else {
5863  				huge_pud_set_accessed(&vmf, orig_pud);
5864  				return 0;
5865  			}
5866  		}
5867  	}
5868  
5869  	vmf.pmd = pmd_alloc(mm, vmf.pud, address);
5870  	if (!vmf.pmd)
5871  		return VM_FAULT_OOM;
5872  
5873  	/* Huge pud page fault raced with pmd_alloc? */
5874  	if (pud_trans_unstable(vmf.pud))
5875  		goto retry_pud;
5876  
5877  	if (pmd_none(*vmf.pmd) &&
5878  	    thp_vma_allowable_order(vma, vm_flags,
5879  				TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {
5880  		ret = create_huge_pmd(&vmf);
5881  		if (!(ret & VM_FAULT_FALLBACK))
5882  			return ret;
5883  	} else {
5884  		vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
5885  
5886  		if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
5887  			VM_BUG_ON(thp_migration_supported() &&
5888  					  !is_pmd_migration_entry(vmf.orig_pmd));
5889  			if (is_pmd_migration_entry(vmf.orig_pmd))
5890  				pmd_migration_entry_wait(mm, vmf.pmd);
5891  			return 0;
5892  		}
5893  		if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
5894  			if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
5895  				return do_huge_pmd_numa_page(&vmf);
5896  
5897  			if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
5898  			    !pmd_write(vmf.orig_pmd)) {
5899  				ret = wp_huge_pmd(&vmf);
5900  				if (!(ret & VM_FAULT_FALLBACK))
5901  					return ret;
5902  			} else {
5903  				huge_pmd_set_accessed(&vmf);
5904  				return 0;
5905  			}
5906  		}
5907  	}
5908  
5909  	return handle_pte_fault(&vmf);
5910  }
5911  
5912  /**
5913   * mm_account_fault - Do page fault accounting
5914   * @mm: mm from which memcg should be extracted. It can be NULL.
5915   * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
5916   *        of perf event counters, but we'll still do the per-task accounting to
5917   *        the task who triggered this page fault.
5918   * @address: the faulted address.
5919   * @flags: the fault flags.
5920   * @ret: the fault retcode.
5921   *
5922   * This will take care of most of the page fault accounting.  Meanwhile, it
5923   * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
5924   * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
5925   * still be in per-arch page fault handlers at the entry of page fault.
5926   */
mm_account_fault(struct mm_struct * mm,struct pt_regs * regs,unsigned long address,unsigned int flags,vm_fault_t ret)5927  static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs,
5928  				    unsigned long address, unsigned int flags,
5929  				    vm_fault_t ret)
5930  {
5931  	bool major;
5932  
5933  	/* Incomplete faults will be accounted upon completion. */
5934  	if (ret & VM_FAULT_RETRY)
5935  		return;
5936  
5937  	/*
5938  	 * To preserve the behavior of older kernels, PGFAULT counters record
5939  	 * both successful and failed faults, as opposed to perf counters,
5940  	 * which ignore failed cases.
5941  	 */
5942  	count_vm_event(PGFAULT);
5943  	count_memcg_event_mm(mm, PGFAULT);
5944  
5945  	/*
5946  	 * Do not account for unsuccessful faults (e.g. when the address wasn't
5947  	 * valid).  That includes arch_vma_access_permitted() failing before
5948  	 * reaching here. So this is not a "this many hardware page faults"
5949  	 * counter.  We should use the hw profiling for that.
5950  	 */
5951  	if (ret & VM_FAULT_ERROR)
5952  		return;
5953  
5954  	/*
5955  	 * We define the fault as a major fault when the final successful fault
5956  	 * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
5957  	 * handle it immediately previously).
5958  	 */
5959  	major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
5960  
5961  	if (major)
5962  		current->maj_flt++;
5963  	else
5964  		current->min_flt++;
5965  
5966  	/*
5967  	 * If the fault is done for GUP, regs will be NULL.  We only do the
5968  	 * accounting for the per thread fault counters who triggered the
5969  	 * fault, and we skip the perf event updates.
5970  	 */
5971  	if (!regs)
5972  		return;
5973  
5974  	if (major)
5975  		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
5976  	else
5977  		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
5978  }
5979  
5980  #ifdef CONFIG_LRU_GEN
lru_gen_enter_fault(struct vm_area_struct * vma)5981  static void lru_gen_enter_fault(struct vm_area_struct *vma)
5982  {
5983  	/* the LRU algorithm only applies to accesses with recency */
5984  	current->in_lru_fault = vma_has_recency(vma);
5985  }
5986  
lru_gen_exit_fault(void)5987  static void lru_gen_exit_fault(void)
5988  {
5989  	current->in_lru_fault = false;
5990  }
5991  #else
lru_gen_enter_fault(struct vm_area_struct * vma)5992  static void lru_gen_enter_fault(struct vm_area_struct *vma)
5993  {
5994  }
5995  
lru_gen_exit_fault(void)5996  static void lru_gen_exit_fault(void)
5997  {
5998  }
5999  #endif /* CONFIG_LRU_GEN */
6000  
sanitize_fault_flags(struct vm_area_struct * vma,unsigned int * flags)6001  static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
6002  				       unsigned int *flags)
6003  {
6004  	if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
6005  		if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
6006  			return VM_FAULT_SIGSEGV;
6007  		/*
6008  		 * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
6009  		 * just treat it like an ordinary read-fault otherwise.
6010  		 */
6011  		if (!is_cow_mapping(vma->vm_flags))
6012  			*flags &= ~FAULT_FLAG_UNSHARE;
6013  	} else if (*flags & FAULT_FLAG_WRITE) {
6014  		/* Write faults on read-only mappings are impossible ... */
6015  		if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
6016  			return VM_FAULT_SIGSEGV;
6017  		/* ... and FOLL_FORCE only applies to COW mappings. */
6018  		if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
6019  				 !is_cow_mapping(vma->vm_flags)))
6020  			return VM_FAULT_SIGSEGV;
6021  	}
6022  #ifdef CONFIG_PER_VMA_LOCK
6023  	/*
6024  	 * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
6025  	 * the assumption that lock is dropped on VM_FAULT_RETRY.
6026  	 */
6027  	if (WARN_ON_ONCE((*flags &
6028  			(FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) ==
6029  			(FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)))
6030  		return VM_FAULT_SIGSEGV;
6031  #endif
6032  
6033  	return 0;
6034  }
6035  
6036  /*
6037   * By the time we get here, we already hold the mm semaphore
6038   *
6039   * The mmap_lock may have been released depending on flags and our
6040   * return value.  See filemap_fault() and __folio_lock_or_retry().
6041   */
handle_mm_fault(struct vm_area_struct * vma,unsigned long address,unsigned int flags,struct pt_regs * regs)6042  vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
6043  			   unsigned int flags, struct pt_regs *regs)
6044  {
6045  	/* If the fault handler drops the mmap_lock, vma may be freed */
6046  	struct mm_struct *mm = vma->vm_mm;
6047  	vm_fault_t ret;
6048  	bool is_droppable;
6049  
6050  	__set_current_state(TASK_RUNNING);
6051  
6052  	ret = sanitize_fault_flags(vma, &flags);
6053  	if (ret)
6054  		goto out;
6055  
6056  	if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
6057  					    flags & FAULT_FLAG_INSTRUCTION,
6058  					    flags & FAULT_FLAG_REMOTE)) {
6059  		ret = VM_FAULT_SIGSEGV;
6060  		goto out;
6061  	}
6062  
6063  	is_droppable = !!(vma->vm_flags & VM_DROPPABLE);
6064  
6065  	/*
6066  	 * Enable the memcg OOM handling for faults triggered in user
6067  	 * space.  Kernel faults are handled more gracefully.
6068  	 */
6069  	if (flags & FAULT_FLAG_USER)
6070  		mem_cgroup_enter_user_fault();
6071  
6072  	lru_gen_enter_fault(vma);
6073  
6074  	if (unlikely(is_vm_hugetlb_page(vma)))
6075  		ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
6076  	else
6077  		ret = __handle_mm_fault(vma, address, flags);
6078  
6079  	/*
6080  	 * Warning: It is no longer safe to dereference vma-> after this point,
6081  	 * because mmap_lock might have been dropped by __handle_mm_fault(), so
6082  	 * vma might be destroyed from underneath us.
6083  	 */
6084  
6085  	lru_gen_exit_fault();
6086  
6087  	/* If the mapping is droppable, then errors due to OOM aren't fatal. */
6088  	if (is_droppable)
6089  		ret &= ~VM_FAULT_OOM;
6090  
6091  	if (flags & FAULT_FLAG_USER) {
6092  		mem_cgroup_exit_user_fault();
6093  		/*
6094  		 * The task may have entered a memcg OOM situation but
6095  		 * if the allocation error was handled gracefully (no
6096  		 * VM_FAULT_OOM), there is no need to kill anything.
6097  		 * Just clean up the OOM state peacefully.
6098  		 */
6099  		if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
6100  			mem_cgroup_oom_synchronize(false);
6101  	}
6102  out:
6103  	mm_account_fault(mm, regs, address, flags, ret);
6104  
6105  	return ret;
6106  }
6107  EXPORT_SYMBOL_GPL(handle_mm_fault);
6108  
6109  #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
6110  #include <linux/extable.h>
6111  
get_mmap_lock_carefully(struct mm_struct * mm,struct pt_regs * regs)6112  static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
6113  {
6114  	if (likely(mmap_read_trylock(mm)))
6115  		return true;
6116  
6117  	if (regs && !user_mode(regs)) {
6118  		unsigned long ip = exception_ip(regs);
6119  		if (!search_exception_tables(ip))
6120  			return false;
6121  	}
6122  
6123  	return !mmap_read_lock_killable(mm);
6124  }
6125  
mmap_upgrade_trylock(struct mm_struct * mm)6126  static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
6127  {
6128  	/*
6129  	 * We don't have this operation yet.
6130  	 *
6131  	 * It should be easy enough to do: it's basically a
6132  	 *    atomic_long_try_cmpxchg_acquire()
6133  	 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
6134  	 * it also needs the proper lockdep magic etc.
6135  	 */
6136  	return false;
6137  }
6138  
upgrade_mmap_lock_carefully(struct mm_struct * mm,struct pt_regs * regs)6139  static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
6140  {
6141  	mmap_read_unlock(mm);
6142  	if (regs && !user_mode(regs)) {
6143  		unsigned long ip = exception_ip(regs);
6144  		if (!search_exception_tables(ip))
6145  			return false;
6146  	}
6147  	return !mmap_write_lock_killable(mm);
6148  }
6149  
6150  /*
6151   * Helper for page fault handling.
6152   *
6153   * This is kind of equivalend to "mmap_read_lock()" followed
6154   * by "find_extend_vma()", except it's a lot more careful about
6155   * the locking (and will drop the lock on failure).
6156   *
6157   * For example, if we have a kernel bug that causes a page
6158   * fault, we don't want to just use mmap_read_lock() to get
6159   * the mm lock, because that would deadlock if the bug were
6160   * to happen while we're holding the mm lock for writing.
6161   *
6162   * So this checks the exception tables on kernel faults in
6163   * order to only do this all for instructions that are actually
6164   * expected to fault.
6165   *
6166   * We can also actually take the mm lock for writing if we
6167   * need to extend the vma, which helps the VM layer a lot.
6168   */
lock_mm_and_find_vma(struct mm_struct * mm,unsigned long addr,struct pt_regs * regs)6169  struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
6170  			unsigned long addr, struct pt_regs *regs)
6171  {
6172  	struct vm_area_struct *vma;
6173  
6174  	if (!get_mmap_lock_carefully(mm, regs))
6175  		return NULL;
6176  
6177  	vma = find_vma(mm, addr);
6178  	if (likely(vma && (vma->vm_start <= addr)))
6179  		return vma;
6180  
6181  	/*
6182  	 * Well, dang. We might still be successful, but only
6183  	 * if we can extend a vma to do so.
6184  	 */
6185  	if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
6186  		mmap_read_unlock(mm);
6187  		return NULL;
6188  	}
6189  
6190  	/*
6191  	 * We can try to upgrade the mmap lock atomically,
6192  	 * in which case we can continue to use the vma
6193  	 * we already looked up.
6194  	 *
6195  	 * Otherwise we'll have to drop the mmap lock and
6196  	 * re-take it, and also look up the vma again,
6197  	 * re-checking it.
6198  	 */
6199  	if (!mmap_upgrade_trylock(mm)) {
6200  		if (!upgrade_mmap_lock_carefully(mm, regs))
6201  			return NULL;
6202  
6203  		vma = find_vma(mm, addr);
6204  		if (!vma)
6205  			goto fail;
6206  		if (vma->vm_start <= addr)
6207  			goto success;
6208  		if (!(vma->vm_flags & VM_GROWSDOWN))
6209  			goto fail;
6210  	}
6211  
6212  	if (expand_stack_locked(vma, addr))
6213  		goto fail;
6214  
6215  success:
6216  	mmap_write_downgrade(mm);
6217  	return vma;
6218  
6219  fail:
6220  	mmap_write_unlock(mm);
6221  	return NULL;
6222  }
6223  #endif
6224  
6225  #ifdef CONFIG_PER_VMA_LOCK
6226  /*
6227   * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
6228   * stable and not isolated. If the VMA is not found or is being modified the
6229   * function returns NULL.
6230   */
lock_vma_under_rcu(struct mm_struct * mm,unsigned long address)6231  struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
6232  					  unsigned long address)
6233  {
6234  	MA_STATE(mas, &mm->mm_mt, address, address);
6235  	struct vm_area_struct *vma;
6236  
6237  	rcu_read_lock();
6238  retry:
6239  	vma = mas_walk(&mas);
6240  	if (!vma)
6241  		goto inval;
6242  
6243  	if (!vma_start_read(vma))
6244  		goto inval;
6245  
6246  	/* Check if the VMA got isolated after we found it */
6247  	if (vma->detached) {
6248  		vma_end_read(vma);
6249  		count_vm_vma_lock_event(VMA_LOCK_MISS);
6250  		/* The area was replaced with another one */
6251  		goto retry;
6252  	}
6253  	/*
6254  	 * At this point, we have a stable reference to a VMA: The VMA is
6255  	 * locked and we know it hasn't already been isolated.
6256  	 * From here on, we can access the VMA without worrying about which
6257  	 * fields are accessible for RCU readers.
6258  	 */
6259  
6260  	/* Check since vm_start/vm_end might change before we lock the VMA */
6261  	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
6262  		goto inval_end_read;
6263  
6264  	rcu_read_unlock();
6265  	return vma;
6266  
6267  inval_end_read:
6268  	vma_end_read(vma);
6269  inval:
6270  	rcu_read_unlock();
6271  	count_vm_vma_lock_event(VMA_LOCK_ABORT);
6272  	return NULL;
6273  }
6274  #endif /* CONFIG_PER_VMA_LOCK */
6275  
6276  #ifndef __PAGETABLE_P4D_FOLDED
6277  /*
6278   * Allocate p4d page table.
6279   * We've already handled the fast-path in-line.
6280   */
__p4d_alloc(struct mm_struct * mm,pgd_t * pgd,unsigned long address)6281  int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
6282  {
6283  	p4d_t *new = p4d_alloc_one(mm, address);
6284  	if (!new)
6285  		return -ENOMEM;
6286  
6287  	spin_lock(&mm->page_table_lock);
6288  	if (pgd_present(*pgd)) {	/* Another has populated it */
6289  		p4d_free(mm, new);
6290  	} else {
6291  		smp_wmb(); /* See comment in pmd_install() */
6292  		pgd_populate(mm, pgd, new);
6293  	}
6294  	spin_unlock(&mm->page_table_lock);
6295  	return 0;
6296  }
6297  #endif /* __PAGETABLE_P4D_FOLDED */
6298  
6299  #ifndef __PAGETABLE_PUD_FOLDED
6300  /*
6301   * Allocate page upper directory.
6302   * We've already handled the fast-path in-line.
6303   */
__pud_alloc(struct mm_struct * mm,p4d_t * p4d,unsigned long address)6304  int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
6305  {
6306  	pud_t *new = pud_alloc_one(mm, address);
6307  	if (!new)
6308  		return -ENOMEM;
6309  
6310  	spin_lock(&mm->page_table_lock);
6311  	if (!p4d_present(*p4d)) {
6312  		mm_inc_nr_puds(mm);
6313  		smp_wmb(); /* See comment in pmd_install() */
6314  		p4d_populate(mm, p4d, new);
6315  	} else	/* Another has populated it */
6316  		pud_free(mm, new);
6317  	spin_unlock(&mm->page_table_lock);
6318  	return 0;
6319  }
6320  #endif /* __PAGETABLE_PUD_FOLDED */
6321  
6322  #ifndef __PAGETABLE_PMD_FOLDED
6323  /*
6324   * Allocate page middle directory.
6325   * We've already handled the fast-path in-line.
6326   */
__pmd_alloc(struct mm_struct * mm,pud_t * pud,unsigned long address)6327  int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
6328  {
6329  	spinlock_t *ptl;
6330  	pmd_t *new = pmd_alloc_one(mm, address);
6331  	if (!new)
6332  		return -ENOMEM;
6333  
6334  	ptl = pud_lock(mm, pud);
6335  	if (!pud_present(*pud)) {
6336  		mm_inc_nr_pmds(mm);
6337  		smp_wmb(); /* See comment in pmd_install() */
6338  		pud_populate(mm, pud, new);
6339  	} else {	/* Another has populated it */
6340  		pmd_free(mm, new);
6341  	}
6342  	spin_unlock(ptl);
6343  	return 0;
6344  }
6345  #endif /* __PAGETABLE_PMD_FOLDED */
6346  
pfnmap_args_setup(struct follow_pfnmap_args * args,spinlock_t * lock,pte_t * ptep,pgprot_t pgprot,unsigned long pfn_base,unsigned long addr_mask,bool writable,bool special)6347  static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
6348  				     spinlock_t *lock, pte_t *ptep,
6349  				     pgprot_t pgprot, unsigned long pfn_base,
6350  				     unsigned long addr_mask, bool writable,
6351  				     bool special)
6352  {
6353  	args->lock = lock;
6354  	args->ptep = ptep;
6355  	args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
6356  	args->pgprot = pgprot;
6357  	args->writable = writable;
6358  	args->special = special;
6359  }
6360  
pfnmap_lockdep_assert(struct vm_area_struct * vma)6361  static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
6362  {
6363  #ifdef CONFIG_LOCKDEP
6364  	struct file *file = vma->vm_file;
6365  	struct address_space *mapping = file ? file->f_mapping : NULL;
6366  
6367  	if (mapping)
6368  		lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) ||
6369  			       lockdep_is_held(&vma->vm_mm->mmap_lock));
6370  	else
6371  		lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
6372  #endif
6373  }
6374  
6375  /**
6376   * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
6377   * @args: Pointer to struct @follow_pfnmap_args
6378   *
6379   * The caller needs to setup args->vma and args->address to point to the
6380   * virtual address as the target of such lookup.  On a successful return,
6381   * the results will be put into other output fields.
6382   *
6383   * After the caller finished using the fields, the caller must invoke
6384   * another follow_pfnmap_end() to proper releases the locks and resources
6385   * of such look up request.
6386   *
6387   * During the start() and end() calls, the results in @args will be valid
6388   * as proper locks will be held.  After the end() is called, all the fields
6389   * in @follow_pfnmap_args will be invalid to be further accessed.  Further
6390   * use of such information after end() may require proper synchronizations
6391   * by the caller with page table updates, otherwise it can create a
6392   * security bug.
6393   *
6394   * If the PTE maps a refcounted page, callers are responsible to protect
6395   * against invalidation with MMU notifiers; otherwise access to the PFN at
6396   * a later point in time can trigger use-after-free.
6397   *
6398   * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
6399   * should be taken for read, and the mmap semaphore cannot be released
6400   * before the end() is invoked.
6401   *
6402   * This function must not be used to modify PTE content.
6403   *
6404   * Return: zero on success, negative otherwise.
6405   */
follow_pfnmap_start(struct follow_pfnmap_args * args)6406  int follow_pfnmap_start(struct follow_pfnmap_args *args)
6407  {
6408  	struct vm_area_struct *vma = args->vma;
6409  	unsigned long address = args->address;
6410  	struct mm_struct *mm = vma->vm_mm;
6411  	spinlock_t *lock;
6412  	pgd_t *pgdp;
6413  	p4d_t *p4dp, p4d;
6414  	pud_t *pudp, pud;
6415  	pmd_t *pmdp, pmd;
6416  	pte_t *ptep, pte;
6417  
6418  	pfnmap_lockdep_assert(vma);
6419  
6420  	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
6421  		goto out;
6422  
6423  	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
6424  		goto out;
6425  retry:
6426  	pgdp = pgd_offset(mm, address);
6427  	if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp)))
6428  		goto out;
6429  
6430  	p4dp = p4d_offset(pgdp, address);
6431  	p4d = READ_ONCE(*p4dp);
6432  	if (p4d_none(p4d) || unlikely(p4d_bad(p4d)))
6433  		goto out;
6434  
6435  	pudp = pud_offset(p4dp, address);
6436  	pud = READ_ONCE(*pudp);
6437  	if (pud_none(pud))
6438  		goto out;
6439  	if (pud_leaf(pud)) {
6440  		lock = pud_lock(mm, pudp);
6441  		if (!unlikely(pud_leaf(pud))) {
6442  			spin_unlock(lock);
6443  			goto retry;
6444  		}
6445  		pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud),
6446  				  pud_pfn(pud), PUD_MASK, pud_write(pud),
6447  				  pud_special(pud));
6448  		return 0;
6449  	}
6450  
6451  	pmdp = pmd_offset(pudp, address);
6452  	pmd = pmdp_get_lockless(pmdp);
6453  	if (pmd_leaf(pmd)) {
6454  		lock = pmd_lock(mm, pmdp);
6455  		if (!unlikely(pmd_leaf(pmd))) {
6456  			spin_unlock(lock);
6457  			goto retry;
6458  		}
6459  		pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd),
6460  				  pmd_pfn(pmd), PMD_MASK, pmd_write(pmd),
6461  				  pmd_special(pmd));
6462  		return 0;
6463  	}
6464  
6465  	ptep = pte_offset_map_lock(mm, pmdp, address, &lock);
6466  	if (!ptep)
6467  		goto out;
6468  	pte = ptep_get(ptep);
6469  	if (!pte_present(pte))
6470  		goto unlock;
6471  	pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte),
6472  			  pte_pfn(pte), PAGE_MASK, pte_write(pte),
6473  			  pte_special(pte));
6474  	return 0;
6475  unlock:
6476  	pte_unmap_unlock(ptep, lock);
6477  out:
6478  	return -EINVAL;
6479  }
6480  EXPORT_SYMBOL_GPL(follow_pfnmap_start);
6481  
6482  /**
6483   * follow_pfnmap_end(): End a follow_pfnmap_start() process
6484   * @args: Pointer to struct @follow_pfnmap_args
6485   *
6486   * Must be used in pair of follow_pfnmap_start().  See the start() function
6487   * above for more information.
6488   */
follow_pfnmap_end(struct follow_pfnmap_args * args)6489  void follow_pfnmap_end(struct follow_pfnmap_args *args)
6490  {
6491  	if (args->lock)
6492  		spin_unlock(args->lock);
6493  	if (args->ptep)
6494  		pte_unmap(args->ptep);
6495  }
6496  EXPORT_SYMBOL_GPL(follow_pfnmap_end);
6497  
6498  #ifdef CONFIG_HAVE_IOREMAP_PROT
6499  /**
6500   * generic_access_phys - generic implementation for iomem mmap access
6501   * @vma: the vma to access
6502   * @addr: userspace address, not relative offset within @vma
6503   * @buf: buffer to read/write
6504   * @len: length of transfer
6505   * @write: set to FOLL_WRITE when writing, otherwise reading
6506   *
6507   * This is a generic implementation for &vm_operations_struct.access for an
6508   * iomem mapping. This callback is used by access_process_vm() when the @vma is
6509   * not page based.
6510   */
generic_access_phys(struct vm_area_struct * vma,unsigned long addr,void * buf,int len,int write)6511  int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
6512  			void *buf, int len, int write)
6513  {
6514  	resource_size_t phys_addr;
6515  	unsigned long prot = 0;
6516  	void __iomem *maddr;
6517  	int offset = offset_in_page(addr);
6518  	int ret = -EINVAL;
6519  	bool writable;
6520  	struct follow_pfnmap_args args = { .vma = vma, .address = addr };
6521  
6522  retry:
6523  	if (follow_pfnmap_start(&args))
6524  		return -EINVAL;
6525  	prot = pgprot_val(args.pgprot);
6526  	phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT;
6527  	writable = args.writable;
6528  	follow_pfnmap_end(&args);
6529  
6530  	if ((write & FOLL_WRITE) && !writable)
6531  		return -EINVAL;
6532  
6533  	maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
6534  	if (!maddr)
6535  		return -ENOMEM;
6536  
6537  	if (follow_pfnmap_start(&args))
6538  		goto out_unmap;
6539  
6540  	if ((prot != pgprot_val(args.pgprot)) ||
6541  	    (phys_addr != (args.pfn << PAGE_SHIFT)) ||
6542  	    (writable != args.writable)) {
6543  		follow_pfnmap_end(&args);
6544  		iounmap(maddr);
6545  		goto retry;
6546  	}
6547  
6548  	if (write)
6549  		memcpy_toio(maddr + offset, buf, len);
6550  	else
6551  		memcpy_fromio(buf, maddr + offset, len);
6552  	ret = len;
6553  	follow_pfnmap_end(&args);
6554  out_unmap:
6555  	iounmap(maddr);
6556  
6557  	return ret;
6558  }
6559  EXPORT_SYMBOL_GPL(generic_access_phys);
6560  #endif
6561  
6562  /*
6563   * Access another process' address space as given in mm.
6564   */
__access_remote_vm(struct mm_struct * mm,unsigned long addr,void * buf,int len,unsigned int gup_flags)6565  static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
6566  			      void *buf, int len, unsigned int gup_flags)
6567  {
6568  	void *old_buf = buf;
6569  	int write = gup_flags & FOLL_WRITE;
6570  
6571  	if (mmap_read_lock_killable(mm))
6572  		return 0;
6573  
6574  	/* Untag the address before looking up the VMA */
6575  	addr = untagged_addr_remote(mm, addr);
6576  
6577  	/* Avoid triggering the temporary warning in __get_user_pages */
6578  	if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
6579  		return 0;
6580  
6581  	/* ignore errors, just check how much was successfully transferred */
6582  	while (len) {
6583  		int bytes, offset;
6584  		void *maddr;
6585  		struct vm_area_struct *vma = NULL;
6586  		struct page *page = get_user_page_vma_remote(mm, addr,
6587  							     gup_flags, &vma);
6588  
6589  		if (IS_ERR(page)) {
6590  			/* We might need to expand the stack to access it */
6591  			vma = vma_lookup(mm, addr);
6592  			if (!vma) {
6593  				vma = expand_stack(mm, addr);
6594  
6595  				/* mmap_lock was dropped on failure */
6596  				if (!vma)
6597  					return buf - old_buf;
6598  
6599  				/* Try again if stack expansion worked */
6600  				continue;
6601  			}
6602  
6603  			/*
6604  			 * Check if this is a VM_IO | VM_PFNMAP VMA, which
6605  			 * we can access using slightly different code.
6606  			 */
6607  			bytes = 0;
6608  #ifdef CONFIG_HAVE_IOREMAP_PROT
6609  			if (vma->vm_ops && vma->vm_ops->access)
6610  				bytes = vma->vm_ops->access(vma, addr, buf,
6611  							    len, write);
6612  #endif
6613  			if (bytes <= 0)
6614  				break;
6615  		} else {
6616  			bytes = len;
6617  			offset = addr & (PAGE_SIZE-1);
6618  			if (bytes > PAGE_SIZE-offset)
6619  				bytes = PAGE_SIZE-offset;
6620  
6621  			maddr = kmap_local_page(page);
6622  			if (write) {
6623  				copy_to_user_page(vma, page, addr,
6624  						  maddr + offset, buf, bytes);
6625  				set_page_dirty_lock(page);
6626  			} else {
6627  				copy_from_user_page(vma, page, addr,
6628  						    buf, maddr + offset, bytes);
6629  			}
6630  			unmap_and_put_page(page, maddr);
6631  		}
6632  		len -= bytes;
6633  		buf += bytes;
6634  		addr += bytes;
6635  	}
6636  	mmap_read_unlock(mm);
6637  
6638  	return buf - old_buf;
6639  }
6640  
6641  /**
6642   * access_remote_vm - access another process' address space
6643   * @mm:		the mm_struct of the target address space
6644   * @addr:	start address to access
6645   * @buf:	source or destination buffer
6646   * @len:	number of bytes to transfer
6647   * @gup_flags:	flags modifying lookup behaviour
6648   *
6649   * The caller must hold a reference on @mm.
6650   *
6651   * Return: number of bytes copied from source to destination.
6652   */
access_remote_vm(struct mm_struct * mm,unsigned long addr,void * buf,int len,unsigned int gup_flags)6653  int access_remote_vm(struct mm_struct *mm, unsigned long addr,
6654  		void *buf, int len, unsigned int gup_flags)
6655  {
6656  	return __access_remote_vm(mm, addr, buf, len, gup_flags);
6657  }
6658  
6659  /*
6660   * Access another process' address space.
6661   * Source/target buffer must be kernel space,
6662   * Do not walk the page table directly, use get_user_pages
6663   */
access_process_vm(struct task_struct * tsk,unsigned long addr,void * buf,int len,unsigned int gup_flags)6664  int access_process_vm(struct task_struct *tsk, unsigned long addr,
6665  		void *buf, int len, unsigned int gup_flags)
6666  {
6667  	struct mm_struct *mm;
6668  	int ret;
6669  
6670  	mm = get_task_mm(tsk);
6671  	if (!mm)
6672  		return 0;
6673  
6674  	ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
6675  
6676  	mmput(mm);
6677  
6678  	return ret;
6679  }
6680  EXPORT_SYMBOL_GPL(access_process_vm);
6681  
6682  /*
6683   * Print the name of a VMA.
6684   */
print_vma_addr(char * prefix,unsigned long ip)6685  void print_vma_addr(char *prefix, unsigned long ip)
6686  {
6687  	struct mm_struct *mm = current->mm;
6688  	struct vm_area_struct *vma;
6689  
6690  	/*
6691  	 * we might be running from an atomic context so we cannot sleep
6692  	 */
6693  	if (!mmap_read_trylock(mm))
6694  		return;
6695  
6696  	vma = vma_lookup(mm, ip);
6697  	if (vma && vma->vm_file) {
6698  		struct file *f = vma->vm_file;
6699  		ip -= vma->vm_start;
6700  		ip += vma->vm_pgoff << PAGE_SHIFT;
6701  		printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip,
6702  				vma->vm_start,
6703  				vma->vm_end - vma->vm_start);
6704  	}
6705  	mmap_read_unlock(mm);
6706  }
6707  
6708  #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
__might_fault(const char * file,int line)6709  void __might_fault(const char *file, int line)
6710  {
6711  	if (pagefault_disabled())
6712  		return;
6713  	__might_sleep(file, line);
6714  #if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
6715  	if (current->mm)
6716  		might_lock_read(&current->mm->mmap_lock);
6717  #endif
6718  }
6719  EXPORT_SYMBOL(__might_fault);
6720  #endif
6721  
6722  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
6723  /*
6724   * Process all subpages of the specified huge page with the specified
6725   * operation.  The target subpage will be processed last to keep its
6726   * cache lines hot.
6727   */
process_huge_page(unsigned long addr_hint,unsigned int nr_pages,int (* process_subpage)(unsigned long addr,int idx,void * arg),void * arg)6728  static inline int process_huge_page(
6729  	unsigned long addr_hint, unsigned int nr_pages,
6730  	int (*process_subpage)(unsigned long addr, int idx, void *arg),
6731  	void *arg)
6732  {
6733  	int i, n, base, l, ret;
6734  	unsigned long addr = addr_hint &
6735  		~(((unsigned long)nr_pages << PAGE_SHIFT) - 1);
6736  
6737  	/* Process target subpage last to keep its cache lines hot */
6738  	might_sleep();
6739  	n = (addr_hint - addr) / PAGE_SIZE;
6740  	if (2 * n <= nr_pages) {
6741  		/* If target subpage in first half of huge page */
6742  		base = 0;
6743  		l = n;
6744  		/* Process subpages at the end of huge page */
6745  		for (i = nr_pages - 1; i >= 2 * n; i--) {
6746  			cond_resched();
6747  			ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
6748  			if (ret)
6749  				return ret;
6750  		}
6751  	} else {
6752  		/* If target subpage in second half of huge page */
6753  		base = nr_pages - 2 * (nr_pages - n);
6754  		l = nr_pages - n;
6755  		/* Process subpages at the begin of huge page */
6756  		for (i = 0; i < base; i++) {
6757  			cond_resched();
6758  			ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
6759  			if (ret)
6760  				return ret;
6761  		}
6762  	}
6763  	/*
6764  	 * Process remaining subpages in left-right-left-right pattern
6765  	 * towards the target subpage
6766  	 */
6767  	for (i = 0; i < l; i++) {
6768  		int left_idx = base + i;
6769  		int right_idx = base + 2 * l - 1 - i;
6770  
6771  		cond_resched();
6772  		ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
6773  		if (ret)
6774  			return ret;
6775  		cond_resched();
6776  		ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
6777  		if (ret)
6778  			return ret;
6779  	}
6780  	return 0;
6781  }
6782  
clear_gigantic_page(struct folio * folio,unsigned long addr,unsigned int nr_pages)6783  static void clear_gigantic_page(struct folio *folio, unsigned long addr,
6784  				unsigned int nr_pages)
6785  {
6786  	int i;
6787  
6788  	might_sleep();
6789  	for (i = 0; i < nr_pages; i++) {
6790  		cond_resched();
6791  		clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE);
6792  	}
6793  }
6794  
clear_subpage(unsigned long addr,int idx,void * arg)6795  static int clear_subpage(unsigned long addr, int idx, void *arg)
6796  {
6797  	struct folio *folio = arg;
6798  
6799  	clear_user_highpage(folio_page(folio, idx), addr);
6800  	return 0;
6801  }
6802  
6803  /**
6804   * folio_zero_user - Zero a folio which will be mapped to userspace.
6805   * @folio: The folio to zero.
6806   * @addr_hint: The address will be accessed or the base address if uncelar.
6807   */
folio_zero_user(struct folio * folio,unsigned long addr_hint)6808  void folio_zero_user(struct folio *folio, unsigned long addr_hint)
6809  {
6810  	unsigned int nr_pages = folio_nr_pages(folio);
6811  
6812  	if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
6813  		clear_gigantic_page(folio, addr_hint, nr_pages);
6814  	else
6815  		process_huge_page(addr_hint, nr_pages, clear_subpage, folio);
6816  }
6817  
copy_user_gigantic_page(struct folio * dst,struct folio * src,unsigned long addr,struct vm_area_struct * vma,unsigned int nr_pages)6818  static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
6819  				   unsigned long addr,
6820  				   struct vm_area_struct *vma,
6821  				   unsigned int nr_pages)
6822  {
6823  	int i;
6824  	struct page *dst_page;
6825  	struct page *src_page;
6826  
6827  	for (i = 0; i < nr_pages; i++) {
6828  		dst_page = folio_page(dst, i);
6829  		src_page = folio_page(src, i);
6830  
6831  		cond_resched();
6832  		if (copy_mc_user_highpage(dst_page, src_page,
6833  					  addr + i*PAGE_SIZE, vma))
6834  			return -EHWPOISON;
6835  	}
6836  	return 0;
6837  }
6838  
6839  struct copy_subpage_arg {
6840  	struct folio *dst;
6841  	struct folio *src;
6842  	struct vm_area_struct *vma;
6843  };
6844  
copy_subpage(unsigned long addr,int idx,void * arg)6845  static int copy_subpage(unsigned long addr, int idx, void *arg)
6846  {
6847  	struct copy_subpage_arg *copy_arg = arg;
6848  	struct page *dst = folio_page(copy_arg->dst, idx);
6849  	struct page *src = folio_page(copy_arg->src, idx);
6850  
6851  	if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma))
6852  		return -EHWPOISON;
6853  	return 0;
6854  }
6855  
copy_user_large_folio(struct folio * dst,struct folio * src,unsigned long addr_hint,struct vm_area_struct * vma)6856  int copy_user_large_folio(struct folio *dst, struct folio *src,
6857  			  unsigned long addr_hint, struct vm_area_struct *vma)
6858  {
6859  	unsigned int nr_pages = folio_nr_pages(dst);
6860  	struct copy_subpage_arg arg = {
6861  		.dst = dst,
6862  		.src = src,
6863  		.vma = vma,
6864  	};
6865  
6866  	if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
6867  		return copy_user_gigantic_page(dst, src, addr_hint, vma, nr_pages);
6868  
6869  	return process_huge_page(addr_hint, nr_pages, copy_subpage, &arg);
6870  }
6871  
copy_folio_from_user(struct folio * dst_folio,const void __user * usr_src,bool allow_pagefault)6872  long copy_folio_from_user(struct folio *dst_folio,
6873  			   const void __user *usr_src,
6874  			   bool allow_pagefault)
6875  {
6876  	void *kaddr;
6877  	unsigned long i, rc = 0;
6878  	unsigned int nr_pages = folio_nr_pages(dst_folio);
6879  	unsigned long ret_val = nr_pages * PAGE_SIZE;
6880  	struct page *subpage;
6881  
6882  	for (i = 0; i < nr_pages; i++) {
6883  		subpage = folio_page(dst_folio, i);
6884  		kaddr = kmap_local_page(subpage);
6885  		if (!allow_pagefault)
6886  			pagefault_disable();
6887  		rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE);
6888  		if (!allow_pagefault)
6889  			pagefault_enable();
6890  		kunmap_local(kaddr);
6891  
6892  		ret_val -= (PAGE_SIZE - rc);
6893  		if (rc)
6894  			break;
6895  
6896  		flush_dcache_page(subpage);
6897  
6898  		cond_resched();
6899  	}
6900  	return ret_val;
6901  }
6902  #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
6903  
6904  #if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS
6905  
6906  static struct kmem_cache *page_ptl_cachep;
6907  
ptlock_cache_init(void)6908  void __init ptlock_cache_init(void)
6909  {
6910  	page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
6911  			SLAB_PANIC, NULL);
6912  }
6913  
ptlock_alloc(struct ptdesc * ptdesc)6914  bool ptlock_alloc(struct ptdesc *ptdesc)
6915  {
6916  	spinlock_t *ptl;
6917  
6918  	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
6919  	if (!ptl)
6920  		return false;
6921  	ptdesc->ptl = ptl;
6922  	return true;
6923  }
6924  
ptlock_free(struct ptdesc * ptdesc)6925  void ptlock_free(struct ptdesc *ptdesc)
6926  {
6927  	kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
6928  }
6929  #endif
6930  
vma_pgtable_walk_begin(struct vm_area_struct * vma)6931  void vma_pgtable_walk_begin(struct vm_area_struct *vma)
6932  {
6933  	if (is_vm_hugetlb_page(vma))
6934  		hugetlb_vma_lock_read(vma);
6935  }
6936  
vma_pgtable_walk_end(struct vm_area_struct * vma)6937  void vma_pgtable_walk_end(struct vm_area_struct *vma)
6938  {
6939  	if (is_vm_hugetlb_page(vma))
6940  		hugetlb_vma_unlock_read(vma);
6941  }
6942