1  // SPDX-License-Identifier: GPL-2.0-or-later
2  
3  /*
4   * VMA-specific functions.
5   */
6  
7  #include "vma_internal.h"
8  #include "vma.h"
9  
is_mergeable_vma(struct vma_merge_struct * vmg,bool merge_next)10  static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
11  {
12  	struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
13  
14  	if (!mpol_equal(vmg->policy, vma_policy(vma)))
15  		return false;
16  	/*
17  	 * VM_SOFTDIRTY should not prevent from VMA merging, if we
18  	 * match the flags but dirty bit -- the caller should mark
19  	 * merged VMA as dirty. If dirty bit won't be excluded from
20  	 * comparison, we increase pressure on the memory system forcing
21  	 * the kernel to generate new VMAs when old one could be
22  	 * extended instead.
23  	 */
24  	if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
25  		return false;
26  	if (vma->vm_file != vmg->file)
27  		return false;
28  	if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
29  		return false;
30  	if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
31  		return false;
32  	return true;
33  }
34  
is_mergeable_anon_vma(struct anon_vma * anon_vma1,struct anon_vma * anon_vma2,struct vm_area_struct * vma)35  static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
36  		 struct anon_vma *anon_vma2, struct vm_area_struct *vma)
37  {
38  	/*
39  	 * The list_is_singular() test is to avoid merging VMA cloned from
40  	 * parents. This can improve scalability caused by anon_vma lock.
41  	 */
42  	if ((!anon_vma1 || !anon_vma2) && (!vma ||
43  		list_is_singular(&vma->anon_vma_chain)))
44  		return true;
45  	return anon_vma1 == anon_vma2;
46  }
47  
48  /* Are the anon_vma's belonging to each VMA compatible with one another? */
are_anon_vmas_compatible(struct vm_area_struct * vma1,struct vm_area_struct * vma2)49  static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1,
50  					    struct vm_area_struct *vma2)
51  {
52  	return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL);
53  }
54  
55  /*
56   * init_multi_vma_prep() - Initializer for struct vma_prepare
57   * @vp: The vma_prepare struct
58   * @vma: The vma that will be altered once locked
59   * @next: The next vma if it is to be adjusted
60   * @remove: The first vma to be removed
61   * @remove2: The second vma to be removed
62   */
init_multi_vma_prep(struct vma_prepare * vp,struct vm_area_struct * vma,struct vm_area_struct * next,struct vm_area_struct * remove,struct vm_area_struct * remove2)63  static void init_multi_vma_prep(struct vma_prepare *vp,
64  				struct vm_area_struct *vma,
65  				struct vm_area_struct *next,
66  				struct vm_area_struct *remove,
67  				struct vm_area_struct *remove2)
68  {
69  	memset(vp, 0, sizeof(struct vma_prepare));
70  	vp->vma = vma;
71  	vp->anon_vma = vma->anon_vma;
72  	vp->remove = remove;
73  	vp->remove2 = remove2;
74  	vp->adj_next = next;
75  	if (!vp->anon_vma && next)
76  		vp->anon_vma = next->anon_vma;
77  
78  	vp->file = vma->vm_file;
79  	if (vp->file)
80  		vp->mapping = vma->vm_file->f_mapping;
81  
82  }
83  
84  /*
85   * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
86   * in front of (at a lower virtual address and file offset than) the vma.
87   *
88   * We cannot merge two vmas if they have differently assigned (non-NULL)
89   * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
90   *
91   * We don't check here for the merged mmap wrapping around the end of pagecache
92   * indices (16TB on ia32) because do_mmap() does not permit mmap's which
93   * wrap, nor mmaps which cover the final page at index -1UL.
94   *
95   * We assume the vma may be removed as part of the merge.
96   */
can_vma_merge_before(struct vma_merge_struct * vmg)97  static bool can_vma_merge_before(struct vma_merge_struct *vmg)
98  {
99  	pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
100  
101  	if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
102  	    is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
103  		if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
104  			return true;
105  	}
106  
107  	return false;
108  }
109  
110  /*
111   * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
112   * beyond (at a higher virtual address and file offset than) the vma.
113   *
114   * We cannot merge two vmas if they have differently assigned (non-NULL)
115   * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
116   *
117   * We assume that vma is not removed as part of the merge.
118   */
can_vma_merge_after(struct vma_merge_struct * vmg)119  static bool can_vma_merge_after(struct vma_merge_struct *vmg)
120  {
121  	if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
122  	    is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
123  		if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
124  			return true;
125  	}
126  	return false;
127  }
128  
__vma_link_file(struct vm_area_struct * vma,struct address_space * mapping)129  static void __vma_link_file(struct vm_area_struct *vma,
130  			    struct address_space *mapping)
131  {
132  	if (vma_is_shared_maywrite(vma))
133  		mapping_allow_writable(mapping);
134  
135  	flush_dcache_mmap_lock(mapping);
136  	vma_interval_tree_insert(vma, &mapping->i_mmap);
137  	flush_dcache_mmap_unlock(mapping);
138  }
139  
140  /*
141   * Requires inode->i_mapping->i_mmap_rwsem
142   */
__remove_shared_vm_struct(struct vm_area_struct * vma,struct address_space * mapping)143  static void __remove_shared_vm_struct(struct vm_area_struct *vma,
144  				      struct address_space *mapping)
145  {
146  	if (vma_is_shared_maywrite(vma))
147  		mapping_unmap_writable(mapping);
148  
149  	flush_dcache_mmap_lock(mapping);
150  	vma_interval_tree_remove(vma, &mapping->i_mmap);
151  	flush_dcache_mmap_unlock(mapping);
152  }
153  
154  /*
155   * vma_prepare() - Helper function for handling locking VMAs prior to altering
156   * @vp: The initialized vma_prepare struct
157   */
vma_prepare(struct vma_prepare * vp)158  static void vma_prepare(struct vma_prepare *vp)
159  {
160  	if (vp->file) {
161  		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
162  
163  		if (vp->adj_next)
164  			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
165  				      vp->adj_next->vm_end);
166  
167  		i_mmap_lock_write(vp->mapping);
168  		if (vp->insert && vp->insert->vm_file) {
169  			/*
170  			 * Put into interval tree now, so instantiated pages
171  			 * are visible to arm/parisc __flush_dcache_page
172  			 * throughout; but we cannot insert into address
173  			 * space until vma start or end is updated.
174  			 */
175  			__vma_link_file(vp->insert,
176  					vp->insert->vm_file->f_mapping);
177  		}
178  	}
179  
180  	if (vp->anon_vma) {
181  		anon_vma_lock_write(vp->anon_vma);
182  		anon_vma_interval_tree_pre_update_vma(vp->vma);
183  		if (vp->adj_next)
184  			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
185  	}
186  
187  	if (vp->file) {
188  		flush_dcache_mmap_lock(vp->mapping);
189  		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
190  		if (vp->adj_next)
191  			vma_interval_tree_remove(vp->adj_next,
192  						 &vp->mapping->i_mmap);
193  	}
194  
195  }
196  
197  /*
198   * vma_complete- Helper function for handling the unlocking after altering VMAs,
199   * or for inserting a VMA.
200   *
201   * @vp: The vma_prepare struct
202   * @vmi: The vma iterator
203   * @mm: The mm_struct
204   */
vma_complete(struct vma_prepare * vp,struct vma_iterator * vmi,struct mm_struct * mm)205  static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
206  			 struct mm_struct *mm)
207  {
208  	if (vp->file) {
209  		if (vp->adj_next)
210  			vma_interval_tree_insert(vp->adj_next,
211  						 &vp->mapping->i_mmap);
212  		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
213  		flush_dcache_mmap_unlock(vp->mapping);
214  	}
215  
216  	if (vp->remove && vp->file) {
217  		__remove_shared_vm_struct(vp->remove, vp->mapping);
218  		if (vp->remove2)
219  			__remove_shared_vm_struct(vp->remove2, vp->mapping);
220  	} else if (vp->insert) {
221  		/*
222  		 * split_vma has split insert from vma, and needs
223  		 * us to insert it before dropping the locks
224  		 * (it may either follow vma or precede it).
225  		 */
226  		vma_iter_store(vmi, vp->insert);
227  		mm->map_count++;
228  	}
229  
230  	if (vp->anon_vma) {
231  		anon_vma_interval_tree_post_update_vma(vp->vma);
232  		if (vp->adj_next)
233  			anon_vma_interval_tree_post_update_vma(vp->adj_next);
234  		anon_vma_unlock_write(vp->anon_vma);
235  	}
236  
237  	if (vp->file) {
238  		i_mmap_unlock_write(vp->mapping);
239  		uprobe_mmap(vp->vma);
240  
241  		if (vp->adj_next)
242  			uprobe_mmap(vp->adj_next);
243  	}
244  
245  	if (vp->remove) {
246  again:
247  		vma_mark_detached(vp->remove, true);
248  		if (vp->file) {
249  			uprobe_munmap(vp->remove, vp->remove->vm_start,
250  				      vp->remove->vm_end);
251  			fput(vp->file);
252  		}
253  		if (vp->remove->anon_vma)
254  			anon_vma_merge(vp->vma, vp->remove);
255  		mm->map_count--;
256  		mpol_put(vma_policy(vp->remove));
257  		if (!vp->remove2)
258  			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
259  		vm_area_free(vp->remove);
260  
261  		/*
262  		 * In mprotect's case 6 (see comments on vma_merge),
263  		 * we are removing both mid and next vmas
264  		 */
265  		if (vp->remove2) {
266  			vp->remove = vp->remove2;
267  			vp->remove2 = NULL;
268  			goto again;
269  		}
270  	}
271  	if (vp->insert && vp->file)
272  		uprobe_mmap(vp->insert);
273  }
274  
275  /*
276   * init_vma_prep() - Initializer wrapper for vma_prepare struct
277   * @vp: The vma_prepare struct
278   * @vma: The vma that will be altered once locked
279   */
init_vma_prep(struct vma_prepare * vp,struct vm_area_struct * vma)280  static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma)
281  {
282  	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
283  }
284  
285  /*
286   * Can the proposed VMA be merged with the left (previous) VMA taking into
287   * account the start position of the proposed range.
288   */
can_vma_merge_left(struct vma_merge_struct * vmg)289  static bool can_vma_merge_left(struct vma_merge_struct *vmg)
290  
291  {
292  	return vmg->prev && vmg->prev->vm_end == vmg->start &&
293  		can_vma_merge_after(vmg);
294  }
295  
296  /*
297   * Can the proposed VMA be merged with the right (next) VMA taking into
298   * account the end position of the proposed range.
299   *
300   * In addition, if we can merge with the left VMA, ensure that left and right
301   * anon_vma's are also compatible.
302   */
can_vma_merge_right(struct vma_merge_struct * vmg,bool can_merge_left)303  static bool can_vma_merge_right(struct vma_merge_struct *vmg,
304  				bool can_merge_left)
305  {
306  	if (!vmg->next || vmg->end != vmg->next->vm_start ||
307  	    !can_vma_merge_before(vmg))
308  		return false;
309  
310  	if (!can_merge_left)
311  		return true;
312  
313  	/*
314  	 * If we can merge with prev (left) and next (right), indicating that
315  	 * each VMA's anon_vma is compatible with the proposed anon_vma, this
316  	 * does not mean prev and next are compatible with EACH OTHER.
317  	 *
318  	 * We therefore check this in addition to mergeability to either side.
319  	 */
320  	return are_anon_vmas_compatible(vmg->prev, vmg->next);
321  }
322  
323  /*
324   * Close a vm structure and free it.
325   */
remove_vma(struct vm_area_struct * vma,bool unreachable)326  void remove_vma(struct vm_area_struct *vma, bool unreachable)
327  {
328  	might_sleep();
329  	vma_close(vma);
330  	if (vma->vm_file)
331  		fput(vma->vm_file);
332  	mpol_put(vma_policy(vma));
333  	if (unreachable)
334  		__vm_area_free(vma);
335  	else
336  		vm_area_free(vma);
337  }
338  
339  /*
340   * Get rid of page table information in the indicated region.
341   *
342   * Called with the mm semaphore held.
343   */
unmap_region(struct ma_state * mas,struct vm_area_struct * vma,struct vm_area_struct * prev,struct vm_area_struct * next)344  void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
345  		struct vm_area_struct *prev, struct vm_area_struct *next)
346  {
347  	struct mm_struct *mm = vma->vm_mm;
348  	struct mmu_gather tlb;
349  
350  	lru_add_drain();
351  	tlb_gather_mmu(&tlb, mm);
352  	update_hiwater_rss(mm);
353  	unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
354  		   /* mm_wr_locked = */ true);
355  	mas_set(mas, vma->vm_end);
356  	free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
357  		      next ? next->vm_start : USER_PGTABLES_CEILING,
358  		      /* mm_wr_locked = */ true);
359  	tlb_finish_mmu(&tlb);
360  }
361  
362  /*
363   * __split_vma() bypasses sysctl_max_map_count checking.  We use this where it
364   * has already been checked or doesn't make sense to fail.
365   * VMA Iterator will point to the original VMA.
366   */
__split_vma(struct vma_iterator * vmi,struct vm_area_struct * vma,unsigned long addr,int new_below)367  static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
368  		       unsigned long addr, int new_below)
369  {
370  	struct vma_prepare vp;
371  	struct vm_area_struct *new;
372  	int err;
373  
374  	WARN_ON(vma->vm_start >= addr);
375  	WARN_ON(vma->vm_end <= addr);
376  
377  	if (vma->vm_ops && vma->vm_ops->may_split) {
378  		err = vma->vm_ops->may_split(vma, addr);
379  		if (err)
380  			return err;
381  	}
382  
383  	new = vm_area_dup(vma);
384  	if (!new)
385  		return -ENOMEM;
386  
387  	if (new_below) {
388  		new->vm_end = addr;
389  	} else {
390  		new->vm_start = addr;
391  		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
392  	}
393  
394  	err = -ENOMEM;
395  	vma_iter_config(vmi, new->vm_start, new->vm_end);
396  	if (vma_iter_prealloc(vmi, new))
397  		goto out_free_vma;
398  
399  	err = vma_dup_policy(vma, new);
400  	if (err)
401  		goto out_free_vmi;
402  
403  	err = anon_vma_clone(new, vma);
404  	if (err)
405  		goto out_free_mpol;
406  
407  	if (new->vm_file)
408  		get_file(new->vm_file);
409  
410  	if (new->vm_ops && new->vm_ops->open)
411  		new->vm_ops->open(new);
412  
413  	vma_start_write(vma);
414  	vma_start_write(new);
415  
416  	init_vma_prep(&vp, vma);
417  	vp.insert = new;
418  	vma_prepare(&vp);
419  	vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
420  
421  	if (new_below) {
422  		vma->vm_start = addr;
423  		vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
424  	} else {
425  		vma->vm_end = addr;
426  	}
427  
428  	/* vma_complete stores the new vma */
429  	vma_complete(&vp, vmi, vma->vm_mm);
430  	validate_mm(vma->vm_mm);
431  
432  	/* Success. */
433  	if (new_below)
434  		vma_next(vmi);
435  	else
436  		vma_prev(vmi);
437  
438  	return 0;
439  
440  out_free_mpol:
441  	mpol_put(vma_policy(new));
442  out_free_vmi:
443  	vma_iter_free(vmi);
444  out_free_vma:
445  	vm_area_free(new);
446  	return err;
447  }
448  
449  /*
450   * Split a vma into two pieces at address 'addr', a new vma is allocated
451   * either for the first part or the tail.
452   */
split_vma(struct vma_iterator * vmi,struct vm_area_struct * vma,unsigned long addr,int new_below)453  static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
454  		     unsigned long addr, int new_below)
455  {
456  	if (vma->vm_mm->map_count >= sysctl_max_map_count)
457  		return -ENOMEM;
458  
459  	return __split_vma(vmi, vma, addr, new_below);
460  }
461  
462  /*
463   * vma has some anon_vma assigned, and is already inserted on that
464   * anon_vma's interval trees.
465   *
466   * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
467   * vma must be removed from the anon_vma's interval trees using
468   * anon_vma_interval_tree_pre_update_vma().
469   *
470   * After the update, the vma will be reinserted using
471   * anon_vma_interval_tree_post_update_vma().
472   *
473   * The entire update must be protected by exclusive mmap_lock and by
474   * the root anon_vma's mutex.
475   */
476  void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct * vma)477  anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
478  {
479  	struct anon_vma_chain *avc;
480  
481  	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
482  		anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
483  }
484  
485  void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct * vma)486  anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
487  {
488  	struct anon_vma_chain *avc;
489  
490  	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
491  		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
492  }
493  
494  /*
495   * dup_anon_vma() - Helper function to duplicate anon_vma
496   * @dst: The destination VMA
497   * @src: The source VMA
498   * @dup: Pointer to the destination VMA when successful.
499   *
500   * Returns: 0 on success.
501   */
dup_anon_vma(struct vm_area_struct * dst,struct vm_area_struct * src,struct vm_area_struct ** dup)502  static int dup_anon_vma(struct vm_area_struct *dst,
503  			struct vm_area_struct *src, struct vm_area_struct **dup)
504  {
505  	/*
506  	 * Easily overlooked: when mprotect shifts the boundary, make sure the
507  	 * expanding vma has anon_vma set if the shrinking vma had, to cover any
508  	 * anon pages imported.
509  	 */
510  	if (src->anon_vma && !dst->anon_vma) {
511  		int ret;
512  
513  		vma_assert_write_locked(dst);
514  		dst->anon_vma = src->anon_vma;
515  		ret = anon_vma_clone(dst, src);
516  		if (ret)
517  			return ret;
518  
519  		*dup = dst;
520  	}
521  
522  	return 0;
523  }
524  
525  #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
validate_mm(struct mm_struct * mm)526  void validate_mm(struct mm_struct *mm)
527  {
528  	int bug = 0;
529  	int i = 0;
530  	struct vm_area_struct *vma;
531  	VMA_ITERATOR(vmi, mm, 0);
532  
533  	mt_validate(&mm->mm_mt);
534  	for_each_vma(vmi, vma) {
535  #ifdef CONFIG_DEBUG_VM_RB
536  		struct anon_vma *anon_vma = vma->anon_vma;
537  		struct anon_vma_chain *avc;
538  #endif
539  		unsigned long vmi_start, vmi_end;
540  		bool warn = 0;
541  
542  		vmi_start = vma_iter_addr(&vmi);
543  		vmi_end = vma_iter_end(&vmi);
544  		if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
545  			warn = 1;
546  
547  		if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
548  			warn = 1;
549  
550  		if (warn) {
551  			pr_emerg("issue in %s\n", current->comm);
552  			dump_stack();
553  			dump_vma(vma);
554  			pr_emerg("tree range: %px start %lx end %lx\n", vma,
555  				 vmi_start, vmi_end - 1);
556  			vma_iter_dump_tree(&vmi);
557  		}
558  
559  #ifdef CONFIG_DEBUG_VM_RB
560  		if (anon_vma) {
561  			anon_vma_lock_read(anon_vma);
562  			list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
563  				anon_vma_interval_tree_verify(avc);
564  			anon_vma_unlock_read(anon_vma);
565  		}
566  #endif
567  		i++;
568  	}
569  	if (i != mm->map_count) {
570  		pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
571  		bug = 1;
572  	}
573  	VM_BUG_ON_MM(bug, mm);
574  }
575  #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
576  
577  /* Actually perform the VMA merge operation. */
commit_merge(struct vma_merge_struct * vmg,struct vm_area_struct * adjust,struct vm_area_struct * remove,struct vm_area_struct * remove2,long adj_start,bool expanded)578  static int commit_merge(struct vma_merge_struct *vmg,
579  			struct vm_area_struct *adjust,
580  			struct vm_area_struct *remove,
581  			struct vm_area_struct *remove2,
582  			long adj_start,
583  			bool expanded)
584  {
585  	struct vma_prepare vp;
586  
587  	init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
588  
589  	VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
590  		   vp.anon_vma != adjust->anon_vma);
591  
592  	if (expanded) {
593  		/* Note: vma iterator must be pointing to 'start'. */
594  		vma_iter_config(vmg->vmi, vmg->start, vmg->end);
595  	} else {
596  		vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
597  				adjust->vm_end);
598  	}
599  
600  	if (vma_iter_prealloc(vmg->vmi, vmg->vma))
601  		return -ENOMEM;
602  
603  	vma_prepare(&vp);
604  	vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
605  	vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
606  
607  	if (expanded)
608  		vma_iter_store(vmg->vmi, vmg->vma);
609  
610  	if (adj_start) {
611  		adjust->vm_start += adj_start;
612  		adjust->vm_pgoff += PHYS_PFN(adj_start);
613  		if (adj_start < 0) {
614  			WARN_ON(expanded);
615  			vma_iter_store(vmg->vmi, adjust);
616  		}
617  	}
618  
619  	vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
620  
621  	return 0;
622  }
623  
624  /* We can only remove VMAs when merging if they do not have a close hook. */
can_merge_remove_vma(struct vm_area_struct * vma)625  static bool can_merge_remove_vma(struct vm_area_struct *vma)
626  {
627  	return !vma->vm_ops || !vma->vm_ops->close;
628  }
629  
630  /*
631   * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its
632   * attributes modified.
633   *
634   * @vmg: Describes the modifications being made to a VMA and associated
635   *       metadata.
636   *
637   * When the attributes of a range within a VMA change, then it might be possible
638   * for immediately adjacent VMAs to be merged into that VMA due to having
639   * identical properties.
640   *
641   * This function checks for the existence of any such mergeable VMAs and updates
642   * the maple tree describing the @vmg->vma->vm_mm address space to account for
643   * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
644   *
645   * As part of this operation, if a merge occurs, the @vmg object will have its
646   * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
647   * calls to this function should reset these fields.
648   *
649   * Returns: The merged VMA if merge succeeds, or NULL otherwise.
650   *
651   * ASSUMPTIONS:
652   * - The caller must assign the VMA to be modifed to @vmg->vma.
653   * - The caller must have set @vmg->prev to the previous VMA, if there is one.
654   * - The caller must not set @vmg->next, as we determine this.
655   * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
656   * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end).
657   */
vma_merge_existing_range(struct vma_merge_struct * vmg)658  static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg)
659  {
660  	struct vm_area_struct *vma = vmg->vma;
661  	struct vm_area_struct *prev = vmg->prev;
662  	struct vm_area_struct *next, *res;
663  	struct vm_area_struct *anon_dup = NULL;
664  	struct vm_area_struct *adjust = NULL;
665  	unsigned long start = vmg->start;
666  	unsigned long end = vmg->end;
667  	bool left_side = vma && start == vma->vm_start;
668  	bool right_side = vma && end == vma->vm_end;
669  	int err = 0;
670  	long adj_start = 0;
671  	bool merge_will_delete_vma, merge_will_delete_next;
672  	bool merge_left, merge_right, merge_both;
673  	bool expanded;
674  
675  	mmap_assert_write_locked(vmg->mm);
676  	VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
677  	VM_WARN_ON(vmg->next); /* We set this. */
678  	VM_WARN_ON(prev && start <= prev->vm_start);
679  	VM_WARN_ON(start >= end);
680  	/*
681  	 * If vma == prev, then we are offset into a VMA. Otherwise, if we are
682  	 * not, we must span a portion of the VMA.
683  	 */
684  	VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
685  			   vmg->end > vma->vm_end));
686  	/* The vmi must be positioned within vmg->vma. */
687  	VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start &&
688  			    vma_iter_addr(vmg->vmi) < vma->vm_end));
689  
690  	vmg->state = VMA_MERGE_NOMERGE;
691  
692  	/*
693  	 * If a special mapping or if the range being modified is neither at the
694  	 * furthermost left or right side of the VMA, then we have no chance of
695  	 * merging and should abort.
696  	 */
697  	if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
698  		return NULL;
699  
700  	if (left_side)
701  		merge_left = can_vma_merge_left(vmg);
702  	else
703  		merge_left = false;
704  
705  	if (right_side) {
706  		next = vmg->next = vma_iter_next_range(vmg->vmi);
707  		vma_iter_prev_range(vmg->vmi);
708  
709  		merge_right = can_vma_merge_right(vmg, merge_left);
710  	} else {
711  		merge_right = false;
712  		next = NULL;
713  	}
714  
715  	if (merge_left)		/* If merging prev, position iterator there. */
716  		vma_prev(vmg->vmi);
717  	else if (!merge_right)	/* If we have nothing to merge, abort. */
718  		return NULL;
719  
720  	merge_both = merge_left && merge_right;
721  	/* If we span the entire VMA, a merge implies it will be deleted. */
722  	merge_will_delete_vma = left_side && right_side;
723  
724  	/*
725  	 * If we need to remove vma in its entirety but are unable to do so,
726  	 * we have no sensible recourse but to abort the merge.
727  	 */
728  	if (merge_will_delete_vma && !can_merge_remove_vma(vma))
729  		return NULL;
730  
731  	/*
732  	 * If we merge both VMAs, then next is also deleted. This implies
733  	 * merge_will_delete_vma also.
734  	 */
735  	merge_will_delete_next = merge_both;
736  
737  	/*
738  	 * If we cannot delete next, then we can reduce the operation to merging
739  	 * prev and vma (thereby deleting vma).
740  	 */
741  	if (merge_will_delete_next && !can_merge_remove_vma(next)) {
742  		merge_will_delete_next = false;
743  		merge_right = false;
744  		merge_both = false;
745  	}
746  
747  	/* No matter what happens, we will be adjusting vma. */
748  	vma_start_write(vma);
749  
750  	if (merge_left)
751  		vma_start_write(prev);
752  
753  	if (merge_right)
754  		vma_start_write(next);
755  
756  	if (merge_both) {
757  		/*
758  		 *         |<----->|
759  		 * |-------*********-------|
760  		 *   prev     vma     next
761  		 *  extend   delete  delete
762  		 */
763  
764  		vmg->vma = prev;
765  		vmg->start = prev->vm_start;
766  		vmg->end = next->vm_end;
767  		vmg->pgoff = prev->vm_pgoff;
768  
769  		/*
770  		 * We already ensured anon_vma compatibility above, so now it's
771  		 * simply a case of, if prev has no anon_vma object, which of
772  		 * next or vma contains the anon_vma we must duplicate.
773  		 */
774  		err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
775  	} else if (merge_left) {
776  		/*
777  		 *         |<----->| OR
778  		 *         |<--------->|
779  		 * |-------*************
780  		 *   prev       vma
781  		 *  extend shrink/delete
782  		 */
783  
784  		vmg->vma = prev;
785  		vmg->start = prev->vm_start;
786  		vmg->pgoff = prev->vm_pgoff;
787  
788  		if (!merge_will_delete_vma) {
789  			adjust = vma;
790  			adj_start = vmg->end - vma->vm_start;
791  		}
792  
793  		err = dup_anon_vma(prev, vma, &anon_dup);
794  	} else { /* merge_right */
795  		/*
796  		 *     |<----->| OR
797  		 * |<--------->|
798  		 * *************-------|
799  		 *      vma       next
800  		 * shrink/delete extend
801  		 */
802  
803  		pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
804  
805  		VM_WARN_ON(!merge_right);
806  		/* If we are offset into a VMA, then prev must be vma. */
807  		VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
808  
809  		if (merge_will_delete_vma) {
810  			vmg->vma = next;
811  			vmg->end = next->vm_end;
812  			vmg->pgoff = next->vm_pgoff - pglen;
813  		} else {
814  			/*
815  			 * We shrink vma and expand next.
816  			 *
817  			 * IMPORTANT: This is the ONLY case where the final
818  			 * merged VMA is NOT vmg->vma, but rather vmg->next.
819  			 */
820  
821  			vmg->start = vma->vm_start;
822  			vmg->end = start;
823  			vmg->pgoff = vma->vm_pgoff;
824  
825  			adjust = next;
826  			adj_start = -(vma->vm_end - start);
827  		}
828  
829  		err = dup_anon_vma(next, vma, &anon_dup);
830  	}
831  
832  	if (err)
833  		goto abort;
834  
835  	/*
836  	 * In nearly all cases, we expand vmg->vma. There is one exception -
837  	 * merge_right where we partially span the VMA. In this case we shrink
838  	 * the end of vmg->vma and adjust the start of vmg->next accordingly.
839  	 */
840  	expanded = !merge_right || merge_will_delete_vma;
841  
842  	if (commit_merge(vmg, adjust,
843  			 merge_will_delete_vma ? vma : NULL,
844  			 merge_will_delete_next ? next : NULL,
845  			 adj_start, expanded)) {
846  		if (anon_dup)
847  			unlink_anon_vmas(anon_dup);
848  
849  		vmg->state = VMA_MERGE_ERROR_NOMEM;
850  		return NULL;
851  	}
852  
853  	res = merge_left ? prev : next;
854  	khugepaged_enter_vma(res, vmg->flags);
855  
856  	vmg->state = VMA_MERGE_SUCCESS;
857  	return res;
858  
859  abort:
860  	vma_iter_set(vmg->vmi, start);
861  	vma_iter_load(vmg->vmi);
862  	vmg->state = VMA_MERGE_ERROR_NOMEM;
863  	return NULL;
864  }
865  
866  /*
867   * vma_merge_new_range - Attempt to merge a new VMA into address space
868   *
869   * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
870   *       (exclusive), which we try to merge with any adjacent VMAs if possible.
871   *
872   * We are about to add a VMA to the address space starting at @vmg->start and
873   * ending at @vmg->end. There are three different possible scenarios:
874   *
875   * 1. There is a VMA with identical properties immediately adjacent to the
876   *    proposed new VMA [@vmg->start, @vmg->end) either before or after it -
877   *    EXPAND that VMA:
878   *
879   * Proposed:       |-----|  or  |-----|
880   * Existing:  |----|                  |----|
881   *
882   * 2. There are VMAs with identical properties immediately adjacent to the
883   *    proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
884   *    EXPAND the former and REMOVE the latter:
885   *
886   * Proposed:       |-----|
887   * Existing:  |----|     |----|
888   *
889   * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
890   *    VMAs do not have identical attributes - NO MERGE POSSIBLE.
891   *
892   * In instances where we can merge, this function returns the expanded VMA which
893   * will have its range adjusted accordingly and the underlying maple tree also
894   * adjusted.
895   *
896   * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
897   *          to the VMA we expanded.
898   *
899   * This function adjusts @vmg to provide @vmg->next if not already specified,
900   * and adjusts [@vmg->start, @vmg->end) to span the expanded range.
901   *
902   * ASSUMPTIONS:
903   * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
904   * - The caller must have determined that [@vmg->start, @vmg->end) is empty,
905       other than VMAs that will be unmapped should the operation succeed.
906   * - The caller must have specified the previous vma in @vmg->prev.
907   * - The caller must have specified the next vma in @vmg->next.
908   * - The caller must have positioned the vmi at or before the gap.
909   */
vma_merge_new_range(struct vma_merge_struct * vmg)910  struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
911  {
912  	struct vm_area_struct *prev = vmg->prev;
913  	struct vm_area_struct *next = vmg->next;
914  	unsigned long start = vmg->start;
915  	unsigned long end = vmg->end;
916  	pgoff_t pgoff = vmg->pgoff;
917  	pgoff_t pglen = PHYS_PFN(end - start);
918  	bool can_merge_left, can_merge_right;
919  	bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
920  
921  	mmap_assert_write_locked(vmg->mm);
922  	VM_WARN_ON(vmg->vma);
923  	/* vmi must point at or before the gap. */
924  	VM_WARN_ON(vma_iter_addr(vmg->vmi) > end);
925  
926  	vmg->state = VMA_MERGE_NOMERGE;
927  
928  	/* Special VMAs are unmergeable, also if no prev/next. */
929  	if ((vmg->flags & VM_SPECIAL) || (!prev && !next))
930  		return NULL;
931  
932  	can_merge_left = can_vma_merge_left(vmg);
933  	can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left);
934  
935  	/* If we can merge with the next VMA, adjust vmg accordingly. */
936  	if (can_merge_right) {
937  		vmg->end = next->vm_end;
938  		vmg->vma = next;
939  		vmg->pgoff = next->vm_pgoff - pglen;
940  	}
941  
942  	/* If we can merge with the previous VMA, adjust vmg accordingly. */
943  	if (can_merge_left) {
944  		vmg->start = prev->vm_start;
945  		vmg->vma = prev;
946  		vmg->pgoff = prev->vm_pgoff;
947  
948  		/*
949  		 * If this merge would result in removal of the next VMA but we
950  		 * are not permitted to do so, reduce the operation to merging
951  		 * prev and vma.
952  		 */
953  		if (can_merge_right && !can_merge_remove_vma(next))
954  			vmg->end = end;
955  
956  		/* In expand-only case we are already positioned at prev. */
957  		if (!just_expand) {
958  			/* Equivalent to going to the previous range. */
959  			vma_prev(vmg->vmi);
960  		}
961  	}
962  
963  	/*
964  	 * Now try to expand adjacent VMA(s). This takes care of removing the
965  	 * following VMA if we have VMAs on both sides.
966  	 */
967  	if (vmg->vma && !vma_expand(vmg)) {
968  		khugepaged_enter_vma(vmg->vma, vmg->flags);
969  		vmg->state = VMA_MERGE_SUCCESS;
970  		return vmg->vma;
971  	}
972  
973  	/* If expansion failed, reset state. Allows us to retry merge later. */
974  	if (!just_expand) {
975  		vmg->vma = NULL;
976  		vmg->start = start;
977  		vmg->end = end;
978  		vmg->pgoff = pgoff;
979  		if (vmg->vma == prev)
980  			vma_iter_set(vmg->vmi, start);
981  	}
982  
983  	return NULL;
984  }
985  
986  /*
987   * vma_expand - Expand an existing VMA
988   *
989   * @vmg: Describes a VMA expansion operation.
990   *
991   * Expand @vma to vmg->start and vmg->end.  Can expand off the start and end.
992   * Will expand over vmg->next if it's different from vmg->vma and vmg->end ==
993   * vmg->next->vm_end.  Checking if the vmg->vma can expand and merge with
994   * vmg->next needs to be handled by the caller.
995   *
996   * Returns: 0 on success.
997   *
998   * ASSUMPTIONS:
999   * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
1000   * - The caller must have set @vmg->vma and @vmg->next.
1001   */
vma_expand(struct vma_merge_struct * vmg)1002  int vma_expand(struct vma_merge_struct *vmg)
1003  {
1004  	struct vm_area_struct *anon_dup = NULL;
1005  	bool remove_next = false;
1006  	struct vm_area_struct *vma = vmg->vma;
1007  	struct vm_area_struct *next = vmg->next;
1008  
1009  	mmap_assert_write_locked(vmg->mm);
1010  
1011  	vma_start_write(vma);
1012  	if (next && (vma != next) && (vmg->end == next->vm_end)) {
1013  		int ret;
1014  
1015  		remove_next = true;
1016  		/* This should already have been checked by this point. */
1017  		VM_WARN_ON(!can_merge_remove_vma(next));
1018  		vma_start_write(next);
1019  		ret = dup_anon_vma(vma, next, &anon_dup);
1020  		if (ret)
1021  			return ret;
1022  	}
1023  
1024  	/* Not merging but overwriting any part of next is not handled. */
1025  	VM_WARN_ON(next && !remove_next &&
1026  		  next != vma && vmg->end > next->vm_start);
1027  	/* Only handles expanding */
1028  	VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
1029  
1030  	if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
1031  		goto nomem;
1032  
1033  	return 0;
1034  
1035  nomem:
1036  	vmg->state = VMA_MERGE_ERROR_NOMEM;
1037  	if (anon_dup)
1038  		unlink_anon_vmas(anon_dup);
1039  	return -ENOMEM;
1040  }
1041  
1042  /*
1043   * vma_shrink() - Reduce an existing VMAs memory area
1044   * @vmi: The vma iterator
1045   * @vma: The VMA to modify
1046   * @start: The new start
1047   * @end: The new end
1048   *
1049   * Returns: 0 on success, -ENOMEM otherwise
1050   */
vma_shrink(struct vma_iterator * vmi,struct vm_area_struct * vma,unsigned long start,unsigned long end,pgoff_t pgoff)1051  int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
1052  	       unsigned long start, unsigned long end, pgoff_t pgoff)
1053  {
1054  	struct vma_prepare vp;
1055  
1056  	WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
1057  
1058  	if (vma->vm_start < start)
1059  		vma_iter_config(vmi, vma->vm_start, start);
1060  	else
1061  		vma_iter_config(vmi, end, vma->vm_end);
1062  
1063  	if (vma_iter_prealloc(vmi, NULL))
1064  		return -ENOMEM;
1065  
1066  	vma_start_write(vma);
1067  
1068  	init_vma_prep(&vp, vma);
1069  	vma_prepare(&vp);
1070  	vma_adjust_trans_huge(vma, start, end, 0);
1071  
1072  	vma_iter_clear(vmi);
1073  	vma_set_range(vma, start, end, pgoff);
1074  	vma_complete(&vp, vmi, vma->vm_mm);
1075  	validate_mm(vma->vm_mm);
1076  	return 0;
1077  }
1078  
vms_clear_ptes(struct vma_munmap_struct * vms,struct ma_state * mas_detach,bool mm_wr_locked)1079  static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
1080  		    struct ma_state *mas_detach, bool mm_wr_locked)
1081  {
1082  	struct mmu_gather tlb;
1083  
1084  	if (!vms->clear_ptes) /* Nothing to do */
1085  		return;
1086  
1087  	/*
1088  	 * We can free page tables without write-locking mmap_lock because VMAs
1089  	 * were isolated before we downgraded mmap_lock.
1090  	 */
1091  	mas_set(mas_detach, 1);
1092  	lru_add_drain();
1093  	tlb_gather_mmu(&tlb, vms->vma->vm_mm);
1094  	update_hiwater_rss(vms->vma->vm_mm);
1095  	unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
1096  		   vms->vma_count, mm_wr_locked);
1097  
1098  	mas_set(mas_detach, 1);
1099  	/* start and end may be different if there is no prev or next vma. */
1100  	free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
1101  		      vms->unmap_end, mm_wr_locked);
1102  	tlb_finish_mmu(&tlb);
1103  	vms->clear_ptes = false;
1104  }
1105  
vms_clean_up_area(struct vma_munmap_struct * vms,struct ma_state * mas_detach)1106  void vms_clean_up_area(struct vma_munmap_struct *vms,
1107  		struct ma_state *mas_detach)
1108  {
1109  	struct vm_area_struct *vma;
1110  
1111  	if (!vms->nr_pages)
1112  		return;
1113  
1114  	vms_clear_ptes(vms, mas_detach, true);
1115  	mas_set(mas_detach, 0);
1116  	mas_for_each(mas_detach, vma, ULONG_MAX)
1117  		vma_close(vma);
1118  }
1119  
1120  /*
1121   * vms_complete_munmap_vmas() - Finish the munmap() operation
1122   * @vms: The vma munmap struct
1123   * @mas_detach: The maple state of the detached vmas
1124   *
1125   * This updates the mm_struct, unmaps the region, frees the resources
1126   * used for the munmap() and may downgrade the lock - if requested.  Everything
1127   * needed to be done once the vma maple tree is updated.
1128   */
vms_complete_munmap_vmas(struct vma_munmap_struct * vms,struct ma_state * mas_detach)1129  void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
1130  		struct ma_state *mas_detach)
1131  {
1132  	struct vm_area_struct *vma;
1133  	struct mm_struct *mm;
1134  
1135  	mm = current->mm;
1136  	mm->map_count -= vms->vma_count;
1137  	mm->locked_vm -= vms->locked_vm;
1138  	if (vms->unlock)
1139  		mmap_write_downgrade(mm);
1140  
1141  	if (!vms->nr_pages)
1142  		return;
1143  
1144  	vms_clear_ptes(vms, mas_detach, !vms->unlock);
1145  	/* Update high watermark before we lower total_vm */
1146  	update_hiwater_vm(mm);
1147  	/* Stat accounting */
1148  	WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages);
1149  	/* Paranoid bookkeeping */
1150  	VM_WARN_ON(vms->exec_vm > mm->exec_vm);
1151  	VM_WARN_ON(vms->stack_vm > mm->stack_vm);
1152  	VM_WARN_ON(vms->data_vm > mm->data_vm);
1153  	mm->exec_vm -= vms->exec_vm;
1154  	mm->stack_vm -= vms->stack_vm;
1155  	mm->data_vm -= vms->data_vm;
1156  
1157  	/* Remove and clean up vmas */
1158  	mas_set(mas_detach, 0);
1159  	mas_for_each(mas_detach, vma, ULONG_MAX)
1160  		remove_vma(vma, /* unreachable = */ false);
1161  
1162  	vm_unacct_memory(vms->nr_accounted);
1163  	validate_mm(mm);
1164  	if (vms->unlock)
1165  		mmap_read_unlock(mm);
1166  
1167  	__mt_destroy(mas_detach->tree);
1168  }
1169  
1170  /*
1171   * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
1172   * for removal at a later date.  Handles splitting first and last if necessary
1173   * and marking the vmas as isolated.
1174   *
1175   * @vms: The vma munmap struct
1176   * @mas_detach: The maple state tracking the detached tree
1177   *
1178   * Return: 0 on success, error otherwise
1179   */
vms_gather_munmap_vmas(struct vma_munmap_struct * vms,struct ma_state * mas_detach)1180  int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
1181  		struct ma_state *mas_detach)
1182  {
1183  	struct vm_area_struct *next = NULL;
1184  	int error;
1185  
1186  	/*
1187  	 * If we need to split any vma, do it now to save pain later.
1188  	 * Does it split the first one?
1189  	 */
1190  	if (vms->start > vms->vma->vm_start) {
1191  
1192  		/*
1193  		 * Make sure that map_count on return from munmap() will
1194  		 * not exceed its limit; but let map_count go just above
1195  		 * its limit temporarily, to help free resources as expected.
1196  		 */
1197  		if (vms->end < vms->vma->vm_end &&
1198  		    vms->vma->vm_mm->map_count >= sysctl_max_map_count) {
1199  			error = -ENOMEM;
1200  			goto map_count_exceeded;
1201  		}
1202  
1203  		/* Don't bother splitting the VMA if we can't unmap it anyway */
1204  		if (!can_modify_vma(vms->vma)) {
1205  			error = -EPERM;
1206  			goto start_split_failed;
1207  		}
1208  
1209  		error = __split_vma(vms->vmi, vms->vma, vms->start, 1);
1210  		if (error)
1211  			goto start_split_failed;
1212  	}
1213  	vms->prev = vma_prev(vms->vmi);
1214  	if (vms->prev)
1215  		vms->unmap_start = vms->prev->vm_end;
1216  
1217  	/*
1218  	 * Detach a range of VMAs from the mm. Using next as a temp variable as
1219  	 * it is always overwritten.
1220  	 */
1221  	for_each_vma_range(*(vms->vmi), next, vms->end) {
1222  		long nrpages;
1223  
1224  		if (!can_modify_vma(next)) {
1225  			error = -EPERM;
1226  			goto modify_vma_failed;
1227  		}
1228  		/* Does it split the end? */
1229  		if (next->vm_end > vms->end) {
1230  			error = __split_vma(vms->vmi, next, vms->end, 0);
1231  			if (error)
1232  				goto end_split_failed;
1233  		}
1234  		vma_start_write(next);
1235  		mas_set(mas_detach, vms->vma_count++);
1236  		error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
1237  		if (error)
1238  			goto munmap_gather_failed;
1239  
1240  		vma_mark_detached(next, true);
1241  		nrpages = vma_pages(next);
1242  
1243  		vms->nr_pages += nrpages;
1244  		if (next->vm_flags & VM_LOCKED)
1245  			vms->locked_vm += nrpages;
1246  
1247  		if (next->vm_flags & VM_ACCOUNT)
1248  			vms->nr_accounted += nrpages;
1249  
1250  		if (is_exec_mapping(next->vm_flags))
1251  			vms->exec_vm += nrpages;
1252  		else if (is_stack_mapping(next->vm_flags))
1253  			vms->stack_vm += nrpages;
1254  		else if (is_data_mapping(next->vm_flags))
1255  			vms->data_vm += nrpages;
1256  
1257  		if (unlikely(vms->uf)) {
1258  			/*
1259  			 * If userfaultfd_unmap_prep returns an error the vmas
1260  			 * will remain split, but userland will get a
1261  			 * highly unexpected error anyway. This is no
1262  			 * different than the case where the first of the two
1263  			 * __split_vma fails, but we don't undo the first
1264  			 * split, despite we could. This is unlikely enough
1265  			 * failure that it's not worth optimizing it for.
1266  			 */
1267  			error = userfaultfd_unmap_prep(next, vms->start,
1268  						       vms->end, vms->uf);
1269  			if (error)
1270  				goto userfaultfd_error;
1271  		}
1272  #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
1273  		BUG_ON(next->vm_start < vms->start);
1274  		BUG_ON(next->vm_start > vms->end);
1275  #endif
1276  	}
1277  
1278  	vms->next = vma_next(vms->vmi);
1279  	if (vms->next)
1280  		vms->unmap_end = vms->next->vm_start;
1281  
1282  #if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
1283  	/* Make sure no VMAs are about to be lost. */
1284  	{
1285  		MA_STATE(test, mas_detach->tree, 0, 0);
1286  		struct vm_area_struct *vma_mas, *vma_test;
1287  		int test_count = 0;
1288  
1289  		vma_iter_set(vms->vmi, vms->start);
1290  		rcu_read_lock();
1291  		vma_test = mas_find(&test, vms->vma_count - 1);
1292  		for_each_vma_range(*(vms->vmi), vma_mas, vms->end) {
1293  			BUG_ON(vma_mas != vma_test);
1294  			test_count++;
1295  			vma_test = mas_next(&test, vms->vma_count - 1);
1296  		}
1297  		rcu_read_unlock();
1298  		BUG_ON(vms->vma_count != test_count);
1299  	}
1300  #endif
1301  
1302  	while (vma_iter_addr(vms->vmi) > vms->start)
1303  		vma_iter_prev_range(vms->vmi);
1304  
1305  	vms->clear_ptes = true;
1306  	return 0;
1307  
1308  userfaultfd_error:
1309  munmap_gather_failed:
1310  end_split_failed:
1311  modify_vma_failed:
1312  	reattach_vmas(mas_detach);
1313  start_split_failed:
1314  map_count_exceeded:
1315  	return error;
1316  }
1317  
1318  /*
1319   * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
1320   * @vmi: The vma iterator
1321   * @vma: The starting vm_area_struct
1322   * @mm: The mm_struct
1323   * @start: The aligned start address to munmap.
1324   * @end: The aligned end address to munmap.
1325   * @uf: The userfaultfd list_head
1326   * @unlock: Set to true to drop the mmap_lock.  unlocking only happens on
1327   * success.
1328   *
1329   * Return: 0 on success and drops the lock if so directed, error and leaves the
1330   * lock held otherwise.
1331   */
do_vmi_align_munmap(struct vma_iterator * vmi,struct vm_area_struct * vma,struct mm_struct * mm,unsigned long start,unsigned long end,struct list_head * uf,bool unlock)1332  int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
1333  		struct mm_struct *mm, unsigned long start, unsigned long end,
1334  		struct list_head *uf, bool unlock)
1335  {
1336  	struct maple_tree mt_detach;
1337  	MA_STATE(mas_detach, &mt_detach, 0, 0);
1338  	mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
1339  	mt_on_stack(mt_detach);
1340  	struct vma_munmap_struct vms;
1341  	int error;
1342  
1343  	init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
1344  	error = vms_gather_munmap_vmas(&vms, &mas_detach);
1345  	if (error)
1346  		goto gather_failed;
1347  
1348  	error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
1349  	if (error)
1350  		goto clear_tree_failed;
1351  
1352  	/* Point of no return */
1353  	vms_complete_munmap_vmas(&vms, &mas_detach);
1354  	return 0;
1355  
1356  clear_tree_failed:
1357  	reattach_vmas(&mas_detach);
1358  gather_failed:
1359  	validate_mm(mm);
1360  	return error;
1361  }
1362  
1363  /*
1364   * do_vmi_munmap() - munmap a given range.
1365   * @vmi: The vma iterator
1366   * @mm: The mm_struct
1367   * @start: The start address to munmap
1368   * @len: The length of the range to munmap
1369   * @uf: The userfaultfd list_head
1370   * @unlock: set to true if the user wants to drop the mmap_lock on success
1371   *
1372   * This function takes a @mas that is either pointing to the previous VMA or set
1373   * to MA_START and sets it up to remove the mapping(s).  The @len will be
1374   * aligned.
1375   *
1376   * Return: 0 on success and drops the lock if so directed, error and leaves the
1377   * lock held otherwise.
1378   */
do_vmi_munmap(struct vma_iterator * vmi,struct mm_struct * mm,unsigned long start,size_t len,struct list_head * uf,bool unlock)1379  int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
1380  		  unsigned long start, size_t len, struct list_head *uf,
1381  		  bool unlock)
1382  {
1383  	unsigned long end;
1384  	struct vm_area_struct *vma;
1385  
1386  	if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
1387  		return -EINVAL;
1388  
1389  	end = start + PAGE_ALIGN(len);
1390  	if (end == start)
1391  		return -EINVAL;
1392  
1393  	/* Find the first overlapping VMA */
1394  	vma = vma_find(vmi, end);
1395  	if (!vma) {
1396  		if (unlock)
1397  			mmap_write_unlock(mm);
1398  		return 0;
1399  	}
1400  
1401  	return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
1402  }
1403  
1404  /*
1405   * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
1406   * context and anonymous VMA name within the range [start, end).
1407   *
1408   * As a result, we might be able to merge the newly modified VMA range with an
1409   * adjacent VMA with identical properties.
1410   *
1411   * If no merge is possible and the range does not span the entirety of the VMA,
1412   * we then need to split the VMA to accommodate the change.
1413   *
1414   * The function returns either the merged VMA, the original VMA if a split was
1415   * required instead, or an error if the split failed.
1416   */
vma_modify(struct vma_merge_struct * vmg)1417  static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
1418  {
1419  	struct vm_area_struct *vma = vmg->vma;
1420  	struct vm_area_struct *merged;
1421  
1422  	/* First, try to merge. */
1423  	merged = vma_merge_existing_range(vmg);
1424  	if (merged)
1425  		return merged;
1426  
1427  	/* Split any preceding portion of the VMA. */
1428  	if (vma->vm_start < vmg->start) {
1429  		int err = split_vma(vmg->vmi, vma, vmg->start, 1);
1430  
1431  		if (err)
1432  			return ERR_PTR(err);
1433  	}
1434  
1435  	/* Split any trailing portion of the VMA. */
1436  	if (vma->vm_end > vmg->end) {
1437  		int err = split_vma(vmg->vmi, vma, vmg->end, 0);
1438  
1439  		if (err)
1440  			return ERR_PTR(err);
1441  	}
1442  
1443  	return vma;
1444  }
1445  
vma_modify_flags(struct vma_iterator * vmi,struct vm_area_struct * prev,struct vm_area_struct * vma,unsigned long start,unsigned long end,unsigned long new_flags)1446  struct vm_area_struct *vma_modify_flags(
1447  	struct vma_iterator *vmi, struct vm_area_struct *prev,
1448  	struct vm_area_struct *vma, unsigned long start, unsigned long end,
1449  	unsigned long new_flags)
1450  {
1451  	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
1452  
1453  	vmg.flags = new_flags;
1454  
1455  	return vma_modify(&vmg);
1456  }
1457  
1458  struct vm_area_struct
vma_modify_flags_name(struct vma_iterator * vmi,struct vm_area_struct * prev,struct vm_area_struct * vma,unsigned long start,unsigned long end,unsigned long new_flags,struct anon_vma_name * new_name)1459  *vma_modify_flags_name(struct vma_iterator *vmi,
1460  		       struct vm_area_struct *prev,
1461  		       struct vm_area_struct *vma,
1462  		       unsigned long start,
1463  		       unsigned long end,
1464  		       unsigned long new_flags,
1465  		       struct anon_vma_name *new_name)
1466  {
1467  	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
1468  
1469  	vmg.flags = new_flags;
1470  	vmg.anon_name = new_name;
1471  
1472  	return vma_modify(&vmg);
1473  }
1474  
1475  struct vm_area_struct
vma_modify_policy(struct vma_iterator * vmi,struct vm_area_struct * prev,struct vm_area_struct * vma,unsigned long start,unsigned long end,struct mempolicy * new_pol)1476  *vma_modify_policy(struct vma_iterator *vmi,
1477  		   struct vm_area_struct *prev,
1478  		   struct vm_area_struct *vma,
1479  		   unsigned long start, unsigned long end,
1480  		   struct mempolicy *new_pol)
1481  {
1482  	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
1483  
1484  	vmg.policy = new_pol;
1485  
1486  	return vma_modify(&vmg);
1487  }
1488  
1489  struct vm_area_struct
vma_modify_flags_uffd(struct vma_iterator * vmi,struct vm_area_struct * prev,struct vm_area_struct * vma,unsigned long start,unsigned long end,unsigned long new_flags,struct vm_userfaultfd_ctx new_ctx)1490  *vma_modify_flags_uffd(struct vma_iterator *vmi,
1491  		       struct vm_area_struct *prev,
1492  		       struct vm_area_struct *vma,
1493  		       unsigned long start, unsigned long end,
1494  		       unsigned long new_flags,
1495  		       struct vm_userfaultfd_ctx new_ctx)
1496  {
1497  	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
1498  
1499  	vmg.flags = new_flags;
1500  	vmg.uffd_ctx = new_ctx;
1501  
1502  	return vma_modify(&vmg);
1503  }
1504  
1505  /*
1506   * Expand vma by delta bytes, potentially merging with an immediately adjacent
1507   * VMA with identical properties.
1508   */
vma_merge_extend(struct vma_iterator * vmi,struct vm_area_struct * vma,unsigned long delta)1509  struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
1510  					struct vm_area_struct *vma,
1511  					unsigned long delta)
1512  {
1513  	VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);
1514  
1515  	vmg.next = vma_iter_next_rewind(vmi, NULL);
1516  	vmg.vma = NULL; /* We use the VMA to populate VMG fields only. */
1517  
1518  	return vma_merge_new_range(&vmg);
1519  }
1520  
unlink_file_vma_batch_init(struct unlink_vma_file_batch * vb)1521  void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
1522  {
1523  	vb->count = 0;
1524  }
1525  
unlink_file_vma_batch_process(struct unlink_vma_file_batch * vb)1526  static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
1527  {
1528  	struct address_space *mapping;
1529  	int i;
1530  
1531  	mapping = vb->vmas[0]->vm_file->f_mapping;
1532  	i_mmap_lock_write(mapping);
1533  	for (i = 0; i < vb->count; i++) {
1534  		VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
1535  		__remove_shared_vm_struct(vb->vmas[i], mapping);
1536  	}
1537  	i_mmap_unlock_write(mapping);
1538  
1539  	unlink_file_vma_batch_init(vb);
1540  }
1541  
unlink_file_vma_batch_add(struct unlink_vma_file_batch * vb,struct vm_area_struct * vma)1542  void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
1543  			       struct vm_area_struct *vma)
1544  {
1545  	if (vma->vm_file == NULL)
1546  		return;
1547  
1548  	if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
1549  	    vb->count == ARRAY_SIZE(vb->vmas))
1550  		unlink_file_vma_batch_process(vb);
1551  
1552  	vb->vmas[vb->count] = vma;
1553  	vb->count++;
1554  }
1555  
unlink_file_vma_batch_final(struct unlink_vma_file_batch * vb)1556  void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
1557  {
1558  	if (vb->count > 0)
1559  		unlink_file_vma_batch_process(vb);
1560  }
1561  
1562  /*
1563   * Unlink a file-based vm structure from its interval tree, to hide
1564   * vma from rmap and vmtruncate before freeing its page tables.
1565   */
unlink_file_vma(struct vm_area_struct * vma)1566  void unlink_file_vma(struct vm_area_struct *vma)
1567  {
1568  	struct file *file = vma->vm_file;
1569  
1570  	if (file) {
1571  		struct address_space *mapping = file->f_mapping;
1572  
1573  		i_mmap_lock_write(mapping);
1574  		__remove_shared_vm_struct(vma, mapping);
1575  		i_mmap_unlock_write(mapping);
1576  	}
1577  }
1578  
vma_link_file(struct vm_area_struct * vma)1579  void vma_link_file(struct vm_area_struct *vma)
1580  {
1581  	struct file *file = vma->vm_file;
1582  	struct address_space *mapping;
1583  
1584  	if (file) {
1585  		mapping = file->f_mapping;
1586  		i_mmap_lock_write(mapping);
1587  		__vma_link_file(vma, mapping);
1588  		i_mmap_unlock_write(mapping);
1589  	}
1590  }
1591  
vma_link(struct mm_struct * mm,struct vm_area_struct * vma)1592  int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
1593  {
1594  	VMA_ITERATOR(vmi, mm, 0);
1595  
1596  	vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
1597  	if (vma_iter_prealloc(&vmi, vma))
1598  		return -ENOMEM;
1599  
1600  	vma_start_write(vma);
1601  	vma_iter_store(&vmi, vma);
1602  	vma_link_file(vma);
1603  	mm->map_count++;
1604  	validate_mm(mm);
1605  	return 0;
1606  }
1607  
1608  /*
1609   * Copy the vma structure to a new location in the same mm,
1610   * prior to moving page table entries, to effect an mremap move.
1611   */
copy_vma(struct vm_area_struct ** vmap,unsigned long addr,unsigned long len,pgoff_t pgoff,bool * need_rmap_locks)1612  struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
1613  	unsigned long addr, unsigned long len, pgoff_t pgoff,
1614  	bool *need_rmap_locks)
1615  {
1616  	struct vm_area_struct *vma = *vmap;
1617  	unsigned long vma_start = vma->vm_start;
1618  	struct mm_struct *mm = vma->vm_mm;
1619  	struct vm_area_struct *new_vma;
1620  	bool faulted_in_anon_vma = true;
1621  	VMA_ITERATOR(vmi, mm, addr);
1622  	VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len);
1623  
1624  	/*
1625  	 * If anonymous vma has not yet been faulted, update new pgoff
1626  	 * to match new location, to increase its chance of merging.
1627  	 */
1628  	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
1629  		pgoff = addr >> PAGE_SHIFT;
1630  		faulted_in_anon_vma = false;
1631  	}
1632  
1633  	new_vma = find_vma_prev(mm, addr, &vmg.prev);
1634  	if (new_vma && new_vma->vm_start < addr + len)
1635  		return NULL;	/* should never get here */
1636  
1637  	vmg.vma = NULL; /* New VMA range. */
1638  	vmg.pgoff = pgoff;
1639  	vmg.next = vma_iter_next_rewind(&vmi, NULL);
1640  	new_vma = vma_merge_new_range(&vmg);
1641  
1642  	if (new_vma) {
1643  		/*
1644  		 * Source vma may have been merged into new_vma
1645  		 */
1646  		if (unlikely(vma_start >= new_vma->vm_start &&
1647  			     vma_start < new_vma->vm_end)) {
1648  			/*
1649  			 * The only way we can get a vma_merge with
1650  			 * self during an mremap is if the vma hasn't
1651  			 * been faulted in yet and we were allowed to
1652  			 * reset the dst vma->vm_pgoff to the
1653  			 * destination address of the mremap to allow
1654  			 * the merge to happen. mremap must change the
1655  			 * vm_pgoff linearity between src and dst vmas
1656  			 * (in turn preventing a vma_merge) to be
1657  			 * safe. It is only safe to keep the vm_pgoff
1658  			 * linear if there are no pages mapped yet.
1659  			 */
1660  			VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
1661  			*vmap = vma = new_vma;
1662  		}
1663  		*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
1664  	} else {
1665  		new_vma = vm_area_dup(vma);
1666  		if (!new_vma)
1667  			goto out;
1668  		vma_set_range(new_vma, addr, addr + len, pgoff);
1669  		if (vma_dup_policy(vma, new_vma))
1670  			goto out_free_vma;
1671  		if (anon_vma_clone(new_vma, vma))
1672  			goto out_free_mempol;
1673  		if (new_vma->vm_file)
1674  			get_file(new_vma->vm_file);
1675  		if (new_vma->vm_ops && new_vma->vm_ops->open)
1676  			new_vma->vm_ops->open(new_vma);
1677  		if (vma_link(mm, new_vma))
1678  			goto out_vma_link;
1679  		*need_rmap_locks = false;
1680  	}
1681  	return new_vma;
1682  
1683  out_vma_link:
1684  	vma_close(new_vma);
1685  
1686  	if (new_vma->vm_file)
1687  		fput(new_vma->vm_file);
1688  
1689  	unlink_anon_vmas(new_vma);
1690  out_free_mempol:
1691  	mpol_put(vma_policy(new_vma));
1692  out_free_vma:
1693  	vm_area_free(new_vma);
1694  out:
1695  	return NULL;
1696  }
1697  
1698  /*
1699   * Rough compatibility check to quickly see if it's even worth looking
1700   * at sharing an anon_vma.
1701   *
1702   * They need to have the same vm_file, and the flags can only differ
1703   * in things that mprotect may change.
1704   *
1705   * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
1706   * we can merge the two vma's. For example, we refuse to merge a vma if
1707   * there is a vm_ops->close() function, because that indicates that the
1708   * driver is doing some kind of reference counting. But that doesn't
1709   * really matter for the anon_vma sharing case.
1710   */
anon_vma_compatible(struct vm_area_struct * a,struct vm_area_struct * b)1711  static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
1712  {
1713  	return a->vm_end == b->vm_start &&
1714  		mpol_equal(vma_policy(a), vma_policy(b)) &&
1715  		a->vm_file == b->vm_file &&
1716  		!((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
1717  		b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
1718  }
1719  
1720  /*
1721   * Do some basic sanity checking to see if we can re-use the anon_vma
1722   * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
1723   * the same as 'old', the other will be the new one that is trying
1724   * to share the anon_vma.
1725   *
1726   * NOTE! This runs with mmap_lock held for reading, so it is possible that
1727   * the anon_vma of 'old' is concurrently in the process of being set up
1728   * by another page fault trying to merge _that_. But that's ok: if it
1729   * is being set up, that automatically means that it will be a singleton
1730   * acceptable for merging, so we can do all of this optimistically. But
1731   * we do that READ_ONCE() to make sure that we never re-load the pointer.
1732   *
1733   * IOW: that the "list_is_singular()" test on the anon_vma_chain only
1734   * matters for the 'stable anon_vma' case (ie the thing we want to avoid
1735   * is to return an anon_vma that is "complex" due to having gone through
1736   * a fork).
1737   *
1738   * We also make sure that the two vma's are compatible (adjacent,
1739   * and with the same memory policies). That's all stable, even with just
1740   * a read lock on the mmap_lock.
1741   */
reusable_anon_vma(struct vm_area_struct * old,struct vm_area_struct * a,struct vm_area_struct * b)1742  static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old,
1743  					  struct vm_area_struct *a,
1744  					  struct vm_area_struct *b)
1745  {
1746  	if (anon_vma_compatible(a, b)) {
1747  		struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
1748  
1749  		if (anon_vma && list_is_singular(&old->anon_vma_chain))
1750  			return anon_vma;
1751  	}
1752  	return NULL;
1753  }
1754  
1755  /*
1756   * find_mergeable_anon_vma is used by anon_vma_prepare, to check
1757   * neighbouring vmas for a suitable anon_vma, before it goes off
1758   * to allocate a new anon_vma.  It checks because a repetitive
1759   * sequence of mprotects and faults may otherwise lead to distinct
1760   * anon_vmas being allocated, preventing vma merge in subsequent
1761   * mprotect.
1762   */
find_mergeable_anon_vma(struct vm_area_struct * vma)1763  struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
1764  {
1765  	struct anon_vma *anon_vma = NULL;
1766  	struct vm_area_struct *prev, *next;
1767  	VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
1768  
1769  	/* Try next first. */
1770  	next = vma_iter_load(&vmi);
1771  	if (next) {
1772  		anon_vma = reusable_anon_vma(next, vma, next);
1773  		if (anon_vma)
1774  			return anon_vma;
1775  	}
1776  
1777  	prev = vma_prev(&vmi);
1778  	VM_BUG_ON_VMA(prev != vma, vma);
1779  	prev = vma_prev(&vmi);
1780  	/* Try prev next. */
1781  	if (prev)
1782  		anon_vma = reusable_anon_vma(prev, prev, vma);
1783  
1784  	/*
1785  	 * We might reach here with anon_vma == NULL if we can't find
1786  	 * any reusable anon_vma.
1787  	 * There's no absolute need to look only at touching neighbours:
1788  	 * we could search further afield for "compatible" anon_vmas.
1789  	 * But it would probably just be a waste of time searching,
1790  	 * or lead to too many vmas hanging off the same anon_vma.
1791  	 * We're trying to allow mprotect remerging later on,
1792  	 * not trying to minimize memory used for anon_vmas.
1793  	 */
1794  	return anon_vma;
1795  }
1796  
vm_ops_needs_writenotify(const struct vm_operations_struct * vm_ops)1797  static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
1798  {
1799  	return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
1800  }
1801  
vma_is_shared_writable(struct vm_area_struct * vma)1802  static bool vma_is_shared_writable(struct vm_area_struct *vma)
1803  {
1804  	return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
1805  		(VM_WRITE | VM_SHARED);
1806  }
1807  
vma_fs_can_writeback(struct vm_area_struct * vma)1808  static bool vma_fs_can_writeback(struct vm_area_struct *vma)
1809  {
1810  	/* No managed pages to writeback. */
1811  	if (vma->vm_flags & VM_PFNMAP)
1812  		return false;
1813  
1814  	return vma->vm_file && vma->vm_file->f_mapping &&
1815  		mapping_can_writeback(vma->vm_file->f_mapping);
1816  }
1817  
1818  /*
1819   * Does this VMA require the underlying folios to have their dirty state
1820   * tracked?
1821   */
vma_needs_dirty_tracking(struct vm_area_struct * vma)1822  bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
1823  {
1824  	/* Only shared, writable VMAs require dirty tracking. */
1825  	if (!vma_is_shared_writable(vma))
1826  		return false;
1827  
1828  	/* Does the filesystem need to be notified? */
1829  	if (vm_ops_needs_writenotify(vma->vm_ops))
1830  		return true;
1831  
1832  	/*
1833  	 * Even if the filesystem doesn't indicate a need for writenotify, if it
1834  	 * can writeback, dirty tracking is still required.
1835  	 */
1836  	return vma_fs_can_writeback(vma);
1837  }
1838  
1839  /*
1840   * Some shared mappings will want the pages marked read-only
1841   * to track write events. If so, we'll downgrade vm_page_prot
1842   * to the private version (using protection_map[] without the
1843   * VM_SHARED bit).
1844   */
vma_wants_writenotify(struct vm_area_struct * vma,pgprot_t vm_page_prot)1845  bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
1846  {
1847  	/* If it was private or non-writable, the write bit is already clear */
1848  	if (!vma_is_shared_writable(vma))
1849  		return false;
1850  
1851  	/* The backer wishes to know when pages are first written to? */
1852  	if (vm_ops_needs_writenotify(vma->vm_ops))
1853  		return true;
1854  
1855  	/* The open routine did something to the protections that pgprot_modify
1856  	 * won't preserve? */
1857  	if (pgprot_val(vm_page_prot) !=
1858  	    pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
1859  		return false;
1860  
1861  	/*
1862  	 * Do we need to track softdirty? hugetlb does not support softdirty
1863  	 * tracking yet.
1864  	 */
1865  	if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
1866  		return true;
1867  
1868  	/* Do we need write faults for uffd-wp tracking? */
1869  	if (userfaultfd_wp(vma))
1870  		return true;
1871  
1872  	/* Can the mapping track the dirty pages? */
1873  	return vma_fs_can_writeback(vma);
1874  }
1875  
1876  static DEFINE_MUTEX(mm_all_locks_mutex);
1877  
vm_lock_anon_vma(struct mm_struct * mm,struct anon_vma * anon_vma)1878  static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
1879  {
1880  	if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
1881  		/*
1882  		 * The LSB of head.next can't change from under us
1883  		 * because we hold the mm_all_locks_mutex.
1884  		 */
1885  		down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
1886  		/*
1887  		 * We can safely modify head.next after taking the
1888  		 * anon_vma->root->rwsem. If some other vma in this mm shares
1889  		 * the same anon_vma we won't take it again.
1890  		 *
1891  		 * No need of atomic instructions here, head.next
1892  		 * can't change from under us thanks to the
1893  		 * anon_vma->root->rwsem.
1894  		 */
1895  		if (__test_and_set_bit(0, (unsigned long *)
1896  				       &anon_vma->root->rb_root.rb_root.rb_node))
1897  			BUG();
1898  	}
1899  }
1900  
vm_lock_mapping(struct mm_struct * mm,struct address_space * mapping)1901  static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
1902  {
1903  	if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
1904  		/*
1905  		 * AS_MM_ALL_LOCKS can't change from under us because
1906  		 * we hold the mm_all_locks_mutex.
1907  		 *
1908  		 * Operations on ->flags have to be atomic because
1909  		 * even if AS_MM_ALL_LOCKS is stable thanks to the
1910  		 * mm_all_locks_mutex, there may be other cpus
1911  		 * changing other bitflags in parallel to us.
1912  		 */
1913  		if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
1914  			BUG();
1915  		down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
1916  	}
1917  }
1918  
1919  /*
1920   * This operation locks against the VM for all pte/vma/mm related
1921   * operations that could ever happen on a certain mm. This includes
1922   * vmtruncate, try_to_unmap, and all page faults.
1923   *
1924   * The caller must take the mmap_lock in write mode before calling
1925   * mm_take_all_locks(). The caller isn't allowed to release the
1926   * mmap_lock until mm_drop_all_locks() returns.
1927   *
1928   * mmap_lock in write mode is required in order to block all operations
1929   * that could modify pagetables and free pages without need of
1930   * altering the vma layout. It's also needed in write mode to avoid new
1931   * anon_vmas to be associated with existing vmas.
1932   *
1933   * A single task can't take more than one mm_take_all_locks() in a row
1934   * or it would deadlock.
1935   *
1936   * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
1937   * mapping->flags avoid to take the same lock twice, if more than one
1938   * vma in this mm is backed by the same anon_vma or address_space.
1939   *
1940   * We take locks in following order, accordingly to comment at beginning
1941   * of mm/rmap.c:
1942   *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
1943   *     hugetlb mapping);
1944   *   - all vmas marked locked
1945   *   - all i_mmap_rwsem locks;
1946   *   - all anon_vma->rwseml
1947   *
1948   * We can take all locks within these types randomly because the VM code
1949   * doesn't nest them and we protected from parallel mm_take_all_locks() by
1950   * mm_all_locks_mutex.
1951   *
1952   * mm_take_all_locks() and mm_drop_all_locks are expensive operations
1953   * that may have to take thousand of locks.
1954   *
1955   * mm_take_all_locks() can fail if it's interrupted by signals.
1956   */
mm_take_all_locks(struct mm_struct * mm)1957  int mm_take_all_locks(struct mm_struct *mm)
1958  {
1959  	struct vm_area_struct *vma;
1960  	struct anon_vma_chain *avc;
1961  	VMA_ITERATOR(vmi, mm, 0);
1962  
1963  	mmap_assert_write_locked(mm);
1964  
1965  	mutex_lock(&mm_all_locks_mutex);
1966  
1967  	/*
1968  	 * vma_start_write() does not have a complement in mm_drop_all_locks()
1969  	 * because vma_start_write() is always asymmetrical; it marks a VMA as
1970  	 * being written to until mmap_write_unlock() or mmap_write_downgrade()
1971  	 * is reached.
1972  	 */
1973  	for_each_vma(vmi, vma) {
1974  		if (signal_pending(current))
1975  			goto out_unlock;
1976  		vma_start_write(vma);
1977  	}
1978  
1979  	vma_iter_init(&vmi, mm, 0);
1980  	for_each_vma(vmi, vma) {
1981  		if (signal_pending(current))
1982  			goto out_unlock;
1983  		if (vma->vm_file && vma->vm_file->f_mapping &&
1984  				is_vm_hugetlb_page(vma))
1985  			vm_lock_mapping(mm, vma->vm_file->f_mapping);
1986  	}
1987  
1988  	vma_iter_init(&vmi, mm, 0);
1989  	for_each_vma(vmi, vma) {
1990  		if (signal_pending(current))
1991  			goto out_unlock;
1992  		if (vma->vm_file && vma->vm_file->f_mapping &&
1993  				!is_vm_hugetlb_page(vma))
1994  			vm_lock_mapping(mm, vma->vm_file->f_mapping);
1995  	}
1996  
1997  	vma_iter_init(&vmi, mm, 0);
1998  	for_each_vma(vmi, vma) {
1999  		if (signal_pending(current))
2000  			goto out_unlock;
2001  		if (vma->anon_vma)
2002  			list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2003  				vm_lock_anon_vma(mm, avc->anon_vma);
2004  	}
2005  
2006  	return 0;
2007  
2008  out_unlock:
2009  	mm_drop_all_locks(mm);
2010  	return -EINTR;
2011  }
2012  
vm_unlock_anon_vma(struct anon_vma * anon_vma)2013  static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2014  {
2015  	if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
2016  		/*
2017  		 * The LSB of head.next can't change to 0 from under
2018  		 * us because we hold the mm_all_locks_mutex.
2019  		 *
2020  		 * We must however clear the bitflag before unlocking
2021  		 * the vma so the users using the anon_vma->rb_root will
2022  		 * never see our bitflag.
2023  		 *
2024  		 * No need of atomic instructions here, head.next
2025  		 * can't change from under us until we release the
2026  		 * anon_vma->root->rwsem.
2027  		 */
2028  		if (!__test_and_clear_bit(0, (unsigned long *)
2029  					  &anon_vma->root->rb_root.rb_root.rb_node))
2030  			BUG();
2031  		anon_vma_unlock_write(anon_vma);
2032  	}
2033  }
2034  
vm_unlock_mapping(struct address_space * mapping)2035  static void vm_unlock_mapping(struct address_space *mapping)
2036  {
2037  	if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2038  		/*
2039  		 * AS_MM_ALL_LOCKS can't change to 0 from under us
2040  		 * because we hold the mm_all_locks_mutex.
2041  		 */
2042  		i_mmap_unlock_write(mapping);
2043  		if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
2044  					&mapping->flags))
2045  			BUG();
2046  	}
2047  }
2048  
2049  /*
2050   * The mmap_lock cannot be released by the caller until
2051   * mm_drop_all_locks() returns.
2052   */
mm_drop_all_locks(struct mm_struct * mm)2053  void mm_drop_all_locks(struct mm_struct *mm)
2054  {
2055  	struct vm_area_struct *vma;
2056  	struct anon_vma_chain *avc;
2057  	VMA_ITERATOR(vmi, mm, 0);
2058  
2059  	mmap_assert_write_locked(mm);
2060  	BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2061  
2062  	for_each_vma(vmi, vma) {
2063  		if (vma->anon_vma)
2064  			list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2065  				vm_unlock_anon_vma(avc->anon_vma);
2066  		if (vma->vm_file && vma->vm_file->f_mapping)
2067  			vm_unlock_mapping(vma->vm_file->f_mapping);
2068  	}
2069  
2070  	mutex_unlock(&mm_all_locks_mutex);
2071  }
2072