1  /* SPDX-License-Identifier: GPL-2.0 */
2  /*
3   *  include/linux/userfaultfd_k.h
4   *
5   *  Copyright (C) 2015  Red Hat, Inc.
6   *
7   */
8  
9  #ifndef _LINUX_USERFAULTFD_K_H
10  #define _LINUX_USERFAULTFD_K_H
11  
12  #ifdef CONFIG_USERFAULTFD
13  
14  #include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */
15  
16  #include <linux/fcntl.h>
17  #include <linux/mm.h>
18  #include <linux/swap.h>
19  #include <linux/swapops.h>
20  #include <asm-generic/pgtable_uffd.h>
21  #include <linux/hugetlb_inline.h>
22  
23  /* The set of all possible UFFD-related VM flags. */
24  #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR)
25  
26  /*
27   * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
28   * new flags, since they might collide with O_* ones. We want
29   * to re-use O_* flags that couldn't possibly have a meaning
30   * from userfaultfd, in order to leave a free define-space for
31   * shared O_* flags.
32   */
33  #define UFFD_CLOEXEC O_CLOEXEC
34  #define UFFD_NONBLOCK O_NONBLOCK
35  
36  #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
37  #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
38  
39  /*
40   * Start with fault_pending_wqh and fault_wqh so they're more likely
41   * to be in the same cacheline.
42   *
43   * Locking order:
44   *	fd_wqh.lock
45   *		fault_pending_wqh.lock
46   *			fault_wqh.lock
47   *		event_wqh.lock
48   *
49   * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
50   * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
51   * also taken in IRQ context.
52   */
53  struct userfaultfd_ctx {
54  	/* waitqueue head for the pending (i.e. not read) userfaults */
55  	wait_queue_head_t fault_pending_wqh;
56  	/* waitqueue head for the userfaults */
57  	wait_queue_head_t fault_wqh;
58  	/* waitqueue head for the pseudo fd to wakeup poll/read */
59  	wait_queue_head_t fd_wqh;
60  	/* waitqueue head for events */
61  	wait_queue_head_t event_wqh;
62  	/* a refile sequence protected by fault_pending_wqh lock */
63  	seqcount_spinlock_t refile_seq;
64  	/* pseudo fd refcounting */
65  	refcount_t refcount;
66  	/* userfaultfd syscall flags */
67  	unsigned int flags;
68  	/* features requested from the userspace */
69  	unsigned int features;
70  	/* released */
71  	bool released;
72  	/*
73  	 * Prevents userfaultfd operations (fill/move/wp) from happening while
74  	 * some non-cooperative event(s) is taking place. Increments are done
75  	 * in write-mode. Whereas, userfaultfd operations, which includes
76  	 * reading mmap_changing, is done under read-mode.
77  	 */
78  	struct rw_semaphore map_changing_lock;
79  	/* memory mappings are changing because of non-cooperative event */
80  	atomic_t mmap_changing;
81  	/* mm with one ore more vmas attached to this userfaultfd_ctx */
82  	struct mm_struct *mm;
83  };
84  
85  extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
86  
87  /* A combined operation mode + behavior flags. */
88  typedef unsigned int __bitwise uffd_flags_t;
89  
90  /* Mutually exclusive modes of operation. */
91  enum mfill_atomic_mode {
92  	MFILL_ATOMIC_COPY,
93  	MFILL_ATOMIC_ZEROPAGE,
94  	MFILL_ATOMIC_CONTINUE,
95  	MFILL_ATOMIC_POISON,
96  	NR_MFILL_ATOMIC_MODES,
97  };
98  
99  #define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1)
100  #define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr))
101  #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr))
102  #define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1))
103  
uffd_flags_mode_is(uffd_flags_t flags,enum mfill_atomic_mode expected)104  static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected)
105  {
106  	return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected);
107  }
108  
uffd_flags_set_mode(uffd_flags_t flags,enum mfill_atomic_mode mode)109  static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode)
110  {
111  	flags &= ~MFILL_ATOMIC_MODE_MASK;
112  	return flags | ((__force uffd_flags_t) mode);
113  }
114  
115  /* Flags controlling behavior. These behavior changes are mode-independent. */
116  #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0)
117  
118  extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
119  				    struct vm_area_struct *dst_vma,
120  				    unsigned long dst_addr, struct page *page,
121  				    bool newly_allocated, uffd_flags_t flags);
122  
123  extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
124  				 unsigned long src_start, unsigned long len,
125  				 uffd_flags_t flags);
126  extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
127  				     unsigned long dst_start,
128  				     unsigned long len);
129  extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
130  				     unsigned long len, uffd_flags_t flags);
131  extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
132  				   unsigned long len, uffd_flags_t flags);
133  extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
134  			       unsigned long len, bool enable_wp);
135  extern long uffd_wp_range(struct vm_area_struct *vma,
136  			  unsigned long start, unsigned long len, bool enable_wp);
137  
138  /* move_pages */
139  void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
140  void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
141  ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
142  		   unsigned long src_start, unsigned long len, __u64 flags);
143  int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
144  			struct vm_area_struct *dst_vma,
145  			struct vm_area_struct *src_vma,
146  			unsigned long dst_addr, unsigned long src_addr);
147  
148  /* mm helpers */
is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct * vma,struct vm_userfaultfd_ctx vm_ctx)149  static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
150  					struct vm_userfaultfd_ctx vm_ctx)
151  {
152  	return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
153  }
154  
155  /*
156   * Never enable huge pmd sharing on some uffd registered vmas:
157   *
158   * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry.
159   *
160   * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for
161   *   VMAs which share huge pmds. (If you have two mappings to the same
162   *   underlying pages, and fault in the non-UFFD-registered one with a write,
163   *   with huge pmd sharing this would *also* setup the second UFFD-registered
164   *   mapping, and we'd not get minor faults.)
165   */
uffd_disable_huge_pmd_share(struct vm_area_struct * vma)166  static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma)
167  {
168  	return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR);
169  }
170  
171  /*
172   * Don't do fault around for either WP or MINOR registered uffd range.  For
173   * MINOR registered range, fault around will be a total disaster and ptes can
174   * be installed without notifications; for WP it should mostly be fine as long
175   * as the fault around checks for pte_none() before the installation, however
176   * to be super safe we just forbid it.
177   */
uffd_disable_fault_around(struct vm_area_struct * vma)178  static inline bool uffd_disable_fault_around(struct vm_area_struct *vma)
179  {
180  	return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR);
181  }
182  
userfaultfd_missing(struct vm_area_struct * vma)183  static inline bool userfaultfd_missing(struct vm_area_struct *vma)
184  {
185  	return vma->vm_flags & VM_UFFD_MISSING;
186  }
187  
userfaultfd_wp(struct vm_area_struct * vma)188  static inline bool userfaultfd_wp(struct vm_area_struct *vma)
189  {
190  	return vma->vm_flags & VM_UFFD_WP;
191  }
192  
userfaultfd_minor(struct vm_area_struct * vma)193  static inline bool userfaultfd_minor(struct vm_area_struct *vma)
194  {
195  	return vma->vm_flags & VM_UFFD_MINOR;
196  }
197  
userfaultfd_pte_wp(struct vm_area_struct * vma,pte_t pte)198  static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
199  				      pte_t pte)
200  {
201  	return userfaultfd_wp(vma) && pte_uffd_wp(pte);
202  }
203  
userfaultfd_huge_pmd_wp(struct vm_area_struct * vma,pmd_t pmd)204  static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
205  					   pmd_t pmd)
206  {
207  	return userfaultfd_wp(vma) && pmd_uffd_wp(pmd);
208  }
209  
userfaultfd_armed(struct vm_area_struct * vma)210  static inline bool userfaultfd_armed(struct vm_area_struct *vma)
211  {
212  	return vma->vm_flags & __VM_UFFD_FLAGS;
213  }
214  
vma_can_userfault(struct vm_area_struct * vma,unsigned long vm_flags,bool wp_async)215  static inline bool vma_can_userfault(struct vm_area_struct *vma,
216  				     unsigned long vm_flags,
217  				     bool wp_async)
218  {
219  	vm_flags &= __VM_UFFD_FLAGS;
220  
221  	if (vm_flags & VM_DROPPABLE)
222  		return false;
223  
224  	if ((vm_flags & VM_UFFD_MINOR) &&
225  	    (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
226  		return false;
227  
228  	/*
229  	 * If wp async enabled, and WP is the only mode enabled, allow any
230  	 * memory type.
231  	 */
232  	if (wp_async && (vm_flags == VM_UFFD_WP))
233  		return true;
234  
235  #ifndef CONFIG_PTE_MARKER_UFFD_WP
236  	/*
237  	 * If user requested uffd-wp but not enabled pte markers for
238  	 * uffd-wp, then shmem & hugetlbfs are not supported but only
239  	 * anonymous.
240  	 */
241  	if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
242  		return false;
243  #endif
244  
245  	/* By default, allow any of anon|shmem|hugetlb */
246  	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
247  	    vma_is_shmem(vma);
248  }
249  
250  extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
251  extern void dup_userfaultfd_complete(struct list_head *);
252  void dup_userfaultfd_fail(struct list_head *);
253  
254  extern void mremap_userfaultfd_prep(struct vm_area_struct *,
255  				    struct vm_userfaultfd_ctx *);
256  extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
257  					unsigned long from, unsigned long to,
258  					unsigned long len);
259  
260  extern bool userfaultfd_remove(struct vm_area_struct *vma,
261  			       unsigned long start,
262  			       unsigned long end);
263  
264  extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
265  		unsigned long start, unsigned long end, struct list_head *uf);
266  extern void userfaultfd_unmap_complete(struct mm_struct *mm,
267  				       struct list_head *uf);
268  extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
269  extern bool userfaultfd_wp_async(struct vm_area_struct *vma);
270  
271  void userfaultfd_reset_ctx(struct vm_area_struct *vma);
272  
273  struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
274  					     struct vm_area_struct *prev,
275  					     struct vm_area_struct *vma,
276  					     unsigned long start,
277  					     unsigned long end);
278  
279  int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
280  			       struct vm_area_struct *vma,
281  			       unsigned long vm_flags,
282  			       unsigned long start, unsigned long end,
283  			       bool wp_async);
284  
285  void userfaultfd_release_new(struct userfaultfd_ctx *ctx);
286  
287  void userfaultfd_release_all(struct mm_struct *mm,
288  			     struct userfaultfd_ctx *ctx);
289  
290  #else /* CONFIG_USERFAULTFD */
291  
292  /* mm helpers */
handle_userfault(struct vm_fault * vmf,unsigned long reason)293  static inline vm_fault_t handle_userfault(struct vm_fault *vmf,
294  				unsigned long reason)
295  {
296  	return VM_FAULT_SIGBUS;
297  }
298  
uffd_wp_range(struct vm_area_struct * vma,unsigned long start,unsigned long len,bool enable_wp)299  static inline long uffd_wp_range(struct vm_area_struct *vma,
300  				 unsigned long start, unsigned long len,
301  				 bool enable_wp)
302  {
303  	return false;
304  }
305  
is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct * vma,struct vm_userfaultfd_ctx vm_ctx)306  static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
307  					struct vm_userfaultfd_ctx vm_ctx)
308  {
309  	return true;
310  }
311  
userfaultfd_missing(struct vm_area_struct * vma)312  static inline bool userfaultfd_missing(struct vm_area_struct *vma)
313  {
314  	return false;
315  }
316  
userfaultfd_wp(struct vm_area_struct * vma)317  static inline bool userfaultfd_wp(struct vm_area_struct *vma)
318  {
319  	return false;
320  }
321  
userfaultfd_minor(struct vm_area_struct * vma)322  static inline bool userfaultfd_minor(struct vm_area_struct *vma)
323  {
324  	return false;
325  }
326  
userfaultfd_pte_wp(struct vm_area_struct * vma,pte_t pte)327  static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
328  				      pte_t pte)
329  {
330  	return false;
331  }
332  
userfaultfd_huge_pmd_wp(struct vm_area_struct * vma,pmd_t pmd)333  static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
334  					   pmd_t pmd)
335  {
336  	return false;
337  }
338  
339  
userfaultfd_armed(struct vm_area_struct * vma)340  static inline bool userfaultfd_armed(struct vm_area_struct *vma)
341  {
342  	return false;
343  }
344  
dup_userfaultfd(struct vm_area_struct * vma,struct list_head * l)345  static inline int dup_userfaultfd(struct vm_area_struct *vma,
346  				  struct list_head *l)
347  {
348  	return 0;
349  }
350  
dup_userfaultfd_complete(struct list_head * l)351  static inline void dup_userfaultfd_complete(struct list_head *l)
352  {
353  }
354  
dup_userfaultfd_fail(struct list_head * l)355  static inline void dup_userfaultfd_fail(struct list_head *l)
356  {
357  }
358  
mremap_userfaultfd_prep(struct vm_area_struct * vma,struct vm_userfaultfd_ctx * ctx)359  static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
360  					   struct vm_userfaultfd_ctx *ctx)
361  {
362  }
363  
mremap_userfaultfd_complete(struct vm_userfaultfd_ctx * ctx,unsigned long from,unsigned long to,unsigned long len)364  static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
365  					       unsigned long from,
366  					       unsigned long to,
367  					       unsigned long len)
368  {
369  }
370  
userfaultfd_remove(struct vm_area_struct * vma,unsigned long start,unsigned long end)371  static inline bool userfaultfd_remove(struct vm_area_struct *vma,
372  				      unsigned long start,
373  				      unsigned long end)
374  {
375  	return true;
376  }
377  
userfaultfd_unmap_prep(struct vm_area_struct * vma,unsigned long start,unsigned long end,struct list_head * uf)378  static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
379  					 unsigned long start, unsigned long end,
380  					 struct list_head *uf)
381  {
382  	return 0;
383  }
384  
userfaultfd_unmap_complete(struct mm_struct * mm,struct list_head * uf)385  static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
386  					      struct list_head *uf)
387  {
388  }
389  
uffd_disable_fault_around(struct vm_area_struct * vma)390  static inline bool uffd_disable_fault_around(struct vm_area_struct *vma)
391  {
392  	return false;
393  }
394  
userfaultfd_wp_unpopulated(struct vm_area_struct * vma)395  static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
396  {
397  	return false;
398  }
399  
userfaultfd_wp_async(struct vm_area_struct * vma)400  static inline bool userfaultfd_wp_async(struct vm_area_struct *vma)
401  {
402  	return false;
403  }
404  
405  #endif /* CONFIG_USERFAULTFD */
406  
userfaultfd_wp_use_markers(struct vm_area_struct * vma)407  static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
408  {
409  	/* Only wr-protect mode uses pte markers */
410  	if (!userfaultfd_wp(vma))
411  		return false;
412  
413  	/* File-based uffd-wp always need markers */
414  	if (!vma_is_anonymous(vma))
415  		return true;
416  
417  	/*
418  	 * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED
419  	 * enabled (to apply markers on zero pages).
420  	 */
421  	return userfaultfd_wp_unpopulated(vma);
422  }
423  
pte_marker_entry_uffd_wp(swp_entry_t entry)424  static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry)
425  {
426  #ifdef CONFIG_PTE_MARKER_UFFD_WP
427  	return is_pte_marker_entry(entry) &&
428  	    (pte_marker_get(entry) & PTE_MARKER_UFFD_WP);
429  #else
430  	return false;
431  #endif
432  }
433  
pte_marker_uffd_wp(pte_t pte)434  static inline bool pte_marker_uffd_wp(pte_t pte)
435  {
436  #ifdef CONFIG_PTE_MARKER_UFFD_WP
437  	swp_entry_t entry;
438  
439  	if (!is_swap_pte(pte))
440  		return false;
441  
442  	entry = pte_to_swp_entry(pte);
443  
444  	return pte_marker_entry_uffd_wp(entry);
445  #else
446  	return false;
447  #endif
448  }
449  
450  /*
451   * Returns true if this is a swap pte and was uffd-wp wr-protected in either
452   * forms (pte marker or a normal swap pte), false otherwise.
453   */
pte_swp_uffd_wp_any(pte_t pte)454  static inline bool pte_swp_uffd_wp_any(pte_t pte)
455  {
456  #ifdef CONFIG_PTE_MARKER_UFFD_WP
457  	if (!is_swap_pte(pte))
458  		return false;
459  
460  	if (pte_swp_uffd_wp(pte))
461  		return true;
462  
463  	if (pte_marker_uffd_wp(pte))
464  		return true;
465  #endif
466  	return false;
467  }
468  
469  #endif /* _LINUX_USERFAULTFD_K_H */
470