1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * fs/dax.c - Direct Access filesystem code
4   * Copyright (c) 2013-2014 Intel Corporation
5   * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
6   * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
7   */
8  
9  #include <linux/atomic.h>
10  #include <linux/blkdev.h>
11  #include <linux/buffer_head.h>
12  #include <linux/dax.h>
13  #include <linux/fs.h>
14  #include <linux/highmem.h>
15  #include <linux/memcontrol.h>
16  #include <linux/mm.h>
17  #include <linux/mutex.h>
18  #include <linux/pagevec.h>
19  #include <linux/sched.h>
20  #include <linux/sched/signal.h>
21  #include <linux/uio.h>
22  #include <linux/vmstat.h>
23  #include <linux/pfn_t.h>
24  #include <linux/sizes.h>
25  #include <linux/mmu_notifier.h>
26  #include <linux/iomap.h>
27  #include <linux/rmap.h>
28  #include <asm/pgalloc.h>
29  
30  #define CREATE_TRACE_POINTS
31  #include <trace/events/fs_dax.h>
32  
33  /* We choose 4096 entries - same as per-zone page wait tables */
34  #define DAX_WAIT_TABLE_BITS 12
35  #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
36  
37  /* The 'colour' (ie low bits) within a PMD of a page offset.  */
38  #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
39  #define PG_PMD_NR	(PMD_SIZE >> PAGE_SHIFT)
40  
41  static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
42  
init_dax_wait_table(void)43  static int __init init_dax_wait_table(void)
44  {
45  	int i;
46  
47  	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
48  		init_waitqueue_head(wait_table + i);
49  	return 0;
50  }
51  fs_initcall(init_dax_wait_table);
52  
53  /*
54   * DAX pagecache entries use XArray value entries so they can't be mistaken
55   * for pages.  We use one bit for locking, one bit for the entry size (PMD)
56   * and two more to tell us if the entry is a zero page or an empty entry that
57   * is just used for locking.  In total four special bits.
58   *
59   * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
60   * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
61   * block allocation.
62   */
63  #define DAX_SHIFT	(4)
64  #define DAX_LOCKED	(1UL << 0)
65  #define DAX_PMD		(1UL << 1)
66  #define DAX_ZERO_PAGE	(1UL << 2)
67  #define DAX_EMPTY	(1UL << 3)
68  
dax_to_pfn(void * entry)69  static unsigned long dax_to_pfn(void *entry)
70  {
71  	return xa_to_value(entry) >> DAX_SHIFT;
72  }
73  
dax_make_entry(pfn_t pfn,unsigned long flags)74  static void *dax_make_entry(pfn_t pfn, unsigned long flags)
75  {
76  	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
77  }
78  
dax_is_locked(void * entry)79  static bool dax_is_locked(void *entry)
80  {
81  	return xa_to_value(entry) & DAX_LOCKED;
82  }
83  
dax_entry_order(void * entry)84  static unsigned int dax_entry_order(void *entry)
85  {
86  	if (xa_to_value(entry) & DAX_PMD)
87  		return PMD_ORDER;
88  	return 0;
89  }
90  
dax_is_pmd_entry(void * entry)91  static unsigned long dax_is_pmd_entry(void *entry)
92  {
93  	return xa_to_value(entry) & DAX_PMD;
94  }
95  
dax_is_pte_entry(void * entry)96  static bool dax_is_pte_entry(void *entry)
97  {
98  	return !(xa_to_value(entry) & DAX_PMD);
99  }
100  
dax_is_zero_entry(void * entry)101  static int dax_is_zero_entry(void *entry)
102  {
103  	return xa_to_value(entry) & DAX_ZERO_PAGE;
104  }
105  
dax_is_empty_entry(void * entry)106  static int dax_is_empty_entry(void *entry)
107  {
108  	return xa_to_value(entry) & DAX_EMPTY;
109  }
110  
111  /*
112   * true if the entry that was found is of a smaller order than the entry
113   * we were looking for
114   */
dax_is_conflict(void * entry)115  static bool dax_is_conflict(void *entry)
116  {
117  	return entry == XA_RETRY_ENTRY;
118  }
119  
120  /*
121   * DAX page cache entry locking
122   */
123  struct exceptional_entry_key {
124  	struct xarray *xa;
125  	pgoff_t entry_start;
126  };
127  
128  struct wait_exceptional_entry_queue {
129  	wait_queue_entry_t wait;
130  	struct exceptional_entry_key key;
131  };
132  
133  /**
134   * enum dax_wake_mode: waitqueue wakeup behaviour
135   * @WAKE_ALL: wake all waiters in the waitqueue
136   * @WAKE_NEXT: wake only the first waiter in the waitqueue
137   */
138  enum dax_wake_mode {
139  	WAKE_ALL,
140  	WAKE_NEXT,
141  };
142  
dax_entry_waitqueue(struct xa_state * xas,void * entry,struct exceptional_entry_key * key)143  static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
144  		void *entry, struct exceptional_entry_key *key)
145  {
146  	unsigned long hash;
147  	unsigned long index = xas->xa_index;
148  
149  	/*
150  	 * If 'entry' is a PMD, align the 'index' that we use for the wait
151  	 * queue to the start of that PMD.  This ensures that all offsets in
152  	 * the range covered by the PMD map to the same bit lock.
153  	 */
154  	if (dax_is_pmd_entry(entry))
155  		index &= ~PG_PMD_COLOUR;
156  	key->xa = xas->xa;
157  	key->entry_start = index;
158  
159  	hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
160  	return wait_table + hash;
161  }
162  
wake_exceptional_entry_func(wait_queue_entry_t * wait,unsigned int mode,int sync,void * keyp)163  static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
164  		unsigned int mode, int sync, void *keyp)
165  {
166  	struct exceptional_entry_key *key = keyp;
167  	struct wait_exceptional_entry_queue *ewait =
168  		container_of(wait, struct wait_exceptional_entry_queue, wait);
169  
170  	if (key->xa != ewait->key.xa ||
171  	    key->entry_start != ewait->key.entry_start)
172  		return 0;
173  	return autoremove_wake_function(wait, mode, sync, NULL);
174  }
175  
176  /*
177   * @entry may no longer be the entry at the index in the mapping.
178   * The important information it's conveying is whether the entry at
179   * this index used to be a PMD entry.
180   */
dax_wake_entry(struct xa_state * xas,void * entry,enum dax_wake_mode mode)181  static void dax_wake_entry(struct xa_state *xas, void *entry,
182  			   enum dax_wake_mode mode)
183  {
184  	struct exceptional_entry_key key;
185  	wait_queue_head_t *wq;
186  
187  	wq = dax_entry_waitqueue(xas, entry, &key);
188  
189  	/*
190  	 * Checking for locked entry and prepare_to_wait_exclusive() happens
191  	 * under the i_pages lock, ditto for entry handling in our callers.
192  	 * So at this point all tasks that could have seen our entry locked
193  	 * must be in the waitqueue and the following check will see them.
194  	 */
195  	if (waitqueue_active(wq))
196  		__wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
197  }
198  
199  /*
200   * Look up entry in page cache, wait for it to become unlocked if it
201   * is a DAX entry and return it.  The caller must subsequently call
202   * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
203   * if it did.  The entry returned may have a larger order than @order.
204   * If @order is larger than the order of the entry found in i_pages, this
205   * function returns a dax_is_conflict entry.
206   *
207   * Must be called with the i_pages lock held.
208   */
get_unlocked_entry(struct xa_state * xas,unsigned int order)209  static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
210  {
211  	void *entry;
212  	struct wait_exceptional_entry_queue ewait;
213  	wait_queue_head_t *wq;
214  
215  	init_wait(&ewait.wait);
216  	ewait.wait.func = wake_exceptional_entry_func;
217  
218  	for (;;) {
219  		entry = xas_find_conflict(xas);
220  		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
221  			return entry;
222  		if (dax_entry_order(entry) < order)
223  			return XA_RETRY_ENTRY;
224  		if (!dax_is_locked(entry))
225  			return entry;
226  
227  		wq = dax_entry_waitqueue(xas, entry, &ewait.key);
228  		prepare_to_wait_exclusive(wq, &ewait.wait,
229  					  TASK_UNINTERRUPTIBLE);
230  		xas_unlock_irq(xas);
231  		xas_reset(xas);
232  		schedule();
233  		finish_wait(wq, &ewait.wait);
234  		xas_lock_irq(xas);
235  	}
236  }
237  
238  /*
239   * The only thing keeping the address space around is the i_pages lock
240   * (it's cycled in clear_inode() after removing the entries from i_pages)
241   * After we call xas_unlock_irq(), we cannot touch xas->xa.
242   */
wait_entry_unlocked(struct xa_state * xas,void * entry)243  static void wait_entry_unlocked(struct xa_state *xas, void *entry)
244  {
245  	struct wait_exceptional_entry_queue ewait;
246  	wait_queue_head_t *wq;
247  
248  	init_wait(&ewait.wait);
249  	ewait.wait.func = wake_exceptional_entry_func;
250  
251  	wq = dax_entry_waitqueue(xas, entry, &ewait.key);
252  	/*
253  	 * Unlike get_unlocked_entry() there is no guarantee that this
254  	 * path ever successfully retrieves an unlocked entry before an
255  	 * inode dies. Perform a non-exclusive wait in case this path
256  	 * never successfully performs its own wake up.
257  	 */
258  	prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
259  	xas_unlock_irq(xas);
260  	schedule();
261  	finish_wait(wq, &ewait.wait);
262  }
263  
put_unlocked_entry(struct xa_state * xas,void * entry,enum dax_wake_mode mode)264  static void put_unlocked_entry(struct xa_state *xas, void *entry,
265  			       enum dax_wake_mode mode)
266  {
267  	if (entry && !dax_is_conflict(entry))
268  		dax_wake_entry(xas, entry, mode);
269  }
270  
271  /*
272   * We used the xa_state to get the entry, but then we locked the entry and
273   * dropped the xa_lock, so we know the xa_state is stale and must be reset
274   * before use.
275   */
dax_unlock_entry(struct xa_state * xas,void * entry)276  static void dax_unlock_entry(struct xa_state *xas, void *entry)
277  {
278  	void *old;
279  
280  	BUG_ON(dax_is_locked(entry));
281  	xas_reset(xas);
282  	xas_lock_irq(xas);
283  	old = xas_store(xas, entry);
284  	xas_unlock_irq(xas);
285  	BUG_ON(!dax_is_locked(old));
286  	dax_wake_entry(xas, entry, WAKE_NEXT);
287  }
288  
289  /*
290   * Return: The entry stored at this location before it was locked.
291   */
dax_lock_entry(struct xa_state * xas,void * entry)292  static void *dax_lock_entry(struct xa_state *xas, void *entry)
293  {
294  	unsigned long v = xa_to_value(entry);
295  	return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
296  }
297  
dax_entry_size(void * entry)298  static unsigned long dax_entry_size(void *entry)
299  {
300  	if (dax_is_zero_entry(entry))
301  		return 0;
302  	else if (dax_is_empty_entry(entry))
303  		return 0;
304  	else if (dax_is_pmd_entry(entry))
305  		return PMD_SIZE;
306  	else
307  		return PAGE_SIZE;
308  }
309  
dax_end_pfn(void * entry)310  static unsigned long dax_end_pfn(void *entry)
311  {
312  	return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
313  }
314  
315  /*
316   * Iterate through all mapped pfns represented by an entry, i.e. skip
317   * 'empty' and 'zero' entries.
318   */
319  #define for_each_mapped_pfn(entry, pfn) \
320  	for (pfn = dax_to_pfn(entry); \
321  			pfn < dax_end_pfn(entry); pfn++)
322  
dax_page_is_shared(struct page * page)323  static inline bool dax_page_is_shared(struct page *page)
324  {
325  	return page->mapping == PAGE_MAPPING_DAX_SHARED;
326  }
327  
328  /*
329   * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
330   * refcount.
331   */
dax_page_share_get(struct page * page)332  static inline void dax_page_share_get(struct page *page)
333  {
334  	if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
335  		/*
336  		 * Reset the index if the page was already mapped
337  		 * regularly before.
338  		 */
339  		if (page->mapping)
340  			page->share = 1;
341  		page->mapping = PAGE_MAPPING_DAX_SHARED;
342  	}
343  	page->share++;
344  }
345  
dax_page_share_put(struct page * page)346  static inline unsigned long dax_page_share_put(struct page *page)
347  {
348  	return --page->share;
349  }
350  
351  /*
352   * When it is called in dax_insert_entry(), the shared flag will indicate that
353   * whether this entry is shared by multiple files.  If so, set the page->mapping
354   * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
355   */
dax_associate_entry(void * entry,struct address_space * mapping,struct vm_area_struct * vma,unsigned long address,bool shared)356  static void dax_associate_entry(void *entry, struct address_space *mapping,
357  		struct vm_area_struct *vma, unsigned long address, bool shared)
358  {
359  	unsigned long size = dax_entry_size(entry), pfn, index;
360  	int i = 0;
361  
362  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
363  		return;
364  
365  	index = linear_page_index(vma, address & ~(size - 1));
366  	for_each_mapped_pfn(entry, pfn) {
367  		struct page *page = pfn_to_page(pfn);
368  
369  		if (shared) {
370  			dax_page_share_get(page);
371  		} else {
372  			WARN_ON_ONCE(page->mapping);
373  			page->mapping = mapping;
374  			page->index = index + i++;
375  		}
376  	}
377  }
378  
dax_disassociate_entry(void * entry,struct address_space * mapping,bool trunc)379  static void dax_disassociate_entry(void *entry, struct address_space *mapping,
380  		bool trunc)
381  {
382  	unsigned long pfn;
383  
384  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
385  		return;
386  
387  	for_each_mapped_pfn(entry, pfn) {
388  		struct page *page = pfn_to_page(pfn);
389  
390  		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
391  		if (dax_page_is_shared(page)) {
392  			/* keep the shared flag if this page is still shared */
393  			if (dax_page_share_put(page) > 0)
394  				continue;
395  		} else
396  			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
397  		page->mapping = NULL;
398  		page->index = 0;
399  	}
400  }
401  
dax_busy_page(void * entry)402  static struct page *dax_busy_page(void *entry)
403  {
404  	unsigned long pfn;
405  
406  	for_each_mapped_pfn(entry, pfn) {
407  		struct page *page = pfn_to_page(pfn);
408  
409  		if (page_ref_count(page) > 1)
410  			return page;
411  	}
412  	return NULL;
413  }
414  
415  /**
416   * dax_lock_folio - Lock the DAX entry corresponding to a folio
417   * @folio: The folio whose entry we want to lock
418   *
419   * Context: Process context.
420   * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
421   * not be locked.
422   */
dax_lock_folio(struct folio * folio)423  dax_entry_t dax_lock_folio(struct folio *folio)
424  {
425  	XA_STATE(xas, NULL, 0);
426  	void *entry;
427  
428  	/* Ensure folio->mapping isn't freed while we look at it */
429  	rcu_read_lock();
430  	for (;;) {
431  		struct address_space *mapping = READ_ONCE(folio->mapping);
432  
433  		entry = NULL;
434  		if (!mapping || !dax_mapping(mapping))
435  			break;
436  
437  		/*
438  		 * In the device-dax case there's no need to lock, a
439  		 * struct dev_pagemap pin is sufficient to keep the
440  		 * inode alive, and we assume we have dev_pagemap pin
441  		 * otherwise we would not have a valid pfn_to_page()
442  		 * translation.
443  		 */
444  		entry = (void *)~0UL;
445  		if (S_ISCHR(mapping->host->i_mode))
446  			break;
447  
448  		xas.xa = &mapping->i_pages;
449  		xas_lock_irq(&xas);
450  		if (mapping != folio->mapping) {
451  			xas_unlock_irq(&xas);
452  			continue;
453  		}
454  		xas_set(&xas, folio->index);
455  		entry = xas_load(&xas);
456  		if (dax_is_locked(entry)) {
457  			rcu_read_unlock();
458  			wait_entry_unlocked(&xas, entry);
459  			rcu_read_lock();
460  			continue;
461  		}
462  		dax_lock_entry(&xas, entry);
463  		xas_unlock_irq(&xas);
464  		break;
465  	}
466  	rcu_read_unlock();
467  	return (dax_entry_t)entry;
468  }
469  
dax_unlock_folio(struct folio * folio,dax_entry_t cookie)470  void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
471  {
472  	struct address_space *mapping = folio->mapping;
473  	XA_STATE(xas, &mapping->i_pages, folio->index);
474  
475  	if (S_ISCHR(mapping->host->i_mode))
476  		return;
477  
478  	dax_unlock_entry(&xas, (void *)cookie);
479  }
480  
481  /*
482   * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
483   * @mapping: the file's mapping whose entry we want to lock
484   * @index: the offset within this file
485   * @page: output the dax page corresponding to this dax entry
486   *
487   * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
488   * could not be locked.
489   */
dax_lock_mapping_entry(struct address_space * mapping,pgoff_t index,struct page ** page)490  dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
491  		struct page **page)
492  {
493  	XA_STATE(xas, NULL, 0);
494  	void *entry;
495  
496  	rcu_read_lock();
497  	for (;;) {
498  		entry = NULL;
499  		if (!dax_mapping(mapping))
500  			break;
501  
502  		xas.xa = &mapping->i_pages;
503  		xas_lock_irq(&xas);
504  		xas_set(&xas, index);
505  		entry = xas_load(&xas);
506  		if (dax_is_locked(entry)) {
507  			rcu_read_unlock();
508  			wait_entry_unlocked(&xas, entry);
509  			rcu_read_lock();
510  			continue;
511  		}
512  		if (!entry ||
513  		    dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
514  			/*
515  			 * Because we are looking for entry from file's mapping
516  			 * and index, so the entry may not be inserted for now,
517  			 * or even a zero/empty entry.  We don't think this is
518  			 * an error case.  So, return a special value and do
519  			 * not output @page.
520  			 */
521  			entry = (void *)~0UL;
522  		} else {
523  			*page = pfn_to_page(dax_to_pfn(entry));
524  			dax_lock_entry(&xas, entry);
525  		}
526  		xas_unlock_irq(&xas);
527  		break;
528  	}
529  	rcu_read_unlock();
530  	return (dax_entry_t)entry;
531  }
532  
dax_unlock_mapping_entry(struct address_space * mapping,pgoff_t index,dax_entry_t cookie)533  void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
534  		dax_entry_t cookie)
535  {
536  	XA_STATE(xas, &mapping->i_pages, index);
537  
538  	if (cookie == ~0UL)
539  		return;
540  
541  	dax_unlock_entry(&xas, (void *)cookie);
542  }
543  
544  /*
545   * Find page cache entry at given index. If it is a DAX entry, return it
546   * with the entry locked. If the page cache doesn't contain an entry at
547   * that index, add a locked empty entry.
548   *
549   * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
550   * either return that locked entry or will return VM_FAULT_FALLBACK.
551   * This will happen if there are any PTE entries within the PMD range
552   * that we are requesting.
553   *
554   * We always favor PTE entries over PMD entries. There isn't a flow where we
555   * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD
556   * insertion will fail if it finds any PTE entries already in the tree, and a
557   * PTE insertion will cause an existing PMD entry to be unmapped and
558   * downgraded to PTE entries.  This happens for both PMD zero pages as
559   * well as PMD empty entries.
560   *
561   * The exception to this downgrade path is for PMD entries that have
562   * real storage backing them.  We will leave these real PMD entries in
563   * the tree, and PTE writes will simply dirty the entire PMD entry.
564   *
565   * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
566   * persistent memory the benefit is doubtful. We can add that later if we can
567   * show it helps.
568   *
569   * On error, this function does not return an ERR_PTR.  Instead it returns
570   * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values
571   * overlap with xarray value entries.
572   */
grab_mapping_entry(struct xa_state * xas,struct address_space * mapping,unsigned int order)573  static void *grab_mapping_entry(struct xa_state *xas,
574  		struct address_space *mapping, unsigned int order)
575  {
576  	unsigned long index = xas->xa_index;
577  	bool pmd_downgrade;	/* splitting PMD entry into PTE entries? */
578  	void *entry;
579  
580  retry:
581  	pmd_downgrade = false;
582  	xas_lock_irq(xas);
583  	entry = get_unlocked_entry(xas, order);
584  
585  	if (entry) {
586  		if (dax_is_conflict(entry))
587  			goto fallback;
588  		if (!xa_is_value(entry)) {
589  			xas_set_err(xas, -EIO);
590  			goto out_unlock;
591  		}
592  
593  		if (order == 0) {
594  			if (dax_is_pmd_entry(entry) &&
595  			    (dax_is_zero_entry(entry) ||
596  			     dax_is_empty_entry(entry))) {
597  				pmd_downgrade = true;
598  			}
599  		}
600  	}
601  
602  	if (pmd_downgrade) {
603  		/*
604  		 * Make sure 'entry' remains valid while we drop
605  		 * the i_pages lock.
606  		 */
607  		dax_lock_entry(xas, entry);
608  
609  		/*
610  		 * Besides huge zero pages the only other thing that gets
611  		 * downgraded are empty entries which don't need to be
612  		 * unmapped.
613  		 */
614  		if (dax_is_zero_entry(entry)) {
615  			xas_unlock_irq(xas);
616  			unmap_mapping_pages(mapping,
617  					xas->xa_index & ~PG_PMD_COLOUR,
618  					PG_PMD_NR, false);
619  			xas_reset(xas);
620  			xas_lock_irq(xas);
621  		}
622  
623  		dax_disassociate_entry(entry, mapping, false);
624  		xas_store(xas, NULL);	/* undo the PMD join */
625  		dax_wake_entry(xas, entry, WAKE_ALL);
626  		mapping->nrpages -= PG_PMD_NR;
627  		entry = NULL;
628  		xas_set(xas, index);
629  	}
630  
631  	if (entry) {
632  		dax_lock_entry(xas, entry);
633  	} else {
634  		unsigned long flags = DAX_EMPTY;
635  
636  		if (order > 0)
637  			flags |= DAX_PMD;
638  		entry = dax_make_entry(pfn_to_pfn_t(0), flags);
639  		dax_lock_entry(xas, entry);
640  		if (xas_error(xas))
641  			goto out_unlock;
642  		mapping->nrpages += 1UL << order;
643  	}
644  
645  out_unlock:
646  	xas_unlock_irq(xas);
647  	if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
648  		goto retry;
649  	if (xas->xa_node == XA_ERROR(-ENOMEM))
650  		return xa_mk_internal(VM_FAULT_OOM);
651  	if (xas_error(xas))
652  		return xa_mk_internal(VM_FAULT_SIGBUS);
653  	return entry;
654  fallback:
655  	xas_unlock_irq(xas);
656  	return xa_mk_internal(VM_FAULT_FALLBACK);
657  }
658  
659  /**
660   * dax_layout_busy_page_range - find first pinned page in @mapping
661   * @mapping: address space to scan for a page with ref count > 1
662   * @start: Starting offset. Page containing 'start' is included.
663   * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
664   *       pages from 'start' till the end of file are included.
665   *
666   * DAX requires ZONE_DEVICE mapped pages. These pages are never
667   * 'onlined' to the page allocator so they are considered idle when
668   * page->count == 1. A filesystem uses this interface to determine if
669   * any page in the mapping is busy, i.e. for DMA, or other
670   * get_user_pages() usages.
671   *
672   * It is expected that the filesystem is holding locks to block the
673   * establishment of new mappings in this address_space. I.e. it expects
674   * to be able to run unmap_mapping_range() and subsequently not race
675   * mapping_mapped() becoming true.
676   */
dax_layout_busy_page_range(struct address_space * mapping,loff_t start,loff_t end)677  struct page *dax_layout_busy_page_range(struct address_space *mapping,
678  					loff_t start, loff_t end)
679  {
680  	void *entry;
681  	unsigned int scanned = 0;
682  	struct page *page = NULL;
683  	pgoff_t start_idx = start >> PAGE_SHIFT;
684  	pgoff_t end_idx;
685  	XA_STATE(xas, &mapping->i_pages, start_idx);
686  
687  	/*
688  	 * In the 'limited' case get_user_pages() for dax is disabled.
689  	 */
690  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
691  		return NULL;
692  
693  	if (!dax_mapping(mapping) || !mapping_mapped(mapping))
694  		return NULL;
695  
696  	/* If end == LLONG_MAX, all pages from start to till end of file */
697  	if (end == LLONG_MAX)
698  		end_idx = ULONG_MAX;
699  	else
700  		end_idx = end >> PAGE_SHIFT;
701  	/*
702  	 * If we race get_user_pages_fast() here either we'll see the
703  	 * elevated page count in the iteration and wait, or
704  	 * get_user_pages_fast() will see that the page it took a reference
705  	 * against is no longer mapped in the page tables and bail to the
706  	 * get_user_pages() slow path.  The slow path is protected by
707  	 * pte_lock() and pmd_lock(). New references are not taken without
708  	 * holding those locks, and unmap_mapping_pages() will not zero the
709  	 * pte or pmd without holding the respective lock, so we are
710  	 * guaranteed to either see new references or prevent new
711  	 * references from being established.
712  	 */
713  	unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
714  
715  	xas_lock_irq(&xas);
716  	xas_for_each(&xas, entry, end_idx) {
717  		if (WARN_ON_ONCE(!xa_is_value(entry)))
718  			continue;
719  		if (unlikely(dax_is_locked(entry)))
720  			entry = get_unlocked_entry(&xas, 0);
721  		if (entry)
722  			page = dax_busy_page(entry);
723  		put_unlocked_entry(&xas, entry, WAKE_NEXT);
724  		if (page)
725  			break;
726  		if (++scanned % XA_CHECK_SCHED)
727  			continue;
728  
729  		xas_pause(&xas);
730  		xas_unlock_irq(&xas);
731  		cond_resched();
732  		xas_lock_irq(&xas);
733  	}
734  	xas_unlock_irq(&xas);
735  	return page;
736  }
737  EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
738  
dax_layout_busy_page(struct address_space * mapping)739  struct page *dax_layout_busy_page(struct address_space *mapping)
740  {
741  	return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
742  }
743  EXPORT_SYMBOL_GPL(dax_layout_busy_page);
744  
__dax_invalidate_entry(struct address_space * mapping,pgoff_t index,bool trunc)745  static int __dax_invalidate_entry(struct address_space *mapping,
746  					  pgoff_t index, bool trunc)
747  {
748  	XA_STATE(xas, &mapping->i_pages, index);
749  	int ret = 0;
750  	void *entry;
751  
752  	xas_lock_irq(&xas);
753  	entry = get_unlocked_entry(&xas, 0);
754  	if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
755  		goto out;
756  	if (!trunc &&
757  	    (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
758  	     xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
759  		goto out;
760  	dax_disassociate_entry(entry, mapping, trunc);
761  	xas_store(&xas, NULL);
762  	mapping->nrpages -= 1UL << dax_entry_order(entry);
763  	ret = 1;
764  out:
765  	put_unlocked_entry(&xas, entry, WAKE_ALL);
766  	xas_unlock_irq(&xas);
767  	return ret;
768  }
769  
__dax_clear_dirty_range(struct address_space * mapping,pgoff_t start,pgoff_t end)770  static int __dax_clear_dirty_range(struct address_space *mapping,
771  		pgoff_t start, pgoff_t end)
772  {
773  	XA_STATE(xas, &mapping->i_pages, start);
774  	unsigned int scanned = 0;
775  	void *entry;
776  
777  	xas_lock_irq(&xas);
778  	xas_for_each(&xas, entry, end) {
779  		entry = get_unlocked_entry(&xas, 0);
780  		xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
781  		xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
782  		put_unlocked_entry(&xas, entry, WAKE_NEXT);
783  
784  		if (++scanned % XA_CHECK_SCHED)
785  			continue;
786  
787  		xas_pause(&xas);
788  		xas_unlock_irq(&xas);
789  		cond_resched();
790  		xas_lock_irq(&xas);
791  	}
792  	xas_unlock_irq(&xas);
793  
794  	return 0;
795  }
796  
797  /*
798   * Delete DAX entry at @index from @mapping.  Wait for it
799   * to be unlocked before deleting it.
800   */
dax_delete_mapping_entry(struct address_space * mapping,pgoff_t index)801  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
802  {
803  	int ret = __dax_invalidate_entry(mapping, index, true);
804  
805  	/*
806  	 * This gets called from truncate / punch_hole path. As such, the caller
807  	 * must hold locks protecting against concurrent modifications of the
808  	 * page cache (usually fs-private i_mmap_sem for writing). Since the
809  	 * caller has seen a DAX entry for this index, we better find it
810  	 * at that index as well...
811  	 */
812  	WARN_ON_ONCE(!ret);
813  	return ret;
814  }
815  
816  /*
817   * Invalidate DAX entry if it is clean.
818   */
dax_invalidate_mapping_entry_sync(struct address_space * mapping,pgoff_t index)819  int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
820  				      pgoff_t index)
821  {
822  	return __dax_invalidate_entry(mapping, index, false);
823  }
824  
dax_iomap_pgoff(const struct iomap * iomap,loff_t pos)825  static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
826  {
827  	return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
828  }
829  
copy_cow_page_dax(struct vm_fault * vmf,const struct iomap_iter * iter)830  static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
831  {
832  	pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
833  	void *vto, *kaddr;
834  	long rc;
835  	int id;
836  
837  	id = dax_read_lock();
838  	rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
839  				&kaddr, NULL);
840  	if (rc < 0) {
841  		dax_read_unlock(id);
842  		return rc;
843  	}
844  	vto = kmap_atomic(vmf->cow_page);
845  	copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
846  	kunmap_atomic(vto);
847  	dax_read_unlock(id);
848  	return 0;
849  }
850  
851  /*
852   * MAP_SYNC on a dax mapping guarantees dirty metadata is
853   * flushed on write-faults (non-cow), but not read-faults.
854   */
dax_fault_is_synchronous(const struct iomap_iter * iter,struct vm_area_struct * vma)855  static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
856  		struct vm_area_struct *vma)
857  {
858  	return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
859  		(iter->iomap.flags & IOMAP_F_DIRTY);
860  }
861  
862  /*
863   * By this point grab_mapping_entry() has ensured that we have a locked entry
864   * of the appropriate size so we don't have to worry about downgrading PMDs to
865   * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
866   * already in the tree, we will skip the insertion and just dirty the PMD as
867   * appropriate.
868   */
dax_insert_entry(struct xa_state * xas,struct vm_fault * vmf,const struct iomap_iter * iter,void * entry,pfn_t pfn,unsigned long flags)869  static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
870  		const struct iomap_iter *iter, void *entry, pfn_t pfn,
871  		unsigned long flags)
872  {
873  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
874  	void *new_entry = dax_make_entry(pfn, flags);
875  	bool write = iter->flags & IOMAP_WRITE;
876  	bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
877  	bool shared = iter->iomap.flags & IOMAP_F_SHARED;
878  
879  	if (dirty)
880  		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
881  
882  	if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
883  		unsigned long index = xas->xa_index;
884  		/* we are replacing a zero page with block mapping */
885  		if (dax_is_pmd_entry(entry))
886  			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
887  					PG_PMD_NR, false);
888  		else /* pte entry */
889  			unmap_mapping_pages(mapping, index, 1, false);
890  	}
891  
892  	xas_reset(xas);
893  	xas_lock_irq(xas);
894  	if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
895  		void *old;
896  
897  		dax_disassociate_entry(entry, mapping, false);
898  		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
899  				shared);
900  		/*
901  		 * Only swap our new entry into the page cache if the current
902  		 * entry is a zero page or an empty entry.  If a normal PTE or
903  		 * PMD entry is already in the cache, we leave it alone.  This
904  		 * means that if we are trying to insert a PTE and the
905  		 * existing entry is a PMD, we will just leave the PMD in the
906  		 * tree and dirty it if necessary.
907  		 */
908  		old = dax_lock_entry(xas, new_entry);
909  		WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
910  					DAX_LOCKED));
911  		entry = new_entry;
912  	} else {
913  		xas_load(xas);	/* Walk the xa_state */
914  	}
915  
916  	if (dirty)
917  		xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
918  
919  	if (write && shared)
920  		xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
921  
922  	xas_unlock_irq(xas);
923  	return entry;
924  }
925  
dax_writeback_one(struct xa_state * xas,struct dax_device * dax_dev,struct address_space * mapping,void * entry)926  static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
927  		struct address_space *mapping, void *entry)
928  {
929  	unsigned long pfn, index, count, end;
930  	long ret = 0;
931  	struct vm_area_struct *vma;
932  
933  	/*
934  	 * A page got tagged dirty in DAX mapping? Something is seriously
935  	 * wrong.
936  	 */
937  	if (WARN_ON(!xa_is_value(entry)))
938  		return -EIO;
939  
940  	if (unlikely(dax_is_locked(entry))) {
941  		void *old_entry = entry;
942  
943  		entry = get_unlocked_entry(xas, 0);
944  
945  		/* Entry got punched out / reallocated? */
946  		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
947  			goto put_unlocked;
948  		/*
949  		 * Entry got reallocated elsewhere? No need to writeback.
950  		 * We have to compare pfns as we must not bail out due to
951  		 * difference in lockbit or entry type.
952  		 */
953  		if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
954  			goto put_unlocked;
955  		if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
956  					dax_is_zero_entry(entry))) {
957  			ret = -EIO;
958  			goto put_unlocked;
959  		}
960  
961  		/* Another fsync thread may have already done this entry */
962  		if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
963  			goto put_unlocked;
964  	}
965  
966  	/* Lock the entry to serialize with page faults */
967  	dax_lock_entry(xas, entry);
968  
969  	/*
970  	 * We can clear the tag now but we have to be careful so that concurrent
971  	 * dax_writeback_one() calls for the same index cannot finish before we
972  	 * actually flush the caches. This is achieved as the calls will look
973  	 * at the entry only under the i_pages lock and once they do that
974  	 * they will see the entry locked and wait for it to unlock.
975  	 */
976  	xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
977  	xas_unlock_irq(xas);
978  
979  	/*
980  	 * If dax_writeback_mapping_range() was given a wbc->range_start
981  	 * in the middle of a PMD, the 'index' we use needs to be
982  	 * aligned to the start of the PMD.
983  	 * This allows us to flush for PMD_SIZE and not have to worry about
984  	 * partial PMD writebacks.
985  	 */
986  	pfn = dax_to_pfn(entry);
987  	count = 1UL << dax_entry_order(entry);
988  	index = xas->xa_index & ~(count - 1);
989  	end = index + count - 1;
990  
991  	/* Walk all mappings of a given index of a file and writeprotect them */
992  	i_mmap_lock_read(mapping);
993  	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
994  		pfn_mkclean_range(pfn, count, index, vma);
995  		cond_resched();
996  	}
997  	i_mmap_unlock_read(mapping);
998  
999  	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
1000  	/*
1001  	 * After we have flushed the cache, we can clear the dirty tag. There
1002  	 * cannot be new dirty data in the pfn after the flush has completed as
1003  	 * the pfn mappings are writeprotected and fault waits for mapping
1004  	 * entry lock.
1005  	 */
1006  	xas_reset(xas);
1007  	xas_lock_irq(xas);
1008  	xas_store(xas, entry);
1009  	xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
1010  	dax_wake_entry(xas, entry, WAKE_NEXT);
1011  
1012  	trace_dax_writeback_one(mapping->host, index, count);
1013  	return ret;
1014  
1015   put_unlocked:
1016  	put_unlocked_entry(xas, entry, WAKE_NEXT);
1017  	return ret;
1018  }
1019  
1020  /*
1021   * Flush the mapping to the persistent domain within the byte range of [start,
1022   * end]. This is required by data integrity operations to ensure file data is
1023   * on persistent storage prior to completion of the operation.
1024   */
dax_writeback_mapping_range(struct address_space * mapping,struct dax_device * dax_dev,struct writeback_control * wbc)1025  int dax_writeback_mapping_range(struct address_space *mapping,
1026  		struct dax_device *dax_dev, struct writeback_control *wbc)
1027  {
1028  	XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
1029  	struct inode *inode = mapping->host;
1030  	pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
1031  	void *entry;
1032  	int ret = 0;
1033  	unsigned int scanned = 0;
1034  
1035  	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
1036  		return -EIO;
1037  
1038  	if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
1039  		return 0;
1040  
1041  	trace_dax_writeback_range(inode, xas.xa_index, end_index);
1042  
1043  	tag_pages_for_writeback(mapping, xas.xa_index, end_index);
1044  
1045  	xas_lock_irq(&xas);
1046  	xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
1047  		ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
1048  		if (ret < 0) {
1049  			mapping_set_error(mapping, ret);
1050  			break;
1051  		}
1052  		if (++scanned % XA_CHECK_SCHED)
1053  			continue;
1054  
1055  		xas_pause(&xas);
1056  		xas_unlock_irq(&xas);
1057  		cond_resched();
1058  		xas_lock_irq(&xas);
1059  	}
1060  	xas_unlock_irq(&xas);
1061  	trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
1062  	return ret;
1063  }
1064  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
1065  
dax_iomap_direct_access(const struct iomap * iomap,loff_t pos,size_t size,void ** kaddr,pfn_t * pfnp)1066  static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
1067  		size_t size, void **kaddr, pfn_t *pfnp)
1068  {
1069  	pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1070  	int id, rc = 0;
1071  	long length;
1072  
1073  	id = dax_read_lock();
1074  	length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
1075  				   DAX_ACCESS, kaddr, pfnp);
1076  	if (length < 0) {
1077  		rc = length;
1078  		goto out;
1079  	}
1080  	if (!pfnp)
1081  		goto out_check_addr;
1082  	rc = -EINVAL;
1083  	if (PFN_PHYS(length) < size)
1084  		goto out;
1085  	if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
1086  		goto out;
1087  	/* For larger pages we need devmap */
1088  	if (length > 1 && !pfn_t_devmap(*pfnp))
1089  		goto out;
1090  	rc = 0;
1091  
1092  out_check_addr:
1093  	if (!kaddr)
1094  		goto out;
1095  	if (!*kaddr)
1096  		rc = -EFAULT;
1097  out:
1098  	dax_read_unlock(id);
1099  	return rc;
1100  }
1101  
1102  /**
1103   * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
1104   * by copying the data before and after the range to be written.
1105   * @pos:	address to do copy from.
1106   * @length:	size of copy operation.
1107   * @align_size:	aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
1108   * @srcmap:	iomap srcmap
1109   * @daddr:	destination address to copy to.
1110   *
1111   * This can be called from two places. Either during DAX write fault (page
1112   * aligned), to copy the length size data to daddr. Or, while doing normal DAX
1113   * write operation, dax_iomap_iter() might call this to do the copy of either
1114   * start or end unaligned address. In the latter case the rest of the copy of
1115   * aligned ranges is taken care by dax_iomap_iter() itself.
1116   * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
1117   * area to make sure no old data remains.
1118   */
dax_iomap_copy_around(loff_t pos,uint64_t length,size_t align_size,const struct iomap * srcmap,void * daddr)1119  static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
1120  		const struct iomap *srcmap, void *daddr)
1121  {
1122  	loff_t head_off = pos & (align_size - 1);
1123  	size_t size = ALIGN(head_off + length, align_size);
1124  	loff_t end = pos + length;
1125  	loff_t pg_end = round_up(end, align_size);
1126  	/* copy_all is usually in page fault case */
1127  	bool copy_all = head_off == 0 && end == pg_end;
1128  	/* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
1129  	bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
1130  			 srcmap->type == IOMAP_UNWRITTEN;
1131  	void *saddr = NULL;
1132  	int ret = 0;
1133  
1134  	if (!zero_edge) {
1135  		ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
1136  		if (ret)
1137  			return dax_mem2blk_err(ret);
1138  	}
1139  
1140  	if (copy_all) {
1141  		if (zero_edge)
1142  			memset(daddr, 0, size);
1143  		else
1144  			ret = copy_mc_to_kernel(daddr, saddr, length);
1145  		goto out;
1146  	}
1147  
1148  	/* Copy the head part of the range */
1149  	if (head_off) {
1150  		if (zero_edge)
1151  			memset(daddr, 0, head_off);
1152  		else {
1153  			ret = copy_mc_to_kernel(daddr, saddr, head_off);
1154  			if (ret)
1155  				return -EIO;
1156  		}
1157  	}
1158  
1159  	/* Copy the tail part of the range */
1160  	if (end < pg_end) {
1161  		loff_t tail_off = head_off + length;
1162  		loff_t tail_len = pg_end - end;
1163  
1164  		if (zero_edge)
1165  			memset(daddr + tail_off, 0, tail_len);
1166  		else {
1167  			ret = copy_mc_to_kernel(daddr + tail_off,
1168  						saddr + tail_off, tail_len);
1169  			if (ret)
1170  				return -EIO;
1171  		}
1172  	}
1173  out:
1174  	if (zero_edge)
1175  		dax_flush(srcmap->dax_dev, daddr, size);
1176  	return ret ? -EIO : 0;
1177  }
1178  
1179  /*
1180   * The user has performed a load from a hole in the file.  Allocating a new
1181   * page in the file would cause excessive storage usage for workloads with
1182   * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
1183   * If this page is ever written to we will re-fault and change the mapping to
1184   * point to real DAX storage instead.
1185   */
dax_load_hole(struct xa_state * xas,struct vm_fault * vmf,const struct iomap_iter * iter,void ** entry)1186  static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1187  		const struct iomap_iter *iter, void **entry)
1188  {
1189  	struct inode *inode = iter->inode;
1190  	unsigned long vaddr = vmf->address;
1191  	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
1192  	vm_fault_t ret;
1193  
1194  	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
1195  
1196  	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
1197  	trace_dax_load_hole(inode, vmf, ret);
1198  	return ret;
1199  }
1200  
1201  #ifdef CONFIG_FS_DAX_PMD
dax_pmd_load_hole(struct xa_state * xas,struct vm_fault * vmf,const struct iomap_iter * iter,void ** entry)1202  static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1203  		const struct iomap_iter *iter, void **entry)
1204  {
1205  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1206  	unsigned long pmd_addr = vmf->address & PMD_MASK;
1207  	struct vm_area_struct *vma = vmf->vma;
1208  	struct inode *inode = mapping->host;
1209  	pgtable_t pgtable = NULL;
1210  	struct folio *zero_folio;
1211  	spinlock_t *ptl;
1212  	pmd_t pmd_entry;
1213  	pfn_t pfn;
1214  
1215  	zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
1216  
1217  	if (unlikely(!zero_folio))
1218  		goto fallback;
1219  
1220  	pfn = page_to_pfn_t(&zero_folio->page);
1221  	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
1222  				  DAX_PMD | DAX_ZERO_PAGE);
1223  
1224  	if (arch_needs_pgtable_deposit()) {
1225  		pgtable = pte_alloc_one(vma->vm_mm);
1226  		if (!pgtable)
1227  			return VM_FAULT_OOM;
1228  	}
1229  
1230  	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1231  	if (!pmd_none(*(vmf->pmd))) {
1232  		spin_unlock(ptl);
1233  		goto fallback;
1234  	}
1235  
1236  	if (pgtable) {
1237  		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
1238  		mm_inc_nr_ptes(vma->vm_mm);
1239  	}
1240  	pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot);
1241  	pmd_entry = pmd_mkhuge(pmd_entry);
1242  	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
1243  	spin_unlock(ptl);
1244  	trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
1245  	return VM_FAULT_NOPAGE;
1246  
1247  fallback:
1248  	if (pgtable)
1249  		pte_free(vma->vm_mm, pgtable);
1250  	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
1251  	return VM_FAULT_FALLBACK;
1252  }
1253  #else
dax_pmd_load_hole(struct xa_state * xas,struct vm_fault * vmf,const struct iomap_iter * iter,void ** entry)1254  static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1255  		const struct iomap_iter *iter, void **entry)
1256  {
1257  	return VM_FAULT_FALLBACK;
1258  }
1259  #endif /* CONFIG_FS_DAX_PMD */
1260  
dax_unshare_iter(struct iomap_iter * iter)1261  static s64 dax_unshare_iter(struct iomap_iter *iter)
1262  {
1263  	struct iomap *iomap = &iter->iomap;
1264  	const struct iomap *srcmap = iomap_iter_srcmap(iter);
1265  	loff_t copy_pos = iter->pos;
1266  	u64 copy_len = iomap_length(iter);
1267  	u32 mod;
1268  	int id = 0;
1269  	s64 ret = 0;
1270  	void *daddr = NULL, *saddr = NULL;
1271  
1272  	if (!iomap_want_unshare_iter(iter))
1273  		return iomap_length(iter);
1274  
1275  	/*
1276  	 * Extend the file range to be aligned to fsblock/pagesize, because
1277  	 * we need to copy entire blocks, not just the byte range specified.
1278  	 * Invalidate the mapping because we're about to CoW.
1279  	 */
1280  	mod = offset_in_page(copy_pos);
1281  	if (mod) {
1282  		copy_len += mod;
1283  		copy_pos -= mod;
1284  	}
1285  
1286  	mod = offset_in_page(copy_pos + copy_len);
1287  	if (mod)
1288  		copy_len += PAGE_SIZE - mod;
1289  
1290  	invalidate_inode_pages2_range(iter->inode->i_mapping,
1291  				      copy_pos >> PAGE_SHIFT,
1292  				      (copy_pos + copy_len - 1) >> PAGE_SHIFT);
1293  
1294  	id = dax_read_lock();
1295  	ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL);
1296  	if (ret < 0)
1297  		goto out_unlock;
1298  
1299  	ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL);
1300  	if (ret < 0)
1301  		goto out_unlock;
1302  
1303  	if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0)
1304  		ret = iomap_length(iter);
1305  	else
1306  		ret = -EIO;
1307  
1308  out_unlock:
1309  	dax_read_unlock(id);
1310  	return dax_mem2blk_err(ret);
1311  }
1312  
dax_file_unshare(struct inode * inode,loff_t pos,loff_t len,const struct iomap_ops * ops)1313  int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
1314  		const struct iomap_ops *ops)
1315  {
1316  	struct iomap_iter iter = {
1317  		.inode		= inode,
1318  		.pos		= pos,
1319  		.flags		= IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX,
1320  	};
1321  	loff_t size = i_size_read(inode);
1322  	int ret;
1323  
1324  	if (pos < 0 || pos >= size)
1325  		return 0;
1326  
1327  	iter.len = min(len, size - pos);
1328  	while ((ret = iomap_iter(&iter, ops)) > 0)
1329  		iter.processed = dax_unshare_iter(&iter);
1330  	return ret;
1331  }
1332  EXPORT_SYMBOL_GPL(dax_file_unshare);
1333  
dax_memzero(struct iomap_iter * iter,loff_t pos,size_t size)1334  static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
1335  {
1336  	const struct iomap *iomap = &iter->iomap;
1337  	const struct iomap *srcmap = iomap_iter_srcmap(iter);
1338  	unsigned offset = offset_in_page(pos);
1339  	pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1340  	void *kaddr;
1341  	long ret;
1342  
1343  	ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
1344  				NULL);
1345  	if (ret < 0)
1346  		return dax_mem2blk_err(ret);
1347  
1348  	memset(kaddr + offset, 0, size);
1349  	if (iomap->flags & IOMAP_F_SHARED)
1350  		ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
1351  					    kaddr);
1352  	else
1353  		dax_flush(iomap->dax_dev, kaddr + offset, size);
1354  	return ret;
1355  }
1356  
dax_zero_iter(struct iomap_iter * iter,bool * did_zero)1357  static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
1358  {
1359  	const struct iomap *iomap = &iter->iomap;
1360  	const struct iomap *srcmap = iomap_iter_srcmap(iter);
1361  	loff_t pos = iter->pos;
1362  	u64 length = iomap_length(iter);
1363  	s64 written = 0;
1364  
1365  	/* already zeroed?  we're done. */
1366  	if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
1367  		return length;
1368  
1369  	/*
1370  	 * invalidate the pages whose sharing state is to be changed
1371  	 * because of CoW.
1372  	 */
1373  	if (iomap->flags & IOMAP_F_SHARED)
1374  		invalidate_inode_pages2_range(iter->inode->i_mapping,
1375  					      pos >> PAGE_SHIFT,
1376  					      (pos + length - 1) >> PAGE_SHIFT);
1377  
1378  	do {
1379  		unsigned offset = offset_in_page(pos);
1380  		unsigned size = min_t(u64, PAGE_SIZE - offset, length);
1381  		pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1382  		long rc;
1383  		int id;
1384  
1385  		id = dax_read_lock();
1386  		if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
1387  			rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
1388  		else
1389  			rc = dax_memzero(iter, pos, size);
1390  		dax_read_unlock(id);
1391  
1392  		if (rc < 0)
1393  			return rc;
1394  		pos += size;
1395  		length -= size;
1396  		written += size;
1397  	} while (length > 0);
1398  
1399  	if (did_zero)
1400  		*did_zero = true;
1401  	return written;
1402  }
1403  
dax_zero_range(struct inode * inode,loff_t pos,loff_t len,bool * did_zero,const struct iomap_ops * ops)1404  int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
1405  		const struct iomap_ops *ops)
1406  {
1407  	struct iomap_iter iter = {
1408  		.inode		= inode,
1409  		.pos		= pos,
1410  		.len		= len,
1411  		.flags		= IOMAP_DAX | IOMAP_ZERO,
1412  	};
1413  	int ret;
1414  
1415  	while ((ret = iomap_iter(&iter, ops)) > 0)
1416  		iter.processed = dax_zero_iter(&iter, did_zero);
1417  	return ret;
1418  }
1419  EXPORT_SYMBOL_GPL(dax_zero_range);
1420  
dax_truncate_page(struct inode * inode,loff_t pos,bool * did_zero,const struct iomap_ops * ops)1421  int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
1422  		const struct iomap_ops *ops)
1423  {
1424  	unsigned int blocksize = i_blocksize(inode);
1425  	unsigned int off = pos & (blocksize - 1);
1426  
1427  	/* Block boundary? Nothing to do */
1428  	if (!off)
1429  		return 0;
1430  	return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
1431  }
1432  EXPORT_SYMBOL_GPL(dax_truncate_page);
1433  
dax_iomap_iter(const struct iomap_iter * iomi,struct iov_iter * iter)1434  static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
1435  		struct iov_iter *iter)
1436  {
1437  	const struct iomap *iomap = &iomi->iomap;
1438  	const struct iomap *srcmap = iomap_iter_srcmap(iomi);
1439  	loff_t length = iomap_length(iomi);
1440  	loff_t pos = iomi->pos;
1441  	struct dax_device *dax_dev = iomap->dax_dev;
1442  	loff_t end = pos + length, done = 0;
1443  	bool write = iov_iter_rw(iter) == WRITE;
1444  	bool cow = write && iomap->flags & IOMAP_F_SHARED;
1445  	ssize_t ret = 0;
1446  	size_t xfer;
1447  	int id;
1448  
1449  	if (!write) {
1450  		end = min(end, i_size_read(iomi->inode));
1451  		if (pos >= end)
1452  			return 0;
1453  
1454  		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1455  			return iov_iter_zero(min(length, end - pos), iter);
1456  	}
1457  
1458  	/*
1459  	 * In DAX mode, enforce either pure overwrites of written extents, or
1460  	 * writes to unwritten extents as part of a copy-on-write operation.
1461  	 */
1462  	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
1463  			!(iomap->flags & IOMAP_F_SHARED)))
1464  		return -EIO;
1465  
1466  	/*
1467  	 * Write can allocate block for an area which has a hole page mapped
1468  	 * into page tables. We have to tear down these mappings so that data
1469  	 * written by write(2) is visible in mmap.
1470  	 */
1471  	if (iomap->flags & IOMAP_F_NEW || cow) {
1472  		/*
1473  		 * Filesystem allows CoW on non-shared extents. The src extents
1474  		 * may have been mmapped with dirty mark before. To be able to
1475  		 * invalidate its dax entries, we need to clear the dirty mark
1476  		 * in advance.
1477  		 */
1478  		if (cow)
1479  			__dax_clear_dirty_range(iomi->inode->i_mapping,
1480  						pos >> PAGE_SHIFT,
1481  						(end - 1) >> PAGE_SHIFT);
1482  		invalidate_inode_pages2_range(iomi->inode->i_mapping,
1483  					      pos >> PAGE_SHIFT,
1484  					      (end - 1) >> PAGE_SHIFT);
1485  	}
1486  
1487  	id = dax_read_lock();
1488  	while (pos < end) {
1489  		unsigned offset = pos & (PAGE_SIZE - 1);
1490  		const size_t size = ALIGN(length + offset, PAGE_SIZE);
1491  		pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1492  		ssize_t map_len;
1493  		bool recovery = false;
1494  		void *kaddr;
1495  
1496  		if (fatal_signal_pending(current)) {
1497  			ret = -EINTR;
1498  			break;
1499  		}
1500  
1501  		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
1502  				DAX_ACCESS, &kaddr, NULL);
1503  		if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
1504  			map_len = dax_direct_access(dax_dev, pgoff,
1505  					PHYS_PFN(size), DAX_RECOVERY_WRITE,
1506  					&kaddr, NULL);
1507  			if (map_len > 0)
1508  				recovery = true;
1509  		}
1510  		if (map_len < 0) {
1511  			ret = dax_mem2blk_err(map_len);
1512  			break;
1513  		}
1514  
1515  		if (cow) {
1516  			ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
1517  						    srcmap, kaddr);
1518  			if (ret)
1519  				break;
1520  		}
1521  
1522  		map_len = PFN_PHYS(map_len);
1523  		kaddr += offset;
1524  		map_len -= offset;
1525  		if (map_len > end - pos)
1526  			map_len = end - pos;
1527  
1528  		if (recovery)
1529  			xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
1530  					map_len, iter);
1531  		else if (write)
1532  			xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
1533  					map_len, iter);
1534  		else
1535  			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
1536  					map_len, iter);
1537  
1538  		pos += xfer;
1539  		length -= xfer;
1540  		done += xfer;
1541  
1542  		if (xfer == 0)
1543  			ret = -EFAULT;
1544  		if (xfer < map_len)
1545  			break;
1546  	}
1547  	dax_read_unlock(id);
1548  
1549  	return done ? done : ret;
1550  }
1551  
1552  /**
1553   * dax_iomap_rw - Perform I/O to a DAX file
1554   * @iocb:	The control block for this I/O
1555   * @iter:	The addresses to do I/O from or to
1556   * @ops:	iomap ops passed from the file system
1557   *
1558   * This function performs read and write operations to directly mapped
1559   * persistent memory.  The callers needs to take care of read/write exclusion
1560   * and evicting any page cache pages in the region under I/O.
1561   */
1562  ssize_t
dax_iomap_rw(struct kiocb * iocb,struct iov_iter * iter,const struct iomap_ops * ops)1563  dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1564  		const struct iomap_ops *ops)
1565  {
1566  	struct iomap_iter iomi = {
1567  		.inode		= iocb->ki_filp->f_mapping->host,
1568  		.pos		= iocb->ki_pos,
1569  		.len		= iov_iter_count(iter),
1570  		.flags		= IOMAP_DAX,
1571  	};
1572  	loff_t done = 0;
1573  	int ret;
1574  
1575  	if (!iomi.len)
1576  		return 0;
1577  
1578  	if (iov_iter_rw(iter) == WRITE) {
1579  		lockdep_assert_held_write(&iomi.inode->i_rwsem);
1580  		iomi.flags |= IOMAP_WRITE;
1581  	} else {
1582  		lockdep_assert_held(&iomi.inode->i_rwsem);
1583  	}
1584  
1585  	if (iocb->ki_flags & IOCB_NOWAIT)
1586  		iomi.flags |= IOMAP_NOWAIT;
1587  
1588  	while ((ret = iomap_iter(&iomi, ops)) > 0)
1589  		iomi.processed = dax_iomap_iter(&iomi, iter);
1590  
1591  	done = iomi.pos - iocb->ki_pos;
1592  	iocb->ki_pos = iomi.pos;
1593  	return done ? done : ret;
1594  }
1595  EXPORT_SYMBOL_GPL(dax_iomap_rw);
1596  
dax_fault_return(int error)1597  static vm_fault_t dax_fault_return(int error)
1598  {
1599  	if (error == 0)
1600  		return VM_FAULT_NOPAGE;
1601  	return vmf_error(error);
1602  }
1603  
1604  /*
1605   * When handling a synchronous page fault and the inode need a fsync, we can
1606   * insert the PTE/PMD into page tables only after that fsync happened. Skip
1607   * insertion for now and return the pfn so that caller can insert it after the
1608   * fsync is done.
1609   */
dax_fault_synchronous_pfnp(pfn_t * pfnp,pfn_t pfn)1610  static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
1611  {
1612  	if (WARN_ON_ONCE(!pfnp))
1613  		return VM_FAULT_SIGBUS;
1614  	*pfnp = pfn;
1615  	return VM_FAULT_NEEDDSYNC;
1616  }
1617  
dax_fault_cow_page(struct vm_fault * vmf,const struct iomap_iter * iter)1618  static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
1619  		const struct iomap_iter *iter)
1620  {
1621  	vm_fault_t ret;
1622  	int error = 0;
1623  
1624  	switch (iter->iomap.type) {
1625  	case IOMAP_HOLE:
1626  	case IOMAP_UNWRITTEN:
1627  		clear_user_highpage(vmf->cow_page, vmf->address);
1628  		break;
1629  	case IOMAP_MAPPED:
1630  		error = copy_cow_page_dax(vmf, iter);
1631  		break;
1632  	default:
1633  		WARN_ON_ONCE(1);
1634  		error = -EIO;
1635  		break;
1636  	}
1637  
1638  	if (error)
1639  		return dax_fault_return(error);
1640  
1641  	__SetPageUptodate(vmf->cow_page);
1642  	ret = finish_fault(vmf);
1643  	if (!ret)
1644  		return VM_FAULT_DONE_COW;
1645  	return ret;
1646  }
1647  
1648  /**
1649   * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
1650   * @vmf:	vm fault instance
1651   * @iter:	iomap iter
1652   * @pfnp:	pfn to be returned
1653   * @xas:	the dax mapping tree of a file
1654   * @entry:	an unlocked dax entry to be inserted
1655   * @pmd:	distinguish whether it is a pmd fault
1656   */
dax_fault_iter(struct vm_fault * vmf,const struct iomap_iter * iter,pfn_t * pfnp,struct xa_state * xas,void ** entry,bool pmd)1657  static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
1658  		const struct iomap_iter *iter, pfn_t *pfnp,
1659  		struct xa_state *xas, void **entry, bool pmd)
1660  {
1661  	const struct iomap *iomap = &iter->iomap;
1662  	const struct iomap *srcmap = iomap_iter_srcmap(iter);
1663  	size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
1664  	loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
1665  	bool write = iter->flags & IOMAP_WRITE;
1666  	unsigned long entry_flags = pmd ? DAX_PMD : 0;
1667  	int err = 0;
1668  	pfn_t pfn;
1669  	void *kaddr;
1670  
1671  	if (!pmd && vmf->cow_page)
1672  		return dax_fault_cow_page(vmf, iter);
1673  
1674  	/* if we are reading UNWRITTEN and HOLE, return a hole. */
1675  	if (!write &&
1676  	    (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
1677  		if (!pmd)
1678  			return dax_load_hole(xas, vmf, iter, entry);
1679  		return dax_pmd_load_hole(xas, vmf, iter, entry);
1680  	}
1681  
1682  	if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
1683  		WARN_ON_ONCE(1);
1684  		return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
1685  	}
1686  
1687  	err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
1688  	if (err)
1689  		return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
1690  
1691  	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
1692  
1693  	if (write && iomap->flags & IOMAP_F_SHARED) {
1694  		err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr);
1695  		if (err)
1696  			return dax_fault_return(err);
1697  	}
1698  
1699  	if (dax_fault_is_synchronous(iter, vmf->vma))
1700  		return dax_fault_synchronous_pfnp(pfnp, pfn);
1701  
1702  	/* insert PMD pfn */
1703  	if (pmd)
1704  		return vmf_insert_pfn_pmd(vmf, pfn, write);
1705  
1706  	/* insert PTE pfn */
1707  	if (write)
1708  		return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1709  	return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
1710  }
1711  
dax_iomap_pte_fault(struct vm_fault * vmf,pfn_t * pfnp,int * iomap_errp,const struct iomap_ops * ops)1712  static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
1713  			       int *iomap_errp, const struct iomap_ops *ops)
1714  {
1715  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1716  	XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
1717  	struct iomap_iter iter = {
1718  		.inode		= mapping->host,
1719  		.pos		= (loff_t)vmf->pgoff << PAGE_SHIFT,
1720  		.len		= PAGE_SIZE,
1721  		.flags		= IOMAP_DAX | IOMAP_FAULT,
1722  	};
1723  	vm_fault_t ret = 0;
1724  	void *entry;
1725  	int error;
1726  
1727  	trace_dax_pte_fault(iter.inode, vmf, ret);
1728  	/*
1729  	 * Check whether offset isn't beyond end of file now. Caller is supposed
1730  	 * to hold locks serializing us with truncate / punch hole so this is
1731  	 * a reliable test.
1732  	 */
1733  	if (iter.pos >= i_size_read(iter.inode)) {
1734  		ret = VM_FAULT_SIGBUS;
1735  		goto out;
1736  	}
1737  
1738  	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1739  		iter.flags |= IOMAP_WRITE;
1740  
1741  	entry = grab_mapping_entry(&xas, mapping, 0);
1742  	if (xa_is_internal(entry)) {
1743  		ret = xa_to_internal(entry);
1744  		goto out;
1745  	}
1746  
1747  	/*
1748  	 * It is possible, particularly with mixed reads & writes to private
1749  	 * mappings, that we have raced with a PMD fault that overlaps with
1750  	 * the PTE we need to set up.  If so just return and the fault will be
1751  	 * retried.
1752  	 */
1753  	if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
1754  		ret = VM_FAULT_NOPAGE;
1755  		goto unlock_entry;
1756  	}
1757  
1758  	while ((error = iomap_iter(&iter, ops)) > 0) {
1759  		if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
1760  			iter.processed = -EIO;	/* fs corruption? */
1761  			continue;
1762  		}
1763  
1764  		ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
1765  		if (ret != VM_FAULT_SIGBUS &&
1766  		    (iter.iomap.flags & IOMAP_F_NEW)) {
1767  			count_vm_event(PGMAJFAULT);
1768  			count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
1769  			ret |= VM_FAULT_MAJOR;
1770  		}
1771  
1772  		if (!(ret & VM_FAULT_ERROR))
1773  			iter.processed = PAGE_SIZE;
1774  	}
1775  
1776  	if (iomap_errp)
1777  		*iomap_errp = error;
1778  	if (!ret && error)
1779  		ret = dax_fault_return(error);
1780  
1781  unlock_entry:
1782  	dax_unlock_entry(&xas, entry);
1783  out:
1784  	trace_dax_pte_fault_done(iter.inode, vmf, ret);
1785  	return ret;
1786  }
1787  
1788  #ifdef CONFIG_FS_DAX_PMD
dax_fault_check_fallback(struct vm_fault * vmf,struct xa_state * xas,pgoff_t max_pgoff)1789  static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
1790  		pgoff_t max_pgoff)
1791  {
1792  	unsigned long pmd_addr = vmf->address & PMD_MASK;
1793  	bool write = vmf->flags & FAULT_FLAG_WRITE;
1794  
1795  	/*
1796  	 * Make sure that the faulting address's PMD offset (color) matches
1797  	 * the PMD offset from the start of the file.  This is necessary so
1798  	 * that a PMD range in the page table overlaps exactly with a PMD
1799  	 * range in the page cache.
1800  	 */
1801  	if ((vmf->pgoff & PG_PMD_COLOUR) !=
1802  	    ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
1803  		return true;
1804  
1805  	/* Fall back to PTEs if we're going to COW */
1806  	if (write && !(vmf->vma->vm_flags & VM_SHARED))
1807  		return true;
1808  
1809  	/* If the PMD would extend outside the VMA */
1810  	if (pmd_addr < vmf->vma->vm_start)
1811  		return true;
1812  	if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
1813  		return true;
1814  
1815  	/* If the PMD would extend beyond the file size */
1816  	if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
1817  		return true;
1818  
1819  	return false;
1820  }
1821  
dax_iomap_pmd_fault(struct vm_fault * vmf,pfn_t * pfnp,const struct iomap_ops * ops)1822  static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1823  			       const struct iomap_ops *ops)
1824  {
1825  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1826  	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
1827  	struct iomap_iter iter = {
1828  		.inode		= mapping->host,
1829  		.len		= PMD_SIZE,
1830  		.flags		= IOMAP_DAX | IOMAP_FAULT,
1831  	};
1832  	vm_fault_t ret = VM_FAULT_FALLBACK;
1833  	pgoff_t max_pgoff;
1834  	void *entry;
1835  
1836  	if (vmf->flags & FAULT_FLAG_WRITE)
1837  		iter.flags |= IOMAP_WRITE;
1838  
1839  	/*
1840  	 * Check whether offset isn't beyond end of file now. Caller is
1841  	 * supposed to hold locks serializing us with truncate / punch hole so
1842  	 * this is a reliable test.
1843  	 */
1844  	max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
1845  
1846  	trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);
1847  
1848  	if (xas.xa_index >= max_pgoff) {
1849  		ret = VM_FAULT_SIGBUS;
1850  		goto out;
1851  	}
1852  
1853  	if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
1854  		goto fallback;
1855  
1856  	/*
1857  	 * grab_mapping_entry() will make sure we get an empty PMD entry,
1858  	 * a zero PMD entry or a DAX PMD.  If it can't (because a PTE
1859  	 * entry is already in the array, for instance), it will return
1860  	 * VM_FAULT_FALLBACK.
1861  	 */
1862  	entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
1863  	if (xa_is_internal(entry)) {
1864  		ret = xa_to_internal(entry);
1865  		goto fallback;
1866  	}
1867  
1868  	/*
1869  	 * It is possible, particularly with mixed reads & writes to private
1870  	 * mappings, that we have raced with a PTE fault that overlaps with
1871  	 * the PMD we need to set up.  If so just return and the fault will be
1872  	 * retried.
1873  	 */
1874  	if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
1875  			!pmd_devmap(*vmf->pmd)) {
1876  		ret = 0;
1877  		goto unlock_entry;
1878  	}
1879  
1880  	iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
1881  	while (iomap_iter(&iter, ops) > 0) {
1882  		if (iomap_length(&iter) < PMD_SIZE)
1883  			continue; /* actually breaks out of the loop */
1884  
1885  		ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
1886  		if (ret != VM_FAULT_FALLBACK)
1887  			iter.processed = PMD_SIZE;
1888  	}
1889  
1890  unlock_entry:
1891  	dax_unlock_entry(&xas, entry);
1892  fallback:
1893  	if (ret == VM_FAULT_FALLBACK) {
1894  		split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
1895  		count_vm_event(THP_FAULT_FALLBACK);
1896  	}
1897  out:
1898  	trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
1899  	return ret;
1900  }
1901  #else
dax_iomap_pmd_fault(struct vm_fault * vmf,pfn_t * pfnp,const struct iomap_ops * ops)1902  static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1903  			       const struct iomap_ops *ops)
1904  {
1905  	return VM_FAULT_FALLBACK;
1906  }
1907  #endif /* CONFIG_FS_DAX_PMD */
1908  
1909  /**
1910   * dax_iomap_fault - handle a page fault on a DAX file
1911   * @vmf: The description of the fault
1912   * @order: Order of the page to fault in
1913   * @pfnp: PFN to insert for synchronous faults if fsync is required
1914   * @iomap_errp: Storage for detailed error code in case of error
1915   * @ops: Iomap ops passed from the file system
1916   *
1917   * When a page fault occurs, filesystems may call this helper in
1918   * their fault handler for DAX files. dax_iomap_fault() assumes the caller
1919   * has done all the necessary locking for page fault to proceed
1920   * successfully.
1921   */
dax_iomap_fault(struct vm_fault * vmf,unsigned int order,pfn_t * pfnp,int * iomap_errp,const struct iomap_ops * ops)1922  vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
1923  		    pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
1924  {
1925  	if (order == 0)
1926  		return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
1927  	else if (order == PMD_ORDER)
1928  		return dax_iomap_pmd_fault(vmf, pfnp, ops);
1929  	else
1930  		return VM_FAULT_FALLBACK;
1931  }
1932  EXPORT_SYMBOL_GPL(dax_iomap_fault);
1933  
1934  /*
1935   * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
1936   * @vmf: The description of the fault
1937   * @pfn: PFN to insert
1938   * @order: Order of entry to insert.
1939   *
1940   * This function inserts a writeable PTE or PMD entry into the page tables
1941   * for an mmaped DAX file.  It also marks the page cache entry as dirty.
1942   */
1943  static vm_fault_t
dax_insert_pfn_mkwrite(struct vm_fault * vmf,pfn_t pfn,unsigned int order)1944  dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
1945  {
1946  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1947  	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
1948  	void *entry;
1949  	vm_fault_t ret;
1950  
1951  	xas_lock_irq(&xas);
1952  	entry = get_unlocked_entry(&xas, order);
1953  	/* Did we race with someone splitting entry or so? */
1954  	if (!entry || dax_is_conflict(entry) ||
1955  	    (order == 0 && !dax_is_pte_entry(entry))) {
1956  		put_unlocked_entry(&xas, entry, WAKE_NEXT);
1957  		xas_unlock_irq(&xas);
1958  		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
1959  						      VM_FAULT_NOPAGE);
1960  		return VM_FAULT_NOPAGE;
1961  	}
1962  	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
1963  	dax_lock_entry(&xas, entry);
1964  	xas_unlock_irq(&xas);
1965  	if (order == 0)
1966  		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1967  #ifdef CONFIG_FS_DAX_PMD
1968  	else if (order == PMD_ORDER)
1969  		ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
1970  #endif
1971  	else
1972  		ret = VM_FAULT_FALLBACK;
1973  	dax_unlock_entry(&xas, entry);
1974  	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
1975  	return ret;
1976  }
1977  
1978  /**
1979   * dax_finish_sync_fault - finish synchronous page fault
1980   * @vmf: The description of the fault
1981   * @order: Order of entry to be inserted
1982   * @pfn: PFN to insert
1983   *
1984   * This function ensures that the file range touched by the page fault is
1985   * stored persistently on the media and handles inserting of appropriate page
1986   * table entry.
1987   */
dax_finish_sync_fault(struct vm_fault * vmf,unsigned int order,pfn_t pfn)1988  vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
1989  		pfn_t pfn)
1990  {
1991  	int err;
1992  	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
1993  	size_t len = PAGE_SIZE << order;
1994  
1995  	err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
1996  	if (err)
1997  		return VM_FAULT_SIGBUS;
1998  	return dax_insert_pfn_mkwrite(vmf, pfn, order);
1999  }
2000  EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
2001  
dax_range_compare_iter(struct iomap_iter * it_src,struct iomap_iter * it_dest,u64 len,bool * same)2002  static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
2003  		struct iomap_iter *it_dest, u64 len, bool *same)
2004  {
2005  	const struct iomap *smap = &it_src->iomap;
2006  	const struct iomap *dmap = &it_dest->iomap;
2007  	loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
2008  	void *saddr, *daddr;
2009  	int id, ret;
2010  
2011  	len = min(len, min(smap->length, dmap->length));
2012  
2013  	if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
2014  		*same = true;
2015  		return len;
2016  	}
2017  
2018  	if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
2019  		*same = false;
2020  		return 0;
2021  	}
2022  
2023  	id = dax_read_lock();
2024  	ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
2025  				      &saddr, NULL);
2026  	if (ret < 0)
2027  		goto out_unlock;
2028  
2029  	ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
2030  				      &daddr, NULL);
2031  	if (ret < 0)
2032  		goto out_unlock;
2033  
2034  	*same = !memcmp(saddr, daddr, len);
2035  	if (!*same)
2036  		len = 0;
2037  	dax_read_unlock(id);
2038  	return len;
2039  
2040  out_unlock:
2041  	dax_read_unlock(id);
2042  	return -EIO;
2043  }
2044  
dax_dedupe_file_range_compare(struct inode * src,loff_t srcoff,struct inode * dst,loff_t dstoff,loff_t len,bool * same,const struct iomap_ops * ops)2045  int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
2046  		struct inode *dst, loff_t dstoff, loff_t len, bool *same,
2047  		const struct iomap_ops *ops)
2048  {
2049  	struct iomap_iter src_iter = {
2050  		.inode		= src,
2051  		.pos		= srcoff,
2052  		.len		= len,
2053  		.flags		= IOMAP_DAX,
2054  	};
2055  	struct iomap_iter dst_iter = {
2056  		.inode		= dst,
2057  		.pos		= dstoff,
2058  		.len		= len,
2059  		.flags		= IOMAP_DAX,
2060  	};
2061  	int ret, compared = 0;
2062  
2063  	while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
2064  	       (ret = iomap_iter(&dst_iter, ops)) > 0) {
2065  		compared = dax_range_compare_iter(&src_iter, &dst_iter,
2066  				min(src_iter.len, dst_iter.len), same);
2067  		if (compared < 0)
2068  			return ret;
2069  		src_iter.processed = dst_iter.processed = compared;
2070  	}
2071  	return ret;
2072  }
2073  
dax_remap_file_range_prep(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t * len,unsigned int remap_flags,const struct iomap_ops * ops)2074  int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
2075  			      struct file *file_out, loff_t pos_out,
2076  			      loff_t *len, unsigned int remap_flags,
2077  			      const struct iomap_ops *ops)
2078  {
2079  	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
2080  					       pos_out, len, remap_flags, ops);
2081  }
2082  EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
2083