1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  linux/mm/swap.c
4   *
5   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6   */
7  
8  /*
9   * This file contains the default values for the operation of the
10   * Linux VM subsystem. Fine-tuning documentation can be found in
11   * Documentation/admin-guide/sysctl/vm.rst.
12   * Started 18.12.91
13   * Swap aging added 23.2.95, Stephen Tweedie.
14   * Buffermem limits added 12.3.98, Rik van Riel.
15   */
16  
17  #include <linux/mm.h>
18  #include <linux/sched.h>
19  #include <linux/kernel_stat.h>
20  #include <linux/swap.h>
21  #include <linux/mman.h>
22  #include <linux/pagemap.h>
23  #include <linux/pagevec.h>
24  #include <linux/init.h>
25  #include <linux/export.h>
26  #include <linux/mm_inline.h>
27  #include <linux/percpu_counter.h>
28  #include <linux/memremap.h>
29  #include <linux/percpu.h>
30  #include <linux/cpu.h>
31  #include <linux/notifier.h>
32  #include <linux/backing-dev.h>
33  #include <linux/memcontrol.h>
34  #include <linux/gfp.h>
35  #include <linux/uio.h>
36  #include <linux/hugetlb.h>
37  #include <linux/page_idle.h>
38  #include <linux/local_lock.h>
39  #include <linux/buffer_head.h>
40  
41  #include "internal.h"
42  
43  #define CREATE_TRACE_POINTS
44  #include <trace/events/pagemap.h>
45  
46  /* How many pages do we try to swap or page in/out together? As a power of 2 */
47  int page_cluster;
48  const int page_cluster_max = 31;
49  
50  struct cpu_fbatches {
51  	/*
52  	 * The following folio batches are grouped together because they are protected
53  	 * by disabling preemption (and interrupts remain enabled).
54  	 */
55  	local_lock_t lock;
56  	struct folio_batch lru_add;
57  	struct folio_batch lru_deactivate_file;
58  	struct folio_batch lru_deactivate;
59  	struct folio_batch lru_lazyfree;
60  #ifdef CONFIG_SMP
61  	struct folio_batch lru_activate;
62  #endif
63  	/* Protecting the following batches which require disabling interrupts */
64  	local_lock_t lock_irq;
65  	struct folio_batch lru_move_tail;
66  };
67  
68  static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
69  	.lock = INIT_LOCAL_LOCK(lock),
70  	.lock_irq = INIT_LOCAL_LOCK(lock_irq),
71  };
72  
__page_cache_release(struct folio * folio,struct lruvec ** lruvecp,unsigned long * flagsp)73  static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
74  		unsigned long *flagsp)
75  {
76  	if (folio_test_lru(folio)) {
77  		folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
78  		lruvec_del_folio(*lruvecp, folio);
79  		__folio_clear_lru_flags(folio);
80  	}
81  }
82  
83  /*
84   * This path almost never happens for VM activity - pages are normally freed
85   * in batches.  But it gets used by networking - and for compound pages.
86   */
page_cache_release(struct folio * folio)87  static void page_cache_release(struct folio *folio)
88  {
89  	struct lruvec *lruvec = NULL;
90  	unsigned long flags;
91  
92  	__page_cache_release(folio, &lruvec, &flags);
93  	if (lruvec)
94  		unlock_page_lruvec_irqrestore(lruvec, flags);
95  }
96  
__folio_put(struct folio * folio)97  void __folio_put(struct folio *folio)
98  {
99  	if (unlikely(folio_is_zone_device(folio))) {
100  		free_zone_device_folio(folio);
101  		return;
102  	}
103  
104  	if (folio_test_hugetlb(folio)) {
105  		free_huge_folio(folio);
106  		return;
107  	}
108  
109  	page_cache_release(folio);
110  	folio_unqueue_deferred_split(folio);
111  	mem_cgroup_uncharge(folio);
112  	free_unref_page(&folio->page, folio_order(folio));
113  }
114  EXPORT_SYMBOL(__folio_put);
115  
116  /**
117   * put_pages_list() - release a list of pages
118   * @pages: list of pages threaded on page->lru
119   *
120   * Release a list of pages which are strung together on page.lru.
121   */
put_pages_list(struct list_head * pages)122  void put_pages_list(struct list_head *pages)
123  {
124  	struct folio_batch fbatch;
125  	struct folio *folio, *next;
126  
127  	folio_batch_init(&fbatch);
128  	list_for_each_entry_safe(folio, next, pages, lru) {
129  		if (!folio_put_testzero(folio))
130  			continue;
131  		if (folio_test_hugetlb(folio)) {
132  			free_huge_folio(folio);
133  			continue;
134  		}
135  		/* LRU flag must be clear because it's passed using the lru */
136  		if (folio_batch_add(&fbatch, folio) > 0)
137  			continue;
138  		free_unref_folios(&fbatch);
139  	}
140  
141  	if (fbatch.nr)
142  		free_unref_folios(&fbatch);
143  	INIT_LIST_HEAD(pages);
144  }
145  EXPORT_SYMBOL(put_pages_list);
146  
147  typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);
148  
lru_add(struct lruvec * lruvec,struct folio * folio)149  static void lru_add(struct lruvec *lruvec, struct folio *folio)
150  {
151  	int was_unevictable = folio_test_clear_unevictable(folio);
152  	long nr_pages = folio_nr_pages(folio);
153  
154  	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
155  
156  	/*
157  	 * Is an smp_mb__after_atomic() still required here, before
158  	 * folio_evictable() tests the mlocked flag, to rule out the possibility
159  	 * of stranding an evictable folio on an unevictable LRU?  I think
160  	 * not, because __munlock_folio() only clears the mlocked flag
161  	 * while the LRU lock is held.
162  	 *
163  	 * (That is not true of __page_cache_release(), and not necessarily
164  	 * true of folios_put(): but those only clear the mlocked flag after
165  	 * folio_put_testzero() has excluded any other users of the folio.)
166  	 */
167  	if (folio_evictable(folio)) {
168  		if (was_unevictable)
169  			__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
170  	} else {
171  		folio_clear_active(folio);
172  		folio_set_unevictable(folio);
173  		/*
174  		 * folio->mlock_count = !!folio_test_mlocked(folio)?
175  		 * But that leaves __mlock_folio() in doubt whether another
176  		 * actor has already counted the mlock or not.  Err on the
177  		 * safe side, underestimate, let page reclaim fix it, rather
178  		 * than leaving a page on the unevictable LRU indefinitely.
179  		 */
180  		folio->mlock_count = 0;
181  		if (!was_unevictable)
182  			__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
183  	}
184  
185  	lruvec_add_folio(lruvec, folio);
186  	trace_mm_lru_insertion(folio);
187  }
188  
folio_batch_move_lru(struct folio_batch * fbatch,move_fn_t move_fn)189  static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
190  {
191  	int i;
192  	struct lruvec *lruvec = NULL;
193  	unsigned long flags = 0;
194  
195  	for (i = 0; i < folio_batch_count(fbatch); i++) {
196  		struct folio *folio = fbatch->folios[i];
197  
198  		folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
199  		move_fn(lruvec, folio);
200  
201  		folio_set_lru(folio);
202  	}
203  
204  	if (lruvec)
205  		unlock_page_lruvec_irqrestore(lruvec, flags);
206  	folios_put(fbatch);
207  }
208  
__folio_batch_add_and_move(struct folio_batch __percpu * fbatch,struct folio * folio,move_fn_t move_fn,bool on_lru,bool disable_irq)209  static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
210  		struct folio *folio, move_fn_t move_fn,
211  		bool on_lru, bool disable_irq)
212  {
213  	unsigned long flags;
214  
215  	if (on_lru && !folio_test_clear_lru(folio))
216  		return;
217  
218  	folio_get(folio);
219  
220  	if (disable_irq)
221  		local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
222  	else
223  		local_lock(&cpu_fbatches.lock);
224  
225  	if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) ||
226  	    lru_cache_disabled())
227  		folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn);
228  
229  	if (disable_irq)
230  		local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
231  	else
232  		local_unlock(&cpu_fbatches.lock);
233  }
234  
235  #define folio_batch_add_and_move(folio, op, on_lru)						\
236  	__folio_batch_add_and_move(								\
237  		&cpu_fbatches.op,								\
238  		folio,										\
239  		op,										\
240  		on_lru,										\
241  		offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq)	\
242  	)
243  
lru_move_tail(struct lruvec * lruvec,struct folio * folio)244  static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
245  {
246  	if (folio_test_unevictable(folio))
247  		return;
248  
249  	lruvec_del_folio(lruvec, folio);
250  	folio_clear_active(folio);
251  	lruvec_add_folio_tail(lruvec, folio);
252  	__count_vm_events(PGROTATED, folio_nr_pages(folio));
253  }
254  
255  /*
256   * Writeback is about to end against a folio which has been marked for
257   * immediate reclaim.  If it still appears to be reclaimable, move it
258   * to the tail of the inactive list.
259   *
260   * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races.
261   */
folio_rotate_reclaimable(struct folio * folio)262  void folio_rotate_reclaimable(struct folio *folio)
263  {
264  	if (folio_test_locked(folio) || folio_test_dirty(folio) ||
265  	    folio_test_unevictable(folio))
266  		return;
267  
268  	folio_batch_add_and_move(folio, lru_move_tail, true);
269  }
270  
lru_note_cost(struct lruvec * lruvec,bool file,unsigned int nr_io,unsigned int nr_rotated)271  void lru_note_cost(struct lruvec *lruvec, bool file,
272  		   unsigned int nr_io, unsigned int nr_rotated)
273  {
274  	unsigned long cost;
275  
276  	/*
277  	 * Reflect the relative cost of incurring IO and spending CPU
278  	 * time on rotations. This doesn't attempt to make a precise
279  	 * comparison, it just says: if reloads are about comparable
280  	 * between the LRU lists, or rotations are overwhelmingly
281  	 * different between them, adjust scan balance for CPU work.
282  	 */
283  	cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
284  
285  	do {
286  		unsigned long lrusize;
287  
288  		/*
289  		 * Hold lruvec->lru_lock is safe here, since
290  		 * 1) The pinned lruvec in reclaim, or
291  		 * 2) From a pre-LRU page during refault (which also holds the
292  		 *    rcu lock, so would be safe even if the page was on the LRU
293  		 *    and could move simultaneously to a new lruvec).
294  		 */
295  		spin_lock_irq(&lruvec->lru_lock);
296  		/* Record cost event */
297  		if (file)
298  			lruvec->file_cost += cost;
299  		else
300  			lruvec->anon_cost += cost;
301  
302  		/*
303  		 * Decay previous events
304  		 *
305  		 * Because workloads change over time (and to avoid
306  		 * overflow) we keep these statistics as a floating
307  		 * average, which ends up weighing recent refaults
308  		 * more than old ones.
309  		 */
310  		lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
311  			  lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
312  			  lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
313  			  lruvec_page_state(lruvec, NR_ACTIVE_FILE);
314  
315  		if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
316  			lruvec->file_cost /= 2;
317  			lruvec->anon_cost /= 2;
318  		}
319  		spin_unlock_irq(&lruvec->lru_lock);
320  	} while ((lruvec = parent_lruvec(lruvec)));
321  }
322  
lru_note_cost_refault(struct folio * folio)323  void lru_note_cost_refault(struct folio *folio)
324  {
325  	lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
326  		      folio_nr_pages(folio), 0);
327  }
328  
lru_activate(struct lruvec * lruvec,struct folio * folio)329  static void lru_activate(struct lruvec *lruvec, struct folio *folio)
330  {
331  	long nr_pages = folio_nr_pages(folio);
332  
333  	if (folio_test_active(folio) || folio_test_unevictable(folio))
334  		return;
335  
336  
337  	lruvec_del_folio(lruvec, folio);
338  	folio_set_active(folio);
339  	lruvec_add_folio(lruvec, folio);
340  	trace_mm_lru_activate(folio);
341  
342  	__count_vm_events(PGACTIVATE, nr_pages);
343  	__count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
344  }
345  
346  #ifdef CONFIG_SMP
folio_activate_drain(int cpu)347  static void folio_activate_drain(int cpu)
348  {
349  	struct folio_batch *fbatch = &per_cpu(cpu_fbatches.lru_activate, cpu);
350  
351  	if (folio_batch_count(fbatch))
352  		folio_batch_move_lru(fbatch, lru_activate);
353  }
354  
folio_activate(struct folio * folio)355  void folio_activate(struct folio *folio)
356  {
357  	if (folio_test_active(folio) || folio_test_unevictable(folio))
358  		return;
359  
360  	folio_batch_add_and_move(folio, lru_activate, true);
361  }
362  
363  #else
folio_activate_drain(int cpu)364  static inline void folio_activate_drain(int cpu)
365  {
366  }
367  
folio_activate(struct folio * folio)368  void folio_activate(struct folio *folio)
369  {
370  	struct lruvec *lruvec;
371  
372  	if (!folio_test_clear_lru(folio))
373  		return;
374  
375  	lruvec = folio_lruvec_lock_irq(folio);
376  	lru_activate(lruvec, folio);
377  	unlock_page_lruvec_irq(lruvec);
378  	folio_set_lru(folio);
379  }
380  #endif
381  
__lru_cache_activate_folio(struct folio * folio)382  static void __lru_cache_activate_folio(struct folio *folio)
383  {
384  	struct folio_batch *fbatch;
385  	int i;
386  
387  	local_lock(&cpu_fbatches.lock);
388  	fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
389  
390  	/*
391  	 * Search backwards on the optimistic assumption that the folio being
392  	 * activated has just been added to this batch. Note that only
393  	 * the local batch is examined as a !LRU folio could be in the
394  	 * process of being released, reclaimed, migrated or on a remote
395  	 * batch that is currently being drained. Furthermore, marking
396  	 * a remote batch's folio active potentially hits a race where
397  	 * a folio is marked active just after it is added to the inactive
398  	 * list causing accounting errors and BUG_ON checks to trigger.
399  	 */
400  	for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) {
401  		struct folio *batch_folio = fbatch->folios[i];
402  
403  		if (batch_folio == folio) {
404  			folio_set_active(folio);
405  			break;
406  		}
407  	}
408  
409  	local_unlock(&cpu_fbatches.lock);
410  }
411  
412  #ifdef CONFIG_LRU_GEN
folio_inc_refs(struct folio * folio)413  static void folio_inc_refs(struct folio *folio)
414  {
415  	unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
416  
417  	if (folio_test_unevictable(folio))
418  		return;
419  
420  	if (!folio_test_referenced(folio)) {
421  		folio_set_referenced(folio);
422  		return;
423  	}
424  
425  	if (!folio_test_workingset(folio)) {
426  		folio_set_workingset(folio);
427  		return;
428  	}
429  
430  	/* see the comment on MAX_NR_TIERS */
431  	do {
432  		new_flags = old_flags & LRU_REFS_MASK;
433  		if (new_flags == LRU_REFS_MASK)
434  			break;
435  
436  		new_flags += BIT(LRU_REFS_PGOFF);
437  		new_flags |= old_flags & ~LRU_REFS_MASK;
438  	} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
439  }
440  #else
folio_inc_refs(struct folio * folio)441  static void folio_inc_refs(struct folio *folio)
442  {
443  }
444  #endif /* CONFIG_LRU_GEN */
445  
446  /**
447   * folio_mark_accessed - Mark a folio as having seen activity.
448   * @folio: The folio to mark.
449   *
450   * This function will perform one of the following transitions:
451   *
452   * * inactive,unreferenced	->	inactive,referenced
453   * * inactive,referenced	->	active,unreferenced
454   * * active,unreferenced	->	active,referenced
455   *
456   * When a newly allocated folio is not yet visible, so safe for non-atomic ops,
457   * __folio_set_referenced() may be substituted for folio_mark_accessed().
458   */
folio_mark_accessed(struct folio * folio)459  void folio_mark_accessed(struct folio *folio)
460  {
461  	if (lru_gen_enabled()) {
462  		folio_inc_refs(folio);
463  		return;
464  	}
465  
466  	if (!folio_test_referenced(folio)) {
467  		folio_set_referenced(folio);
468  	} else if (folio_test_unevictable(folio)) {
469  		/*
470  		 * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
471  		 * this list is never rotated or maintained, so marking an
472  		 * unevictable page accessed has no effect.
473  		 */
474  	} else if (!folio_test_active(folio)) {
475  		/*
476  		 * If the folio is on the LRU, queue it for activation via
477  		 * cpu_fbatches.lru_activate. Otherwise, assume the folio is in a
478  		 * folio_batch, mark it active and it'll be moved to the active
479  		 * LRU on the next drain.
480  		 */
481  		if (folio_test_lru(folio))
482  			folio_activate(folio);
483  		else
484  			__lru_cache_activate_folio(folio);
485  		folio_clear_referenced(folio);
486  		workingset_activation(folio);
487  	}
488  	if (folio_test_idle(folio))
489  		folio_clear_idle(folio);
490  }
491  EXPORT_SYMBOL(folio_mark_accessed);
492  
493  /**
494   * folio_add_lru - Add a folio to an LRU list.
495   * @folio: The folio to be added to the LRU.
496   *
497   * Queue the folio for addition to the LRU. The decision on whether
498   * to add the page to the [in]active [file|anon] list is deferred until the
499   * folio_batch is drained. This gives a chance for the caller of folio_add_lru()
500   * have the folio added to the active list using folio_mark_accessed().
501   */
folio_add_lru(struct folio * folio)502  void folio_add_lru(struct folio *folio)
503  {
504  	VM_BUG_ON_FOLIO(folio_test_active(folio) &&
505  			folio_test_unevictable(folio), folio);
506  	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
507  
508  	/* see the comment in lru_gen_add_folio() */
509  	if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
510  	    lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
511  		folio_set_active(folio);
512  
513  	folio_batch_add_and_move(folio, lru_add, false);
514  }
515  EXPORT_SYMBOL(folio_add_lru);
516  
517  /**
518   * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA.
519   * @folio: The folio to be added to the LRU.
520   * @vma: VMA in which the folio is mapped.
521   *
522   * If the VMA is mlocked, @folio is added to the unevictable list.
523   * Otherwise, it is treated the same way as folio_add_lru().
524   */
folio_add_lru_vma(struct folio * folio,struct vm_area_struct * vma)525  void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma)
526  {
527  	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
528  
529  	if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
530  		mlock_new_folio(folio);
531  	else
532  		folio_add_lru(folio);
533  }
534  
535  /*
536   * If the folio cannot be invalidated, it is moved to the
537   * inactive list to speed up its reclaim.  It is moved to the
538   * head of the list, rather than the tail, to give the flusher
539   * threads some time to write it out, as this is much more
540   * effective than the single-page writeout from reclaim.
541   *
542   * If the folio isn't mapped and dirty/writeback, the folio
543   * could be reclaimed asap using the reclaim flag.
544   *
545   * 1. active, mapped folio -> none
546   * 2. active, dirty/writeback folio -> inactive, head, reclaim
547   * 3. inactive, mapped folio -> none
548   * 4. inactive, dirty/writeback folio -> inactive, head, reclaim
549   * 5. inactive, clean -> inactive, tail
550   * 6. Others -> none
551   *
552   * In 4, it moves to the head of the inactive list so the folio is
553   * written out by flusher threads as this is much more efficient
554   * than the single-page writeout from reclaim.
555   */
lru_deactivate_file(struct lruvec * lruvec,struct folio * folio)556  static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
557  {
558  	bool active = folio_test_active(folio);
559  	long nr_pages = folio_nr_pages(folio);
560  
561  	if (folio_test_unevictable(folio))
562  		return;
563  
564  	/* Some processes are using the folio */
565  	if (folio_mapped(folio))
566  		return;
567  
568  	lruvec_del_folio(lruvec, folio);
569  	folio_clear_active(folio);
570  	folio_clear_referenced(folio);
571  
572  	if (folio_test_writeback(folio) || folio_test_dirty(folio)) {
573  		/*
574  		 * Setting the reclaim flag could race with
575  		 * folio_end_writeback() and confuse readahead.  But the
576  		 * race window is _really_ small and  it's not a critical
577  		 * problem.
578  		 */
579  		lruvec_add_folio(lruvec, folio);
580  		folio_set_reclaim(folio);
581  	} else {
582  		/*
583  		 * The folio's writeback ended while it was in the batch.
584  		 * We move that folio to the tail of the inactive list.
585  		 */
586  		lruvec_add_folio_tail(lruvec, folio);
587  		__count_vm_events(PGROTATED, nr_pages);
588  	}
589  
590  	if (active) {
591  		__count_vm_events(PGDEACTIVATE, nr_pages);
592  		__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
593  				     nr_pages);
594  	}
595  }
596  
lru_deactivate(struct lruvec * lruvec,struct folio * folio)597  static void lru_deactivate(struct lruvec *lruvec, struct folio *folio)
598  {
599  	long nr_pages = folio_nr_pages(folio);
600  
601  	if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled()))
602  		return;
603  
604  	lruvec_del_folio(lruvec, folio);
605  	folio_clear_active(folio);
606  	folio_clear_referenced(folio);
607  	lruvec_add_folio(lruvec, folio);
608  
609  	__count_vm_events(PGDEACTIVATE, nr_pages);
610  	__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
611  }
612  
lru_lazyfree(struct lruvec * lruvec,struct folio * folio)613  static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
614  {
615  	long nr_pages = folio_nr_pages(folio);
616  
617  	if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
618  	    folio_test_swapcache(folio) || folio_test_unevictable(folio))
619  		return;
620  
621  	lruvec_del_folio(lruvec, folio);
622  	folio_clear_active(folio);
623  	folio_clear_referenced(folio);
624  	/*
625  	 * Lazyfree folios are clean anonymous folios.  They have
626  	 * the swapbacked flag cleared, to distinguish them from normal
627  	 * anonymous folios
628  	 */
629  	folio_clear_swapbacked(folio);
630  	lruvec_add_folio(lruvec, folio);
631  
632  	__count_vm_events(PGLAZYFREE, nr_pages);
633  	__count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
634  }
635  
636  /*
637   * Drain pages out of the cpu's folio_batch.
638   * Either "cpu" is the current CPU, and preemption has already been
639   * disabled; or "cpu" is being hot-unplugged, and is already dead.
640   */
lru_add_drain_cpu(int cpu)641  void lru_add_drain_cpu(int cpu)
642  {
643  	struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
644  	struct folio_batch *fbatch = &fbatches->lru_add;
645  
646  	if (folio_batch_count(fbatch))
647  		folio_batch_move_lru(fbatch, lru_add);
648  
649  	fbatch = &fbatches->lru_move_tail;
650  	/* Disabling interrupts below acts as a compiler barrier. */
651  	if (data_race(folio_batch_count(fbatch))) {
652  		unsigned long flags;
653  
654  		/* No harm done if a racing interrupt already did this */
655  		local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
656  		folio_batch_move_lru(fbatch, lru_move_tail);
657  		local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
658  	}
659  
660  	fbatch = &fbatches->lru_deactivate_file;
661  	if (folio_batch_count(fbatch))
662  		folio_batch_move_lru(fbatch, lru_deactivate_file);
663  
664  	fbatch = &fbatches->lru_deactivate;
665  	if (folio_batch_count(fbatch))
666  		folio_batch_move_lru(fbatch, lru_deactivate);
667  
668  	fbatch = &fbatches->lru_lazyfree;
669  	if (folio_batch_count(fbatch))
670  		folio_batch_move_lru(fbatch, lru_lazyfree);
671  
672  	folio_activate_drain(cpu);
673  }
674  
675  /**
676   * deactivate_file_folio() - Deactivate a file folio.
677   * @folio: Folio to deactivate.
678   *
679   * This function hints to the VM that @folio is a good reclaim candidate,
680   * for example if its invalidation fails due to the folio being dirty
681   * or under writeback.
682   *
683   * Context: Caller holds a reference on the folio.
684   */
deactivate_file_folio(struct folio * folio)685  void deactivate_file_folio(struct folio *folio)
686  {
687  	/* Deactivating an unevictable folio will not accelerate reclaim */
688  	if (folio_test_unevictable(folio))
689  		return;
690  
691  	folio_batch_add_and_move(folio, lru_deactivate_file, true);
692  }
693  
694  /*
695   * folio_deactivate - deactivate a folio
696   * @folio: folio to deactivate
697   *
698   * folio_deactivate() moves @folio to the inactive list if @folio was on the
699   * active list and was not unevictable. This is done to accelerate the
700   * reclaim of @folio.
701   */
folio_deactivate(struct folio * folio)702  void folio_deactivate(struct folio *folio)
703  {
704  	if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled()))
705  		return;
706  
707  	folio_batch_add_and_move(folio, lru_deactivate, true);
708  }
709  
710  /**
711   * folio_mark_lazyfree - make an anon folio lazyfree
712   * @folio: folio to deactivate
713   *
714   * folio_mark_lazyfree() moves @folio to the inactive file list.
715   * This is done to accelerate the reclaim of @folio.
716   */
folio_mark_lazyfree(struct folio * folio)717  void folio_mark_lazyfree(struct folio *folio)
718  {
719  	if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
720  	    folio_test_swapcache(folio) || folio_test_unevictable(folio))
721  		return;
722  
723  	folio_batch_add_and_move(folio, lru_lazyfree, true);
724  }
725  
lru_add_drain(void)726  void lru_add_drain(void)
727  {
728  	local_lock(&cpu_fbatches.lock);
729  	lru_add_drain_cpu(smp_processor_id());
730  	local_unlock(&cpu_fbatches.lock);
731  	mlock_drain_local();
732  }
733  
734  /*
735   * It's called from per-cpu workqueue context in SMP case so
736   * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on
737   * the same cpu. It shouldn't be a problem in !SMP case since
738   * the core is only one and the locks will disable preemption.
739   */
lru_add_and_bh_lrus_drain(void)740  static void lru_add_and_bh_lrus_drain(void)
741  {
742  	local_lock(&cpu_fbatches.lock);
743  	lru_add_drain_cpu(smp_processor_id());
744  	local_unlock(&cpu_fbatches.lock);
745  	invalidate_bh_lrus_cpu();
746  	mlock_drain_local();
747  }
748  
lru_add_drain_cpu_zone(struct zone * zone)749  void lru_add_drain_cpu_zone(struct zone *zone)
750  {
751  	local_lock(&cpu_fbatches.lock);
752  	lru_add_drain_cpu(smp_processor_id());
753  	drain_local_pages(zone);
754  	local_unlock(&cpu_fbatches.lock);
755  	mlock_drain_local();
756  }
757  
758  #ifdef CONFIG_SMP
759  
760  static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
761  
lru_add_drain_per_cpu(struct work_struct * dummy)762  static void lru_add_drain_per_cpu(struct work_struct *dummy)
763  {
764  	lru_add_and_bh_lrus_drain();
765  }
766  
cpu_needs_drain(unsigned int cpu)767  static bool cpu_needs_drain(unsigned int cpu)
768  {
769  	struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
770  
771  	/* Check these in order of likelihood that they're not zero */
772  	return folio_batch_count(&fbatches->lru_add) ||
773  		folio_batch_count(&fbatches->lru_move_tail) ||
774  		folio_batch_count(&fbatches->lru_deactivate_file) ||
775  		folio_batch_count(&fbatches->lru_deactivate) ||
776  		folio_batch_count(&fbatches->lru_lazyfree) ||
777  		folio_batch_count(&fbatches->lru_activate) ||
778  		need_mlock_drain(cpu) ||
779  		has_bh_in_lru(cpu, NULL);
780  }
781  
782  /*
783   * Doesn't need any cpu hotplug locking because we do rely on per-cpu
784   * kworkers being shut down before our page_alloc_cpu_dead callback is
785   * executed on the offlined cpu.
786   * Calling this function with cpu hotplug locks held can actually lead
787   * to obscure indirect dependencies via WQ context.
788   */
__lru_add_drain_all(bool force_all_cpus)789  static inline void __lru_add_drain_all(bool force_all_cpus)
790  {
791  	/*
792  	 * lru_drain_gen - Global pages generation number
793  	 *
794  	 * (A) Definition: global lru_drain_gen = x implies that all generations
795  	 *     0 < n <= x are already *scheduled* for draining.
796  	 *
797  	 * This is an optimization for the highly-contended use case where a
798  	 * user space workload keeps constantly generating a flow of pages for
799  	 * each CPU.
800  	 */
801  	static unsigned int lru_drain_gen;
802  	static struct cpumask has_work;
803  	static DEFINE_MUTEX(lock);
804  	unsigned cpu, this_gen;
805  
806  	/*
807  	 * Make sure nobody triggers this path before mm_percpu_wq is fully
808  	 * initialized.
809  	 */
810  	if (WARN_ON(!mm_percpu_wq))
811  		return;
812  
813  	/*
814  	 * Guarantee folio_batch counter stores visible by this CPU
815  	 * are visible to other CPUs before loading the current drain
816  	 * generation.
817  	 */
818  	smp_mb();
819  
820  	/*
821  	 * (B) Locally cache global LRU draining generation number
822  	 *
823  	 * The read barrier ensures that the counter is loaded before the mutex
824  	 * is taken. It pairs with smp_mb() inside the mutex critical section
825  	 * at (D).
826  	 */
827  	this_gen = smp_load_acquire(&lru_drain_gen);
828  
829  	mutex_lock(&lock);
830  
831  	/*
832  	 * (C) Exit the draining operation if a newer generation, from another
833  	 * lru_add_drain_all(), was already scheduled for draining. Check (A).
834  	 */
835  	if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
836  		goto done;
837  
838  	/*
839  	 * (D) Increment global generation number
840  	 *
841  	 * Pairs with smp_load_acquire() at (B), outside of the critical
842  	 * section. Use a full memory barrier to guarantee that the
843  	 * new global drain generation number is stored before loading
844  	 * folio_batch counters.
845  	 *
846  	 * This pairing must be done here, before the for_each_online_cpu loop
847  	 * below which drains the page vectors.
848  	 *
849  	 * Let x, y, and z represent some system CPU numbers, where x < y < z.
850  	 * Assume CPU #z is in the middle of the for_each_online_cpu loop
851  	 * below and has already reached CPU #y's per-cpu data. CPU #x comes
852  	 * along, adds some pages to its per-cpu vectors, then calls
853  	 * lru_add_drain_all().
854  	 *
855  	 * If the paired barrier is done at any later step, e.g. after the
856  	 * loop, CPU #x will just exit at (C) and miss flushing out all of its
857  	 * added pages.
858  	 */
859  	WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
860  	smp_mb();
861  
862  	cpumask_clear(&has_work);
863  	for_each_online_cpu(cpu) {
864  		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
865  
866  		if (cpu_needs_drain(cpu)) {
867  			INIT_WORK(work, lru_add_drain_per_cpu);
868  			queue_work_on(cpu, mm_percpu_wq, work);
869  			__cpumask_set_cpu(cpu, &has_work);
870  		}
871  	}
872  
873  	for_each_cpu(cpu, &has_work)
874  		flush_work(&per_cpu(lru_add_drain_work, cpu));
875  
876  done:
877  	mutex_unlock(&lock);
878  }
879  
lru_add_drain_all(void)880  void lru_add_drain_all(void)
881  {
882  	__lru_add_drain_all(false);
883  }
884  #else
lru_add_drain_all(void)885  void lru_add_drain_all(void)
886  {
887  	lru_add_drain();
888  }
889  #endif /* CONFIG_SMP */
890  
891  atomic_t lru_disable_count = ATOMIC_INIT(0);
892  
893  /*
894   * lru_cache_disable() needs to be called before we start compiling
895   * a list of folios to be migrated using folio_isolate_lru().
896   * It drains folios on LRU cache and then disable on all cpus until
897   * lru_cache_enable is called.
898   *
899   * Must be paired with a call to lru_cache_enable().
900   */
lru_cache_disable(void)901  void lru_cache_disable(void)
902  {
903  	atomic_inc(&lru_disable_count);
904  	/*
905  	 * Readers of lru_disable_count are protected by either disabling
906  	 * preemption or rcu_read_lock:
907  	 *
908  	 * preempt_disable, local_irq_disable  [bh_lru_lock()]
909  	 * rcu_read_lock		       [rt_spin_lock CONFIG_PREEMPT_RT]
910  	 * preempt_disable		       [local_lock !CONFIG_PREEMPT_RT]
911  	 *
912  	 * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on
913  	 * preempt_disable() regions of code. So any CPU which sees
914  	 * lru_disable_count = 0 will have exited the critical
915  	 * section when synchronize_rcu() returns.
916  	 */
917  	synchronize_rcu_expedited();
918  #ifdef CONFIG_SMP
919  	__lru_add_drain_all(true);
920  #else
921  	lru_add_and_bh_lrus_drain();
922  #endif
923  }
924  
925  /**
926   * folios_put_refs - Reduce the reference count on a batch of folios.
927   * @folios: The folios.
928   * @refs: The number of refs to subtract from each folio.
929   *
930   * Like folio_put(), but for a batch of folios.  This is more efficient
931   * than writing the loop yourself as it will optimise the locks which need
932   * to be taken if the folios are freed.  The folios batch is returned
933   * empty and ready to be reused for another batch; there is no need
934   * to reinitialise it.  If @refs is NULL, we subtract one from each
935   * folio refcount.
936   *
937   * Context: May be called in process or interrupt context, but not in NMI
938   * context.  May be called while holding a spinlock.
939   */
folios_put_refs(struct folio_batch * folios,unsigned int * refs)940  void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
941  {
942  	int i, j;
943  	struct lruvec *lruvec = NULL;
944  	unsigned long flags = 0;
945  
946  	for (i = 0, j = 0; i < folios->nr; i++) {
947  		struct folio *folio = folios->folios[i];
948  		unsigned int nr_refs = refs ? refs[i] : 1;
949  
950  		if (is_huge_zero_folio(folio))
951  			continue;
952  
953  		if (folio_is_zone_device(folio)) {
954  			if (lruvec) {
955  				unlock_page_lruvec_irqrestore(lruvec, flags);
956  				lruvec = NULL;
957  			}
958  			if (put_devmap_managed_folio_refs(folio, nr_refs))
959  				continue;
960  			if (folio_ref_sub_and_test(folio, nr_refs))
961  				free_zone_device_folio(folio);
962  			continue;
963  		}
964  
965  		if (!folio_ref_sub_and_test(folio, nr_refs))
966  			continue;
967  
968  		/* hugetlb has its own memcg */
969  		if (folio_test_hugetlb(folio)) {
970  			if (lruvec) {
971  				unlock_page_lruvec_irqrestore(lruvec, flags);
972  				lruvec = NULL;
973  			}
974  			free_huge_folio(folio);
975  			continue;
976  		}
977  		folio_unqueue_deferred_split(folio);
978  		__page_cache_release(folio, &lruvec, &flags);
979  
980  		if (j != i)
981  			folios->folios[j] = folio;
982  		j++;
983  	}
984  	if (lruvec)
985  		unlock_page_lruvec_irqrestore(lruvec, flags);
986  	if (!j) {
987  		folio_batch_reinit(folios);
988  		return;
989  	}
990  
991  	folios->nr = j;
992  	mem_cgroup_uncharge_folios(folios);
993  	free_unref_folios(folios);
994  }
995  EXPORT_SYMBOL(folios_put_refs);
996  
997  /**
998   * release_pages - batched put_page()
999   * @arg: array of pages to release
1000   * @nr: number of pages
1001   *
1002   * Decrement the reference count on all the pages in @arg.  If it
1003   * fell to zero, remove the page from the LRU and free it.
1004   *
1005   * Note that the argument can be an array of pages, encoded pages,
1006   * or folio pointers. We ignore any encoded bits, and turn any of
1007   * them into just a folio that gets free'd.
1008   */
release_pages(release_pages_arg arg,int nr)1009  void release_pages(release_pages_arg arg, int nr)
1010  {
1011  	struct folio_batch fbatch;
1012  	int refs[PAGEVEC_SIZE];
1013  	struct encoded_page **encoded = arg.encoded_pages;
1014  	int i;
1015  
1016  	folio_batch_init(&fbatch);
1017  	for (i = 0; i < nr; i++) {
1018  		/* Turn any of the argument types into a folio */
1019  		struct folio *folio = page_folio(encoded_page_ptr(encoded[i]));
1020  
1021  		/* Is our next entry actually "nr_pages" -> "nr_refs" ? */
1022  		refs[fbatch.nr] = 1;
1023  		if (unlikely(encoded_page_flags(encoded[i]) &
1024  			     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
1025  			refs[fbatch.nr] = encoded_nr_pages(encoded[++i]);
1026  
1027  		if (folio_batch_add(&fbatch, folio) > 0)
1028  			continue;
1029  		folios_put_refs(&fbatch, refs);
1030  	}
1031  
1032  	if (fbatch.nr)
1033  		folios_put_refs(&fbatch, refs);
1034  }
1035  EXPORT_SYMBOL(release_pages);
1036  
1037  /*
1038   * The folios which we're about to release may be in the deferred lru-addition
1039   * queues.  That would prevent them from really being freed right now.  That's
1040   * OK from a correctness point of view but is inefficient - those folios may be
1041   * cache-warm and we want to give them back to the page allocator ASAP.
1042   *
1043   * So __folio_batch_release() will drain those queues here.
1044   * folio_batch_move_lru() calls folios_put() directly to avoid
1045   * mutual recursion.
1046   */
__folio_batch_release(struct folio_batch * fbatch)1047  void __folio_batch_release(struct folio_batch *fbatch)
1048  {
1049  	if (!fbatch->percpu_pvec_drained) {
1050  		lru_add_drain();
1051  		fbatch->percpu_pvec_drained = true;
1052  	}
1053  	folios_put(fbatch);
1054  }
1055  EXPORT_SYMBOL(__folio_batch_release);
1056  
1057  /**
1058   * folio_batch_remove_exceptionals() - Prune non-folios from a batch.
1059   * @fbatch: The batch to prune
1060   *
1061   * find_get_entries() fills a batch with both folios and shadow/swap/DAX
1062   * entries.  This function prunes all the non-folio entries from @fbatch
1063   * without leaving holes, so that it can be passed on to folio-only batch
1064   * operations.
1065   */
folio_batch_remove_exceptionals(struct folio_batch * fbatch)1066  void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
1067  {
1068  	unsigned int i, j;
1069  
1070  	for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) {
1071  		struct folio *folio = fbatch->folios[i];
1072  		if (!xa_is_value(folio))
1073  			fbatch->folios[j++] = folio;
1074  	}
1075  	fbatch->nr = j;
1076  }
1077  
1078  /*
1079   * Perform any setup for the swap system
1080   */
swap_setup(void)1081  void __init swap_setup(void)
1082  {
1083  	unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
1084  
1085  	/* Use a smaller cluster for small-memory machines */
1086  	if (megs < 16)
1087  		page_cluster = 2;
1088  	else
1089  		page_cluster = 3;
1090  	/*
1091  	 * Right now other parts of the system means that we
1092  	 * _really_ don't want to cluster much more
1093  	 */
1094  }
1095