1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4   * All Rights Reserved.
5   */
6  #include "xfs.h"
7  #include "xfs_fs.h"
8  #include "xfs_shared.h"
9  #include "xfs_format.h"
10  #include "xfs_log_format.h"
11  #include "xfs_trans_resv.h"
12  #include "xfs_mount.h"
13  #include "xfs_inode.h"
14  #include "xfs_trans.h"
15  #include "xfs_trans_priv.h"
16  #include "xfs_inode_item.h"
17  #include "xfs_quota.h"
18  #include "xfs_trace.h"
19  #include "xfs_icache.h"
20  #include "xfs_bmap_util.h"
21  #include "xfs_dquot_item.h"
22  #include "xfs_dquot.h"
23  #include "xfs_reflink.h"
24  #include "xfs_ialloc.h"
25  #include "xfs_ag.h"
26  #include "xfs_log_priv.h"
27  #include "xfs_health.h"
28  
29  #include <linux/iversion.h>
30  
31  /* Radix tree tags for incore inode tree. */
32  
33  /* inode is to be reclaimed */
34  #define XFS_ICI_RECLAIM_TAG	0
35  /* Inode has speculative preallocations (posteof or cow) to clean. */
36  #define XFS_ICI_BLOCKGC_TAG	1
37  
38  /*
39   * The goal for walking incore inodes.  These can correspond with incore inode
40   * radix tree tags when convenient.  Avoid existing XFS_IWALK namespace.
41   */
42  enum xfs_icwalk_goal {
43  	/* Goals directly associated with tagged inodes. */
44  	XFS_ICWALK_BLOCKGC	= XFS_ICI_BLOCKGC_TAG,
45  	XFS_ICWALK_RECLAIM	= XFS_ICI_RECLAIM_TAG,
46  };
47  
48  static int xfs_icwalk(struct xfs_mount *mp,
49  		enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
50  static int xfs_icwalk_ag(struct xfs_perag *pag,
51  		enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
52  
53  /*
54   * Private inode cache walk flags for struct xfs_icwalk.  Must not
55   * coincide with XFS_ICWALK_FLAGS_VALID.
56   */
57  
58  /* Stop scanning after icw_scan_limit inodes. */
59  #define XFS_ICWALK_FLAG_SCAN_LIMIT	(1U << 28)
60  
61  #define XFS_ICWALK_FLAG_RECLAIM_SICK	(1U << 27)
62  #define XFS_ICWALK_FLAG_UNION		(1U << 26) /* union filter algorithm */
63  
64  #define XFS_ICWALK_PRIVATE_FLAGS	(XFS_ICWALK_FLAG_SCAN_LIMIT | \
65  					 XFS_ICWALK_FLAG_RECLAIM_SICK | \
66  					 XFS_ICWALK_FLAG_UNION)
67  
68  /* Marks for the perag xarray */
69  #define XFS_PERAG_RECLAIM_MARK	XA_MARK_0
70  #define XFS_PERAG_BLOCKGC_MARK	XA_MARK_1
71  
ici_tag_to_mark(unsigned int tag)72  static inline xa_mark_t ici_tag_to_mark(unsigned int tag)
73  {
74  	if (tag == XFS_ICI_RECLAIM_TAG)
75  		return XFS_PERAG_RECLAIM_MARK;
76  	ASSERT(tag == XFS_ICI_BLOCKGC_TAG);
77  	return XFS_PERAG_BLOCKGC_MARK;
78  }
79  
80  /*
81   * Allocate and initialise an xfs_inode.
82   */
83  struct xfs_inode *
xfs_inode_alloc(struct xfs_mount * mp,xfs_ino_t ino)84  xfs_inode_alloc(
85  	struct xfs_mount	*mp,
86  	xfs_ino_t		ino)
87  {
88  	struct xfs_inode	*ip;
89  
90  	/*
91  	 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
92  	 * and return NULL here on ENOMEM.
93  	 */
94  	ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
95  
96  	if (inode_init_always(mp->m_super, VFS_I(ip))) {
97  		kmem_cache_free(xfs_inode_cache, ip);
98  		return NULL;
99  	}
100  
101  	/* VFS doesn't initialise i_mode! */
102  	VFS_I(ip)->i_mode = 0;
103  	mapping_set_folio_min_order(VFS_I(ip)->i_mapping,
104  				    M_IGEO(mp)->min_folio_order);
105  
106  	XFS_STATS_INC(mp, vn_active);
107  	ASSERT(atomic_read(&ip->i_pincount) == 0);
108  	ASSERT(ip->i_ino == 0);
109  
110  	/* initialise the xfs inode */
111  	ip->i_ino = ino;
112  	ip->i_mount = mp;
113  	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
114  	ip->i_cowfp = NULL;
115  	memset(&ip->i_af, 0, sizeof(ip->i_af));
116  	ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
117  	memset(&ip->i_df, 0, sizeof(ip->i_df));
118  	ip->i_flags = 0;
119  	ip->i_delayed_blks = 0;
120  	ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
121  	ip->i_nblocks = 0;
122  	ip->i_forkoff = 0;
123  	ip->i_sick = 0;
124  	ip->i_checked = 0;
125  	INIT_WORK(&ip->i_ioend_work, xfs_end_io);
126  	INIT_LIST_HEAD(&ip->i_ioend_list);
127  	spin_lock_init(&ip->i_ioend_lock);
128  	ip->i_next_unlinked = NULLAGINO;
129  	ip->i_prev_unlinked = 0;
130  
131  	return ip;
132  }
133  
134  STATIC void
xfs_inode_free_callback(struct rcu_head * head)135  xfs_inode_free_callback(
136  	struct rcu_head		*head)
137  {
138  	struct inode		*inode = container_of(head, struct inode, i_rcu);
139  	struct xfs_inode	*ip = XFS_I(inode);
140  
141  	switch (VFS_I(ip)->i_mode & S_IFMT) {
142  	case S_IFREG:
143  	case S_IFDIR:
144  	case S_IFLNK:
145  		xfs_idestroy_fork(&ip->i_df);
146  		break;
147  	}
148  
149  	xfs_ifork_zap_attr(ip);
150  
151  	if (ip->i_cowfp) {
152  		xfs_idestroy_fork(ip->i_cowfp);
153  		kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
154  	}
155  	if (ip->i_itemp) {
156  		ASSERT(!test_bit(XFS_LI_IN_AIL,
157  				 &ip->i_itemp->ili_item.li_flags));
158  		xfs_inode_item_destroy(ip);
159  		ip->i_itemp = NULL;
160  	}
161  
162  	kmem_cache_free(xfs_inode_cache, ip);
163  }
164  
165  static void
__xfs_inode_free(struct xfs_inode * ip)166  __xfs_inode_free(
167  	struct xfs_inode	*ip)
168  {
169  	/* asserts to verify all state is correct here */
170  	ASSERT(atomic_read(&ip->i_pincount) == 0);
171  	ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
172  	XFS_STATS_DEC(ip->i_mount, vn_active);
173  
174  	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
175  }
176  
177  void
xfs_inode_free(struct xfs_inode * ip)178  xfs_inode_free(
179  	struct xfs_inode	*ip)
180  {
181  	ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
182  
183  	/*
184  	 * Because we use RCU freeing we need to ensure the inode always
185  	 * appears to be reclaimed with an invalid inode number when in the
186  	 * free state. The ip->i_flags_lock provides the barrier against lookup
187  	 * races.
188  	 */
189  	spin_lock(&ip->i_flags_lock);
190  	ip->i_flags = XFS_IRECLAIM;
191  	ip->i_ino = 0;
192  	spin_unlock(&ip->i_flags_lock);
193  
194  	__xfs_inode_free(ip);
195  }
196  
197  /*
198   * Queue background inode reclaim work if there are reclaimable inodes and there
199   * isn't reclaim work already scheduled or in progress.
200   */
201  static void
xfs_reclaim_work_queue(struct xfs_mount * mp)202  xfs_reclaim_work_queue(
203  	struct xfs_mount        *mp)
204  {
205  
206  	rcu_read_lock();
207  	if (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
208  		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
209  			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
210  	}
211  	rcu_read_unlock();
212  }
213  
214  /*
215   * Background scanning to trim preallocated space. This is queued based on the
216   * 'speculative_prealloc_lifetime' tunable (5m by default).
217   */
218  static inline void
xfs_blockgc_queue(struct xfs_perag * pag)219  xfs_blockgc_queue(
220  	struct xfs_perag	*pag)
221  {
222  	struct xfs_mount	*mp = pag->pag_mount;
223  
224  	if (!xfs_is_blockgc_enabled(mp))
225  		return;
226  
227  	rcu_read_lock();
228  	if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
229  		queue_delayed_work(pag->pag_mount->m_blockgc_wq,
230  				   &pag->pag_blockgc_work,
231  				   msecs_to_jiffies(xfs_blockgc_secs * 1000));
232  	rcu_read_unlock();
233  }
234  
235  /* Set a tag on both the AG incore inode tree and the AG radix tree. */
236  static void
xfs_perag_set_inode_tag(struct xfs_perag * pag,xfs_agino_t agino,unsigned int tag)237  xfs_perag_set_inode_tag(
238  	struct xfs_perag	*pag,
239  	xfs_agino_t		agino,
240  	unsigned int		tag)
241  {
242  	struct xfs_mount	*mp = pag->pag_mount;
243  	bool			was_tagged;
244  
245  	lockdep_assert_held(&pag->pag_ici_lock);
246  
247  	was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
248  	radix_tree_tag_set(&pag->pag_ici_root, agino, tag);
249  
250  	if (tag == XFS_ICI_RECLAIM_TAG)
251  		pag->pag_ici_reclaimable++;
252  
253  	if (was_tagged)
254  		return;
255  
256  	/* propagate the tag up into the perag radix tree */
257  	xa_set_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
258  
259  	/* start background work */
260  	switch (tag) {
261  	case XFS_ICI_RECLAIM_TAG:
262  		xfs_reclaim_work_queue(mp);
263  		break;
264  	case XFS_ICI_BLOCKGC_TAG:
265  		xfs_blockgc_queue(pag);
266  		break;
267  	}
268  
269  	trace_xfs_perag_set_inode_tag(pag, _RET_IP_);
270  }
271  
272  /* Clear a tag on both the AG incore inode tree and the AG radix tree. */
273  static void
xfs_perag_clear_inode_tag(struct xfs_perag * pag,xfs_agino_t agino,unsigned int tag)274  xfs_perag_clear_inode_tag(
275  	struct xfs_perag	*pag,
276  	xfs_agino_t		agino,
277  	unsigned int		tag)
278  {
279  	struct xfs_mount	*mp = pag->pag_mount;
280  
281  	lockdep_assert_held(&pag->pag_ici_lock);
282  
283  	/*
284  	 * Reclaim can signal (with a null agino) that it cleared its own tag
285  	 * by removing the inode from the radix tree.
286  	 */
287  	if (agino != NULLAGINO)
288  		radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
289  	else
290  		ASSERT(tag == XFS_ICI_RECLAIM_TAG);
291  
292  	if (tag == XFS_ICI_RECLAIM_TAG)
293  		pag->pag_ici_reclaimable--;
294  
295  	if (radix_tree_tagged(&pag->pag_ici_root, tag))
296  		return;
297  
298  	/* clear the tag from the perag radix tree */
299  	xa_clear_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
300  
301  	trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
302  }
303  
304  /*
305   * Find the next AG after @pag, or the first AG if @pag is NULL.
306   */
307  static struct xfs_perag *
xfs_perag_grab_next_tag(struct xfs_mount * mp,struct xfs_perag * pag,int tag)308  xfs_perag_grab_next_tag(
309  	struct xfs_mount	*mp,
310  	struct xfs_perag	*pag,
311  	int			tag)
312  {
313  	unsigned long		index = 0;
314  
315  	if (pag) {
316  		index = pag->pag_agno + 1;
317  		xfs_perag_rele(pag);
318  	}
319  
320  	rcu_read_lock();
321  	pag = xa_find(&mp->m_perags, &index, ULONG_MAX, ici_tag_to_mark(tag));
322  	if (pag) {
323  		trace_xfs_perag_grab_next_tag(pag, _RET_IP_);
324  		if (!atomic_inc_not_zero(&pag->pag_active_ref))
325  			pag = NULL;
326  	}
327  	rcu_read_unlock();
328  	return pag;
329  }
330  
331  /*
332   * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
333   * part of the structure. This is made more complex by the fact we store
334   * information about the on-disk values in the VFS inode and so we can't just
335   * overwrite the values unconditionally. Hence we save the parameters we
336   * need to retain across reinitialisation, and rewrite them into the VFS inode
337   * after reinitialisation even if it fails.
338   */
339  static int
xfs_reinit_inode(struct xfs_mount * mp,struct inode * inode)340  xfs_reinit_inode(
341  	struct xfs_mount	*mp,
342  	struct inode		*inode)
343  {
344  	int			error;
345  	uint32_t		nlink = inode->i_nlink;
346  	uint32_t		generation = inode->i_generation;
347  	uint64_t		version = inode_peek_iversion(inode);
348  	umode_t			mode = inode->i_mode;
349  	dev_t			dev = inode->i_rdev;
350  	kuid_t			uid = inode->i_uid;
351  	kgid_t			gid = inode->i_gid;
352  	unsigned long		state = inode->i_state;
353  
354  	error = inode_init_always(mp->m_super, inode);
355  
356  	set_nlink(inode, nlink);
357  	inode->i_generation = generation;
358  	inode_set_iversion_queried(inode, version);
359  	inode->i_mode = mode;
360  	inode->i_rdev = dev;
361  	inode->i_uid = uid;
362  	inode->i_gid = gid;
363  	inode->i_state = state;
364  	mapping_set_folio_min_order(inode->i_mapping,
365  				    M_IGEO(mp)->min_folio_order);
366  	return error;
367  }
368  
369  /*
370   * Carefully nudge an inode whose VFS state has been torn down back into a
371   * usable state.  Drops the i_flags_lock and the rcu read lock.
372   */
373  static int
xfs_iget_recycle(struct xfs_perag * pag,struct xfs_inode * ip)374  xfs_iget_recycle(
375  	struct xfs_perag	*pag,
376  	struct xfs_inode	*ip) __releases(&ip->i_flags_lock)
377  {
378  	struct xfs_mount	*mp = ip->i_mount;
379  	struct inode		*inode = VFS_I(ip);
380  	int			error;
381  
382  	trace_xfs_iget_recycle(ip);
383  
384  	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
385  		return -EAGAIN;
386  
387  	/*
388  	 * We need to make it look like the inode is being reclaimed to prevent
389  	 * the actual reclaim workers from stomping over us while we recycle
390  	 * the inode.  We can't clear the radix tree tag yet as it requires
391  	 * pag_ici_lock to be held exclusive.
392  	 */
393  	ip->i_flags |= XFS_IRECLAIM;
394  
395  	spin_unlock(&ip->i_flags_lock);
396  	rcu_read_unlock();
397  
398  	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
399  	error = xfs_reinit_inode(mp, inode);
400  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
401  	if (error) {
402  		/*
403  		 * Re-initializing the inode failed, and we are in deep
404  		 * trouble.  Try to re-add it to the reclaim list.
405  		 */
406  		rcu_read_lock();
407  		spin_lock(&ip->i_flags_lock);
408  		ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
409  		ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
410  		spin_unlock(&ip->i_flags_lock);
411  		rcu_read_unlock();
412  
413  		trace_xfs_iget_recycle_fail(ip);
414  		return error;
415  	}
416  
417  	spin_lock(&pag->pag_ici_lock);
418  	spin_lock(&ip->i_flags_lock);
419  
420  	/*
421  	 * Clear the per-lifetime state in the inode as we are now effectively
422  	 * a new inode and need to return to the initial state before reuse
423  	 * occurs.
424  	 */
425  	ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
426  	ip->i_flags |= XFS_INEW;
427  	xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
428  			XFS_ICI_RECLAIM_TAG);
429  	inode->i_state = I_NEW;
430  	spin_unlock(&ip->i_flags_lock);
431  	spin_unlock(&pag->pag_ici_lock);
432  
433  	return 0;
434  }
435  
436  /*
437   * If we are allocating a new inode, then check what was returned is
438   * actually a free, empty inode. If we are not allocating an inode,
439   * then check we didn't find a free inode.
440   *
441   * Returns:
442   *	0		if the inode free state matches the lookup context
443   *	-ENOENT		if the inode is free and we are not allocating
444   *	-EFSCORRUPTED	if there is any state mismatch at all
445   */
446  static int
xfs_iget_check_free_state(struct xfs_inode * ip,int flags)447  xfs_iget_check_free_state(
448  	struct xfs_inode	*ip,
449  	int			flags)
450  {
451  	if (flags & XFS_IGET_CREATE) {
452  		/* should be a free inode */
453  		if (VFS_I(ip)->i_mode != 0) {
454  			xfs_warn(ip->i_mount,
455  "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
456  				ip->i_ino, VFS_I(ip)->i_mode);
457  			xfs_agno_mark_sick(ip->i_mount,
458  					XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
459  					XFS_SICK_AG_INOBT);
460  			return -EFSCORRUPTED;
461  		}
462  
463  		if (ip->i_nblocks != 0) {
464  			xfs_warn(ip->i_mount,
465  "Corruption detected! Free inode 0x%llx has blocks allocated!",
466  				ip->i_ino);
467  			xfs_agno_mark_sick(ip->i_mount,
468  					XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
469  					XFS_SICK_AG_INOBT);
470  			return -EFSCORRUPTED;
471  		}
472  		return 0;
473  	}
474  
475  	/* should be an allocated inode */
476  	if (VFS_I(ip)->i_mode == 0)
477  		return -ENOENT;
478  
479  	return 0;
480  }
481  
482  /* Make all pending inactivation work start immediately. */
483  static bool
xfs_inodegc_queue_all(struct xfs_mount * mp)484  xfs_inodegc_queue_all(
485  	struct xfs_mount	*mp)
486  {
487  	struct xfs_inodegc	*gc;
488  	int			cpu;
489  	bool			ret = false;
490  
491  	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
492  		gc = per_cpu_ptr(mp->m_inodegc, cpu);
493  		if (!llist_empty(&gc->list)) {
494  			mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
495  			ret = true;
496  		}
497  	}
498  
499  	return ret;
500  }
501  
502  /* Wait for all queued work and collect errors */
503  static int
xfs_inodegc_wait_all(struct xfs_mount * mp)504  xfs_inodegc_wait_all(
505  	struct xfs_mount	*mp)
506  {
507  	int			cpu;
508  	int			error = 0;
509  
510  	flush_workqueue(mp->m_inodegc_wq);
511  	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
512  		struct xfs_inodegc	*gc;
513  
514  		gc = per_cpu_ptr(mp->m_inodegc, cpu);
515  		if (gc->error && !error)
516  			error = gc->error;
517  		gc->error = 0;
518  	}
519  
520  	return error;
521  }
522  
523  /*
524   * Check the validity of the inode we just found it the cache
525   */
526  static int
xfs_iget_cache_hit(struct xfs_perag * pag,struct xfs_inode * ip,xfs_ino_t ino,int flags,int lock_flags)527  xfs_iget_cache_hit(
528  	struct xfs_perag	*pag,
529  	struct xfs_inode	*ip,
530  	xfs_ino_t		ino,
531  	int			flags,
532  	int			lock_flags) __releases(RCU)
533  {
534  	struct inode		*inode = VFS_I(ip);
535  	struct xfs_mount	*mp = ip->i_mount;
536  	int			error;
537  
538  	/*
539  	 * check for re-use of an inode within an RCU grace period due to the
540  	 * radix tree nodes not being updated yet. We monitor for this by
541  	 * setting the inode number to zero before freeing the inode structure.
542  	 * If the inode has been reallocated and set up, then the inode number
543  	 * will not match, so check for that, too.
544  	 */
545  	spin_lock(&ip->i_flags_lock);
546  	if (ip->i_ino != ino)
547  		goto out_skip;
548  
549  	/*
550  	 * If we are racing with another cache hit that is currently
551  	 * instantiating this inode or currently recycling it out of
552  	 * reclaimable state, wait for the initialisation to complete
553  	 * before continuing.
554  	 *
555  	 * If we're racing with the inactivation worker we also want to wait.
556  	 * If we're creating a new file, it's possible that the worker
557  	 * previously marked the inode as free on disk but hasn't finished
558  	 * updating the incore state yet.  The AGI buffer will be dirty and
559  	 * locked to the icreate transaction, so a synchronous push of the
560  	 * inodegc workers would result in deadlock.  For a regular iget, the
561  	 * worker is running already, so we might as well wait.
562  	 *
563  	 * XXX(hch): eventually we should do something equivalent to
564  	 *	     wait_on_inode to wait for these flags to be cleared
565  	 *	     instead of polling for it.
566  	 */
567  	if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
568  		goto out_skip;
569  
570  	if (ip->i_flags & XFS_NEED_INACTIVE) {
571  		/* Unlinked inodes cannot be re-grabbed. */
572  		if (VFS_I(ip)->i_nlink == 0) {
573  			error = -ENOENT;
574  			goto out_error;
575  		}
576  		goto out_inodegc_flush;
577  	}
578  
579  	/*
580  	 * Check the inode free state is valid. This also detects lookup
581  	 * racing with unlinks.
582  	 */
583  	error = xfs_iget_check_free_state(ip, flags);
584  	if (error)
585  		goto out_error;
586  
587  	/* Skip inodes that have no vfs state. */
588  	if ((flags & XFS_IGET_INCORE) &&
589  	    (ip->i_flags & XFS_IRECLAIMABLE))
590  		goto out_skip;
591  
592  	/* The inode fits the selection criteria; process it. */
593  	if (ip->i_flags & XFS_IRECLAIMABLE) {
594  		/* Drops i_flags_lock and RCU read lock. */
595  		error = xfs_iget_recycle(pag, ip);
596  		if (error == -EAGAIN)
597  			goto out_skip;
598  		if (error)
599  			return error;
600  	} else {
601  		/* If the VFS inode is being torn down, pause and try again. */
602  		if (!igrab(inode))
603  			goto out_skip;
604  
605  		/* We've got a live one. */
606  		spin_unlock(&ip->i_flags_lock);
607  		rcu_read_unlock();
608  		trace_xfs_iget_hit(ip);
609  	}
610  
611  	if (lock_flags != 0)
612  		xfs_ilock(ip, lock_flags);
613  
614  	if (!(flags & XFS_IGET_INCORE))
615  		xfs_iflags_clear(ip, XFS_ISTALE);
616  	XFS_STATS_INC(mp, xs_ig_found);
617  
618  	return 0;
619  
620  out_skip:
621  	trace_xfs_iget_skip(ip);
622  	XFS_STATS_INC(mp, xs_ig_frecycle);
623  	error = -EAGAIN;
624  out_error:
625  	spin_unlock(&ip->i_flags_lock);
626  	rcu_read_unlock();
627  	return error;
628  
629  out_inodegc_flush:
630  	spin_unlock(&ip->i_flags_lock);
631  	rcu_read_unlock();
632  	/*
633  	 * Do not wait for the workers, because the caller could hold an AGI
634  	 * buffer lock.  We're just going to sleep in a loop anyway.
635  	 */
636  	if (xfs_is_inodegc_enabled(mp))
637  		xfs_inodegc_queue_all(mp);
638  	return -EAGAIN;
639  }
640  
641  static int
xfs_iget_cache_miss(struct xfs_mount * mp,struct xfs_perag * pag,xfs_trans_t * tp,xfs_ino_t ino,struct xfs_inode ** ipp,int flags,int lock_flags)642  xfs_iget_cache_miss(
643  	struct xfs_mount	*mp,
644  	struct xfs_perag	*pag,
645  	xfs_trans_t		*tp,
646  	xfs_ino_t		ino,
647  	struct xfs_inode	**ipp,
648  	int			flags,
649  	int			lock_flags)
650  {
651  	struct xfs_inode	*ip;
652  	int			error;
653  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
654  
655  	ip = xfs_inode_alloc(mp, ino);
656  	if (!ip)
657  		return -ENOMEM;
658  
659  	error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags);
660  	if (error)
661  		goto out_destroy;
662  
663  	/*
664  	 * For version 5 superblocks, if we are initialising a new inode and we
665  	 * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
666  	 * simply build the new inode core with a random generation number.
667  	 *
668  	 * For version 4 (and older) superblocks, log recovery is dependent on
669  	 * the i_flushiter field being initialised from the current on-disk
670  	 * value and hence we must also read the inode off disk even when
671  	 * initializing new inodes.
672  	 */
673  	if (xfs_has_v3inodes(mp) &&
674  	    (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
675  		VFS_I(ip)->i_generation = get_random_u32();
676  	} else {
677  		struct xfs_buf		*bp;
678  
679  		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
680  		if (error)
681  			goto out_destroy;
682  
683  		error = xfs_inode_from_disk(ip,
684  				xfs_buf_offset(bp, ip->i_imap.im_boffset));
685  		if (!error)
686  			xfs_buf_set_ref(bp, XFS_INO_REF);
687  		else
688  			xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
689  		xfs_trans_brelse(tp, bp);
690  
691  		if (error)
692  			goto out_destroy;
693  	}
694  
695  	trace_xfs_iget_miss(ip);
696  
697  	/*
698  	 * Check the inode free state is valid. This also detects lookup
699  	 * racing with unlinks.
700  	 */
701  	error = xfs_iget_check_free_state(ip, flags);
702  	if (error)
703  		goto out_destroy;
704  
705  	/*
706  	 * Preload the radix tree so we can insert safely under the
707  	 * write spinlock. Note that we cannot sleep inside the preload
708  	 * region.
709  	 */
710  	if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) {
711  		error = -EAGAIN;
712  		goto out_destroy;
713  	}
714  
715  	/*
716  	 * Because the inode hasn't been added to the radix-tree yet it can't
717  	 * be found by another thread, so we can do the non-sleeping lock here.
718  	 */
719  	if (lock_flags) {
720  		if (!xfs_ilock_nowait(ip, lock_flags))
721  			BUG();
722  	}
723  
724  	/*
725  	 * These values must be set before inserting the inode into the radix
726  	 * tree as the moment it is inserted a concurrent lookup (allowed by the
727  	 * RCU locking mechanism) can find it and that lookup must see that this
728  	 * is an inode currently under construction (i.e. that XFS_INEW is set).
729  	 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
730  	 * memory barrier that ensures this detection works correctly at lookup
731  	 * time.
732  	 */
733  	if (flags & XFS_IGET_DONTCACHE)
734  		d_mark_dontcache(VFS_I(ip));
735  	ip->i_udquot = NULL;
736  	ip->i_gdquot = NULL;
737  	ip->i_pdquot = NULL;
738  	xfs_iflags_set(ip, XFS_INEW);
739  
740  	/* insert the new inode */
741  	spin_lock(&pag->pag_ici_lock);
742  	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
743  	if (unlikely(error)) {
744  		WARN_ON(error != -EEXIST);
745  		XFS_STATS_INC(mp, xs_ig_dup);
746  		error = -EAGAIN;
747  		goto out_preload_end;
748  	}
749  	spin_unlock(&pag->pag_ici_lock);
750  	radix_tree_preload_end();
751  
752  	*ipp = ip;
753  	return 0;
754  
755  out_preload_end:
756  	spin_unlock(&pag->pag_ici_lock);
757  	radix_tree_preload_end();
758  	if (lock_flags)
759  		xfs_iunlock(ip, lock_flags);
760  out_destroy:
761  	__destroy_inode(VFS_I(ip));
762  	xfs_inode_free(ip);
763  	return error;
764  }
765  
766  /*
767   * Look up an inode by number in the given file system.  The inode is looked up
768   * in the cache held in each AG.  If the inode is found in the cache, initialise
769   * the vfs inode if necessary.
770   *
771   * If it is not in core, read it in from the file system's device, add it to the
772   * cache and initialise the vfs inode.
773   *
774   * The inode is locked according to the value of the lock_flags parameter.
775   * Inode lookup is only done during metadata operations and not as part of the
776   * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
777   */
778  int
xfs_iget(struct xfs_mount * mp,struct xfs_trans * tp,xfs_ino_t ino,uint flags,uint lock_flags,struct xfs_inode ** ipp)779  xfs_iget(
780  	struct xfs_mount	*mp,
781  	struct xfs_trans	*tp,
782  	xfs_ino_t		ino,
783  	uint			flags,
784  	uint			lock_flags,
785  	struct xfs_inode	**ipp)
786  {
787  	struct xfs_inode	*ip;
788  	struct xfs_perag	*pag;
789  	xfs_agino_t		agino;
790  	int			error;
791  
792  	ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
793  
794  	/* reject inode numbers outside existing AGs */
795  	if (!xfs_verify_ino(mp, ino))
796  		return -EINVAL;
797  
798  	XFS_STATS_INC(mp, xs_ig_attempts);
799  
800  	/* get the perag structure and ensure that it's inode capable */
801  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
802  	agino = XFS_INO_TO_AGINO(mp, ino);
803  
804  again:
805  	error = 0;
806  	rcu_read_lock();
807  	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
808  
809  	if (ip) {
810  		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
811  		if (error)
812  			goto out_error_or_again;
813  	} else {
814  		rcu_read_unlock();
815  		if (flags & XFS_IGET_INCORE) {
816  			error = -ENODATA;
817  			goto out_error_or_again;
818  		}
819  		XFS_STATS_INC(mp, xs_ig_missed);
820  
821  		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
822  							flags, lock_flags);
823  		if (error)
824  			goto out_error_or_again;
825  	}
826  	xfs_perag_put(pag);
827  
828  	*ipp = ip;
829  
830  	/*
831  	 * If we have a real type for an on-disk inode, we can setup the inode
832  	 * now.	 If it's a new inode being created, xfs_init_new_inode will
833  	 * handle it.
834  	 */
835  	if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
836  		xfs_setup_existing_inode(ip);
837  	return 0;
838  
839  out_error_or_again:
840  	if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) &&
841  	    error == -EAGAIN) {
842  		delay(1);
843  		goto again;
844  	}
845  	xfs_perag_put(pag);
846  	return error;
847  }
848  
849  /*
850   * Grab the inode for reclaim exclusively.
851   *
852   * We have found this inode via a lookup under RCU, so the inode may have
853   * already been freed, or it may be in the process of being recycled by
854   * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
855   * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
856   * will not be set. Hence we need to check for both these flag conditions to
857   * avoid inodes that are no longer reclaim candidates.
858   *
859   * Note: checking for other state flags here, under the i_flags_lock or not, is
860   * racy and should be avoided. Those races should be resolved only after we have
861   * ensured that we are able to reclaim this inode and the world can see that we
862   * are going to reclaim it.
863   *
864   * Return true if we grabbed it, false otherwise.
865   */
866  static bool
xfs_reclaim_igrab(struct xfs_inode * ip,struct xfs_icwalk * icw)867  xfs_reclaim_igrab(
868  	struct xfs_inode	*ip,
869  	struct xfs_icwalk	*icw)
870  {
871  	ASSERT(rcu_read_lock_held());
872  
873  	spin_lock(&ip->i_flags_lock);
874  	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
875  	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
876  		/* not a reclaim candidate. */
877  		spin_unlock(&ip->i_flags_lock);
878  		return false;
879  	}
880  
881  	/* Don't reclaim a sick inode unless the caller asked for it. */
882  	if (ip->i_sick &&
883  	    (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
884  		spin_unlock(&ip->i_flags_lock);
885  		return false;
886  	}
887  
888  	__xfs_iflags_set(ip, XFS_IRECLAIM);
889  	spin_unlock(&ip->i_flags_lock);
890  	return true;
891  }
892  
893  /*
894   * Inode reclaim is non-blocking, so the default action if progress cannot be
895   * made is to "requeue" the inode for reclaim by unlocking it and clearing the
896   * XFS_IRECLAIM flag.  If we are in a shutdown state, we don't care about
897   * blocking anymore and hence we can wait for the inode to be able to reclaim
898   * it.
899   *
900   * We do no IO here - if callers require inodes to be cleaned they must push the
901   * AIL first to trigger writeback of dirty inodes.  This enables writeback to be
902   * done in the background in a non-blocking manner, and enables memory reclaim
903   * to make progress without blocking.
904   */
905  static void
xfs_reclaim_inode(struct xfs_inode * ip,struct xfs_perag * pag)906  xfs_reclaim_inode(
907  	struct xfs_inode	*ip,
908  	struct xfs_perag	*pag)
909  {
910  	xfs_ino_t		ino = ip->i_ino; /* for radix_tree_delete */
911  
912  	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
913  		goto out;
914  	if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
915  		goto out_iunlock;
916  
917  	/*
918  	 * Check for log shutdown because aborting the inode can move the log
919  	 * tail and corrupt in memory state. This is fine if the log is shut
920  	 * down, but if the log is still active and only the mount is shut down
921  	 * then the in-memory log tail movement caused by the abort can be
922  	 * incorrectly propagated to disk.
923  	 */
924  	if (xlog_is_shutdown(ip->i_mount->m_log)) {
925  		xfs_iunpin_wait(ip);
926  		xfs_iflush_shutdown_abort(ip);
927  		goto reclaim;
928  	}
929  	if (xfs_ipincount(ip))
930  		goto out_clear_flush;
931  	if (!xfs_inode_clean(ip))
932  		goto out_clear_flush;
933  
934  	xfs_iflags_clear(ip, XFS_IFLUSHING);
935  reclaim:
936  	trace_xfs_inode_reclaiming(ip);
937  
938  	/*
939  	 * Because we use RCU freeing we need to ensure the inode always appears
940  	 * to be reclaimed with an invalid inode number when in the free state.
941  	 * We do this as early as possible under the ILOCK so that
942  	 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
943  	 * detect races with us here. By doing this, we guarantee that once
944  	 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
945  	 * it will see either a valid inode that will serialise correctly, or it
946  	 * will see an invalid inode that it can skip.
947  	 */
948  	spin_lock(&ip->i_flags_lock);
949  	ip->i_flags = XFS_IRECLAIM;
950  	ip->i_ino = 0;
951  	ip->i_sick = 0;
952  	ip->i_checked = 0;
953  	spin_unlock(&ip->i_flags_lock);
954  
955  	ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
956  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
957  
958  	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
959  	/*
960  	 * Remove the inode from the per-AG radix tree.
961  	 *
962  	 * Because radix_tree_delete won't complain even if the item was never
963  	 * added to the tree assert that it's been there before to catch
964  	 * problems with the inode life time early on.
965  	 */
966  	spin_lock(&pag->pag_ici_lock);
967  	if (!radix_tree_delete(&pag->pag_ici_root,
968  				XFS_INO_TO_AGINO(ip->i_mount, ino)))
969  		ASSERT(0);
970  	xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
971  	spin_unlock(&pag->pag_ici_lock);
972  
973  	/*
974  	 * Here we do an (almost) spurious inode lock in order to coordinate
975  	 * with inode cache radix tree lookups.  This is because the lookup
976  	 * can reference the inodes in the cache without taking references.
977  	 *
978  	 * We make that OK here by ensuring that we wait until the inode is
979  	 * unlocked after the lookup before we go ahead and free it.
980  	 */
981  	xfs_ilock(ip, XFS_ILOCK_EXCL);
982  	ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
983  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
984  	ASSERT(xfs_inode_clean(ip));
985  
986  	__xfs_inode_free(ip);
987  	return;
988  
989  out_clear_flush:
990  	xfs_iflags_clear(ip, XFS_IFLUSHING);
991  out_iunlock:
992  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
993  out:
994  	xfs_iflags_clear(ip, XFS_IRECLAIM);
995  }
996  
997  /* Reclaim sick inodes if we're unmounting or the fs went down. */
998  static inline bool
xfs_want_reclaim_sick(struct xfs_mount * mp)999  xfs_want_reclaim_sick(
1000  	struct xfs_mount	*mp)
1001  {
1002  	return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) ||
1003  	       xfs_is_shutdown(mp);
1004  }
1005  
1006  void
xfs_reclaim_inodes(struct xfs_mount * mp)1007  xfs_reclaim_inodes(
1008  	struct xfs_mount	*mp)
1009  {
1010  	struct xfs_icwalk	icw = {
1011  		.icw_flags	= 0,
1012  	};
1013  
1014  	if (xfs_want_reclaim_sick(mp))
1015  		icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
1016  
1017  	while (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
1018  		xfs_ail_push_all_sync(mp->m_ail);
1019  		xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
1020  	}
1021  }
1022  
1023  /*
1024   * The shrinker infrastructure determines how many inodes we should scan for
1025   * reclaim. We want as many clean inodes ready to reclaim as possible, so we
1026   * push the AIL here. We also want to proactively free up memory if we can to
1027   * minimise the amount of work memory reclaim has to do so we kick the
1028   * background reclaim if it isn't already scheduled.
1029   */
1030  long
xfs_reclaim_inodes_nr(struct xfs_mount * mp,unsigned long nr_to_scan)1031  xfs_reclaim_inodes_nr(
1032  	struct xfs_mount	*mp,
1033  	unsigned long		nr_to_scan)
1034  {
1035  	struct xfs_icwalk	icw = {
1036  		.icw_flags	= XFS_ICWALK_FLAG_SCAN_LIMIT,
1037  		.icw_scan_limit	= min_t(unsigned long, LONG_MAX, nr_to_scan),
1038  	};
1039  
1040  	if (xfs_want_reclaim_sick(mp))
1041  		icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
1042  
1043  	/* kick background reclaimer and push the AIL */
1044  	xfs_reclaim_work_queue(mp);
1045  	xfs_ail_push_all(mp->m_ail);
1046  
1047  	xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
1048  	return 0;
1049  }
1050  
1051  /*
1052   * Return the number of reclaimable inodes in the filesystem for
1053   * the shrinker to determine how much to reclaim.
1054   */
1055  long
xfs_reclaim_inodes_count(struct xfs_mount * mp)1056  xfs_reclaim_inodes_count(
1057  	struct xfs_mount	*mp)
1058  {
1059  	XA_STATE		(xas, &mp->m_perags, 0);
1060  	long			reclaimable = 0;
1061  	struct xfs_perag	*pag;
1062  
1063  	rcu_read_lock();
1064  	xas_for_each_marked(&xas, pag, ULONG_MAX, XFS_PERAG_RECLAIM_MARK) {
1065  		trace_xfs_reclaim_inodes_count(pag, _THIS_IP_);
1066  		reclaimable += pag->pag_ici_reclaimable;
1067  	}
1068  	rcu_read_unlock();
1069  
1070  	return reclaimable;
1071  }
1072  
1073  STATIC bool
xfs_icwalk_match_id(struct xfs_inode * ip,struct xfs_icwalk * icw)1074  xfs_icwalk_match_id(
1075  	struct xfs_inode	*ip,
1076  	struct xfs_icwalk	*icw)
1077  {
1078  	if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1079  	    !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
1080  		return false;
1081  
1082  	if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1083  	    !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
1084  		return false;
1085  
1086  	if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1087  	    ip->i_projid != icw->icw_prid)
1088  		return false;
1089  
1090  	return true;
1091  }
1092  
1093  /*
1094   * A union-based inode filtering algorithm. Process the inode if any of the
1095   * criteria match. This is for global/internal scans only.
1096   */
1097  STATIC bool
xfs_icwalk_match_id_union(struct xfs_inode * ip,struct xfs_icwalk * icw)1098  xfs_icwalk_match_id_union(
1099  	struct xfs_inode	*ip,
1100  	struct xfs_icwalk	*icw)
1101  {
1102  	if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1103  	    uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
1104  		return true;
1105  
1106  	if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1107  	    gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
1108  		return true;
1109  
1110  	if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1111  	    ip->i_projid == icw->icw_prid)
1112  		return true;
1113  
1114  	return false;
1115  }
1116  
1117  /*
1118   * Is this inode @ip eligible for eof/cow block reclamation, given some
1119   * filtering parameters @icw?  The inode is eligible if @icw is null or
1120   * if the predicate functions match.
1121   */
1122  static bool
xfs_icwalk_match(struct xfs_inode * ip,struct xfs_icwalk * icw)1123  xfs_icwalk_match(
1124  	struct xfs_inode	*ip,
1125  	struct xfs_icwalk	*icw)
1126  {
1127  	bool			match;
1128  
1129  	if (!icw)
1130  		return true;
1131  
1132  	if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
1133  		match = xfs_icwalk_match_id_union(ip, icw);
1134  	else
1135  		match = xfs_icwalk_match_id(ip, icw);
1136  	if (!match)
1137  		return false;
1138  
1139  	/* skip the inode if the file size is too small */
1140  	if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
1141  	    XFS_ISIZE(ip) < icw->icw_min_file_size)
1142  		return false;
1143  
1144  	return true;
1145  }
1146  
1147  /*
1148   * This is a fast pass over the inode cache to try to get reclaim moving on as
1149   * many inodes as possible in a short period of time. It kicks itself every few
1150   * seconds, as well as being kicked by the inode cache shrinker when memory
1151   * goes low.
1152   */
1153  void
xfs_reclaim_worker(struct work_struct * work)1154  xfs_reclaim_worker(
1155  	struct work_struct *work)
1156  {
1157  	struct xfs_mount *mp = container_of(to_delayed_work(work),
1158  					struct xfs_mount, m_reclaim_work);
1159  
1160  	xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
1161  	xfs_reclaim_work_queue(mp);
1162  }
1163  
1164  STATIC int
xfs_inode_free_eofblocks(struct xfs_inode * ip,struct xfs_icwalk * icw,unsigned int * lockflags)1165  xfs_inode_free_eofblocks(
1166  	struct xfs_inode	*ip,
1167  	struct xfs_icwalk	*icw,
1168  	unsigned int		*lockflags)
1169  {
1170  	bool			wait;
1171  
1172  	wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1173  
1174  	if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
1175  		return 0;
1176  
1177  	/*
1178  	 * If the mapping is dirty the operation can block and wait for some
1179  	 * time. Unless we are waiting, skip it.
1180  	 */
1181  	if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1182  		return 0;
1183  
1184  	if (!xfs_icwalk_match(ip, icw))
1185  		return 0;
1186  
1187  	/*
1188  	 * If the caller is waiting, return -EAGAIN to keep the background
1189  	 * scanner moving and revisit the inode in a subsequent pass.
1190  	 */
1191  	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1192  		if (wait)
1193  			return -EAGAIN;
1194  		return 0;
1195  	}
1196  	*lockflags |= XFS_IOLOCK_EXCL;
1197  
1198  	if (xfs_can_free_eofblocks(ip))
1199  		return xfs_free_eofblocks(ip);
1200  
1201  	/* inode could be preallocated */
1202  	trace_xfs_inode_free_eofblocks_invalid(ip);
1203  	xfs_inode_clear_eofblocks_tag(ip);
1204  	return 0;
1205  }
1206  
1207  static void
xfs_blockgc_set_iflag(struct xfs_inode * ip,unsigned long iflag)1208  xfs_blockgc_set_iflag(
1209  	struct xfs_inode	*ip,
1210  	unsigned long		iflag)
1211  {
1212  	struct xfs_mount	*mp = ip->i_mount;
1213  	struct xfs_perag	*pag;
1214  
1215  	ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1216  
1217  	/*
1218  	 * Don't bother locking the AG and looking up in the radix trees
1219  	 * if we already know that we have the tag set.
1220  	 */
1221  	if (ip->i_flags & iflag)
1222  		return;
1223  	spin_lock(&ip->i_flags_lock);
1224  	ip->i_flags |= iflag;
1225  	spin_unlock(&ip->i_flags_lock);
1226  
1227  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1228  	spin_lock(&pag->pag_ici_lock);
1229  
1230  	xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1231  			XFS_ICI_BLOCKGC_TAG);
1232  
1233  	spin_unlock(&pag->pag_ici_lock);
1234  	xfs_perag_put(pag);
1235  }
1236  
1237  void
xfs_inode_set_eofblocks_tag(xfs_inode_t * ip)1238  xfs_inode_set_eofblocks_tag(
1239  	xfs_inode_t	*ip)
1240  {
1241  	trace_xfs_inode_set_eofblocks_tag(ip);
1242  	return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
1243  }
1244  
1245  static void
xfs_blockgc_clear_iflag(struct xfs_inode * ip,unsigned long iflag)1246  xfs_blockgc_clear_iflag(
1247  	struct xfs_inode	*ip,
1248  	unsigned long		iflag)
1249  {
1250  	struct xfs_mount	*mp = ip->i_mount;
1251  	struct xfs_perag	*pag;
1252  	bool			clear_tag;
1253  
1254  	ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1255  
1256  	spin_lock(&ip->i_flags_lock);
1257  	ip->i_flags &= ~iflag;
1258  	clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
1259  	spin_unlock(&ip->i_flags_lock);
1260  
1261  	if (!clear_tag)
1262  		return;
1263  
1264  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1265  	spin_lock(&pag->pag_ici_lock);
1266  
1267  	xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1268  			XFS_ICI_BLOCKGC_TAG);
1269  
1270  	spin_unlock(&pag->pag_ici_lock);
1271  	xfs_perag_put(pag);
1272  }
1273  
1274  void
xfs_inode_clear_eofblocks_tag(xfs_inode_t * ip)1275  xfs_inode_clear_eofblocks_tag(
1276  	xfs_inode_t	*ip)
1277  {
1278  	trace_xfs_inode_clear_eofblocks_tag(ip);
1279  	return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
1280  }
1281  
1282  /*
1283   * Prepare to free COW fork blocks from an inode.
1284   */
1285  static bool
xfs_prep_free_cowblocks(struct xfs_inode * ip,struct xfs_icwalk * icw)1286  xfs_prep_free_cowblocks(
1287  	struct xfs_inode	*ip,
1288  	struct xfs_icwalk	*icw)
1289  {
1290  	bool			sync;
1291  
1292  	sync = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1293  
1294  	/*
1295  	 * Just clear the tag if we have an empty cow fork or none at all. It's
1296  	 * possible the inode was fully unshared since it was originally tagged.
1297  	 */
1298  	if (!xfs_inode_has_cow_data(ip)) {
1299  		trace_xfs_inode_free_cowblocks_invalid(ip);
1300  		xfs_inode_clear_cowblocks_tag(ip);
1301  		return false;
1302  	}
1303  
1304  	/*
1305  	 * A cowblocks trim of an inode can have a significant effect on
1306  	 * fragmentation even when a reasonable COW extent size hint is set.
1307  	 * Therefore, we prefer to not process cowblocks unless they are clean
1308  	 * and idle. We can never process a cowblocks inode that is dirty or has
1309  	 * in-flight I/O under any circumstances, because outstanding writeback
1310  	 * or dio expects targeted COW fork blocks exist through write
1311  	 * completion where they can be remapped into the data fork.
1312  	 *
1313  	 * Therefore, the heuristic used here is to never process inodes
1314  	 * currently opened for write from background (i.e. non-sync) scans. For
1315  	 * sync scans, use the pagecache/dio state of the inode to ensure we
1316  	 * never free COW fork blocks out from under pending I/O.
1317  	 */
1318  	if (!sync && inode_is_open_for_write(VFS_I(ip)))
1319  		return false;
1320  	return xfs_can_free_cowblocks(ip);
1321  }
1322  
1323  /*
1324   * Automatic CoW Reservation Freeing
1325   *
1326   * These functions automatically garbage collect leftover CoW reservations
1327   * that were made on behalf of a cowextsize hint when we start to run out
1328   * of quota or when the reservations sit around for too long.  If the file
1329   * has dirty pages or is undergoing writeback, its CoW reservations will
1330   * be retained.
1331   *
1332   * The actual garbage collection piggybacks off the same code that runs
1333   * the speculative EOF preallocation garbage collector.
1334   */
1335  STATIC int
xfs_inode_free_cowblocks(struct xfs_inode * ip,struct xfs_icwalk * icw,unsigned int * lockflags)1336  xfs_inode_free_cowblocks(
1337  	struct xfs_inode	*ip,
1338  	struct xfs_icwalk	*icw,
1339  	unsigned int		*lockflags)
1340  {
1341  	bool			wait;
1342  	int			ret = 0;
1343  
1344  	wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1345  
1346  	if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
1347  		return 0;
1348  
1349  	if (!xfs_prep_free_cowblocks(ip, icw))
1350  		return 0;
1351  
1352  	if (!xfs_icwalk_match(ip, icw))
1353  		return 0;
1354  
1355  	/*
1356  	 * If the caller is waiting, return -EAGAIN to keep the background
1357  	 * scanner moving and revisit the inode in a subsequent pass.
1358  	 */
1359  	if (!(*lockflags & XFS_IOLOCK_EXCL) &&
1360  	    !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1361  		if (wait)
1362  			return -EAGAIN;
1363  		return 0;
1364  	}
1365  	*lockflags |= XFS_IOLOCK_EXCL;
1366  
1367  	if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
1368  		if (wait)
1369  			return -EAGAIN;
1370  		return 0;
1371  	}
1372  	*lockflags |= XFS_MMAPLOCK_EXCL;
1373  
1374  	/*
1375  	 * Check again, nobody else should be able to dirty blocks or change
1376  	 * the reflink iflag now that we have the first two locks held.
1377  	 */
1378  	if (xfs_prep_free_cowblocks(ip, icw))
1379  		ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
1380  	return ret;
1381  }
1382  
1383  void
xfs_inode_set_cowblocks_tag(xfs_inode_t * ip)1384  xfs_inode_set_cowblocks_tag(
1385  	xfs_inode_t	*ip)
1386  {
1387  	trace_xfs_inode_set_cowblocks_tag(ip);
1388  	return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
1389  }
1390  
1391  void
xfs_inode_clear_cowblocks_tag(xfs_inode_t * ip)1392  xfs_inode_clear_cowblocks_tag(
1393  	xfs_inode_t	*ip)
1394  {
1395  	trace_xfs_inode_clear_cowblocks_tag(ip);
1396  	return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
1397  }
1398  
1399  /* Disable post-EOF and CoW block auto-reclamation. */
1400  void
xfs_blockgc_stop(struct xfs_mount * mp)1401  xfs_blockgc_stop(
1402  	struct xfs_mount	*mp)
1403  {
1404  	struct xfs_perag	*pag;
1405  	xfs_agnumber_t		agno;
1406  
1407  	if (!xfs_clear_blockgc_enabled(mp))
1408  		return;
1409  
1410  	for_each_perag(mp, agno, pag)
1411  		cancel_delayed_work_sync(&pag->pag_blockgc_work);
1412  	trace_xfs_blockgc_stop(mp, __return_address);
1413  }
1414  
1415  /* Enable post-EOF and CoW block auto-reclamation. */
1416  void
xfs_blockgc_start(struct xfs_mount * mp)1417  xfs_blockgc_start(
1418  	struct xfs_mount	*mp)
1419  {
1420  	struct xfs_perag	*pag = NULL;
1421  
1422  	if (xfs_set_blockgc_enabled(mp))
1423  		return;
1424  
1425  	trace_xfs_blockgc_start(mp, __return_address);
1426  	while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
1427  		xfs_blockgc_queue(pag);
1428  }
1429  
1430  /* Don't try to run block gc on an inode that's in any of these states. */
1431  #define XFS_BLOCKGC_NOGRAB_IFLAGS	(XFS_INEW | \
1432  					 XFS_NEED_INACTIVE | \
1433  					 XFS_INACTIVATING | \
1434  					 XFS_IRECLAIMABLE | \
1435  					 XFS_IRECLAIM)
1436  /*
1437   * Decide if the given @ip is eligible for garbage collection of speculative
1438   * preallocations, and grab it if so.  Returns true if it's ready to go or
1439   * false if we should just ignore it.
1440   */
1441  static bool
xfs_blockgc_igrab(struct xfs_inode * ip)1442  xfs_blockgc_igrab(
1443  	struct xfs_inode	*ip)
1444  {
1445  	struct inode		*inode = VFS_I(ip);
1446  
1447  	ASSERT(rcu_read_lock_held());
1448  
1449  	/* Check for stale RCU freed inode */
1450  	spin_lock(&ip->i_flags_lock);
1451  	if (!ip->i_ino)
1452  		goto out_unlock_noent;
1453  
1454  	if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
1455  		goto out_unlock_noent;
1456  	spin_unlock(&ip->i_flags_lock);
1457  
1458  	/* nothing to sync during shutdown */
1459  	if (xfs_is_shutdown(ip->i_mount))
1460  		return false;
1461  
1462  	/* If we can't grab the inode, it must on it's way to reclaim. */
1463  	if (!igrab(inode))
1464  		return false;
1465  
1466  	/* inode is valid */
1467  	return true;
1468  
1469  out_unlock_noent:
1470  	spin_unlock(&ip->i_flags_lock);
1471  	return false;
1472  }
1473  
1474  /* Scan one incore inode for block preallocations that we can remove. */
1475  static int
xfs_blockgc_scan_inode(struct xfs_inode * ip,struct xfs_icwalk * icw)1476  xfs_blockgc_scan_inode(
1477  	struct xfs_inode	*ip,
1478  	struct xfs_icwalk	*icw)
1479  {
1480  	unsigned int		lockflags = 0;
1481  	int			error;
1482  
1483  	error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
1484  	if (error)
1485  		goto unlock;
1486  
1487  	error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
1488  unlock:
1489  	if (lockflags)
1490  		xfs_iunlock(ip, lockflags);
1491  	xfs_irele(ip);
1492  	return error;
1493  }
1494  
1495  /* Background worker that trims preallocated space. */
1496  void
xfs_blockgc_worker(struct work_struct * work)1497  xfs_blockgc_worker(
1498  	struct work_struct	*work)
1499  {
1500  	struct xfs_perag	*pag = container_of(to_delayed_work(work),
1501  					struct xfs_perag, pag_blockgc_work);
1502  	struct xfs_mount	*mp = pag->pag_mount;
1503  	int			error;
1504  
1505  	trace_xfs_blockgc_worker(mp, __return_address);
1506  
1507  	error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
1508  	if (error)
1509  		xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
1510  				pag->pag_agno, error);
1511  	xfs_blockgc_queue(pag);
1512  }
1513  
1514  /*
1515   * Try to free space in the filesystem by purging inactive inodes, eofblocks
1516   * and cowblocks.
1517   */
1518  int
xfs_blockgc_free_space(struct xfs_mount * mp,struct xfs_icwalk * icw)1519  xfs_blockgc_free_space(
1520  	struct xfs_mount	*mp,
1521  	struct xfs_icwalk	*icw)
1522  {
1523  	int			error;
1524  
1525  	trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
1526  
1527  	error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
1528  	if (error)
1529  		return error;
1530  
1531  	return xfs_inodegc_flush(mp);
1532  }
1533  
1534  /*
1535   * Reclaim all the free space that we can by scheduling the background blockgc
1536   * and inodegc workers immediately and waiting for them all to clear.
1537   */
1538  int
xfs_blockgc_flush_all(struct xfs_mount * mp)1539  xfs_blockgc_flush_all(
1540  	struct xfs_mount	*mp)
1541  {
1542  	struct xfs_perag	*pag = NULL;
1543  
1544  	trace_xfs_blockgc_flush_all(mp, __return_address);
1545  
1546  	/*
1547  	 * For each blockgc worker, move its queue time up to now.  If it wasn't
1548  	 * queued, it will not be requeued.  Then flush whatever is left.
1549  	 */
1550  	while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
1551  		mod_delayed_work(pag->pag_mount->m_blockgc_wq,
1552  				&pag->pag_blockgc_work, 0);
1553  
1554  	while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
1555  		flush_delayed_work(&pag->pag_blockgc_work);
1556  
1557  	return xfs_inodegc_flush(mp);
1558  }
1559  
1560  /*
1561   * Run cow/eofblocks scans on the supplied dquots.  We don't know exactly which
1562   * quota caused an allocation failure, so we make a best effort by including
1563   * each quota under low free space conditions (less than 1% free space) in the
1564   * scan.
1565   *
1566   * Callers must not hold any inode's ILOCK.  If requesting a synchronous scan
1567   * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
1568   * MMAPLOCK.
1569   */
1570  int
xfs_blockgc_free_dquots(struct xfs_mount * mp,struct xfs_dquot * udqp,struct xfs_dquot * gdqp,struct xfs_dquot * pdqp,unsigned int iwalk_flags)1571  xfs_blockgc_free_dquots(
1572  	struct xfs_mount	*mp,
1573  	struct xfs_dquot	*udqp,
1574  	struct xfs_dquot	*gdqp,
1575  	struct xfs_dquot	*pdqp,
1576  	unsigned int		iwalk_flags)
1577  {
1578  	struct xfs_icwalk	icw = {0};
1579  	bool			do_work = false;
1580  
1581  	if (!udqp && !gdqp && !pdqp)
1582  		return 0;
1583  
1584  	/*
1585  	 * Run a scan to free blocks using the union filter to cover all
1586  	 * applicable quotas in a single scan.
1587  	 */
1588  	icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
1589  
1590  	if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
1591  		icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
1592  		icw.icw_flags |= XFS_ICWALK_FLAG_UID;
1593  		do_work = true;
1594  	}
1595  
1596  	if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
1597  		icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
1598  		icw.icw_flags |= XFS_ICWALK_FLAG_GID;
1599  		do_work = true;
1600  	}
1601  
1602  	if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
1603  		icw.icw_prid = pdqp->q_id;
1604  		icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
1605  		do_work = true;
1606  	}
1607  
1608  	if (!do_work)
1609  		return 0;
1610  
1611  	return xfs_blockgc_free_space(mp, &icw);
1612  }
1613  
1614  /* Run cow/eofblocks scans on the quotas attached to the inode. */
1615  int
xfs_blockgc_free_quota(struct xfs_inode * ip,unsigned int iwalk_flags)1616  xfs_blockgc_free_quota(
1617  	struct xfs_inode	*ip,
1618  	unsigned int		iwalk_flags)
1619  {
1620  	return xfs_blockgc_free_dquots(ip->i_mount,
1621  			xfs_inode_dquot(ip, XFS_DQTYPE_USER),
1622  			xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
1623  			xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
1624  }
1625  
1626  /* XFS Inode Cache Walking Code */
1627  
1628  /*
1629   * The inode lookup is done in batches to keep the amount of lock traffic and
1630   * radix tree lookups to a minimum. The batch size is a trade off between
1631   * lookup reduction and stack usage. This is in the reclaim path, so we can't
1632   * be too greedy.
1633   */
1634  #define XFS_LOOKUP_BATCH	32
1635  
1636  
1637  /*
1638   * Decide if we want to grab this inode in anticipation of doing work towards
1639   * the goal.
1640   */
1641  static inline bool
xfs_icwalk_igrab(enum xfs_icwalk_goal goal,struct xfs_inode * ip,struct xfs_icwalk * icw)1642  xfs_icwalk_igrab(
1643  	enum xfs_icwalk_goal	goal,
1644  	struct xfs_inode	*ip,
1645  	struct xfs_icwalk	*icw)
1646  {
1647  	switch (goal) {
1648  	case XFS_ICWALK_BLOCKGC:
1649  		return xfs_blockgc_igrab(ip);
1650  	case XFS_ICWALK_RECLAIM:
1651  		return xfs_reclaim_igrab(ip, icw);
1652  	default:
1653  		return false;
1654  	}
1655  }
1656  
1657  /*
1658   * Process an inode.  Each processing function must handle any state changes
1659   * made by the icwalk igrab function.  Return -EAGAIN to skip an inode.
1660   */
1661  static inline int
xfs_icwalk_process_inode(enum xfs_icwalk_goal goal,struct xfs_inode * ip,struct xfs_perag * pag,struct xfs_icwalk * icw)1662  xfs_icwalk_process_inode(
1663  	enum xfs_icwalk_goal	goal,
1664  	struct xfs_inode	*ip,
1665  	struct xfs_perag	*pag,
1666  	struct xfs_icwalk	*icw)
1667  {
1668  	int			error = 0;
1669  
1670  	switch (goal) {
1671  	case XFS_ICWALK_BLOCKGC:
1672  		error = xfs_blockgc_scan_inode(ip, icw);
1673  		break;
1674  	case XFS_ICWALK_RECLAIM:
1675  		xfs_reclaim_inode(ip, pag);
1676  		break;
1677  	}
1678  	return error;
1679  }
1680  
1681  /*
1682   * For a given per-AG structure @pag and a goal, grab qualifying inodes and
1683   * process them in some manner.
1684   */
1685  static int
xfs_icwalk_ag(struct xfs_perag * pag,enum xfs_icwalk_goal goal,struct xfs_icwalk * icw)1686  xfs_icwalk_ag(
1687  	struct xfs_perag	*pag,
1688  	enum xfs_icwalk_goal	goal,
1689  	struct xfs_icwalk	*icw)
1690  {
1691  	struct xfs_mount	*mp = pag->pag_mount;
1692  	uint32_t		first_index;
1693  	int			last_error = 0;
1694  	int			skipped;
1695  	bool			done;
1696  	int			nr_found;
1697  
1698  restart:
1699  	done = false;
1700  	skipped = 0;
1701  	if (goal == XFS_ICWALK_RECLAIM)
1702  		first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
1703  	else
1704  		first_index = 0;
1705  	nr_found = 0;
1706  	do {
1707  		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1708  		int		error = 0;
1709  		int		i;
1710  
1711  		rcu_read_lock();
1712  
1713  		nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
1714  				(void **) batch, first_index,
1715  				XFS_LOOKUP_BATCH, goal);
1716  		if (!nr_found) {
1717  			done = true;
1718  			rcu_read_unlock();
1719  			break;
1720  		}
1721  
1722  		/*
1723  		 * Grab the inodes before we drop the lock. if we found
1724  		 * nothing, nr == 0 and the loop will be skipped.
1725  		 */
1726  		for (i = 0; i < nr_found; i++) {
1727  			struct xfs_inode *ip = batch[i];
1728  
1729  			if (done || !xfs_icwalk_igrab(goal, ip, icw))
1730  				batch[i] = NULL;
1731  
1732  			/*
1733  			 * Update the index for the next lookup. Catch
1734  			 * overflows into the next AG range which can occur if
1735  			 * we have inodes in the last block of the AG and we
1736  			 * are currently pointing to the last inode.
1737  			 *
1738  			 * Because we may see inodes that are from the wrong AG
1739  			 * due to RCU freeing and reallocation, only update the
1740  			 * index if it lies in this AG. It was a race that lead
1741  			 * us to see this inode, so another lookup from the
1742  			 * same index will not find it again.
1743  			 */
1744  			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
1745  				continue;
1746  			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1747  			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1748  				done = true;
1749  		}
1750  
1751  		/* unlock now we've grabbed the inodes. */
1752  		rcu_read_unlock();
1753  
1754  		for (i = 0; i < nr_found; i++) {
1755  			if (!batch[i])
1756  				continue;
1757  			error = xfs_icwalk_process_inode(goal, batch[i], pag,
1758  					icw);
1759  			if (error == -EAGAIN) {
1760  				skipped++;
1761  				continue;
1762  			}
1763  			if (error && last_error != -EFSCORRUPTED)
1764  				last_error = error;
1765  		}
1766  
1767  		/* bail out if the filesystem is corrupted.  */
1768  		if (error == -EFSCORRUPTED)
1769  			break;
1770  
1771  		cond_resched();
1772  
1773  		if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
1774  			icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
1775  			if (icw->icw_scan_limit <= 0)
1776  				break;
1777  		}
1778  	} while (nr_found && !done);
1779  
1780  	if (goal == XFS_ICWALK_RECLAIM) {
1781  		if (done)
1782  			first_index = 0;
1783  		WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
1784  	}
1785  
1786  	if (skipped) {
1787  		delay(1);
1788  		goto restart;
1789  	}
1790  	return last_error;
1791  }
1792  
1793  /* Walk all incore inodes to achieve a given goal. */
1794  static int
xfs_icwalk(struct xfs_mount * mp,enum xfs_icwalk_goal goal,struct xfs_icwalk * icw)1795  xfs_icwalk(
1796  	struct xfs_mount	*mp,
1797  	enum xfs_icwalk_goal	goal,
1798  	struct xfs_icwalk	*icw)
1799  {
1800  	struct xfs_perag	*pag = NULL;
1801  	int			error = 0;
1802  	int			last_error = 0;
1803  
1804  	while ((pag = xfs_perag_grab_next_tag(mp, pag, goal))) {
1805  		error = xfs_icwalk_ag(pag, goal, icw);
1806  		if (error) {
1807  			last_error = error;
1808  			if (error == -EFSCORRUPTED) {
1809  				xfs_perag_rele(pag);
1810  				break;
1811  			}
1812  		}
1813  	}
1814  	return last_error;
1815  	BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
1816  }
1817  
1818  #ifdef DEBUG
1819  static void
xfs_check_delalloc(struct xfs_inode * ip,int whichfork)1820  xfs_check_delalloc(
1821  	struct xfs_inode	*ip,
1822  	int			whichfork)
1823  {
1824  	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
1825  	struct xfs_bmbt_irec	got;
1826  	struct xfs_iext_cursor	icur;
1827  
1828  	if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
1829  		return;
1830  	do {
1831  		if (isnullstartblock(got.br_startblock)) {
1832  			xfs_warn(ip->i_mount,
1833  	"ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
1834  				ip->i_ino,
1835  				whichfork == XFS_DATA_FORK ? "data" : "cow",
1836  				got.br_startoff, got.br_blockcount);
1837  		}
1838  	} while (xfs_iext_next_extent(ifp, &icur, &got));
1839  }
1840  #else
1841  #define xfs_check_delalloc(ip, whichfork)	do { } while (0)
1842  #endif
1843  
1844  /* Schedule the inode for reclaim. */
1845  static void
xfs_inodegc_set_reclaimable(struct xfs_inode * ip)1846  xfs_inodegc_set_reclaimable(
1847  	struct xfs_inode	*ip)
1848  {
1849  	struct xfs_mount	*mp = ip->i_mount;
1850  	struct xfs_perag	*pag;
1851  
1852  	if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
1853  		xfs_check_delalloc(ip, XFS_DATA_FORK);
1854  		xfs_check_delalloc(ip, XFS_COW_FORK);
1855  		ASSERT(0);
1856  	}
1857  
1858  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1859  	spin_lock(&pag->pag_ici_lock);
1860  	spin_lock(&ip->i_flags_lock);
1861  
1862  	trace_xfs_inode_set_reclaimable(ip);
1863  	ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
1864  	ip->i_flags |= XFS_IRECLAIMABLE;
1865  	xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1866  			XFS_ICI_RECLAIM_TAG);
1867  
1868  	spin_unlock(&ip->i_flags_lock);
1869  	spin_unlock(&pag->pag_ici_lock);
1870  	xfs_perag_put(pag);
1871  }
1872  
1873  /*
1874   * Free all speculative preallocations and possibly even the inode itself.
1875   * This is the last chance to make changes to an otherwise unreferenced file
1876   * before incore reclamation happens.
1877   */
1878  static int
xfs_inodegc_inactivate(struct xfs_inode * ip)1879  xfs_inodegc_inactivate(
1880  	struct xfs_inode	*ip)
1881  {
1882  	int			error;
1883  
1884  	trace_xfs_inode_inactivating(ip);
1885  	error = xfs_inactive(ip);
1886  	xfs_inodegc_set_reclaimable(ip);
1887  	return error;
1888  
1889  }
1890  
1891  void
xfs_inodegc_worker(struct work_struct * work)1892  xfs_inodegc_worker(
1893  	struct work_struct	*work)
1894  {
1895  	struct xfs_inodegc	*gc = container_of(to_delayed_work(work),
1896  						struct xfs_inodegc, work);
1897  	struct llist_node	*node = llist_del_all(&gc->list);
1898  	struct xfs_inode	*ip, *n;
1899  	struct xfs_mount	*mp = gc->mp;
1900  	unsigned int		nofs_flag;
1901  
1902  	/*
1903  	 * Clear the cpu mask bit and ensure that we have seen the latest
1904  	 * update of the gc structure associated with this CPU. This matches
1905  	 * with the release semantics used when setting the cpumask bit in
1906  	 * xfs_inodegc_queue.
1907  	 */
1908  	cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask);
1909  	smp_mb__after_atomic();
1910  
1911  	WRITE_ONCE(gc->items, 0);
1912  
1913  	if (!node)
1914  		return;
1915  
1916  	/*
1917  	 * We can allocate memory here while doing writeback on behalf of
1918  	 * memory reclaim.  To avoid memory allocation deadlocks set the
1919  	 * task-wide nofs context for the following operations.
1920  	 */
1921  	nofs_flag = memalloc_nofs_save();
1922  
1923  	ip = llist_entry(node, struct xfs_inode, i_gclist);
1924  	trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));
1925  
1926  	WRITE_ONCE(gc->shrinker_hits, 0);
1927  	llist_for_each_entry_safe(ip, n, node, i_gclist) {
1928  		int	error;
1929  
1930  		xfs_iflags_set(ip, XFS_INACTIVATING);
1931  		error = xfs_inodegc_inactivate(ip);
1932  		if (error && !gc->error)
1933  			gc->error = error;
1934  	}
1935  
1936  	memalloc_nofs_restore(nofs_flag);
1937  }
1938  
1939  /*
1940   * Expedite all pending inodegc work to run immediately. This does not wait for
1941   * completion of the work.
1942   */
1943  void
xfs_inodegc_push(struct xfs_mount * mp)1944  xfs_inodegc_push(
1945  	struct xfs_mount	*mp)
1946  {
1947  	if (!xfs_is_inodegc_enabled(mp))
1948  		return;
1949  	trace_xfs_inodegc_push(mp, __return_address);
1950  	xfs_inodegc_queue_all(mp);
1951  }
1952  
1953  /*
1954   * Force all currently queued inode inactivation work to run immediately and
1955   * wait for the work to finish.
1956   */
1957  int
xfs_inodegc_flush(struct xfs_mount * mp)1958  xfs_inodegc_flush(
1959  	struct xfs_mount	*mp)
1960  {
1961  	xfs_inodegc_push(mp);
1962  	trace_xfs_inodegc_flush(mp, __return_address);
1963  	return xfs_inodegc_wait_all(mp);
1964  }
1965  
1966  /*
1967   * Flush all the pending work and then disable the inode inactivation background
1968   * workers and wait for them to stop.  Caller must hold sb->s_umount to
1969   * coordinate changes in the inodegc_enabled state.
1970   */
1971  void
xfs_inodegc_stop(struct xfs_mount * mp)1972  xfs_inodegc_stop(
1973  	struct xfs_mount	*mp)
1974  {
1975  	bool			rerun;
1976  
1977  	if (!xfs_clear_inodegc_enabled(mp))
1978  		return;
1979  
1980  	/*
1981  	 * Drain all pending inodegc work, including inodes that could be
1982  	 * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
1983  	 * threads that sample the inodegc state just prior to us clearing it.
1984  	 * The inodegc flag state prevents new threads from queuing more
1985  	 * inodes, so we queue pending work items and flush the workqueue until
1986  	 * all inodegc lists are empty.  IOWs, we cannot use drain_workqueue
1987  	 * here because it does not allow other unserialized mechanisms to
1988  	 * reschedule inodegc work while this draining is in progress.
1989  	 */
1990  	xfs_inodegc_queue_all(mp);
1991  	do {
1992  		flush_workqueue(mp->m_inodegc_wq);
1993  		rerun = xfs_inodegc_queue_all(mp);
1994  	} while (rerun);
1995  
1996  	trace_xfs_inodegc_stop(mp, __return_address);
1997  }
1998  
1999  /*
2000   * Enable the inode inactivation background workers and schedule deferred inode
2001   * inactivation work if there is any.  Caller must hold sb->s_umount to
2002   * coordinate changes in the inodegc_enabled state.
2003   */
2004  void
xfs_inodegc_start(struct xfs_mount * mp)2005  xfs_inodegc_start(
2006  	struct xfs_mount	*mp)
2007  {
2008  	if (xfs_set_inodegc_enabled(mp))
2009  		return;
2010  
2011  	trace_xfs_inodegc_start(mp, __return_address);
2012  	xfs_inodegc_queue_all(mp);
2013  }
2014  
2015  #ifdef CONFIG_XFS_RT
2016  static inline bool
xfs_inodegc_want_queue_rt_file(struct xfs_inode * ip)2017  xfs_inodegc_want_queue_rt_file(
2018  	struct xfs_inode	*ip)
2019  {
2020  	struct xfs_mount	*mp = ip->i_mount;
2021  
2022  	if (!XFS_IS_REALTIME_INODE(ip))
2023  		return false;
2024  
2025  	if (__percpu_counter_compare(&mp->m_frextents,
2026  				mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
2027  				XFS_FDBLOCKS_BATCH) < 0)
2028  		return true;
2029  
2030  	return false;
2031  }
2032  #else
2033  # define xfs_inodegc_want_queue_rt_file(ip)	(false)
2034  #endif /* CONFIG_XFS_RT */
2035  
2036  /*
2037   * Schedule the inactivation worker when:
2038   *
2039   *  - We've accumulated more than one inode cluster buffer's worth of inodes.
2040   *  - There is less than 5% free space left.
2041   *  - Any of the quotas for this inode are near an enforcement limit.
2042   */
2043  static inline bool
xfs_inodegc_want_queue_work(struct xfs_inode * ip,unsigned int items)2044  xfs_inodegc_want_queue_work(
2045  	struct xfs_inode	*ip,
2046  	unsigned int		items)
2047  {
2048  	struct xfs_mount	*mp = ip->i_mount;
2049  
2050  	if (items > mp->m_ino_geo.inodes_per_cluster)
2051  		return true;
2052  
2053  	if (__percpu_counter_compare(&mp->m_fdblocks,
2054  				mp->m_low_space[XFS_LOWSP_5_PCNT],
2055  				XFS_FDBLOCKS_BATCH) < 0)
2056  		return true;
2057  
2058  	if (xfs_inodegc_want_queue_rt_file(ip))
2059  		return true;
2060  
2061  	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
2062  		return true;
2063  
2064  	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
2065  		return true;
2066  
2067  	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
2068  		return true;
2069  
2070  	return false;
2071  }
2072  
2073  /*
2074   * Upper bound on the number of inodes in each AG that can be queued for
2075   * inactivation at any given time, to avoid monopolizing the workqueue.
2076   */
2077  #define XFS_INODEGC_MAX_BACKLOG		(4 * XFS_INODES_PER_CHUNK)
2078  
2079  /*
2080   * Make the frontend wait for inactivations when:
2081   *
2082   *  - Memory shrinkers queued the inactivation worker and it hasn't finished.
2083   *  - The queue depth exceeds the maximum allowable percpu backlog.
2084   *
2085   * Note: If we are in a NOFS context here (e.g. current thread is running a
2086   * transaction) the we don't want to block here as inodegc progress may require
2087   * filesystem resources we hold to make progress and that could result in a
2088   * deadlock. Hence we skip out of here if we are in a scoped NOFS context.
2089   */
2090  static inline bool
xfs_inodegc_want_flush_work(struct xfs_inode * ip,unsigned int items,unsigned int shrinker_hits)2091  xfs_inodegc_want_flush_work(
2092  	struct xfs_inode	*ip,
2093  	unsigned int		items,
2094  	unsigned int		shrinker_hits)
2095  {
2096  	if (current->flags & PF_MEMALLOC_NOFS)
2097  		return false;
2098  
2099  	if (shrinker_hits > 0)
2100  		return true;
2101  
2102  	if (items > XFS_INODEGC_MAX_BACKLOG)
2103  		return true;
2104  
2105  	return false;
2106  }
2107  
2108  /*
2109   * Queue a background inactivation worker if there are inodes that need to be
2110   * inactivated and higher level xfs code hasn't disabled the background
2111   * workers.
2112   */
2113  static void
xfs_inodegc_queue(struct xfs_inode * ip)2114  xfs_inodegc_queue(
2115  	struct xfs_inode	*ip)
2116  {
2117  	struct xfs_mount	*mp = ip->i_mount;
2118  	struct xfs_inodegc	*gc;
2119  	int			items;
2120  	unsigned int		shrinker_hits;
2121  	unsigned int		cpu_nr;
2122  	unsigned long		queue_delay = 1;
2123  
2124  	trace_xfs_inode_set_need_inactive(ip);
2125  	spin_lock(&ip->i_flags_lock);
2126  	ip->i_flags |= XFS_NEED_INACTIVE;
2127  	spin_unlock(&ip->i_flags_lock);
2128  
2129  	cpu_nr = get_cpu();
2130  	gc = this_cpu_ptr(mp->m_inodegc);
2131  	llist_add(&ip->i_gclist, &gc->list);
2132  	items = READ_ONCE(gc->items);
2133  	WRITE_ONCE(gc->items, items + 1);
2134  	shrinker_hits = READ_ONCE(gc->shrinker_hits);
2135  
2136  	/*
2137  	 * Ensure the list add is always seen by anyone who finds the cpumask
2138  	 * bit set. This effectively gives the cpumask bit set operation
2139  	 * release ordering semantics.
2140  	 */
2141  	smp_mb__before_atomic();
2142  	if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask))
2143  		cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask);
2144  
2145  	/*
2146  	 * We queue the work while holding the current CPU so that the work
2147  	 * is scheduled to run on this CPU.
2148  	 */
2149  	if (!xfs_is_inodegc_enabled(mp)) {
2150  		put_cpu();
2151  		return;
2152  	}
2153  
2154  	if (xfs_inodegc_want_queue_work(ip, items))
2155  		queue_delay = 0;
2156  
2157  	trace_xfs_inodegc_queue(mp, __return_address);
2158  	mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
2159  			queue_delay);
2160  	put_cpu();
2161  
2162  	if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
2163  		trace_xfs_inodegc_throttle(mp, __return_address);
2164  		flush_delayed_work(&gc->work);
2165  	}
2166  }
2167  
2168  /*
2169   * We set the inode flag atomically with the radix tree tag.  Once we get tag
2170   * lookups on the radix tree, this inode flag can go away.
2171   *
2172   * We always use background reclaim here because even if the inode is clean, it
2173   * still may be under IO and hence we have wait for IO completion to occur
2174   * before we can reclaim the inode. The background reclaim path handles this
2175   * more efficiently than we can here, so simply let background reclaim tear down
2176   * all inodes.
2177   */
2178  void
xfs_inode_mark_reclaimable(struct xfs_inode * ip)2179  xfs_inode_mark_reclaimable(
2180  	struct xfs_inode	*ip)
2181  {
2182  	struct xfs_mount	*mp = ip->i_mount;
2183  	bool			need_inactive;
2184  
2185  	XFS_STATS_INC(mp, vn_reclaim);
2186  
2187  	/*
2188  	 * We should never get here with any of the reclaim flags already set.
2189  	 */
2190  	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
2191  
2192  	need_inactive = xfs_inode_needs_inactive(ip);
2193  	if (need_inactive) {
2194  		xfs_inodegc_queue(ip);
2195  		return;
2196  	}
2197  
2198  	/* Going straight to reclaim, so drop the dquots. */
2199  	xfs_qm_dqdetach(ip);
2200  	xfs_inodegc_set_reclaimable(ip);
2201  }
2202  
2203  /*
2204   * Register a phony shrinker so that we can run background inodegc sooner when
2205   * there's memory pressure.  Inactivation does not itself free any memory but
2206   * it does make inodes reclaimable, which eventually frees memory.
2207   *
2208   * The count function, seek value, and batch value are crafted to trigger the
2209   * scan function during the second round of scanning.  Hopefully this means
2210   * that we reclaimed enough memory that initiating metadata transactions won't
2211   * make things worse.
2212   */
2213  #define XFS_INODEGC_SHRINKER_COUNT	(1UL << DEF_PRIORITY)
2214  #define XFS_INODEGC_SHRINKER_BATCH	((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
2215  
2216  static unsigned long
xfs_inodegc_shrinker_count(struct shrinker * shrink,struct shrink_control * sc)2217  xfs_inodegc_shrinker_count(
2218  	struct shrinker		*shrink,
2219  	struct shrink_control	*sc)
2220  {
2221  	struct xfs_mount	*mp = shrink->private_data;
2222  	struct xfs_inodegc	*gc;
2223  	int			cpu;
2224  
2225  	if (!xfs_is_inodegc_enabled(mp))
2226  		return 0;
2227  
2228  	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
2229  		gc = per_cpu_ptr(mp->m_inodegc, cpu);
2230  		if (!llist_empty(&gc->list))
2231  			return XFS_INODEGC_SHRINKER_COUNT;
2232  	}
2233  
2234  	return 0;
2235  }
2236  
2237  static unsigned long
xfs_inodegc_shrinker_scan(struct shrinker * shrink,struct shrink_control * sc)2238  xfs_inodegc_shrinker_scan(
2239  	struct shrinker		*shrink,
2240  	struct shrink_control	*sc)
2241  {
2242  	struct xfs_mount	*mp = shrink->private_data;
2243  	struct xfs_inodegc	*gc;
2244  	int			cpu;
2245  	bool			no_items = true;
2246  
2247  	if (!xfs_is_inodegc_enabled(mp))
2248  		return SHRINK_STOP;
2249  
2250  	trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
2251  
2252  	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
2253  		gc = per_cpu_ptr(mp->m_inodegc, cpu);
2254  		if (!llist_empty(&gc->list)) {
2255  			unsigned int	h = READ_ONCE(gc->shrinker_hits);
2256  
2257  			WRITE_ONCE(gc->shrinker_hits, h + 1);
2258  			mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
2259  			no_items = false;
2260  		}
2261  	}
2262  
2263  	/*
2264  	 * If there are no inodes to inactivate, we don't want the shrinker
2265  	 * to think there's deferred work to call us back about.
2266  	 */
2267  	if (no_items)
2268  		return LONG_MAX;
2269  
2270  	return SHRINK_STOP;
2271  }
2272  
2273  /* Register a shrinker so we can accelerate inodegc and throttle queuing. */
2274  int
xfs_inodegc_register_shrinker(struct xfs_mount * mp)2275  xfs_inodegc_register_shrinker(
2276  	struct xfs_mount	*mp)
2277  {
2278  	mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB,
2279  						"xfs-inodegc:%s",
2280  						mp->m_super->s_id);
2281  	if (!mp->m_inodegc_shrinker)
2282  		return -ENOMEM;
2283  
2284  	mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count;
2285  	mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan;
2286  	mp->m_inodegc_shrinker->seeks = 0;
2287  	mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH;
2288  	mp->m_inodegc_shrinker->private_data = mp;
2289  
2290  	shrinker_register(mp->m_inodegc_shrinker);
2291  
2292  	return 0;
2293  }
2294