1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
4   * Author: Darrick J. Wong <djwong@kernel.org>
5   */
6  #include "xfs.h"
7  #include "xfs_fs.h"
8  #include "xfs_shared.h"
9  #include "xfs_format.h"
10  #include "xfs_trans_resv.h"
11  #include "xfs_mount.h"
12  #include "xfs_btree.h"
13  #include "xfs_log_format.h"
14  #include "xfs_trans.h"
15  #include "xfs_inode.h"
16  #include "xfs_icache.h"
17  #include "xfs_alloc.h"
18  #include "xfs_alloc_btree.h"
19  #include "xfs_ialloc.h"
20  #include "xfs_ialloc_btree.h"
21  #include "xfs_refcount_btree.h"
22  #include "xfs_rmap.h"
23  #include "xfs_rmap_btree.h"
24  #include "xfs_log.h"
25  #include "xfs_trans_priv.h"
26  #include "xfs_da_format.h"
27  #include "xfs_da_btree.h"
28  #include "xfs_dir2_priv.h"
29  #include "xfs_dir2.h"
30  #include "xfs_attr.h"
31  #include "xfs_reflink.h"
32  #include "xfs_ag.h"
33  #include "xfs_error.h"
34  #include "xfs_quota.h"
35  #include "xfs_exchmaps.h"
36  #include "xfs_rtbitmap.h"
37  #include "scrub/scrub.h"
38  #include "scrub/common.h"
39  #include "scrub/trace.h"
40  #include "scrub/repair.h"
41  #include "scrub/health.h"
42  
43  /* Common code for the metadata scrubbers. */
44  
45  /*
46   * Handling operational errors.
47   *
48   * The *_process_error() family of functions are used to process error return
49   * codes from functions called as part of a scrub operation.
50   *
51   * If there's no error, we return true to tell the caller that it's ok
52   * to move on to the next check in its list.
53   *
54   * For non-verifier errors (e.g. ENOMEM) we return false to tell the
55   * caller that something bad happened, and we preserve *error so that
56   * the caller can return the *error up the stack to userspace.
57   *
58   * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
59   * OFLAG_CORRUPT in sm_flags and the *error is cleared.  In other words,
60   * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
61   * not via return codes.  We return false to tell the caller that
62   * something bad happened.  Since the error has been cleared, the caller
63   * will (presumably) return that zero and scrubbing will move on to
64   * whatever's next.
65   *
66   * ftrace can be used to record the precise metadata location and the
67   * approximate code location of the failed operation.
68   */
69  
70  /* Check for operational errors. */
71  static bool
__xchk_process_error(struct xfs_scrub * sc,xfs_agnumber_t agno,xfs_agblock_t bno,int * error,__u32 errflag,void * ret_ip)72  __xchk_process_error(
73  	struct xfs_scrub	*sc,
74  	xfs_agnumber_t		agno,
75  	xfs_agblock_t		bno,
76  	int			*error,
77  	__u32			errflag,
78  	void			*ret_ip)
79  {
80  	switch (*error) {
81  	case 0:
82  		return true;
83  	case -EDEADLOCK:
84  	case -ECHRNG:
85  		/* Used to restart an op with deadlock avoidance. */
86  		trace_xchk_deadlock_retry(
87  				sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
88  				sc->sm, *error);
89  		break;
90  	case -ECANCELED:
91  		/*
92  		 * ECANCELED here means that the caller set one of the scrub
93  		 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
94  		 * quickly.  Set error to zero and do not continue.
95  		 */
96  		trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
97  		*error = 0;
98  		break;
99  	case -EFSBADCRC:
100  	case -EFSCORRUPTED:
101  		/* Note the badness but don't abort. */
102  		sc->sm->sm_flags |= errflag;
103  		*error = 0;
104  		fallthrough;
105  	default:
106  		trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
107  		break;
108  	}
109  	return false;
110  }
111  
112  bool
xchk_process_error(struct xfs_scrub * sc,xfs_agnumber_t agno,xfs_agblock_t bno,int * error)113  xchk_process_error(
114  	struct xfs_scrub	*sc,
115  	xfs_agnumber_t		agno,
116  	xfs_agblock_t		bno,
117  	int			*error)
118  {
119  	return __xchk_process_error(sc, agno, bno, error,
120  			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
121  }
122  
123  bool
xchk_xref_process_error(struct xfs_scrub * sc,xfs_agnumber_t agno,xfs_agblock_t bno,int * error)124  xchk_xref_process_error(
125  	struct xfs_scrub	*sc,
126  	xfs_agnumber_t		agno,
127  	xfs_agblock_t		bno,
128  	int			*error)
129  {
130  	return __xchk_process_error(sc, agno, bno, error,
131  			XFS_SCRUB_OFLAG_XFAIL, __return_address);
132  }
133  
134  /* Check for operational errors for a file offset. */
135  static bool
__xchk_fblock_process_error(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t offset,int * error,__u32 errflag,void * ret_ip)136  __xchk_fblock_process_error(
137  	struct xfs_scrub	*sc,
138  	int			whichfork,
139  	xfs_fileoff_t		offset,
140  	int			*error,
141  	__u32			errflag,
142  	void			*ret_ip)
143  {
144  	switch (*error) {
145  	case 0:
146  		return true;
147  	case -EDEADLOCK:
148  	case -ECHRNG:
149  		/* Used to restart an op with deadlock avoidance. */
150  		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
151  		break;
152  	case -ECANCELED:
153  		/*
154  		 * ECANCELED here means that the caller set one of the scrub
155  		 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
156  		 * quickly.  Set error to zero and do not continue.
157  		 */
158  		trace_xchk_file_op_error(sc, whichfork, offset, *error,
159  				ret_ip);
160  		*error = 0;
161  		break;
162  	case -EFSBADCRC:
163  	case -EFSCORRUPTED:
164  		/* Note the badness but don't abort. */
165  		sc->sm->sm_flags |= errflag;
166  		*error = 0;
167  		fallthrough;
168  	default:
169  		trace_xchk_file_op_error(sc, whichfork, offset, *error,
170  				ret_ip);
171  		break;
172  	}
173  	return false;
174  }
175  
176  bool
xchk_fblock_process_error(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t offset,int * error)177  xchk_fblock_process_error(
178  	struct xfs_scrub	*sc,
179  	int			whichfork,
180  	xfs_fileoff_t		offset,
181  	int			*error)
182  {
183  	return __xchk_fblock_process_error(sc, whichfork, offset, error,
184  			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
185  }
186  
187  bool
xchk_fblock_xref_process_error(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t offset,int * error)188  xchk_fblock_xref_process_error(
189  	struct xfs_scrub	*sc,
190  	int			whichfork,
191  	xfs_fileoff_t		offset,
192  	int			*error)
193  {
194  	return __xchk_fblock_process_error(sc, whichfork, offset, error,
195  			XFS_SCRUB_OFLAG_XFAIL, __return_address);
196  }
197  
198  /*
199   * Handling scrub corruption/optimization/warning checks.
200   *
201   * The *_set_{corrupt,preen,warning}() family of functions are used to
202   * record the presence of metadata that is incorrect (corrupt), could be
203   * optimized somehow (preen), or should be flagged for administrative
204   * review but is not incorrect (warn).
205   *
206   * ftrace can be used to record the precise metadata location and
207   * approximate code location of the failed check.
208   */
209  
210  /* Record a block which could be optimized. */
211  void
xchk_block_set_preen(struct xfs_scrub * sc,struct xfs_buf * bp)212  xchk_block_set_preen(
213  	struct xfs_scrub	*sc,
214  	struct xfs_buf		*bp)
215  {
216  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
217  	trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address);
218  }
219  
220  /*
221   * Record an inode which could be optimized.  The trace data will
222   * include the block given by bp if bp is given; otherwise it will use
223   * the block location of the inode record itself.
224   */
225  void
xchk_ino_set_preen(struct xfs_scrub * sc,xfs_ino_t ino)226  xchk_ino_set_preen(
227  	struct xfs_scrub	*sc,
228  	xfs_ino_t		ino)
229  {
230  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
231  	trace_xchk_ino_preen(sc, ino, __return_address);
232  }
233  
234  /* Record something being wrong with the filesystem primary superblock. */
235  void
xchk_set_corrupt(struct xfs_scrub * sc)236  xchk_set_corrupt(
237  	struct xfs_scrub	*sc)
238  {
239  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
240  	trace_xchk_fs_error(sc, 0, __return_address);
241  }
242  
243  /* Record a corrupt block. */
244  void
xchk_block_set_corrupt(struct xfs_scrub * sc,struct xfs_buf * bp)245  xchk_block_set_corrupt(
246  	struct xfs_scrub	*sc,
247  	struct xfs_buf		*bp)
248  {
249  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
250  	trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
251  }
252  
253  #ifdef CONFIG_XFS_QUOTA
254  /* Record a corrupt quota counter. */
255  void
xchk_qcheck_set_corrupt(struct xfs_scrub * sc,unsigned int dqtype,xfs_dqid_t id)256  xchk_qcheck_set_corrupt(
257  	struct xfs_scrub	*sc,
258  	unsigned int		dqtype,
259  	xfs_dqid_t		id)
260  {
261  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
262  	trace_xchk_qcheck_error(sc, dqtype, id, __return_address);
263  }
264  #endif
265  
266  /* Record a corruption while cross-referencing. */
267  void
xchk_block_xref_set_corrupt(struct xfs_scrub * sc,struct xfs_buf * bp)268  xchk_block_xref_set_corrupt(
269  	struct xfs_scrub	*sc,
270  	struct xfs_buf		*bp)
271  {
272  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
273  	trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
274  }
275  
276  /*
277   * Record a corrupt inode.  The trace data will include the block given
278   * by bp if bp is given; otherwise it will use the block location of the
279   * inode record itself.
280   */
281  void
xchk_ino_set_corrupt(struct xfs_scrub * sc,xfs_ino_t ino)282  xchk_ino_set_corrupt(
283  	struct xfs_scrub	*sc,
284  	xfs_ino_t		ino)
285  {
286  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
287  	trace_xchk_ino_error(sc, ino, __return_address);
288  }
289  
290  /* Record a corruption while cross-referencing with an inode. */
291  void
xchk_ino_xref_set_corrupt(struct xfs_scrub * sc,xfs_ino_t ino)292  xchk_ino_xref_set_corrupt(
293  	struct xfs_scrub	*sc,
294  	xfs_ino_t		ino)
295  {
296  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
297  	trace_xchk_ino_error(sc, ino, __return_address);
298  }
299  
300  /* Record corruption in a block indexed by a file fork. */
301  void
xchk_fblock_set_corrupt(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t offset)302  xchk_fblock_set_corrupt(
303  	struct xfs_scrub	*sc,
304  	int			whichfork,
305  	xfs_fileoff_t		offset)
306  {
307  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
308  	trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
309  }
310  
311  /* Record a corruption while cross-referencing a fork block. */
312  void
xchk_fblock_xref_set_corrupt(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t offset)313  xchk_fblock_xref_set_corrupt(
314  	struct xfs_scrub	*sc,
315  	int			whichfork,
316  	xfs_fileoff_t		offset)
317  {
318  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
319  	trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
320  }
321  
322  /*
323   * Warn about inodes that need administrative review but is not
324   * incorrect.
325   */
326  void
xchk_ino_set_warning(struct xfs_scrub * sc,xfs_ino_t ino)327  xchk_ino_set_warning(
328  	struct xfs_scrub	*sc,
329  	xfs_ino_t		ino)
330  {
331  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
332  	trace_xchk_ino_warning(sc, ino, __return_address);
333  }
334  
335  /* Warn about a block indexed by a file fork that needs review. */
336  void
xchk_fblock_set_warning(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t offset)337  xchk_fblock_set_warning(
338  	struct xfs_scrub	*sc,
339  	int			whichfork,
340  	xfs_fileoff_t		offset)
341  {
342  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
343  	trace_xchk_fblock_warning(sc, whichfork, offset, __return_address);
344  }
345  
346  /* Signal an incomplete scrub. */
347  void
xchk_set_incomplete(struct xfs_scrub * sc)348  xchk_set_incomplete(
349  	struct xfs_scrub	*sc)
350  {
351  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
352  	trace_xchk_incomplete(sc, __return_address);
353  }
354  
355  /*
356   * rmap scrubbing -- compute the number of blocks with a given owner,
357   * at least according to the reverse mapping data.
358   */
359  
360  struct xchk_rmap_ownedby_info {
361  	const struct xfs_owner_info	*oinfo;
362  	xfs_filblks_t			*blocks;
363  };
364  
365  STATIC int
xchk_count_rmap_ownedby_irec(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * priv)366  xchk_count_rmap_ownedby_irec(
367  	struct xfs_btree_cur		*cur,
368  	const struct xfs_rmap_irec	*rec,
369  	void				*priv)
370  {
371  	struct xchk_rmap_ownedby_info	*sroi = priv;
372  	bool				irec_attr;
373  	bool				oinfo_attr;
374  
375  	irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK;
376  	oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK;
377  
378  	if (rec->rm_owner != sroi->oinfo->oi_owner)
379  		return 0;
380  
381  	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr)
382  		(*sroi->blocks) += rec->rm_blockcount;
383  
384  	return 0;
385  }
386  
387  /*
388   * Calculate the number of blocks the rmap thinks are owned by something.
389   * The caller should pass us an rmapbt cursor.
390   */
391  int
xchk_count_rmap_ownedby_ag(struct xfs_scrub * sc,struct xfs_btree_cur * cur,const struct xfs_owner_info * oinfo,xfs_filblks_t * blocks)392  xchk_count_rmap_ownedby_ag(
393  	struct xfs_scrub		*sc,
394  	struct xfs_btree_cur		*cur,
395  	const struct xfs_owner_info	*oinfo,
396  	xfs_filblks_t			*blocks)
397  {
398  	struct xchk_rmap_ownedby_info	sroi = {
399  		.oinfo			= oinfo,
400  		.blocks			= blocks,
401  	};
402  
403  	*blocks = 0;
404  	return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
405  			&sroi);
406  }
407  
408  /*
409   * AG scrubbing
410   *
411   * These helpers facilitate locking an allocation group's header
412   * buffers, setting up cursors for all btrees that are present, and
413   * cleaning everything up once we're through.
414   */
415  
416  /* Decide if we want to return an AG header read failure. */
417  static inline bool
want_ag_read_header_failure(struct xfs_scrub * sc,unsigned int type)418  want_ag_read_header_failure(
419  	struct xfs_scrub	*sc,
420  	unsigned int		type)
421  {
422  	/* Return all AG header read failures when scanning btrees. */
423  	if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
424  	    sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
425  	    sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
426  		return true;
427  	/*
428  	 * If we're scanning a given type of AG header, we only want to
429  	 * see read failures from that specific header.  We'd like the
430  	 * other headers to cross-check them, but this isn't required.
431  	 */
432  	if (sc->sm->sm_type == type)
433  		return true;
434  	return false;
435  }
436  
437  /*
438   * Grab the AG header buffers for the attached perag structure.
439   *
440   * The headers should be released by xchk_ag_free, but as a fail safe we attach
441   * all the buffers we grab to the scrub transaction so they'll all be freed
442   * when we cancel it.
443   */
444  static inline int
xchk_perag_read_headers(struct xfs_scrub * sc,struct xchk_ag * sa)445  xchk_perag_read_headers(
446  	struct xfs_scrub	*sc,
447  	struct xchk_ag		*sa)
448  {
449  	int			error;
450  
451  	error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp);
452  	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
453  		return error;
454  
455  	error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
456  	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
457  		return error;
458  
459  	return 0;
460  }
461  
462  /*
463   * Grab the AG headers for the attached perag structure and wait for pending
464   * intents to drain.
465   */
466  int
xchk_perag_drain_and_lock(struct xfs_scrub * sc)467  xchk_perag_drain_and_lock(
468  	struct xfs_scrub	*sc)
469  {
470  	struct xchk_ag		*sa = &sc->sa;
471  	int			error = 0;
472  
473  	ASSERT(sa->pag != NULL);
474  	ASSERT(sa->agi_bp == NULL);
475  	ASSERT(sa->agf_bp == NULL);
476  
477  	do {
478  		if (xchk_should_terminate(sc, &error))
479  			return error;
480  
481  		error = xchk_perag_read_headers(sc, sa);
482  		if (error)
483  			return error;
484  
485  		/*
486  		 * If we've grabbed an inode for scrubbing then we assume that
487  		 * holding its ILOCK will suffice to coordinate with any intent
488  		 * chains involving this inode.
489  		 */
490  		if (sc->ip)
491  			return 0;
492  
493  		/*
494  		 * Decide if this AG is quiet enough for all metadata to be
495  		 * consistent with each other.  XFS allows the AG header buffer
496  		 * locks to cycle across transaction rolls while processing
497  		 * chains of deferred ops, which means that there could be
498  		 * other threads in the middle of processing a chain of
499  		 * deferred ops.  For regular operations we are careful about
500  		 * ordering operations to prevent collisions between threads
501  		 * (which is why we don't need a per-AG lock), but scrub and
502  		 * repair have to serialize against chained operations.
503  		 *
504  		 * We just locked all the AG headers buffers; now take a look
505  		 * to see if there are any intents in progress.  If there are,
506  		 * drop the AG headers and wait for the intents to drain.
507  		 * Since we hold all the AG header locks for the duration of
508  		 * the scrub, this is the only time we have to sample the
509  		 * intents counter; any threads increasing it after this point
510  		 * can't possibly be in the middle of a chain of AG metadata
511  		 * updates.
512  		 *
513  		 * Obviously, this should be slanted against scrub and in favor
514  		 * of runtime threads.
515  		 */
516  		if (!xfs_perag_intent_busy(sa->pag))
517  			return 0;
518  
519  		if (sa->agf_bp) {
520  			xfs_trans_brelse(sc->tp, sa->agf_bp);
521  			sa->agf_bp = NULL;
522  		}
523  
524  		if (sa->agi_bp) {
525  			xfs_trans_brelse(sc->tp, sa->agi_bp);
526  			sa->agi_bp = NULL;
527  		}
528  
529  		if (!(sc->flags & XCHK_FSGATES_DRAIN))
530  			return -ECHRNG;
531  		error = xfs_perag_intent_drain(sa->pag);
532  		if (error == -ERESTARTSYS)
533  			error = -EINTR;
534  	} while (!error);
535  
536  	return error;
537  }
538  
539  /*
540   * Grab the per-AG structure, grab all AG header buffers, and wait until there
541   * aren't any pending intents.  Returns -ENOENT if we can't grab the perag
542   * structure.
543   */
544  int
xchk_ag_read_headers(struct xfs_scrub * sc,xfs_agnumber_t agno,struct xchk_ag * sa)545  xchk_ag_read_headers(
546  	struct xfs_scrub	*sc,
547  	xfs_agnumber_t		agno,
548  	struct xchk_ag		*sa)
549  {
550  	struct xfs_mount	*mp = sc->mp;
551  
552  	ASSERT(!sa->pag);
553  	sa->pag = xfs_perag_get(mp, agno);
554  	if (!sa->pag)
555  		return -ENOENT;
556  
557  	return xchk_perag_drain_and_lock(sc);
558  }
559  
560  /* Release all the AG btree cursors. */
561  void
xchk_ag_btcur_free(struct xchk_ag * sa)562  xchk_ag_btcur_free(
563  	struct xchk_ag		*sa)
564  {
565  	if (sa->refc_cur)
566  		xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
567  	if (sa->rmap_cur)
568  		xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
569  	if (sa->fino_cur)
570  		xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
571  	if (sa->ino_cur)
572  		xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
573  	if (sa->cnt_cur)
574  		xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
575  	if (sa->bno_cur)
576  		xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
577  
578  	sa->refc_cur = NULL;
579  	sa->rmap_cur = NULL;
580  	sa->fino_cur = NULL;
581  	sa->ino_cur = NULL;
582  	sa->bno_cur = NULL;
583  	sa->cnt_cur = NULL;
584  }
585  
586  /* Initialize all the btree cursors for an AG. */
587  void
xchk_ag_btcur_init(struct xfs_scrub * sc,struct xchk_ag * sa)588  xchk_ag_btcur_init(
589  	struct xfs_scrub	*sc,
590  	struct xchk_ag		*sa)
591  {
592  	struct xfs_mount	*mp = sc->mp;
593  
594  	if (sa->agf_bp) {
595  		/* Set up a bnobt cursor for cross-referencing. */
596  		sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
597  				sa->pag);
598  		xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur,
599  				XFS_SCRUB_TYPE_BNOBT);
600  
601  		/* Set up a cntbt cursor for cross-referencing. */
602  		sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
603  				sa->pag);
604  		xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur,
605  				XFS_SCRUB_TYPE_CNTBT);
606  
607  		/* Set up a rmapbt cursor for cross-referencing. */
608  		if (xfs_has_rmapbt(mp)) {
609  			sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
610  					sa->agf_bp, sa->pag);
611  			xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur,
612  					XFS_SCRUB_TYPE_RMAPBT);
613  		}
614  
615  		/* Set up a refcountbt cursor for cross-referencing. */
616  		if (xfs_has_reflink(mp)) {
617  			sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
618  					sa->agf_bp, sa->pag);
619  			xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur,
620  					XFS_SCRUB_TYPE_REFCNTBT);
621  		}
622  	}
623  
624  	if (sa->agi_bp) {
625  		/* Set up a inobt cursor for cross-referencing. */
626  		sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp,
627  				sa->agi_bp);
628  		xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur,
629  				XFS_SCRUB_TYPE_INOBT);
630  
631  		/* Set up a finobt cursor for cross-referencing. */
632  		if (xfs_has_finobt(mp)) {
633  			sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp,
634  					sa->agi_bp);
635  			xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur,
636  					XFS_SCRUB_TYPE_FINOBT);
637  		}
638  	}
639  }
640  
641  /* Release the AG header context and btree cursors. */
642  void
xchk_ag_free(struct xfs_scrub * sc,struct xchk_ag * sa)643  xchk_ag_free(
644  	struct xfs_scrub	*sc,
645  	struct xchk_ag		*sa)
646  {
647  	xchk_ag_btcur_free(sa);
648  	xrep_reset_perag_resv(sc);
649  	if (sa->agf_bp) {
650  		xfs_trans_brelse(sc->tp, sa->agf_bp);
651  		sa->agf_bp = NULL;
652  	}
653  	if (sa->agi_bp) {
654  		xfs_trans_brelse(sc->tp, sa->agi_bp);
655  		sa->agi_bp = NULL;
656  	}
657  	if (sa->pag) {
658  		xfs_perag_put(sa->pag);
659  		sa->pag = NULL;
660  	}
661  }
662  
663  /*
664   * For scrub, grab the perag structure, the AGI, and the AGF headers, in that
665   * order.  Locking order requires us to get the AGI before the AGF.  We use the
666   * transaction to avoid deadlocking on crosslinked metadata buffers; either the
667   * caller passes one in (bmap scrub) or we have to create a transaction
668   * ourselves.  Returns ENOENT if the perag struct cannot be grabbed.
669   */
670  int
xchk_ag_init(struct xfs_scrub * sc,xfs_agnumber_t agno,struct xchk_ag * sa)671  xchk_ag_init(
672  	struct xfs_scrub	*sc,
673  	xfs_agnumber_t		agno,
674  	struct xchk_ag		*sa)
675  {
676  	int			error;
677  
678  	error = xchk_ag_read_headers(sc, agno, sa);
679  	if (error)
680  		return error;
681  
682  	xchk_ag_btcur_init(sc, sa);
683  	return 0;
684  }
685  
686  /* Per-scrubber setup functions */
687  
688  void
xchk_trans_cancel(struct xfs_scrub * sc)689  xchk_trans_cancel(
690  	struct xfs_scrub	*sc)
691  {
692  	xfs_trans_cancel(sc->tp);
693  	sc->tp = NULL;
694  }
695  
696  int
xchk_trans_alloc_empty(struct xfs_scrub * sc)697  xchk_trans_alloc_empty(
698  	struct xfs_scrub	*sc)
699  {
700  	return xfs_trans_alloc_empty(sc->mp, &sc->tp);
701  }
702  
703  /*
704   * Grab an empty transaction so that we can re-grab locked buffers if
705   * one of our btrees turns out to be cyclic.
706   *
707   * If we're going to repair something, we need to ask for the largest possible
708   * log reservation so that we can handle the worst case scenario for metadata
709   * updates while rebuilding a metadata item.  We also need to reserve as many
710   * blocks in the head transaction as we think we're going to need to rebuild
711   * the metadata object.
712   */
713  int
xchk_trans_alloc(struct xfs_scrub * sc,uint resblks)714  xchk_trans_alloc(
715  	struct xfs_scrub	*sc,
716  	uint			resblks)
717  {
718  	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
719  		return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
720  				resblks, 0, 0, &sc->tp);
721  
722  	return xchk_trans_alloc_empty(sc);
723  }
724  
725  /* Set us up with a transaction and an empty context. */
726  int
xchk_setup_fs(struct xfs_scrub * sc)727  xchk_setup_fs(
728  	struct xfs_scrub	*sc)
729  {
730  	uint			resblks;
731  
732  	resblks = xrep_calc_ag_resblks(sc);
733  	return xchk_trans_alloc(sc, resblks);
734  }
735  
736  /* Set us up with AG headers and btree cursors. */
737  int
xchk_setup_ag_btree(struct xfs_scrub * sc,bool force_log)738  xchk_setup_ag_btree(
739  	struct xfs_scrub	*sc,
740  	bool			force_log)
741  {
742  	struct xfs_mount	*mp = sc->mp;
743  	int			error;
744  
745  	/*
746  	 * If the caller asks us to checkpont the log, do so.  This
747  	 * expensive operation should be performed infrequently and only
748  	 * as a last resort.  Any caller that sets force_log should
749  	 * document why they need to do so.
750  	 */
751  	if (force_log) {
752  		error = xchk_checkpoint_log(mp);
753  		if (error)
754  			return error;
755  	}
756  
757  	error = xchk_setup_fs(sc);
758  	if (error)
759  		return error;
760  
761  	return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa);
762  }
763  
764  /* Push everything out of the log onto disk. */
765  int
xchk_checkpoint_log(struct xfs_mount * mp)766  xchk_checkpoint_log(
767  	struct xfs_mount	*mp)
768  {
769  	int			error;
770  
771  	error = xfs_log_force(mp, XFS_LOG_SYNC);
772  	if (error)
773  		return error;
774  	xfs_ail_push_all_sync(mp->m_ail);
775  	return 0;
776  }
777  
778  /* Verify that an inode is allocated ondisk, then return its cached inode. */
779  int
xchk_iget(struct xfs_scrub * sc,xfs_ino_t inum,struct xfs_inode ** ipp)780  xchk_iget(
781  	struct xfs_scrub	*sc,
782  	xfs_ino_t		inum,
783  	struct xfs_inode	**ipp)
784  {
785  	ASSERT(sc->tp != NULL);
786  
787  	return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp);
788  }
789  
790  /*
791   * Try to grab an inode in a manner that avoids races with physical inode
792   * allocation.  If we can't, return the locked AGI buffer so that the caller
793   * can single-step the loading process to see where things went wrong.
794   * Callers must have a valid scrub transaction.
795   *
796   * If the iget succeeds, return 0, a NULL AGI, and the inode.
797   *
798   * If the iget fails, return the error, the locked AGI, and a NULL inode.  This
799   * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are
800   * no longer allocated; or any other corruption or runtime error.
801   *
802   * If the AGI read fails, return the error, a NULL AGI, and NULL inode.
803   *
804   * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode.
805   */
806  int
xchk_iget_agi(struct xfs_scrub * sc,xfs_ino_t inum,struct xfs_buf ** agi_bpp,struct xfs_inode ** ipp)807  xchk_iget_agi(
808  	struct xfs_scrub	*sc,
809  	xfs_ino_t		inum,
810  	struct xfs_buf		**agi_bpp,
811  	struct xfs_inode	**ipp)
812  {
813  	struct xfs_mount	*mp = sc->mp;
814  	struct xfs_trans	*tp = sc->tp;
815  	struct xfs_perag	*pag;
816  	int			error;
817  
818  	ASSERT(sc->tp != NULL);
819  
820  again:
821  	*agi_bpp = NULL;
822  	*ipp = NULL;
823  	error = 0;
824  
825  	if (xchk_should_terminate(sc, &error))
826  		return error;
827  
828  	/*
829  	 * Attach the AGI buffer to the scrub transaction to avoid deadlocks
830  	 * in the iget cache miss path.
831  	 */
832  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
833  	error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp);
834  	xfs_perag_put(pag);
835  	if (error)
836  		return error;
837  
838  	error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0,
839  			ipp);
840  	if (error == -EAGAIN) {
841  		/*
842  		 * The inode may be in core but temporarily unavailable and may
843  		 * require the AGI buffer before it can be returned.  Drop the
844  		 * AGI buffer and retry the lookup.
845  		 *
846  		 * Incore lookup will fail with EAGAIN on a cache hit if the
847  		 * inode is queued to the inactivation list.  The inactivation
848  		 * worker may remove the inode from the unlinked list and hence
849  		 * needs the AGI.
850  		 *
851  		 * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN
852  		 * to allow inodegc to make progress and move the inode to
853  		 * IRECLAIMABLE state where xfs_iget will be able to return it
854  		 * again if it can lock the inode.
855  		 */
856  		xfs_trans_brelse(tp, *agi_bpp);
857  		delay(1);
858  		goto again;
859  	}
860  	if (error)
861  		return error;
862  
863  	/* We got the inode, so we can release the AGI. */
864  	ASSERT(*ipp != NULL);
865  	xfs_trans_brelse(tp, *agi_bpp);
866  	*agi_bpp = NULL;
867  	return 0;
868  }
869  
870  #ifdef CONFIG_XFS_QUOTA
871  /*
872   * Try to attach dquots to this inode if we think we might want to repair it.
873   * Callers must not hold any ILOCKs.  If the dquots are broken and cannot be
874   * attached, a quotacheck will be scheduled.
875   */
876  int
xchk_ino_dqattach(struct xfs_scrub * sc)877  xchk_ino_dqattach(
878  	struct xfs_scrub	*sc)
879  {
880  	ASSERT(sc->tp != NULL);
881  	ASSERT(sc->ip != NULL);
882  
883  	if (!xchk_could_repair(sc))
884  		return 0;
885  
886  	return xrep_ino_dqattach(sc);
887  }
888  #endif
889  
890  /* Install an inode that we opened by handle for scrubbing. */
891  int
xchk_install_handle_inode(struct xfs_scrub * sc,struct xfs_inode * ip)892  xchk_install_handle_inode(
893  	struct xfs_scrub	*sc,
894  	struct xfs_inode	*ip)
895  {
896  	if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
897  		xchk_irele(sc, ip);
898  		return -ENOENT;
899  	}
900  
901  	sc->ip = ip;
902  	return 0;
903  }
904  
905  /*
906   * Install an already-referenced inode for scrubbing.  Get our own reference to
907   * the inode to make disposal simpler.  The inode must not be in I_FREEING or
908   * I_WILL_FREE state!
909   */
910  int
xchk_install_live_inode(struct xfs_scrub * sc,struct xfs_inode * ip)911  xchk_install_live_inode(
912  	struct xfs_scrub	*sc,
913  	struct xfs_inode	*ip)
914  {
915  	if (!igrab(VFS_I(ip))) {
916  		xchk_ino_set_corrupt(sc, ip->i_ino);
917  		return -EFSCORRUPTED;
918  	}
919  
920  	sc->ip = ip;
921  	return 0;
922  }
923  
924  /*
925   * In preparation to scrub metadata structures that hang off of an inode,
926   * grab either the inode referenced in the scrub control structure or the
927   * inode passed in.  If the inumber does not reference an allocated inode
928   * record, the function returns ENOENT to end the scrub early.  The inode
929   * is not locked.
930   */
931  int
xchk_iget_for_scrubbing(struct xfs_scrub * sc)932  xchk_iget_for_scrubbing(
933  	struct xfs_scrub	*sc)
934  {
935  	struct xfs_imap		imap;
936  	struct xfs_mount	*mp = sc->mp;
937  	struct xfs_perag	*pag;
938  	struct xfs_buf		*agi_bp;
939  	struct xfs_inode	*ip_in = XFS_I(file_inode(sc->file));
940  	struct xfs_inode	*ip = NULL;
941  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
942  	int			error;
943  
944  	ASSERT(sc->tp == NULL);
945  
946  	/* We want to scan the inode we already had opened. */
947  	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
948  		return xchk_install_live_inode(sc, ip_in);
949  
950  	/* Reject internal metadata files and obviously bad inode numbers. */
951  	if (xfs_internal_inum(mp, sc->sm->sm_ino))
952  		return -ENOENT;
953  	if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
954  		return -ENOENT;
955  
956  	/* Try a safe untrusted iget. */
957  	error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
958  	if (!error)
959  		return xchk_install_handle_inode(sc, ip);
960  	if (error == -ENOENT)
961  		return error;
962  	if (error != -EINVAL)
963  		goto out_error;
964  
965  	/*
966  	 * EINVAL with IGET_UNTRUSTED probably means one of several things:
967  	 * userspace gave us an inode number that doesn't correspond to fs
968  	 * space; the inode btree lacks a record for this inode; or there is a
969  	 * record, and it says this inode is free.
970  	 *
971  	 * We want to look up this inode in the inobt to distinguish two
972  	 * scenarios: (1) the inobt says the inode is free, in which case
973  	 * there's nothing to do; and (2) the inobt says the inode is
974  	 * allocated, but loading it failed due to corruption.
975  	 *
976  	 * Allocate a transaction and grab the AGI to prevent inobt activity
977  	 * in this AG.  Retry the iget in case someone allocated a new inode
978  	 * after the first iget failed.
979  	 */
980  	error = xchk_trans_alloc(sc, 0);
981  	if (error)
982  		goto out_error;
983  
984  	error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
985  	if (error == 0) {
986  		/* Actually got the inode, so install it. */
987  		xchk_trans_cancel(sc);
988  		return xchk_install_handle_inode(sc, ip);
989  	}
990  	if (error == -ENOENT)
991  		goto out_gone;
992  	if (error != -EINVAL)
993  		goto out_cancel;
994  
995  	/* Ensure that we have protected against inode allocation/freeing. */
996  	if (agi_bp == NULL) {
997  		ASSERT(agi_bp != NULL);
998  		error = -ECANCELED;
999  		goto out_cancel;
1000  	}
1001  
1002  	/*
1003  	 * Untrusted iget failed a second time.  Let's try an inobt lookup.
1004  	 * If the inobt thinks this the inode neither can exist inside the
1005  	 * filesystem nor is allocated, return ENOENT to signal that the check
1006  	 * can be skipped.
1007  	 *
1008  	 * If the lookup returns corruption, we'll mark this inode corrupt and
1009  	 * exit to userspace.  There's little chance of fixing anything until
1010  	 * the inobt is straightened out, but there's nothing we can do here.
1011  	 *
1012  	 * If the lookup encounters any other error, exit to userspace.
1013  	 *
1014  	 * If the lookup succeeds, something else must be very wrong in the fs
1015  	 * such that setting up the incore inode failed in some strange way.
1016  	 * Treat those as corruptions.
1017  	 */
1018  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
1019  	if (!pag) {
1020  		error = -EFSCORRUPTED;
1021  		goto out_cancel;
1022  	}
1023  
1024  	error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
1025  			XFS_IGET_UNTRUSTED);
1026  	xfs_perag_put(pag);
1027  	if (error == -EINVAL || error == -ENOENT)
1028  		goto out_gone;
1029  	if (!error)
1030  		error = -EFSCORRUPTED;
1031  
1032  out_cancel:
1033  	xchk_trans_cancel(sc);
1034  out_error:
1035  	trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
1036  			error, __return_address);
1037  	return error;
1038  out_gone:
1039  	/* The file is gone, so there's nothing to check. */
1040  	xchk_trans_cancel(sc);
1041  	return -ENOENT;
1042  }
1043  
1044  /* Release an inode, possibly dropping it in the process. */
1045  void
xchk_irele(struct xfs_scrub * sc,struct xfs_inode * ip)1046  xchk_irele(
1047  	struct xfs_scrub	*sc,
1048  	struct xfs_inode	*ip)
1049  {
1050  	if (sc->tp) {
1051  		/*
1052  		 * If we are in a transaction, we /cannot/ drop the inode
1053  		 * ourselves, because the VFS will trigger writeback, which
1054  		 * can require a transaction.  Clear DONTCACHE to force the
1055  		 * inode to the LRU, where someone else can take care of
1056  		 * dropping it.
1057  		 *
1058  		 * Note that when we grabbed our reference to the inode, it
1059  		 * could have had an active ref and DONTCACHE set if a sysadmin
1060  		 * is trying to coerce a change in file access mode.  icache
1061  		 * hits do not clear DONTCACHE, so we must do it here.
1062  		 */
1063  		spin_lock(&VFS_I(ip)->i_lock);
1064  		VFS_I(ip)->i_state &= ~I_DONTCACHE;
1065  		spin_unlock(&VFS_I(ip)->i_lock);
1066  	}
1067  
1068  	xfs_irele(ip);
1069  }
1070  
1071  /*
1072   * Set us up to scrub metadata mapped by a file's fork.  Callers must not use
1073   * this to operate on user-accessible regular file data because the MMAPLOCK is
1074   * not taken.
1075   */
1076  int
xchk_setup_inode_contents(struct xfs_scrub * sc,unsigned int resblks)1077  xchk_setup_inode_contents(
1078  	struct xfs_scrub	*sc,
1079  	unsigned int		resblks)
1080  {
1081  	int			error;
1082  
1083  	error = xchk_iget_for_scrubbing(sc);
1084  	if (error)
1085  		return error;
1086  
1087  	/* Lock the inode so the VFS cannot touch this file. */
1088  	xchk_ilock(sc, XFS_IOLOCK_EXCL);
1089  
1090  	error = xchk_trans_alloc(sc, resblks);
1091  	if (error)
1092  		goto out;
1093  
1094  	error = xchk_ino_dqattach(sc);
1095  	if (error)
1096  		goto out;
1097  
1098  	xchk_ilock(sc, XFS_ILOCK_EXCL);
1099  out:
1100  	/* scrub teardown will unlock and release the inode for us */
1101  	return error;
1102  }
1103  
1104  void
xchk_ilock(struct xfs_scrub * sc,unsigned int ilock_flags)1105  xchk_ilock(
1106  	struct xfs_scrub	*sc,
1107  	unsigned int		ilock_flags)
1108  {
1109  	xfs_ilock(sc->ip, ilock_flags);
1110  	sc->ilock_flags |= ilock_flags;
1111  }
1112  
1113  bool
xchk_ilock_nowait(struct xfs_scrub * sc,unsigned int ilock_flags)1114  xchk_ilock_nowait(
1115  	struct xfs_scrub	*sc,
1116  	unsigned int		ilock_flags)
1117  {
1118  	if (xfs_ilock_nowait(sc->ip, ilock_flags)) {
1119  		sc->ilock_flags |= ilock_flags;
1120  		return true;
1121  	}
1122  
1123  	return false;
1124  }
1125  
1126  void
xchk_iunlock(struct xfs_scrub * sc,unsigned int ilock_flags)1127  xchk_iunlock(
1128  	struct xfs_scrub	*sc,
1129  	unsigned int		ilock_flags)
1130  {
1131  	sc->ilock_flags &= ~ilock_flags;
1132  	xfs_iunlock(sc->ip, ilock_flags);
1133  }
1134  
1135  /*
1136   * Predicate that decides if we need to evaluate the cross-reference check.
1137   * If there was an error accessing the cross-reference btree, just delete
1138   * the cursor and skip the check.
1139   */
1140  bool
xchk_should_check_xref(struct xfs_scrub * sc,int * error,struct xfs_btree_cur ** curpp)1141  xchk_should_check_xref(
1142  	struct xfs_scrub	*sc,
1143  	int			*error,
1144  	struct xfs_btree_cur	**curpp)
1145  {
1146  	/* No point in xref if we already know we're corrupt. */
1147  	if (xchk_skip_xref(sc->sm))
1148  		return false;
1149  
1150  	if (*error == 0)
1151  		return true;
1152  
1153  	if (curpp) {
1154  		/* If we've already given up on xref, just bail out. */
1155  		if (!*curpp)
1156  			return false;
1157  
1158  		/* xref error, delete cursor and bail out. */
1159  		xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR);
1160  		*curpp = NULL;
1161  	}
1162  
1163  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
1164  	trace_xchk_xref_error(sc, *error, __return_address);
1165  
1166  	/*
1167  	 * Errors encountered during cross-referencing with another
1168  	 * data structure should not cause this scrubber to abort.
1169  	 */
1170  	*error = 0;
1171  	return false;
1172  }
1173  
1174  /* Run the structure verifiers on in-memory buffers to detect bad memory. */
1175  void
xchk_buffer_recheck(struct xfs_scrub * sc,struct xfs_buf * bp)1176  xchk_buffer_recheck(
1177  	struct xfs_scrub	*sc,
1178  	struct xfs_buf		*bp)
1179  {
1180  	xfs_failaddr_t		fa;
1181  
1182  	if (bp->b_ops == NULL) {
1183  		xchk_block_set_corrupt(sc, bp);
1184  		return;
1185  	}
1186  	if (bp->b_ops->verify_struct == NULL) {
1187  		xchk_set_incomplete(sc);
1188  		return;
1189  	}
1190  	fa = bp->b_ops->verify_struct(bp);
1191  	if (!fa)
1192  		return;
1193  	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
1194  	trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
1195  }
1196  
1197  static inline int
xchk_metadata_inode_subtype(struct xfs_scrub * sc,unsigned int scrub_type)1198  xchk_metadata_inode_subtype(
1199  	struct xfs_scrub	*sc,
1200  	unsigned int		scrub_type)
1201  {
1202  	struct xfs_scrub_subord	*sub;
1203  	int			error;
1204  
1205  	sub = xchk_scrub_create_subord(sc, scrub_type);
1206  	error = sub->sc.ops->scrub(&sub->sc);
1207  	xchk_scrub_free_subord(sub);
1208  	return error;
1209  }
1210  
1211  /*
1212   * Scrub the attr/data forks of a metadata inode.  The metadata inode must be
1213   * pointed to by sc->ip and the ILOCK must be held.
1214   */
1215  int
xchk_metadata_inode_forks(struct xfs_scrub * sc)1216  xchk_metadata_inode_forks(
1217  	struct xfs_scrub	*sc)
1218  {
1219  	bool			shared;
1220  	int			error;
1221  
1222  	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
1223  		return 0;
1224  
1225  	/* Check the inode record. */
1226  	error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
1227  	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1228  		return error;
1229  
1230  	/* Metadata inodes don't live on the rt device. */
1231  	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
1232  		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1233  		return 0;
1234  	}
1235  
1236  	/* They should never participate in reflink. */
1237  	if (xfs_is_reflink_inode(sc->ip)) {
1238  		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1239  		return 0;
1240  	}
1241  
1242  	/* They also should never have extended attributes. */
1243  	if (xfs_inode_hasattr(sc->ip)) {
1244  		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1245  		return 0;
1246  	}
1247  
1248  	/* Invoke the data fork scrubber. */
1249  	error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
1250  	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1251  		return error;
1252  
1253  	/* Look for incorrect shared blocks. */
1254  	if (xfs_has_reflink(sc->mp)) {
1255  		error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
1256  				&shared);
1257  		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
1258  				&error))
1259  			return error;
1260  		if (shared)
1261  			xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1262  	}
1263  
1264  	return 0;
1265  }
1266  
1267  /*
1268   * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
1269   * operation.  Callers must not hold any locks that intersect with the CPU
1270   * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs
1271   * to change kernel code.
1272   */
1273  void
xchk_fsgates_enable(struct xfs_scrub * sc,unsigned int scrub_fsgates)1274  xchk_fsgates_enable(
1275  	struct xfs_scrub	*sc,
1276  	unsigned int		scrub_fsgates)
1277  {
1278  	ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL));
1279  	ASSERT(!(sc->flags & scrub_fsgates));
1280  
1281  	trace_xchk_fsgates_enable(sc, scrub_fsgates);
1282  
1283  	if (scrub_fsgates & XCHK_FSGATES_DRAIN)
1284  		xfs_drain_wait_enable();
1285  
1286  	if (scrub_fsgates & XCHK_FSGATES_QUOTA)
1287  		xfs_dqtrx_hook_enable();
1288  
1289  	if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
1290  		xfs_dir_hook_enable();
1291  
1292  	if (scrub_fsgates & XCHK_FSGATES_RMAP)
1293  		xfs_rmap_hook_enable();
1294  
1295  	sc->flags |= scrub_fsgates;
1296  }
1297  
1298  /*
1299   * Decide if this is this a cached inode that's also allocated.  The caller
1300   * must hold a reference to an AG and the AGI buffer lock to prevent inodes
1301   * from being allocated or freed.
1302   *
1303   * Look up an inode by number in the given file system.  If the inode number
1304   * is invalid, return -EINVAL.  If the inode is not in cache, return -ENODATA.
1305   * If the inode is being reclaimed, return -ENODATA because we know the inode
1306   * cache cannot be updating the ondisk metadata.
1307   *
1308   * Otherwise, the incore inode is the one we want, and it is either live,
1309   * somewhere in the inactivation machinery, or reclaimable.  The inode is
1310   * allocated if i_mode is nonzero.  In all three cases, the cached inode will
1311   * be more up to date than the ondisk inode buffer, so we must use the incore
1312   * i_mode.
1313   */
1314  int
xchk_inode_is_allocated(struct xfs_scrub * sc,xfs_agino_t agino,bool * inuse)1315  xchk_inode_is_allocated(
1316  	struct xfs_scrub	*sc,
1317  	xfs_agino_t		agino,
1318  	bool			*inuse)
1319  {
1320  	struct xfs_mount	*mp = sc->mp;
1321  	struct xfs_perag	*pag = sc->sa.pag;
1322  	xfs_ino_t		ino;
1323  	struct xfs_inode	*ip;
1324  	int			error;
1325  
1326  	/* caller must hold perag reference */
1327  	if (pag == NULL) {
1328  		ASSERT(pag != NULL);
1329  		return -EINVAL;
1330  	}
1331  
1332  	/* caller must have AGI buffer */
1333  	if (sc->sa.agi_bp == NULL) {
1334  		ASSERT(sc->sa.agi_bp != NULL);
1335  		return -EINVAL;
1336  	}
1337  
1338  	/* reject inode numbers outside existing AGs */
1339  	ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
1340  	if (!xfs_verify_ino(mp, ino))
1341  		return -EINVAL;
1342  
1343  	error = -ENODATA;
1344  	rcu_read_lock();
1345  	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1346  	if (!ip) {
1347  		/* cache miss */
1348  		goto out_rcu;
1349  	}
1350  
1351  	/*
1352  	 * If the inode number doesn't match, the incore inode got reused
1353  	 * during an RCU grace period and the radix tree hasn't been updated.
1354  	 * This isn't the inode we want.
1355  	 */
1356  	spin_lock(&ip->i_flags_lock);
1357  	if (ip->i_ino != ino)
1358  		goto out_skip;
1359  
1360  	trace_xchk_inode_is_allocated(ip);
1361  
1362  	/*
1363  	 * We have an incore inode that matches the inode we want, and the
1364  	 * caller holds the perag structure and the AGI buffer.  Let's check
1365  	 * our assumptions below:
1366  	 */
1367  
1368  #ifdef DEBUG
1369  	/*
1370  	 * (1) If the incore inode is live (i.e. referenced from the dcache),
1371  	 * it will not be INEW, nor will it be in the inactivation or reclaim
1372  	 * machinery.  The ondisk inode had better be allocated.  This is the
1373  	 * most trivial case.
1374  	 */
1375  	if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE |
1376  			     XFS_INACTIVATING))) {
1377  		/* live inode */
1378  		ASSERT(VFS_I(ip)->i_mode != 0);
1379  	}
1380  
1381  	/*
1382  	 * If the incore inode is INEW, there are several possibilities:
1383  	 *
1384  	 * (2) For a file that is being created, note that we allocate the
1385  	 * ondisk inode before allocating, initializing, and adding the incore
1386  	 * inode to the radix tree.
1387  	 *
1388  	 * (3) If the incore inode is being recycled, the inode has to be
1389  	 * allocated because we don't allow freed inodes to be recycled.
1390  	 * Recycling doesn't touch i_mode.
1391  	 */
1392  	if (ip->i_flags & XFS_INEW) {
1393  		/* created on disk already or recycling */
1394  		ASSERT(VFS_I(ip)->i_mode != 0);
1395  	}
1396  
1397  	/*
1398  	 * (4) If the inode is queued for inactivation (NEED_INACTIVE) but
1399  	 * inactivation has not started (!INACTIVATING), it is still allocated.
1400  	 */
1401  	if ((ip->i_flags & XFS_NEED_INACTIVE) &&
1402  	    !(ip->i_flags & XFS_INACTIVATING)) {
1403  		/* definitely before difree */
1404  		ASSERT(VFS_I(ip)->i_mode != 0);
1405  	}
1406  #endif
1407  
1408  	/*
1409  	 * If the incore inode is undergoing inactivation (INACTIVATING), there
1410  	 * are two possibilities:
1411  	 *
1412  	 * (5) It is before the point where it would get freed ondisk, in which
1413  	 * case i_mode is still nonzero.
1414  	 *
1415  	 * (6) It has already been freed, in which case i_mode is zero.
1416  	 *
1417  	 * We don't take the ILOCK here, but difree and dialloc update the AGI,
1418  	 * and we've taken the AGI buffer lock, which prevents that from
1419  	 * happening.
1420  	 */
1421  
1422  	/*
1423  	 * (7) Inodes undergoing inactivation (INACTIVATING) or queued for
1424  	 * reclaim (IRECLAIMABLE) could be allocated or free.  i_mode still
1425  	 * reflects the ondisk state.
1426  	 */
1427  
1428  	/*
1429  	 * (8) If the inode is in IFLUSHING, it's safe to query i_mode because
1430  	 * the flush code uses i_mode to format the ondisk inode.
1431  	 */
1432  
1433  	/*
1434  	 * (9) If the inode is in IRECLAIM and was reachable via the radix
1435  	 * tree, it still has the same i_mode as it did before it entered
1436  	 * reclaim.  The inode object is still alive because we hold the RCU
1437  	 * read lock.
1438  	 */
1439  
1440  	*inuse = VFS_I(ip)->i_mode != 0;
1441  	error = 0;
1442  
1443  out_skip:
1444  	spin_unlock(&ip->i_flags_lock);
1445  out_rcu:
1446  	rcu_read_unlock();
1447  	return error;
1448  }
1449