1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_bit.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_sb.h"
17 #include "xfs_inode.h"
18 #include "xfs_icache.h"
19 #include "xfs_da_format.h"
20 #include "xfs_da_btree.h"
21 #include "xfs_dir2.h"
22 #include "xfs_dir2_priv.h"
23 #include "xfs_bmap.h"
24 #include "xfs_quota.h"
25 #include "xfs_bmap_btree.h"
26 #include "xfs_trans_space.h"
27 #include "xfs_bmap_util.h"
28 #include "xfs_exchmaps.h"
29 #include "xfs_exchrange.h"
30 #include "xfs_ag.h"
31 #include "xfs_parent.h"
32 #include "scrub/xfs_scrub.h"
33 #include "scrub/scrub.h"
34 #include "scrub/common.h"
35 #include "scrub/trace.h"
36 #include "scrub/repair.h"
37 #include "scrub/tempfile.h"
38 #include "scrub/tempexch.h"
39 #include "scrub/xfile.h"
40 #include "scrub/xfarray.h"
41 #include "scrub/xfblob.h"
42 #include "scrub/iscan.h"
43 #include "scrub/readdir.h"
44 #include "scrub/reap.h"
45 #include "scrub/findparent.h"
46 #include "scrub/orphanage.h"
47 #include "scrub/listxattr.h"
48
49 /*
50 * Directory Repair
51 * ================
52 *
53 * We repair directories by reading the directory data blocks looking for
54 * directory entries that look salvageable (name passes verifiers, entry points
55 * to a valid allocated inode, etc). Each entry worth salvaging is stashed in
56 * memory, and the stashed entries are periodically replayed into a temporary
57 * directory to constrain memory use. Batching the construction of the
58 * temporary directory in this fashion reduces lock cycling of the directory
59 * being repaired and the temporary directory, and will later become important
60 * for parent pointer scanning.
61 *
62 * If parent pointers are enabled on this filesystem, we instead reconstruct
63 * the directory by visiting each parent pointer of each file in the filesystem
64 * and translating the relevant parent pointer records into dirents. In this
65 * case, it is advantageous to stash all directory entries created from parent
66 * pointers for a single child file before replaying them into the temporary
67 * directory. To save memory, the live filesystem scan reuses the findparent
68 * fields. Directory repair chooses either parent pointer scanning or
69 * directory entry salvaging, but not both.
70 *
71 * Directory entries added to the temporary directory do not elevate the link
72 * counts of the inodes found. When salvaging completes, the remaining stashed
73 * entries are replayed to the temporary directory. An atomic mapping exchange
74 * is used to commit the new directory blocks to the directory being repaired.
75 * This will disrupt readdir cursors.
76 *
77 * Locking Issues
78 * --------------
79 *
80 * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on
81 * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects
82 * b's dotdot update. This is in contrast to every other dotdot update (link,
83 * remove, mkdir). If the repair code drops the ILOCK, it must either
84 * revalidate the dotdot entry or use dirent hooks to capture updates from
85 * other threads.
86 */
87
88 /* Create a dirent in the tempdir. */
89 #define XREP_DIRENT_ADD (1)
90
91 /* Remove a dirent from the tempdir. */
92 #define XREP_DIRENT_REMOVE (2)
93
94 /* Directory entry to be restored in the new directory. */
95 struct xrep_dirent {
96 /* Cookie for retrieval of the dirent name. */
97 xfblob_cookie name_cookie;
98
99 /* Target inode number. */
100 xfs_ino_t ino;
101
102 /* Length of the dirent name. */
103 uint8_t namelen;
104
105 /* File type of the dirent. */
106 uint8_t ftype;
107
108 /* XREP_DIRENT_{ADD,REMOVE} */
109 uint8_t action;
110 };
111
112 /*
113 * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names
114 * before we write them to the temp dir.
115 */
116 #define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8)
117
118 struct xrep_dir {
119 struct xfs_scrub *sc;
120
121 /* Fixed-size array of xrep_dirent structures. */
122 struct xfarray *dir_entries;
123
124 /* Blobs containing directory entry names. */
125 struct xfblob *dir_names;
126
127 /* Information for exchanging data forks at the end. */
128 struct xrep_tempexch tx;
129
130 /* Preallocated args struct for performing dir operations */
131 struct xfs_da_args args;
132
133 /*
134 * Information used to scan the filesystem to find the inumber of the
135 * dotdot entry for this directory. For directory salvaging when
136 * parent pointers are not enabled, we use the findparent_* functions
137 * on this object and access only the parent_ino field directly.
138 *
139 * When parent pointers are enabled, however, the pptr scanner uses the
140 * iscan, hooks, lock, and parent_ino fields of this object directly.
141 * @pscan.lock coordinates access to dir_entries, dir_names,
142 * parent_ino, subdirs, dirents, and args. This reduces the memory
143 * requirements of this structure.
144 */
145 struct xrep_parent_scan_info pscan;
146
147 /*
148 * Context information for attaching this directory to the lost+found
149 * if this directory does not have a parent.
150 */
151 struct xrep_adoption adoption;
152
153 /* How many subdirectories did we find? */
154 uint64_t subdirs;
155
156 /* How many dirents did we find? */
157 unsigned int dirents;
158
159 /* Should we move this directory to the orphanage? */
160 bool needs_adoption;
161
162 /* Directory entry name, plus the trailing null. */
163 struct xfs_name xname;
164 unsigned char namebuf[MAXNAMELEN];
165 };
166
167 /* Tear down all the incore stuff we created. */
168 static void
xrep_dir_teardown(struct xfs_scrub * sc)169 xrep_dir_teardown(
170 struct xfs_scrub *sc)
171 {
172 struct xrep_dir *rd = sc->buf;
173
174 xrep_findparent_scan_teardown(&rd->pscan);
175 xfblob_destroy(rd->dir_names);
176 xfarray_destroy(rd->dir_entries);
177 }
178
179 /* Set up for a directory repair. */
180 int
xrep_setup_directory(struct xfs_scrub * sc)181 xrep_setup_directory(
182 struct xfs_scrub *sc)
183 {
184 struct xrep_dir *rd;
185 int error;
186
187 xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
188
189 error = xrep_orphanage_try_create(sc);
190 if (error)
191 return error;
192
193 error = xrep_tempfile_create(sc, S_IFDIR);
194 if (error)
195 return error;
196
197 rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS);
198 if (!rd)
199 return -ENOMEM;
200 rd->sc = sc;
201 rd->xname.name = rd->namebuf;
202 sc->buf = rd;
203
204 return 0;
205 }
206
207 /*
208 * Look up the dotdot entry and confirm that it's really the parent.
209 * Returns NULLFSINO if we don't know what to do.
210 */
211 static inline xfs_ino_t
xrep_dir_lookup_parent(struct xrep_dir * rd)212 xrep_dir_lookup_parent(
213 struct xrep_dir *rd)
214 {
215 struct xfs_scrub *sc = rd->sc;
216 xfs_ino_t ino;
217 int error;
218
219 error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL);
220 if (error)
221 return NULLFSINO;
222 if (!xfs_verify_dir_ino(sc->mp, ino))
223 return NULLFSINO;
224
225 error = xrep_findparent_confirm(sc, &ino);
226 if (error)
227 return NULLFSINO;
228
229 return ino;
230 }
231
232 /*
233 * Look up '..' in the dentry cache and confirm that it's really the parent.
234 * Returns NULLFSINO if the dcache misses or if the hit is implausible.
235 */
236 static inline xfs_ino_t
xrep_dir_dcache_parent(struct xrep_dir * rd)237 xrep_dir_dcache_parent(
238 struct xrep_dir *rd)
239 {
240 struct xfs_scrub *sc = rd->sc;
241 xfs_ino_t parent_ino;
242 int error;
243
244 parent_ino = xrep_findparent_from_dcache(sc);
245 if (parent_ino == NULLFSINO)
246 return parent_ino;
247
248 error = xrep_findparent_confirm(sc, &parent_ino);
249 if (error)
250 return NULLFSINO;
251
252 return parent_ino;
253 }
254
255 /* Try to find the parent of the directory being repaired. */
256 STATIC int
xrep_dir_find_parent(struct xrep_dir * rd)257 xrep_dir_find_parent(
258 struct xrep_dir *rd)
259 {
260 xfs_ino_t ino;
261
262 ino = xrep_findparent_self_reference(rd->sc);
263 if (ino != NULLFSINO) {
264 xrep_findparent_scan_finish_early(&rd->pscan, ino);
265 return 0;
266 }
267
268 ino = xrep_dir_dcache_parent(rd);
269 if (ino != NULLFSINO) {
270 xrep_findparent_scan_finish_early(&rd->pscan, ino);
271 return 0;
272 }
273
274 ino = xrep_dir_lookup_parent(rd);
275 if (ino != NULLFSINO) {
276 xrep_findparent_scan_finish_early(&rd->pscan, ino);
277 return 0;
278 }
279
280 /*
281 * A full filesystem scan is the last resort. On a busy filesystem,
282 * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means
283 * that we don't know what who the parent is, so we should return to
284 * userspace.
285 */
286 return xrep_findparent_scan(&rd->pscan);
287 }
288
289 /*
290 * Decide if we want to salvage this entry. We don't bother with oversized
291 * names or the dot entry.
292 */
293 STATIC int
xrep_dir_want_salvage(struct xrep_dir * rd,const char * name,int namelen,xfs_ino_t ino)294 xrep_dir_want_salvage(
295 struct xrep_dir *rd,
296 const char *name,
297 int namelen,
298 xfs_ino_t ino)
299 {
300 struct xfs_mount *mp = rd->sc->mp;
301
302 /* No pointers to ourselves or to garbage. */
303 if (ino == rd->sc->ip->i_ino)
304 return false;
305 if (!xfs_verify_dir_ino(mp, ino))
306 return false;
307
308 /* No weird looking names or dot entries. */
309 if (namelen >= MAXNAMELEN || namelen <= 0)
310 return false;
311 if (namelen == 1 && name[0] == '.')
312 return false;
313 if (!xfs_dir2_namecheck(name, namelen))
314 return false;
315
316 return true;
317 }
318
319 /*
320 * Remember that we want to create a dirent in the tempdir. These stashed
321 * actions will be replayed later.
322 */
323 STATIC int
xrep_dir_stash_createname(struct xrep_dir * rd,const struct xfs_name * name,xfs_ino_t ino)324 xrep_dir_stash_createname(
325 struct xrep_dir *rd,
326 const struct xfs_name *name,
327 xfs_ino_t ino)
328 {
329 struct xrep_dirent dirent = {
330 .action = XREP_DIRENT_ADD,
331 .ino = ino,
332 .namelen = name->len,
333 .ftype = name->type,
334 };
335 int error;
336
337 trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino);
338
339 error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
340 if (error)
341 return error;
342
343 return xfarray_append(rd->dir_entries, &dirent);
344 }
345
346 /*
347 * Remember that we want to remove a dirent from the tempdir. These stashed
348 * actions will be replayed later.
349 */
350 STATIC int
xrep_dir_stash_removename(struct xrep_dir * rd,const struct xfs_name * name,xfs_ino_t ino)351 xrep_dir_stash_removename(
352 struct xrep_dir *rd,
353 const struct xfs_name *name,
354 xfs_ino_t ino)
355 {
356 struct xrep_dirent dirent = {
357 .action = XREP_DIRENT_REMOVE,
358 .ino = ino,
359 .namelen = name->len,
360 .ftype = name->type,
361 };
362 int error;
363
364 trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino);
365
366 error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
367 if (error)
368 return error;
369
370 return xfarray_append(rd->dir_entries, &dirent);
371 }
372
373 /* Allocate an in-core record to hold entries while we rebuild the dir data. */
374 STATIC int
xrep_dir_salvage_entry(struct xrep_dir * rd,unsigned char * name,unsigned int namelen,xfs_ino_t ino)375 xrep_dir_salvage_entry(
376 struct xrep_dir *rd,
377 unsigned char *name,
378 unsigned int namelen,
379 xfs_ino_t ino)
380 {
381 struct xfs_name xname = {
382 .name = name,
383 };
384 struct xfs_scrub *sc = rd->sc;
385 struct xfs_inode *ip;
386 unsigned int i = 0;
387 int error = 0;
388
389 if (xchk_should_terminate(sc, &error))
390 return error;
391
392 /*
393 * Truncate the name to the first character that would trip namecheck.
394 * If we no longer have a name after that, ignore this entry.
395 */
396 while (i < namelen && name[i] != 0 && name[i] != '/')
397 i++;
398 if (i == 0)
399 return 0;
400 xname.len = i;
401
402 /* Ignore '..' entries; we already picked the new parent. */
403 if (xname.len == 2 && name[0] == '.' && name[1] == '.') {
404 trace_xrep_dir_salvaged_parent(sc->ip, ino);
405 return 0;
406 }
407
408 trace_xrep_dir_salvage_entry(sc->ip, &xname, ino);
409
410 /*
411 * Compute the ftype or dump the entry if we can't. We don't lock the
412 * inode because inodes can't change type while we have a reference.
413 */
414 error = xchk_iget(sc, ino, &ip);
415 if (error)
416 return 0;
417
418 xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
419 xchk_irele(sc, ip);
420
421 return xrep_dir_stash_createname(rd, &xname, ino);
422 }
423
424 /* Record a shortform directory entry for later reinsertion. */
425 STATIC int
xrep_dir_salvage_sf_entry(struct xrep_dir * rd,struct xfs_dir2_sf_hdr * sfp,struct xfs_dir2_sf_entry * sfep)426 xrep_dir_salvage_sf_entry(
427 struct xrep_dir *rd,
428 struct xfs_dir2_sf_hdr *sfp,
429 struct xfs_dir2_sf_entry *sfep)
430 {
431 xfs_ino_t ino;
432
433 ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
434 if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
435 return 0;
436
437 return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
438 }
439
440 /* Record a regular directory entry for later reinsertion. */
441 STATIC int
xrep_dir_salvage_data_entry(struct xrep_dir * rd,struct xfs_dir2_data_entry * dep)442 xrep_dir_salvage_data_entry(
443 struct xrep_dir *rd,
444 struct xfs_dir2_data_entry *dep)
445 {
446 xfs_ino_t ino;
447
448 ino = be64_to_cpu(dep->inumber);
449 if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
450 return 0;
451
452 return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
453 }
454
455 /* Try to recover block/data format directory entries. */
456 STATIC int
xrep_dir_recover_data(struct xrep_dir * rd,struct xfs_buf * bp)457 xrep_dir_recover_data(
458 struct xrep_dir *rd,
459 struct xfs_buf *bp)
460 {
461 struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo;
462 unsigned int offset;
463 unsigned int end;
464 int error = 0;
465
466 /*
467 * Loop over the data portion of the block.
468 * Each object is a real entry (dep) or an unused one (dup).
469 */
470 offset = geo->data_entry_offset;
471 end = min_t(unsigned int, BBTOB(bp->b_length),
472 xfs_dir3_data_end_offset(geo, bp->b_addr));
473
474 while (offset < end) {
475 struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
476 struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
477
478 if (xchk_should_terminate(rd->sc, &error))
479 return error;
480
481 /* Skip unused entries. */
482 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
483 offset += be16_to_cpu(dup->length);
484 continue;
485 }
486
487 /* Don't walk off the end of the block. */
488 offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
489 if (offset > end)
490 break;
491
492 /* Ok, let's save this entry. */
493 error = xrep_dir_salvage_data_entry(rd, dep);
494 if (error)
495 return error;
496
497 }
498
499 return 0;
500 }
501
502 /* Try to recover shortform directory entries. */
503 STATIC int
xrep_dir_recover_sf(struct xrep_dir * rd)504 xrep_dir_recover_sf(
505 struct xrep_dir *rd)
506 {
507 struct xfs_dir2_sf_hdr *hdr;
508 struct xfs_dir2_sf_entry *sfep;
509 struct xfs_dir2_sf_entry *next;
510 struct xfs_ifork *ifp;
511 xfs_ino_t ino;
512 unsigned char *end;
513 int error = 0;
514
515 ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK);
516 hdr = ifp->if_data;
517 end = (unsigned char *)ifp->if_data + ifp->if_bytes;
518
519 ino = xfs_dir2_sf_get_parent_ino(hdr);
520 trace_xrep_dir_salvaged_parent(rd->sc->ip, ino);
521
522 sfep = xfs_dir2_sf_firstentry(hdr);
523 while ((unsigned char *)sfep < end) {
524 if (xchk_should_terminate(rd->sc, &error))
525 return error;
526
527 next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep);
528 if ((unsigned char *)next > end)
529 break;
530
531 /* Ok, let's save this entry. */
532 error = xrep_dir_salvage_sf_entry(rd, hdr, sfep);
533 if (error)
534 return error;
535
536 sfep = next;
537 }
538
539 return 0;
540 }
541
542 /*
543 * Try to figure out the format of this directory from the data fork mappings
544 * and the directory size. If we can be reasonably sure of format, we can be
545 * more aggressive in salvaging directory entries. On return, @magic_guess
546 * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
547 * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
548 * and 0 if we can't tell.
549 */
550 STATIC void
xrep_dir_guess_format(struct xrep_dir * rd,__be32 * magic_guess)551 xrep_dir_guess_format(
552 struct xrep_dir *rd,
553 __be32 *magic_guess)
554 {
555 struct xfs_inode *dp = rd->sc->ip;
556 struct xfs_mount *mp = rd->sc->mp;
557 struct xfs_da_geometry *geo = mp->m_dir_geo;
558 xfs_fileoff_t last;
559 int error;
560
561 ASSERT(xfs_has_crc(mp));
562
563 *magic_guess = 0;
564
565 /*
566 * If there's a single directory block and the directory size is
567 * exactly one block, this has to be a single block format directory.
568 */
569 error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK);
570 if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize &&
571 dp->i_disk_size == geo->blksize) {
572 *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
573 return;
574 }
575
576 /*
577 * If the last extent before the leaf offset matches the directory
578 * size and the directory size is larger than 1 block, this is a
579 * data format directory.
580 */
581 last = geo->leafblk;
582 error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK);
583 if (!error &&
584 XFS_FSB_TO_B(mp, last) > geo->blksize &&
585 XFS_FSB_TO_B(mp, last) == dp->i_disk_size) {
586 *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
587 return;
588 }
589 }
590
591 /* Recover directory entries from a specific directory block. */
592 STATIC int
xrep_dir_recover_dirblock(struct xrep_dir * rd,__be32 magic_guess,xfs_dablk_t dabno)593 xrep_dir_recover_dirblock(
594 struct xrep_dir *rd,
595 __be32 magic_guess,
596 xfs_dablk_t dabno)
597 {
598 struct xfs_dir2_data_hdr *hdr;
599 struct xfs_buf *bp;
600 __be32 oldmagic;
601 int error;
602
603 /*
604 * Try to read buffer. We invalidate them in the next step so we don't
605 * bother to set a buffer type or ops.
606 */
607 error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
608 XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
609 if (error || !bp)
610 return error;
611
612 hdr = bp->b_addr;
613 oldmagic = hdr->magic;
614
615 trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
616 be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));
617
618 /*
619 * If we're sure of the block's format, proceed with the salvage
620 * operation using the specified magic number.
621 */
622 if (magic_guess) {
623 hdr->magic = magic_guess;
624 goto recover;
625 }
626
627 /*
628 * If we couldn't guess what type of directory this is, then we will
629 * only salvage entries from directory blocks that match the magic
630 * number and pass verifiers.
631 */
632 switch (hdr->magic) {
633 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
634 case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
635 if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
636 goto out;
637 if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL)
638 goto out;
639 break;
640 case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
641 case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
642 if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
643 goto out;
644 if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL)
645 goto out;
646 break;
647 default:
648 goto out;
649 }
650
651 recover:
652 error = xrep_dir_recover_data(rd, bp);
653
654 out:
655 hdr->magic = oldmagic;
656 xfs_trans_brelse(rd->sc->tp, bp);
657 return error;
658 }
659
660 static inline void
xrep_dir_init_args(struct xrep_dir * rd,struct xfs_inode * dp,const struct xfs_name * name)661 xrep_dir_init_args(
662 struct xrep_dir *rd,
663 struct xfs_inode *dp,
664 const struct xfs_name *name)
665 {
666 memset(&rd->args, 0, sizeof(struct xfs_da_args));
667 rd->args.geo = rd->sc->mp->m_dir_geo;
668 rd->args.whichfork = XFS_DATA_FORK;
669 rd->args.owner = rd->sc->ip->i_ino;
670 rd->args.trans = rd->sc->tp;
671 rd->args.dp = dp;
672 if (!name)
673 return;
674 rd->args.name = name->name;
675 rd->args.namelen = name->len;
676 rd->args.filetype = name->type;
677 rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name);
678 }
679
680 /* Replay a stashed createname into the temporary directory. */
681 STATIC int
xrep_dir_replay_createname(struct xrep_dir * rd,const struct xfs_name * name,xfs_ino_t inum,xfs_extlen_t total)682 xrep_dir_replay_createname(
683 struct xrep_dir *rd,
684 const struct xfs_name *name,
685 xfs_ino_t inum,
686 xfs_extlen_t total)
687 {
688 struct xfs_scrub *sc = rd->sc;
689 struct xfs_inode *dp = rd->sc->tempip;
690 int error;
691
692 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
693
694 error = xfs_dir_ino_validate(sc->mp, inum);
695 if (error)
696 return error;
697
698 trace_xrep_dir_replay_createname(dp, name, inum);
699
700 xrep_dir_init_args(rd, dp, name);
701 rd->args.inumber = inum;
702 rd->args.total = total;
703 rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
704 return xfs_dir_createname_args(&rd->args);
705 }
706
707 /* Replay a stashed removename onto the temporary directory. */
708 STATIC int
xrep_dir_replay_removename(struct xrep_dir * rd,const struct xfs_name * name,xfs_extlen_t total)709 xrep_dir_replay_removename(
710 struct xrep_dir *rd,
711 const struct xfs_name *name,
712 xfs_extlen_t total)
713 {
714 struct xfs_inode *dp = rd->args.dp;
715
716 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
717
718 xrep_dir_init_args(rd, dp, name);
719 rd->args.op_flags = 0;
720 rd->args.total = total;
721
722 trace_xrep_dir_replay_removename(dp, name, 0);
723 return xfs_dir_removename_args(&rd->args);
724 }
725
726 /*
727 * Add this stashed incore directory entry to the temporary directory.
728 * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
729 * must not be in transaction context.
730 */
731 STATIC int
xrep_dir_replay_update(struct xrep_dir * rd,const struct xfs_name * xname,const struct xrep_dirent * dirent)732 xrep_dir_replay_update(
733 struct xrep_dir *rd,
734 const struct xfs_name *xname,
735 const struct xrep_dirent *dirent)
736 {
737 struct xfs_mount *mp = rd->sc->mp;
738 #ifdef DEBUG
739 xfs_ino_t ino;
740 #endif
741 uint resblks;
742 int error;
743
744 resblks = xfs_link_space_res(mp, xname->len);
745 error = xchk_trans_alloc(rd->sc, resblks);
746 if (error)
747 return error;
748
749 /* Lock the temporary directory and join it to the transaction */
750 xrep_tempfile_ilock(rd->sc);
751 xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0);
752
753 switch (dirent->action) {
754 case XREP_DIRENT_ADD:
755 /*
756 * Create a replacement dirent in the temporary directory.
757 * Note that _createname doesn't check for existing entries.
758 * There shouldn't be any in the temporary dir, but we'll
759 * verify this in debug mode.
760 */
761 #ifdef DEBUG
762 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
763 if (error != -ENOENT) {
764 ASSERT(error != -ENOENT);
765 goto out_cancel;
766 }
767 #endif
768
769 error = xrep_dir_replay_createname(rd, xname, dirent->ino,
770 resblks);
771 if (error)
772 goto out_cancel;
773
774 if (xname->type == XFS_DIR3_FT_DIR)
775 rd->subdirs++;
776 rd->dirents++;
777 break;
778 case XREP_DIRENT_REMOVE:
779 /*
780 * Remove a dirent from the temporary directory. Note that
781 * _removename doesn't check the inode target of the exist
782 * entry. There should be a perfect match in the temporary
783 * dir, but we'll verify this in debug mode.
784 */
785 #ifdef DEBUG
786 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
787 if (error) {
788 ASSERT(error != 0);
789 goto out_cancel;
790 }
791 if (ino != dirent->ino) {
792 ASSERT(ino == dirent->ino);
793 error = -EIO;
794 goto out_cancel;
795 }
796 #endif
797
798 error = xrep_dir_replay_removename(rd, xname, resblks);
799 if (error)
800 goto out_cancel;
801
802 if (xname->type == XFS_DIR3_FT_DIR)
803 rd->subdirs--;
804 rd->dirents--;
805 break;
806 default:
807 ASSERT(0);
808 error = -EIO;
809 goto out_cancel;
810 }
811
812 /* Commit and unlock. */
813 error = xrep_trans_commit(rd->sc);
814 if (error)
815 return error;
816
817 xrep_tempfile_iunlock(rd->sc);
818 return 0;
819 out_cancel:
820 xchk_trans_cancel(rd->sc);
821 xrep_tempfile_iunlock(rd->sc);
822 return error;
823 }
824
825 /*
826 * Flush stashed incore dirent updates that have been recorded by the scanner.
827 * This is done to reduce the memory requirements of the directory rebuild,
828 * since directories can contain up to 32GB of directory data.
829 *
830 * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir
831 * IOLOCK.
832 */
833 STATIC int
xrep_dir_replay_updates(struct xrep_dir * rd)834 xrep_dir_replay_updates(
835 struct xrep_dir *rd)
836 {
837 xfarray_idx_t array_cur;
838 int error;
839
840 /* Add all the salvaged dirents to the temporary directory. */
841 mutex_lock(&rd->pscan.lock);
842 foreach_xfarray_idx(rd->dir_entries, array_cur) {
843 struct xrep_dirent dirent;
844
845 error = xfarray_load(rd->dir_entries, array_cur, &dirent);
846 if (error)
847 goto out_unlock;
848
849 error = xfblob_loadname(rd->dir_names, dirent.name_cookie,
850 &rd->xname, dirent.namelen);
851 if (error)
852 goto out_unlock;
853 rd->xname.type = dirent.ftype;
854 mutex_unlock(&rd->pscan.lock);
855
856 error = xrep_dir_replay_update(rd, &rd->xname, &dirent);
857 if (error)
858 return error;
859 mutex_lock(&rd->pscan.lock);
860 }
861
862 /* Empty out both arrays now that we've added the entries. */
863 xfarray_truncate(rd->dir_entries);
864 xfblob_truncate(rd->dir_names);
865 mutex_unlock(&rd->pscan.lock);
866 return 0;
867 out_unlock:
868 mutex_unlock(&rd->pscan.lock);
869 return error;
870 }
871
872 /*
873 * Periodically flush stashed directory entries to the temporary dir. This
874 * is done to reduce the memory requirements of the directory rebuild, since
875 * directories can contain up to 32GB of directory data.
876 */
877 STATIC int
xrep_dir_flush_stashed(struct xrep_dir * rd)878 xrep_dir_flush_stashed(
879 struct xrep_dir *rd)
880 {
881 int error;
882
883 /*
884 * Entering this function, the scrub context has a reference to the
885 * inode being repaired, the temporary file, and a scrub transaction
886 * that we use during dirent salvaging to avoid livelocking if there
887 * are cycles in the directory structures. We hold ILOCK_EXCL on both
888 * the inode being repaired and the temporary file, though they are
889 * not ijoined to the scrub transaction.
890 *
891 * To constrain kernel memory use, we occasionally write salvaged
892 * dirents from the xfarray and xfblob structures into the temporary
893 * directory in preparation for exchanging the directory structures at
894 * the end. Updating the temporary file requires a transaction, so we
895 * commit the scrub transaction and drop the two ILOCKs so that
896 * we can allocate whatever transaction we want.
897 *
898 * We still hold IOLOCK_EXCL on the inode being repaired, which
899 * prevents anyone from accessing the damaged directory data while we
900 * repair it.
901 */
902 error = xrep_trans_commit(rd->sc);
903 if (error)
904 return error;
905 xchk_iunlock(rd->sc, XFS_ILOCK_EXCL);
906
907 /*
908 * Take the IOLOCK of the temporary file while we modify dirents. This
909 * isn't strictly required because the temporary file is never revealed
910 * to userspace, but we follow the same locking rules. We still hold
911 * sc->ip's IOLOCK.
912 */
913 error = xrep_tempfile_iolock_polled(rd->sc);
914 if (error)
915 return error;
916
917 /* Write to the tempdir all the updates that we've stashed. */
918 error = xrep_dir_replay_updates(rd);
919 xrep_tempfile_iounlock(rd->sc);
920 if (error)
921 return error;
922
923 /*
924 * Recreate the salvage transaction and relock the dir we're salvaging.
925 */
926 error = xchk_trans_alloc(rd->sc, 0);
927 if (error)
928 return error;
929 xchk_ilock(rd->sc, XFS_ILOCK_EXCL);
930 return 0;
931 }
932
933 /* Decide if we've stashed too much dirent data in memory. */
934 static inline bool
xrep_dir_want_flush_stashed(struct xrep_dir * rd)935 xrep_dir_want_flush_stashed(
936 struct xrep_dir *rd)
937 {
938 unsigned long long bytes;
939
940 bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names);
941 return bytes > XREP_DIR_MAX_STASH_BYTES;
942 }
943
944 /* Extract as many directory entries as we can. */
945 STATIC int
xrep_dir_recover(struct xrep_dir * rd)946 xrep_dir_recover(
947 struct xrep_dir *rd)
948 {
949 struct xfs_bmbt_irec got;
950 struct xfs_scrub *sc = rd->sc;
951 struct xfs_da_geometry *geo = sc->mp->m_dir_geo;
952 xfs_fileoff_t offset;
953 xfs_dablk_t dabno;
954 __be32 magic_guess;
955 int nmap;
956 int error;
957
958 xrep_dir_guess_format(rd, &magic_guess);
959
960 /* Iterate each directory data block in the data fork. */
961 for (offset = 0;
962 offset < geo->leafblk;
963 offset = got.br_startoff + got.br_blockcount) {
964 nmap = 1;
965 error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset,
966 &got, &nmap, 0);
967 if (error)
968 return error;
969 if (nmap != 1)
970 return -EFSCORRUPTED;
971 if (!xfs_bmap_is_written_extent(&got))
972 continue;
973
974 for (dabno = round_up(got.br_startoff, geo->fsbcount);
975 dabno < got.br_startoff + got.br_blockcount;
976 dabno += geo->fsbcount) {
977 if (xchk_should_terminate(rd->sc, &error))
978 return error;
979
980 error = xrep_dir_recover_dirblock(rd,
981 magic_guess, dabno);
982 if (error)
983 return error;
984
985 /* Flush dirents to constrain memory usage. */
986 if (xrep_dir_want_flush_stashed(rd)) {
987 error = xrep_dir_flush_stashed(rd);
988 if (error)
989 return error;
990 }
991 }
992 }
993
994 return 0;
995 }
996
997 /*
998 * Find all the directory entries for this inode by scraping them out of the
999 * directory leaf blocks by hand, and flushing them into the temp dir.
1000 */
1001 STATIC int
xrep_dir_find_entries(struct xrep_dir * rd)1002 xrep_dir_find_entries(
1003 struct xrep_dir *rd)
1004 {
1005 struct xfs_inode *dp = rd->sc->ip;
1006 int error;
1007
1008 /*
1009 * Salvage directory entries from the old directory, and write them to
1010 * the temporary directory.
1011 */
1012 if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
1013 error = xrep_dir_recover_sf(rd);
1014 } else {
1015 error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK);
1016 if (error)
1017 return error;
1018
1019 error = xrep_dir_recover(rd);
1020 }
1021 if (error)
1022 return error;
1023
1024 return xrep_dir_flush_stashed(rd);
1025 }
1026
1027 /* Scan all files in the filesystem for dirents. */
1028 STATIC int
xrep_dir_salvage_entries(struct xrep_dir * rd)1029 xrep_dir_salvage_entries(
1030 struct xrep_dir *rd)
1031 {
1032 struct xfs_scrub *sc = rd->sc;
1033 int error;
1034
1035 /*
1036 * Drop the ILOCK on this directory so that we can scan for this
1037 * directory's parent. Figure out who is going to be the parent of
1038 * this directory, then retake the ILOCK so that we can salvage
1039 * directory entries.
1040 */
1041 xchk_iunlock(sc, XFS_ILOCK_EXCL);
1042 error = xrep_dir_find_parent(rd);
1043 xchk_ilock(sc, XFS_ILOCK_EXCL);
1044 if (error)
1045 return error;
1046
1047 /*
1048 * Collect directory entries by parsing raw leaf blocks to salvage
1049 * whatever we can. When we're done, free the staging memory before
1050 * exchanging the directories to reduce memory usage.
1051 */
1052 error = xrep_dir_find_entries(rd);
1053 if (error)
1054 return error;
1055
1056 /*
1057 * Cancel the repair transaction and drop the ILOCK so that we can
1058 * (later) use the atomic mapping exchange functions to compute the
1059 * correct block reservations and re-lock the inodes.
1060 *
1061 * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory
1062 * modifications, but there's nothing to prevent userspace from reading
1063 * the directory until we're ready for the exchange operation. Reads
1064 * will return -EIO without shutting down the fs, so we're ok with
1065 * that.
1066 *
1067 * The VFS can change dotdot on us, but the findparent scan will keep
1068 * our incore parent inode up to date. See the note on locking issues
1069 * for more details.
1070 */
1071 error = xrep_trans_commit(sc);
1072 if (error)
1073 return error;
1074
1075 xchk_iunlock(sc, XFS_ILOCK_EXCL);
1076 return 0;
1077 }
1078
1079
1080 /*
1081 * Examine a parent pointer of a file. If it leads us back to the directory
1082 * that we're rebuilding, create an incore dirent from the parent pointer and
1083 * stash it.
1084 */
1085 STATIC int
xrep_dir_scan_pptr(struct xfs_scrub * sc,struct xfs_inode * ip,unsigned int attr_flags,const unsigned char * name,unsigned int namelen,const void * value,unsigned int valuelen,void * priv)1086 xrep_dir_scan_pptr(
1087 struct xfs_scrub *sc,
1088 struct xfs_inode *ip,
1089 unsigned int attr_flags,
1090 const unsigned char *name,
1091 unsigned int namelen,
1092 const void *value,
1093 unsigned int valuelen,
1094 void *priv)
1095 {
1096 struct xfs_name xname = {
1097 .name = name,
1098 .len = namelen,
1099 .type = xfs_mode_to_ftype(VFS_I(ip)->i_mode),
1100 };
1101 xfs_ino_t parent_ino;
1102 uint32_t parent_gen;
1103 struct xrep_dir *rd = priv;
1104 int error;
1105
1106 if (!(attr_flags & XFS_ATTR_PARENT))
1107 return 0;
1108
1109 /*
1110 * Ignore parent pointers that point back to a different dir, list the
1111 * wrong generation number, or are invalid.
1112 */
1113 error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
1114 valuelen, &parent_ino, &parent_gen);
1115 if (error)
1116 return error;
1117
1118 if (parent_ino != sc->ip->i_ino ||
1119 parent_gen != VFS_I(sc->ip)->i_generation)
1120 return 0;
1121
1122 mutex_lock(&rd->pscan.lock);
1123 error = xrep_dir_stash_createname(rd, &xname, ip->i_ino);
1124 mutex_unlock(&rd->pscan.lock);
1125 return error;
1126 }
1127
1128 /*
1129 * If this child dirent points to the directory being repaired, remember that
1130 * fact so that we can reset the dotdot entry if necessary.
1131 */
1132 STATIC int
xrep_dir_scan_dirent(struct xfs_scrub * sc,struct xfs_inode * dp,xfs_dir2_dataptr_t dapos,const struct xfs_name * name,xfs_ino_t ino,void * priv)1133 xrep_dir_scan_dirent(
1134 struct xfs_scrub *sc,
1135 struct xfs_inode *dp,
1136 xfs_dir2_dataptr_t dapos,
1137 const struct xfs_name *name,
1138 xfs_ino_t ino,
1139 void *priv)
1140 {
1141 struct xrep_dir *rd = priv;
1142
1143 /* Dirent doesn't point to this directory. */
1144 if (ino != rd->sc->ip->i_ino)
1145 return 0;
1146
1147 /* Ignore garbage inum. */
1148 if (!xfs_verify_dir_ino(rd->sc->mp, ino))
1149 return 0;
1150
1151 /* No weird looking names. */
1152 if (name->len >= MAXNAMELEN || name->len <= 0)
1153 return 0;
1154
1155 /* Don't pick up dot or dotdot entries; we only want child dirents. */
1156 if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
1157 xfs_dir2_samename(name, &xfs_name_dot))
1158 return 0;
1159
1160 trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot,
1161 dp->i_ino);
1162
1163 xrep_findparent_scan_found(&rd->pscan, dp->i_ino);
1164 return 0;
1165 }
1166
1167 /*
1168 * Decide if we want to look for child dirents or parent pointers in this file.
1169 * Skip the dir being repaired and any files being used to stage repairs.
1170 */
1171 static inline bool
xrep_dir_want_scan(struct xrep_dir * rd,const struct xfs_inode * ip)1172 xrep_dir_want_scan(
1173 struct xrep_dir *rd,
1174 const struct xfs_inode *ip)
1175 {
1176 return ip != rd->sc->ip && !xrep_is_tempfile(ip);
1177 }
1178
1179 /*
1180 * Take ILOCK on a file that we want to scan.
1181 *
1182 * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or
1183 * has an unloaded attr bmbt. Otherwise, take ILOCK_SHARED.
1184 */
1185 static inline unsigned int
xrep_dir_scan_ilock(struct xrep_dir * rd,struct xfs_inode * ip)1186 xrep_dir_scan_ilock(
1187 struct xrep_dir *rd,
1188 struct xfs_inode *ip)
1189 {
1190 uint lock_mode = XFS_ILOCK_SHARED;
1191
1192 /* Need to take the shared ILOCK to advance the iscan cursor. */
1193 if (!xrep_dir_want_scan(rd, ip))
1194 goto lock;
1195
1196 if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) {
1197 lock_mode = XFS_ILOCK_EXCL;
1198 goto lock;
1199 }
1200
1201 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
1202 lock_mode = XFS_ILOCK_EXCL;
1203
1204 lock:
1205 xfs_ilock(ip, lock_mode);
1206 return lock_mode;
1207 }
1208
1209 /*
1210 * Scan this file for relevant child dirents or parent pointers that point to
1211 * the directory we're rebuilding.
1212 */
1213 STATIC int
xrep_dir_scan_file(struct xrep_dir * rd,struct xfs_inode * ip)1214 xrep_dir_scan_file(
1215 struct xrep_dir *rd,
1216 struct xfs_inode *ip)
1217 {
1218 unsigned int lock_mode;
1219 int error = 0;
1220
1221 lock_mode = xrep_dir_scan_ilock(rd, ip);
1222
1223 if (!xrep_dir_want_scan(rd, ip))
1224 goto scan_done;
1225
1226 /*
1227 * If the extended attributes look as though they has been zapped by
1228 * the inode record repair code, we cannot scan for parent pointers.
1229 */
1230 if (xchk_pptr_looks_zapped(ip)) {
1231 error = -EBUSY;
1232 goto scan_done;
1233 }
1234
1235 error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd);
1236 if (error)
1237 goto scan_done;
1238
1239 if (S_ISDIR(VFS_I(ip)->i_mode)) {
1240 /*
1241 * If the directory looks as though it has been zapped by the
1242 * inode record repair code, we cannot scan for child dirents.
1243 */
1244 if (xchk_dir_looks_zapped(ip)) {
1245 error = -EBUSY;
1246 goto scan_done;
1247 }
1248
1249 error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd);
1250 if (error)
1251 goto scan_done;
1252 }
1253
1254 scan_done:
1255 xchk_iscan_mark_visited(&rd->pscan.iscan, ip);
1256 xfs_iunlock(ip, lock_mode);
1257 return error;
1258 }
1259
1260 /*
1261 * Scan all files in the filesystem for parent pointers that we can turn into
1262 * replacement dirents, and a dirent that we can use to set the dotdot pointer.
1263 */
1264 STATIC int
xrep_dir_scan_dirtree(struct xrep_dir * rd)1265 xrep_dir_scan_dirtree(
1266 struct xrep_dir *rd)
1267 {
1268 struct xfs_scrub *sc = rd->sc;
1269 struct xfs_inode *ip;
1270 int error;
1271
1272 /* Roots of directory trees are their own parents. */
1273 if (sc->ip == sc->mp->m_rootip)
1274 xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino);
1275
1276 /*
1277 * Filesystem scans are time consuming. Drop the directory ILOCK and
1278 * all other resources for the duration of the scan and hope for the
1279 * best. The live update hooks will keep our scan information up to
1280 * date even though we've dropped the locks.
1281 */
1282 xchk_trans_cancel(sc);
1283 if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
1284 xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
1285 XFS_ILOCK_EXCL));
1286 error = xchk_trans_alloc_empty(sc);
1287 if (error)
1288 return error;
1289
1290 while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) {
1291 bool flush;
1292
1293 error = xrep_dir_scan_file(rd, ip);
1294 xchk_irele(sc, ip);
1295 if (error)
1296 break;
1297
1298 /* Flush stashed dirent updates to constrain memory usage. */
1299 mutex_lock(&rd->pscan.lock);
1300 flush = xrep_dir_want_flush_stashed(rd);
1301 mutex_unlock(&rd->pscan.lock);
1302 if (flush) {
1303 xchk_trans_cancel(sc);
1304
1305 error = xrep_tempfile_iolock_polled(sc);
1306 if (error)
1307 break;
1308
1309 error = xrep_dir_replay_updates(rd);
1310 xrep_tempfile_iounlock(sc);
1311 if (error)
1312 break;
1313
1314 error = xchk_trans_alloc_empty(sc);
1315 if (error)
1316 break;
1317 }
1318
1319 if (xchk_should_terminate(sc, &error))
1320 break;
1321 }
1322 xchk_iscan_iter_finish(&rd->pscan.iscan);
1323 if (error) {
1324 /*
1325 * If we couldn't grab an inode that was busy with a state
1326 * change, change the error code so that we exit to userspace
1327 * as quickly as possible.
1328 */
1329 if (error == -EBUSY)
1330 return -ECANCELED;
1331 return error;
1332 }
1333
1334 /*
1335 * Cancel the empty transaction so that we can (later) use the atomic
1336 * file mapping exchange functions to lock files and commit the new
1337 * directory.
1338 */
1339 xchk_trans_cancel(rd->sc);
1340 return 0;
1341 }
1342
1343 /*
1344 * Capture dirent updates being made by other threads which are relevant to the
1345 * directory being repaired.
1346 */
1347 STATIC int
xrep_dir_live_update(struct notifier_block * nb,unsigned long action,void * data)1348 xrep_dir_live_update(
1349 struct notifier_block *nb,
1350 unsigned long action,
1351 void *data)
1352 {
1353 struct xfs_dir_update_params *p = data;
1354 struct xrep_dir *rd;
1355 struct xfs_scrub *sc;
1356 int error = 0;
1357
1358 rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb);
1359 sc = rd->sc;
1360
1361 /*
1362 * This thread updated a child dirent in the directory that we're
1363 * rebuilding. Stash the update for replay against the temporary
1364 * directory.
1365 */
1366 if (p->dp->i_ino == sc->ip->i_ino &&
1367 xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) {
1368 mutex_lock(&rd->pscan.lock);
1369 if (p->delta > 0)
1370 error = xrep_dir_stash_createname(rd, p->name,
1371 p->ip->i_ino);
1372 else
1373 error = xrep_dir_stash_removename(rd, p->name,
1374 p->ip->i_ino);
1375 mutex_unlock(&rd->pscan.lock);
1376 if (error)
1377 goto out_abort;
1378 }
1379
1380 /*
1381 * This thread updated another directory's child dirent that points to
1382 * the directory that we're rebuilding, so remember the new dotdot
1383 * target.
1384 */
1385 if (p->ip->i_ino == sc->ip->i_ino &&
1386 xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) {
1387 if (p->delta > 0) {
1388 trace_xrep_dir_stash_createname(sc->tempip,
1389 &xfs_name_dotdot,
1390 p->dp->i_ino);
1391
1392 xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino);
1393 } else {
1394 trace_xrep_dir_stash_removename(sc->tempip,
1395 &xfs_name_dotdot,
1396 rd->pscan.parent_ino);
1397
1398 xrep_findparent_scan_found(&rd->pscan, NULLFSINO);
1399 }
1400 }
1401
1402 return NOTIFY_DONE;
1403 out_abort:
1404 xchk_iscan_abort(&rd->pscan.iscan);
1405 return NOTIFY_DONE;
1406 }
1407
1408 /*
1409 * Free all the directory blocks and reset the data fork. The caller must
1410 * join the inode to the transaction. This function returns with the inode
1411 * joined to a clean scrub transaction.
1412 */
1413 STATIC int
xrep_dir_reset_fork(struct xrep_dir * rd,xfs_ino_t parent_ino)1414 xrep_dir_reset_fork(
1415 struct xrep_dir *rd,
1416 xfs_ino_t parent_ino)
1417 {
1418 struct xfs_scrub *sc = rd->sc;
1419 struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
1420 int error;
1421
1422 /* Unmap all the directory buffers. */
1423 if (xfs_ifork_has_extents(ifp)) {
1424 error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
1425 if (error)
1426 return error;
1427 }
1428
1429 trace_xrep_dir_reset_fork(sc->tempip, parent_ino);
1430
1431 /* Reset the data fork to an empty data fork. */
1432 xfs_idestroy_fork(ifp);
1433 ifp->if_bytes = 0;
1434 sc->tempip->i_disk_size = 0;
1435
1436 /* Reinitialize the short form directory. */
1437 xrep_dir_init_args(rd, sc->tempip, NULL);
1438 return xfs_dir2_sf_create(&rd->args, parent_ino);
1439 }
1440
1441 /*
1442 * Prepare both inodes' directory forks for exchanging mappings. Promote the
1443 * tempfile from short format to leaf format, and if the file being repaired
1444 * has a short format data fork, turn it into an empty extent list.
1445 */
1446 STATIC int
xrep_dir_swap_prep(struct xfs_scrub * sc,bool temp_local,bool ip_local)1447 xrep_dir_swap_prep(
1448 struct xfs_scrub *sc,
1449 bool temp_local,
1450 bool ip_local)
1451 {
1452 int error;
1453
1454 /*
1455 * If the tempfile's directory is in shortform format, convert that to
1456 * a single leaf extent so that we can use the atomic mapping exchange.
1457 */
1458 if (temp_local) {
1459 struct xfs_da_args args = {
1460 .dp = sc->tempip,
1461 .geo = sc->mp->m_dir_geo,
1462 .whichfork = XFS_DATA_FORK,
1463 .trans = sc->tp,
1464 .total = 1,
1465 .owner = sc->ip->i_ino,
1466 };
1467
1468 error = xfs_dir2_sf_to_block(&args);
1469 if (error)
1470 return error;
1471
1472 /*
1473 * Roll the deferred log items to get us back to a clean
1474 * transaction.
1475 */
1476 error = xfs_defer_finish(&sc->tp);
1477 if (error)
1478 return error;
1479 }
1480
1481 /*
1482 * If the file being repaired had a shortform data fork, convert that
1483 * to an empty extent list in preparation for the atomic mapping
1484 * exchange.
1485 */
1486 if (ip_local) {
1487 struct xfs_ifork *ifp;
1488
1489 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1490 xfs_idestroy_fork(ifp);
1491 ifp->if_format = XFS_DINODE_FMT_EXTENTS;
1492 ifp->if_nextents = 0;
1493 ifp->if_bytes = 0;
1494 ifp->if_data = NULL;
1495 ifp->if_height = 0;
1496
1497 xfs_trans_log_inode(sc->tp, sc->ip,
1498 XFS_ILOG_CORE | XFS_ILOG_DDATA);
1499 }
1500
1501 return 0;
1502 }
1503
1504 /*
1505 * Replace the inode number of a directory entry.
1506 */
1507 static int
xrep_dir_replace(struct xrep_dir * rd,struct xfs_inode * dp,const struct xfs_name * name,xfs_ino_t inum,xfs_extlen_t total)1508 xrep_dir_replace(
1509 struct xrep_dir *rd,
1510 struct xfs_inode *dp,
1511 const struct xfs_name *name,
1512 xfs_ino_t inum,
1513 xfs_extlen_t total)
1514 {
1515 struct xfs_scrub *sc = rd->sc;
1516 int error;
1517
1518 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
1519
1520 error = xfs_dir_ino_validate(sc->mp, inum);
1521 if (error)
1522 return error;
1523
1524 xrep_dir_init_args(rd, dp, name);
1525 rd->args.inumber = inum;
1526 rd->args.total = total;
1527 return xfs_dir_replace_args(&rd->args);
1528 }
1529
1530 /*
1531 * Reset the link count of this directory and adjust the unlinked list pointers
1532 * as needed.
1533 */
1534 STATIC int
xrep_dir_set_nlink(struct xrep_dir * rd)1535 xrep_dir_set_nlink(
1536 struct xrep_dir *rd)
1537 {
1538 struct xfs_scrub *sc = rd->sc;
1539 struct xfs_inode *dp = sc->ip;
1540 struct xfs_perag *pag;
1541 unsigned int new_nlink = min_t(unsigned long long,
1542 rd->subdirs + 2,
1543 XFS_NLINK_PINNED);
1544 int error;
1545
1546 /*
1547 * The directory is not on the incore unlinked list, which means that
1548 * it needs to be reachable via the directory tree. Update the nlink
1549 * with our observed link count. If the directory has no parent, it
1550 * will be moved to the orphanage.
1551 */
1552 if (!xfs_inode_on_unlinked_list(dp))
1553 goto reset_nlink;
1554
1555 /*
1556 * The directory is on the unlinked list and we did not find any
1557 * dirents. Set the link count to zero and let the directory
1558 * inactivate when the last reference drops.
1559 */
1560 if (rd->dirents == 0) {
1561 rd->needs_adoption = false;
1562 new_nlink = 0;
1563 goto reset_nlink;
1564 }
1565
1566 /*
1567 * The directory is on the unlinked list and we found dirents. This
1568 * directory needs to be reachable via the directory tree. Remove the
1569 * dir from the unlinked list and update nlink with the observed link
1570 * count. If the directory has no parent, it will be moved to the
1571 * orphanage.
1572 */
1573 pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino));
1574 if (!pag) {
1575 ASSERT(0);
1576 return -EFSCORRUPTED;
1577 }
1578
1579 error = xfs_iunlink_remove(sc->tp, pag, dp);
1580 xfs_perag_put(pag);
1581 if (error)
1582 return error;
1583
1584 reset_nlink:
1585 if (VFS_I(dp)->i_nlink != new_nlink)
1586 set_nlink(VFS_I(dp), new_nlink);
1587 return 0;
1588 }
1589
1590 /*
1591 * Finish replaying stashed dirent updates, allocate a transaction for
1592 * exchanging data fork mappings, and take the ILOCKs of both directories
1593 * before we commit the new directory structure.
1594 */
1595 STATIC int
xrep_dir_finalize_tempdir(struct xrep_dir * rd)1596 xrep_dir_finalize_tempdir(
1597 struct xrep_dir *rd)
1598 {
1599 struct xfs_scrub *sc = rd->sc;
1600 int error;
1601
1602 if (!xfs_has_parent(sc->mp))
1603 return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1604
1605 /*
1606 * Repair relies on the ILOCK to quiesce all possible dirent updates.
1607 * Replay all queued dirent updates into the tempdir before exchanging
1608 * the contents, even if that means dropping the ILOCKs and the
1609 * transaction.
1610 */
1611 do {
1612 error = xrep_dir_replay_updates(rd);
1613 if (error)
1614 return error;
1615
1616 error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1617 if (error)
1618 return error;
1619
1620 if (xfarray_length(rd->dir_entries) == 0)
1621 break;
1622
1623 xchk_trans_cancel(sc);
1624 xrep_tempfile_iunlock_both(sc);
1625 } while (!xchk_should_terminate(sc, &error));
1626 return error;
1627 }
1628
1629 /* Exchange the temporary directory's data fork with the one being repaired. */
1630 STATIC int
xrep_dir_swap(struct xrep_dir * rd)1631 xrep_dir_swap(
1632 struct xrep_dir *rd)
1633 {
1634 struct xfs_scrub *sc = rd->sc;
1635 bool ip_local, temp_local;
1636 int error = 0;
1637
1638 /*
1639 * If we never found the parent for this directory, temporarily assign
1640 * the root dir as the parent; we'll move this to the orphanage after
1641 * exchanging the dir contents. We hold the ILOCK of the dir being
1642 * repaired, so we're not worried about racy updates of dotdot.
1643 */
1644 ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
1645 if (rd->pscan.parent_ino == NULLFSINO) {
1646 rd->needs_adoption = true;
1647 rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino;
1648 }
1649
1650 /*
1651 * Reset the temporary directory's '..' entry to point to the parent
1652 * that we found. The temporary directory was created with the root
1653 * directory as the parent, so we can skip this if repairing a
1654 * subdirectory of the root.
1655 *
1656 * It's also possible that this replacement could also expand a sf
1657 * tempdir into block format.
1658 */
1659 if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) {
1660 error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
1661 rd->pscan.parent_ino, rd->tx.req.resblks);
1662 if (error)
1663 return error;
1664 }
1665
1666 /*
1667 * Changing the dot and dotdot entries could have changed the shape of
1668 * the directory, so we recompute these.
1669 */
1670 ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1671 temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1672
1673 /*
1674 * If the both files have a local format data fork and the rebuilt
1675 * directory data would fit in the repaired file's data fork, copy
1676 * the contents from the tempfile and update the directory link count.
1677 * We're done now.
1678 */
1679 if (ip_local && temp_local &&
1680 sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
1681 xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
1682 return xrep_dir_set_nlink(rd);
1683 }
1684
1685 /*
1686 * Clean the transaction before we start working on exchanging
1687 * directory contents.
1688 */
1689 error = xrep_tempfile_roll_trans(rd->sc);
1690 if (error)
1691 return error;
1692
1693 /* Otherwise, make sure both data forks are in block-mapping mode. */
1694 error = xrep_dir_swap_prep(sc, temp_local, ip_local);
1695 if (error)
1696 return error;
1697
1698 /*
1699 * Set nlink of the directory in the same transaction sequence that
1700 * (atomically) commits the new directory data.
1701 */
1702 error = xrep_dir_set_nlink(rd);
1703 if (error)
1704 return error;
1705
1706 return xrep_tempexch_contents(sc, &rd->tx);
1707 }
1708
1709 /*
1710 * Exchange the new directory contents (which we created in the tempfile) with
1711 * the directory being repaired.
1712 */
1713 STATIC int
xrep_dir_rebuild_tree(struct xrep_dir * rd)1714 xrep_dir_rebuild_tree(
1715 struct xrep_dir *rd)
1716 {
1717 struct xfs_scrub *sc = rd->sc;
1718 int error;
1719
1720 trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino);
1721
1722 /*
1723 * Take the IOLOCK on the temporary file so that we can run dir
1724 * operations with the same locks held as we would for a normal file.
1725 * We still hold sc->ip's IOLOCK.
1726 */
1727 error = xrep_tempfile_iolock_polled(rd->sc);
1728 if (error)
1729 return error;
1730
1731 /*
1732 * Allocate transaction, lock inodes, and make sure that we've replayed
1733 * all the stashed dirent updates to the tempdir. After this point,
1734 * we're ready to exchange data fork mappings.
1735 */
1736 error = xrep_dir_finalize_tempdir(rd);
1737 if (error)
1738 return error;
1739
1740 if (xchk_iscan_aborted(&rd->pscan.iscan))
1741 return -ECANCELED;
1742
1743 /*
1744 * Exchange the tempdir's data fork with the file being repaired. This
1745 * recreates the transaction and re-takes the ILOCK in the scrub
1746 * context.
1747 */
1748 error = xrep_dir_swap(rd);
1749 if (error)
1750 return error;
1751
1752 /*
1753 * Release the old directory blocks and reset the data fork of the temp
1754 * directory to an empty shortform directory because inactivation does
1755 * nothing for directories.
1756 */
1757 error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino);
1758 if (error)
1759 return error;
1760
1761 /*
1762 * Roll to get a transaction without any inodes joined to it. Then we
1763 * can drop the tempfile's ILOCK and IOLOCK before doing more work on
1764 * the scrub target directory.
1765 */
1766 error = xfs_trans_roll(&sc->tp);
1767 if (error)
1768 return error;
1769
1770 xrep_tempfile_iunlock(sc);
1771 xrep_tempfile_iounlock(sc);
1772 return 0;
1773 }
1774
1775 /* Set up the filesystem scan so we can regenerate directory entries. */
1776 STATIC int
xrep_dir_setup_scan(struct xrep_dir * rd)1777 xrep_dir_setup_scan(
1778 struct xrep_dir *rd)
1779 {
1780 struct xfs_scrub *sc = rd->sc;
1781 char *descr;
1782 int error;
1783
1784 /* Set up some staging memory for salvaging dirents. */
1785 descr = xchk_xfile_ino_descr(sc, "directory entries");
1786 error = xfarray_create(descr, 0, sizeof(struct xrep_dirent),
1787 &rd->dir_entries);
1788 kfree(descr);
1789 if (error)
1790 return error;
1791
1792 descr = xchk_xfile_ino_descr(sc, "directory entry names");
1793 error = xfblob_create(descr, &rd->dir_names);
1794 kfree(descr);
1795 if (error)
1796 goto out_xfarray;
1797
1798 if (xfs_has_parent(sc->mp))
1799 error = __xrep_findparent_scan_start(sc, &rd->pscan,
1800 xrep_dir_live_update);
1801 else
1802 error = xrep_findparent_scan_start(sc, &rd->pscan);
1803 if (error)
1804 goto out_xfblob;
1805
1806 return 0;
1807
1808 out_xfblob:
1809 xfblob_destroy(rd->dir_names);
1810 rd->dir_names = NULL;
1811 out_xfarray:
1812 xfarray_destroy(rd->dir_entries);
1813 rd->dir_entries = NULL;
1814 return error;
1815 }
1816
1817 /*
1818 * Move the current file to the orphanage.
1819 *
1820 * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon
1821 * successful return, the scrub transaction will have enough extra reservation
1822 * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
1823 * orphanage; and both inodes will be ijoined.
1824 */
1825 STATIC int
xrep_dir_move_to_orphanage(struct xrep_dir * rd)1826 xrep_dir_move_to_orphanage(
1827 struct xrep_dir *rd)
1828 {
1829 struct xfs_scrub *sc = rd->sc;
1830 xfs_ino_t orig_parent, new_parent;
1831 int error;
1832
1833 /*
1834 * We are about to drop the ILOCK on sc->ip to lock the orphanage and
1835 * prepare for the adoption. Therefore, look up the old dotdot entry
1836 * for sc->ip so that we can compare it after we re-lock sc->ip.
1837 */
1838 error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent);
1839 if (error)
1840 return error;
1841
1842 /*
1843 * Drop the ILOCK on the scrub target and commit the transaction.
1844 * Adoption computes its own resource requirements and gathers the
1845 * necessary components.
1846 */
1847 error = xrep_trans_commit(sc);
1848 if (error)
1849 return error;
1850 xchk_iunlock(sc, XFS_ILOCK_EXCL);
1851
1852 /* If we can take the orphanage's iolock then we're ready to move. */
1853 if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
1854 xchk_iunlock(sc, sc->ilock_flags);
1855 error = xrep_orphanage_iolock_two(sc);
1856 if (error)
1857 return error;
1858 }
1859
1860 /* Grab transaction and ILOCK the two files. */
1861 error = xrep_adoption_trans_alloc(sc, &rd->adoption);
1862 if (error)
1863 return error;
1864
1865 error = xrep_adoption_compute_name(&rd->adoption, &rd->xname);
1866 if (error)
1867 return error;
1868
1869 /*
1870 * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
1871 * entry again. If the parent changed or the child was unlinked while
1872 * the child directory was unlocked, we don't need to move the child to
1873 * the orphanage after all.
1874 */
1875 error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent);
1876 if (error)
1877 return error;
1878
1879 /*
1880 * Attach to the orphanage if we still have a linked directory and it
1881 * hasn't been moved.
1882 */
1883 if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
1884 error = xrep_adoption_move(&rd->adoption);
1885 if (error)
1886 return error;
1887 }
1888
1889 /*
1890 * Launder the scrub transaction so we can drop the orphanage ILOCK
1891 * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK.
1892 */
1893 error = xrep_adoption_trans_roll(&rd->adoption);
1894 if (error)
1895 return error;
1896
1897 xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
1898 xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
1899 return 0;
1900 }
1901
1902 /*
1903 * Repair the directory metadata.
1904 *
1905 * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer
1906 * cache in XFS can't handle aliased multiblock buffers, so this might
1907 * misbehave if the directory blocks are crosslinked with other filesystem
1908 * metadata.
1909 *
1910 * XXX: Is it necessary to check the dcache for this directory to make sure
1911 * that we always recreate every cached entry?
1912 */
1913 int
xrep_directory(struct xfs_scrub * sc)1914 xrep_directory(
1915 struct xfs_scrub *sc)
1916 {
1917 struct xrep_dir *rd = sc->buf;
1918 int error;
1919
1920 /* The rmapbt is required to reap the old data fork. */
1921 if (!xfs_has_rmapbt(sc->mp))
1922 return -EOPNOTSUPP;
1923 /* We require atomic file exchange range to rebuild anything. */
1924 if (!xfs_has_exchange_range(sc->mp))
1925 return -EOPNOTSUPP;
1926
1927 error = xrep_dir_setup_scan(rd);
1928 if (error)
1929 return error;
1930
1931 if (xfs_has_parent(sc->mp))
1932 error = xrep_dir_scan_dirtree(rd);
1933 else
1934 error = xrep_dir_salvage_entries(rd);
1935 if (error)
1936 goto out_teardown;
1937
1938 /* Last chance to abort before we start committing fixes. */
1939 if (xchk_should_terminate(sc, &error))
1940 goto out_teardown;
1941
1942 error = xrep_dir_rebuild_tree(rd);
1943 if (error)
1944 goto out_teardown;
1945
1946 if (rd->needs_adoption) {
1947 if (!xrep_orphanage_can_adopt(rd->sc))
1948 error = -EFSCORRUPTED;
1949 else
1950 error = xrep_dir_move_to_orphanage(rd);
1951 if (error)
1952 goto out_teardown;
1953 }
1954
1955 out_teardown:
1956 xrep_dir_teardown(sc);
1957 return error;
1958 }
1959