1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_bit.h" 14 #include "xfs_log_format.h" 15 #include "xfs_trans.h" 16 #include "xfs_sb.h" 17 #include "xfs_inode.h" 18 #include "xfs_icache.h" 19 #include "xfs_da_format.h" 20 #include "xfs_da_btree.h" 21 #include "xfs_dir2.h" 22 #include "xfs_dir2_priv.h" 23 #include "xfs_bmap.h" 24 #include "xfs_quota.h" 25 #include "xfs_bmap_btree.h" 26 #include "xfs_trans_space.h" 27 #include "xfs_bmap_util.h" 28 #include "xfs_exchmaps.h" 29 #include "xfs_exchrange.h" 30 #include "xfs_ag.h" 31 #include "xfs_parent.h" 32 #include "scrub/xfs_scrub.h" 33 #include "scrub/scrub.h" 34 #include "scrub/common.h" 35 #include "scrub/trace.h" 36 #include "scrub/repair.h" 37 #include "scrub/tempfile.h" 38 #include "scrub/tempexch.h" 39 #include "scrub/xfile.h" 40 #include "scrub/xfarray.h" 41 #include "scrub/xfblob.h" 42 #include "scrub/iscan.h" 43 #include "scrub/readdir.h" 44 #include "scrub/reap.h" 45 #include "scrub/findparent.h" 46 #include "scrub/orphanage.h" 47 #include "scrub/listxattr.h" 48 49 /* 50 * Directory Repair 51 * ================ 52 * 53 * We repair directories by reading the directory data blocks looking for 54 * directory entries that look salvageable (name passes verifiers, entry points 55 * to a valid allocated inode, etc). Each entry worth salvaging is stashed in 56 * memory, and the stashed entries are periodically replayed into a temporary 57 * directory to constrain memory use. Batching the construction of the 58 * temporary directory in this fashion reduces lock cycling of the directory 59 * being repaired and the temporary directory, and will later become important 60 * for parent pointer scanning. 61 * 62 * If parent pointers are enabled on this filesystem, we instead reconstruct 63 * the directory by visiting each parent pointer of each file in the filesystem 64 * and translating the relevant parent pointer records into dirents. In this 65 * case, it is advantageous to stash all directory entries created from parent 66 * pointers for a single child file before replaying them into the temporary 67 * directory. To save memory, the live filesystem scan reuses the findparent 68 * fields. Directory repair chooses either parent pointer scanning or 69 * directory entry salvaging, but not both. 70 * 71 * Directory entries added to the temporary directory do not elevate the link 72 * counts of the inodes found. When salvaging completes, the remaining stashed 73 * entries are replayed to the temporary directory. An atomic mapping exchange 74 * is used to commit the new directory blocks to the directory being repaired. 75 * This will disrupt readdir cursors. 76 * 77 * Locking Issues 78 * -------------- 79 * 80 * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on 81 * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects 82 * b's dotdot update. This is in contrast to every other dotdot update (link, 83 * remove, mkdir). If the repair code drops the ILOCK, it must either 84 * revalidate the dotdot entry or use dirent hooks to capture updates from 85 * other threads. 86 */ 87 88 /* Create a dirent in the tempdir. */ 89 #define XREP_DIRENT_ADD (1) 90 91 /* Remove a dirent from the tempdir. */ 92 #define XREP_DIRENT_REMOVE (2) 93 94 /* Directory entry to be restored in the new directory. */ 95 struct xrep_dirent { 96 /* Cookie for retrieval of the dirent name. */ 97 xfblob_cookie name_cookie; 98 99 /* Target inode number. */ 100 xfs_ino_t ino; 101 102 /* Length of the dirent name. */ 103 uint8_t namelen; 104 105 /* File type of the dirent. */ 106 uint8_t ftype; 107 108 /* XREP_DIRENT_{ADD,REMOVE} */ 109 uint8_t action; 110 }; 111 112 /* 113 * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names 114 * before we write them to the temp dir. 115 */ 116 #define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8) 117 118 struct xrep_dir { 119 struct xfs_scrub *sc; 120 121 /* Fixed-size array of xrep_dirent structures. */ 122 struct xfarray *dir_entries; 123 124 /* Blobs containing directory entry names. */ 125 struct xfblob *dir_names; 126 127 /* Information for exchanging data forks at the end. */ 128 struct xrep_tempexch tx; 129 130 /* Preallocated args struct for performing dir operations */ 131 struct xfs_da_args args; 132 133 /* 134 * Information used to scan the filesystem to find the inumber of the 135 * dotdot entry for this directory. For directory salvaging when 136 * parent pointers are not enabled, we use the findparent_* functions 137 * on this object and access only the parent_ino field directly. 138 * 139 * When parent pointers are enabled, however, the pptr scanner uses the 140 * iscan, hooks, lock, and parent_ino fields of this object directly. 141 * @pscan.lock coordinates access to dir_entries, dir_names, 142 * parent_ino, subdirs, dirents, and args. This reduces the memory 143 * requirements of this structure. 144 */ 145 struct xrep_parent_scan_info pscan; 146 147 /* 148 * Context information for attaching this directory to the lost+found 149 * if this directory does not have a parent. 150 */ 151 struct xrep_adoption adoption; 152 153 /* How many subdirectories did we find? */ 154 uint64_t subdirs; 155 156 /* How many dirents did we find? */ 157 unsigned int dirents; 158 159 /* Should we move this directory to the orphanage? */ 160 bool needs_adoption; 161 162 /* Directory entry name, plus the trailing null. */ 163 struct xfs_name xname; 164 unsigned char namebuf[MAXNAMELEN]; 165 }; 166 167 /* Tear down all the incore stuff we created. */ 168 static void xrep_dir_teardown(struct xfs_scrub * sc)169 xrep_dir_teardown( 170 struct xfs_scrub *sc) 171 { 172 struct xrep_dir *rd = sc->buf; 173 174 xrep_findparent_scan_teardown(&rd->pscan); 175 xfblob_destroy(rd->dir_names); 176 xfarray_destroy(rd->dir_entries); 177 } 178 179 /* Set up for a directory repair. */ 180 int xrep_setup_directory(struct xfs_scrub * sc)181 xrep_setup_directory( 182 struct xfs_scrub *sc) 183 { 184 struct xrep_dir *rd; 185 int error; 186 187 xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); 188 189 error = xrep_orphanage_try_create(sc); 190 if (error) 191 return error; 192 193 error = xrep_tempfile_create(sc, S_IFDIR); 194 if (error) 195 return error; 196 197 rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS); 198 if (!rd) 199 return -ENOMEM; 200 rd->sc = sc; 201 rd->xname.name = rd->namebuf; 202 sc->buf = rd; 203 204 return 0; 205 } 206 207 /* 208 * Look up the dotdot entry and confirm that it's really the parent. 209 * Returns NULLFSINO if we don't know what to do. 210 */ 211 static inline xfs_ino_t xrep_dir_lookup_parent(struct xrep_dir * rd)212 xrep_dir_lookup_parent( 213 struct xrep_dir *rd) 214 { 215 struct xfs_scrub *sc = rd->sc; 216 xfs_ino_t ino; 217 int error; 218 219 error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL); 220 if (error) 221 return NULLFSINO; 222 if (!xfs_verify_dir_ino(sc->mp, ino)) 223 return NULLFSINO; 224 225 error = xrep_findparent_confirm(sc, &ino); 226 if (error) 227 return NULLFSINO; 228 229 return ino; 230 } 231 232 /* 233 * Look up '..' in the dentry cache and confirm that it's really the parent. 234 * Returns NULLFSINO if the dcache misses or if the hit is implausible. 235 */ 236 static inline xfs_ino_t xrep_dir_dcache_parent(struct xrep_dir * rd)237 xrep_dir_dcache_parent( 238 struct xrep_dir *rd) 239 { 240 struct xfs_scrub *sc = rd->sc; 241 xfs_ino_t parent_ino; 242 int error; 243 244 parent_ino = xrep_findparent_from_dcache(sc); 245 if (parent_ino == NULLFSINO) 246 return parent_ino; 247 248 error = xrep_findparent_confirm(sc, &parent_ino); 249 if (error) 250 return NULLFSINO; 251 252 return parent_ino; 253 } 254 255 /* Try to find the parent of the directory being repaired. */ 256 STATIC int xrep_dir_find_parent(struct xrep_dir * rd)257 xrep_dir_find_parent( 258 struct xrep_dir *rd) 259 { 260 xfs_ino_t ino; 261 262 ino = xrep_findparent_self_reference(rd->sc); 263 if (ino != NULLFSINO) { 264 xrep_findparent_scan_finish_early(&rd->pscan, ino); 265 return 0; 266 } 267 268 ino = xrep_dir_dcache_parent(rd); 269 if (ino != NULLFSINO) { 270 xrep_findparent_scan_finish_early(&rd->pscan, ino); 271 return 0; 272 } 273 274 ino = xrep_dir_lookup_parent(rd); 275 if (ino != NULLFSINO) { 276 xrep_findparent_scan_finish_early(&rd->pscan, ino); 277 return 0; 278 } 279 280 /* 281 * A full filesystem scan is the last resort. On a busy filesystem, 282 * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means 283 * that we don't know what who the parent is, so we should return to 284 * userspace. 285 */ 286 return xrep_findparent_scan(&rd->pscan); 287 } 288 289 /* 290 * Decide if we want to salvage this entry. We don't bother with oversized 291 * names or the dot entry. 292 */ 293 STATIC int xrep_dir_want_salvage(struct xrep_dir * rd,const char * name,int namelen,xfs_ino_t ino)294 xrep_dir_want_salvage( 295 struct xrep_dir *rd, 296 const char *name, 297 int namelen, 298 xfs_ino_t ino) 299 { 300 struct xfs_mount *mp = rd->sc->mp; 301 302 /* No pointers to ourselves or to garbage. */ 303 if (ino == rd->sc->ip->i_ino) 304 return false; 305 if (!xfs_verify_dir_ino(mp, ino)) 306 return false; 307 308 /* No weird looking names or dot entries. */ 309 if (namelen >= MAXNAMELEN || namelen <= 0) 310 return false; 311 if (namelen == 1 && name[0] == '.') 312 return false; 313 if (!xfs_dir2_namecheck(name, namelen)) 314 return false; 315 316 return true; 317 } 318 319 /* 320 * Remember that we want to create a dirent in the tempdir. These stashed 321 * actions will be replayed later. 322 */ 323 STATIC int xrep_dir_stash_createname(struct xrep_dir * rd,const struct xfs_name * name,xfs_ino_t ino)324 xrep_dir_stash_createname( 325 struct xrep_dir *rd, 326 const struct xfs_name *name, 327 xfs_ino_t ino) 328 { 329 struct xrep_dirent dirent = { 330 .action = XREP_DIRENT_ADD, 331 .ino = ino, 332 .namelen = name->len, 333 .ftype = name->type, 334 }; 335 int error; 336 337 trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino); 338 339 error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); 340 if (error) 341 return error; 342 343 return xfarray_append(rd->dir_entries, &dirent); 344 } 345 346 /* 347 * Remember that we want to remove a dirent from the tempdir. These stashed 348 * actions will be replayed later. 349 */ 350 STATIC int xrep_dir_stash_removename(struct xrep_dir * rd,const struct xfs_name * name,xfs_ino_t ino)351 xrep_dir_stash_removename( 352 struct xrep_dir *rd, 353 const struct xfs_name *name, 354 xfs_ino_t ino) 355 { 356 struct xrep_dirent dirent = { 357 .action = XREP_DIRENT_REMOVE, 358 .ino = ino, 359 .namelen = name->len, 360 .ftype = name->type, 361 }; 362 int error; 363 364 trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino); 365 366 error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); 367 if (error) 368 return error; 369 370 return xfarray_append(rd->dir_entries, &dirent); 371 } 372 373 /* Allocate an in-core record to hold entries while we rebuild the dir data. */ 374 STATIC int xrep_dir_salvage_entry(struct xrep_dir * rd,unsigned char * name,unsigned int namelen,xfs_ino_t ino)375 xrep_dir_salvage_entry( 376 struct xrep_dir *rd, 377 unsigned char *name, 378 unsigned int namelen, 379 xfs_ino_t ino) 380 { 381 struct xfs_name xname = { 382 .name = name, 383 }; 384 struct xfs_scrub *sc = rd->sc; 385 struct xfs_inode *ip; 386 unsigned int i = 0; 387 int error = 0; 388 389 if (xchk_should_terminate(sc, &error)) 390 return error; 391 392 /* 393 * Truncate the name to the first character that would trip namecheck. 394 * If we no longer have a name after that, ignore this entry. 395 */ 396 while (i < namelen && name[i] != 0 && name[i] != '/') 397 i++; 398 if (i == 0) 399 return 0; 400 xname.len = i; 401 402 /* Ignore '..' entries; we already picked the new parent. */ 403 if (xname.len == 2 && name[0] == '.' && name[1] == '.') { 404 trace_xrep_dir_salvaged_parent(sc->ip, ino); 405 return 0; 406 } 407 408 trace_xrep_dir_salvage_entry(sc->ip, &xname, ino); 409 410 /* 411 * Compute the ftype or dump the entry if we can't. We don't lock the 412 * inode because inodes can't change type while we have a reference. 413 */ 414 error = xchk_iget(sc, ino, &ip); 415 if (error) 416 return 0; 417 418 xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode); 419 xchk_irele(sc, ip); 420 421 return xrep_dir_stash_createname(rd, &xname, ino); 422 } 423 424 /* Record a shortform directory entry for later reinsertion. */ 425 STATIC int xrep_dir_salvage_sf_entry(struct xrep_dir * rd,struct xfs_dir2_sf_hdr * sfp,struct xfs_dir2_sf_entry * sfep)426 xrep_dir_salvage_sf_entry( 427 struct xrep_dir *rd, 428 struct xfs_dir2_sf_hdr *sfp, 429 struct xfs_dir2_sf_entry *sfep) 430 { 431 xfs_ino_t ino; 432 433 ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep); 434 if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino)) 435 return 0; 436 437 return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino); 438 } 439 440 /* Record a regular directory entry for later reinsertion. */ 441 STATIC int xrep_dir_salvage_data_entry(struct xrep_dir * rd,struct xfs_dir2_data_entry * dep)442 xrep_dir_salvage_data_entry( 443 struct xrep_dir *rd, 444 struct xfs_dir2_data_entry *dep) 445 { 446 xfs_ino_t ino; 447 448 ino = be64_to_cpu(dep->inumber); 449 if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino)) 450 return 0; 451 452 return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino); 453 } 454 455 /* Try to recover block/data format directory entries. */ 456 STATIC int xrep_dir_recover_data(struct xrep_dir * rd,struct xfs_buf * bp)457 xrep_dir_recover_data( 458 struct xrep_dir *rd, 459 struct xfs_buf *bp) 460 { 461 struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo; 462 unsigned int offset; 463 unsigned int end; 464 int error = 0; 465 466 /* 467 * Loop over the data portion of the block. 468 * Each object is a real entry (dep) or an unused one (dup). 469 */ 470 offset = geo->data_entry_offset; 471 end = min_t(unsigned int, BBTOB(bp->b_length), 472 xfs_dir3_data_end_offset(geo, bp->b_addr)); 473 474 while (offset < end) { 475 struct xfs_dir2_data_unused *dup = bp->b_addr + offset; 476 struct xfs_dir2_data_entry *dep = bp->b_addr + offset; 477 478 if (xchk_should_terminate(rd->sc, &error)) 479 return error; 480 481 /* Skip unused entries. */ 482 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { 483 offset += be16_to_cpu(dup->length); 484 continue; 485 } 486 487 /* Don't walk off the end of the block. */ 488 offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen); 489 if (offset > end) 490 break; 491 492 /* Ok, let's save this entry. */ 493 error = xrep_dir_salvage_data_entry(rd, dep); 494 if (error) 495 return error; 496 497 } 498 499 return 0; 500 } 501 502 /* Try to recover shortform directory entries. */ 503 STATIC int xrep_dir_recover_sf(struct xrep_dir * rd)504 xrep_dir_recover_sf( 505 struct xrep_dir *rd) 506 { 507 struct xfs_dir2_sf_hdr *hdr; 508 struct xfs_dir2_sf_entry *sfep; 509 struct xfs_dir2_sf_entry *next; 510 struct xfs_ifork *ifp; 511 xfs_ino_t ino; 512 unsigned char *end; 513 int error = 0; 514 515 ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK); 516 hdr = ifp->if_data; 517 end = (unsigned char *)ifp->if_data + ifp->if_bytes; 518 519 ino = xfs_dir2_sf_get_parent_ino(hdr); 520 trace_xrep_dir_salvaged_parent(rd->sc->ip, ino); 521 522 sfep = xfs_dir2_sf_firstentry(hdr); 523 while ((unsigned char *)sfep < end) { 524 if (xchk_should_terminate(rd->sc, &error)) 525 return error; 526 527 next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep); 528 if ((unsigned char *)next > end) 529 break; 530 531 /* Ok, let's save this entry. */ 532 error = xrep_dir_salvage_sf_entry(rd, hdr, sfep); 533 if (error) 534 return error; 535 536 sfep = next; 537 } 538 539 return 0; 540 } 541 542 /* 543 * Try to figure out the format of this directory from the data fork mappings 544 * and the directory size. If we can be reasonably sure of format, we can be 545 * more aggressive in salvaging directory entries. On return, @magic_guess 546 * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format" 547 * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory, 548 * and 0 if we can't tell. 549 */ 550 STATIC void xrep_dir_guess_format(struct xrep_dir * rd,__be32 * magic_guess)551 xrep_dir_guess_format( 552 struct xrep_dir *rd, 553 __be32 *magic_guess) 554 { 555 struct xfs_inode *dp = rd->sc->ip; 556 struct xfs_mount *mp = rd->sc->mp; 557 struct xfs_da_geometry *geo = mp->m_dir_geo; 558 xfs_fileoff_t last; 559 int error; 560 561 ASSERT(xfs_has_crc(mp)); 562 563 *magic_guess = 0; 564 565 /* 566 * If there's a single directory block and the directory size is 567 * exactly one block, this has to be a single block format directory. 568 */ 569 error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK); 570 if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize && 571 dp->i_disk_size == geo->blksize) { 572 *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); 573 return; 574 } 575 576 /* 577 * If the last extent before the leaf offset matches the directory 578 * size and the directory size is larger than 1 block, this is a 579 * data format directory. 580 */ 581 last = geo->leafblk; 582 error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK); 583 if (!error && 584 XFS_FSB_TO_B(mp, last) > geo->blksize && 585 XFS_FSB_TO_B(mp, last) == dp->i_disk_size) { 586 *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC); 587 return; 588 } 589 } 590 591 /* Recover directory entries from a specific directory block. */ 592 STATIC int xrep_dir_recover_dirblock(struct xrep_dir * rd,__be32 magic_guess,xfs_dablk_t dabno)593 xrep_dir_recover_dirblock( 594 struct xrep_dir *rd, 595 __be32 magic_guess, 596 xfs_dablk_t dabno) 597 { 598 struct xfs_dir2_data_hdr *hdr; 599 struct xfs_buf *bp; 600 __be32 oldmagic; 601 int error; 602 603 /* 604 * Try to read buffer. We invalidate them in the next step so we don't 605 * bother to set a buffer type or ops. 606 */ 607 error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno, 608 XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL); 609 if (error || !bp) 610 return error; 611 612 hdr = bp->b_addr; 613 oldmagic = hdr->magic; 614 615 trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno, 616 be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess)); 617 618 /* 619 * If we're sure of the block's format, proceed with the salvage 620 * operation using the specified magic number. 621 */ 622 if (magic_guess) { 623 hdr->magic = magic_guess; 624 goto recover; 625 } 626 627 /* 628 * If we couldn't guess what type of directory this is, then we will 629 * only salvage entries from directory blocks that match the magic 630 * number and pass verifiers. 631 */ 632 switch (hdr->magic) { 633 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): 634 case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): 635 if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops)) 636 goto out; 637 if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL) 638 goto out; 639 break; 640 case cpu_to_be32(XFS_DIR2_DATA_MAGIC): 641 case cpu_to_be32(XFS_DIR3_DATA_MAGIC): 642 if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops)) 643 goto out; 644 if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL) 645 goto out; 646 break; 647 default: 648 goto out; 649 } 650 651 recover: 652 error = xrep_dir_recover_data(rd, bp); 653 654 out: 655 hdr->magic = oldmagic; 656 xfs_trans_brelse(rd->sc->tp, bp); 657 return error; 658 } 659 660 static inline void xrep_dir_init_args(struct xrep_dir * rd,struct xfs_inode * dp,const struct xfs_name * name)661 xrep_dir_init_args( 662 struct xrep_dir *rd, 663 struct xfs_inode *dp, 664 const struct xfs_name *name) 665 { 666 memset(&rd->args, 0, sizeof(struct xfs_da_args)); 667 rd->args.geo = rd->sc->mp->m_dir_geo; 668 rd->args.whichfork = XFS_DATA_FORK; 669 rd->args.owner = rd->sc->ip->i_ino; 670 rd->args.trans = rd->sc->tp; 671 rd->args.dp = dp; 672 if (!name) 673 return; 674 rd->args.name = name->name; 675 rd->args.namelen = name->len; 676 rd->args.filetype = name->type; 677 rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name); 678 } 679 680 /* Replay a stashed createname into the temporary directory. */ 681 STATIC int xrep_dir_replay_createname(struct xrep_dir * rd,const struct xfs_name * name,xfs_ino_t inum,xfs_extlen_t total)682 xrep_dir_replay_createname( 683 struct xrep_dir *rd, 684 const struct xfs_name *name, 685 xfs_ino_t inum, 686 xfs_extlen_t total) 687 { 688 struct xfs_scrub *sc = rd->sc; 689 struct xfs_inode *dp = rd->sc->tempip; 690 int error; 691 692 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 693 694 error = xfs_dir_ino_validate(sc->mp, inum); 695 if (error) 696 return error; 697 698 trace_xrep_dir_replay_createname(dp, name, inum); 699 700 xrep_dir_init_args(rd, dp, name); 701 rd->args.inumber = inum; 702 rd->args.total = total; 703 rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; 704 return xfs_dir_createname_args(&rd->args); 705 } 706 707 /* Replay a stashed removename onto the temporary directory. */ 708 STATIC int xrep_dir_replay_removename(struct xrep_dir * rd,const struct xfs_name * name,xfs_extlen_t total)709 xrep_dir_replay_removename( 710 struct xrep_dir *rd, 711 const struct xfs_name *name, 712 xfs_extlen_t total) 713 { 714 struct xfs_inode *dp = rd->args.dp; 715 716 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 717 718 xrep_dir_init_args(rd, dp, name); 719 rd->args.op_flags = 0; 720 rd->args.total = total; 721 722 trace_xrep_dir_replay_removename(dp, name, 0); 723 return xfs_dir_removename_args(&rd->args); 724 } 725 726 /* 727 * Add this stashed incore directory entry to the temporary directory. 728 * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and 729 * must not be in transaction context. 730 */ 731 STATIC int xrep_dir_replay_update(struct xrep_dir * rd,const struct xfs_name * xname,const struct xrep_dirent * dirent)732 xrep_dir_replay_update( 733 struct xrep_dir *rd, 734 const struct xfs_name *xname, 735 const struct xrep_dirent *dirent) 736 { 737 struct xfs_mount *mp = rd->sc->mp; 738 #ifdef DEBUG 739 xfs_ino_t ino; 740 #endif 741 uint resblks; 742 int error; 743 744 resblks = xfs_link_space_res(mp, xname->len); 745 error = xchk_trans_alloc(rd->sc, resblks); 746 if (error) 747 return error; 748 749 /* Lock the temporary directory and join it to the transaction */ 750 xrep_tempfile_ilock(rd->sc); 751 xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0); 752 753 switch (dirent->action) { 754 case XREP_DIRENT_ADD: 755 /* 756 * Create a replacement dirent in the temporary directory. 757 * Note that _createname doesn't check for existing entries. 758 * There shouldn't be any in the temporary dir, but we'll 759 * verify this in debug mode. 760 */ 761 #ifdef DEBUG 762 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); 763 if (error != -ENOENT) { 764 ASSERT(error != -ENOENT); 765 goto out_cancel; 766 } 767 #endif 768 769 error = xrep_dir_replay_createname(rd, xname, dirent->ino, 770 resblks); 771 if (error) 772 goto out_cancel; 773 774 if (xname->type == XFS_DIR3_FT_DIR) 775 rd->subdirs++; 776 rd->dirents++; 777 break; 778 case XREP_DIRENT_REMOVE: 779 /* 780 * Remove a dirent from the temporary directory. Note that 781 * _removename doesn't check the inode target of the exist 782 * entry. There should be a perfect match in the temporary 783 * dir, but we'll verify this in debug mode. 784 */ 785 #ifdef DEBUG 786 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); 787 if (error) { 788 ASSERT(error != 0); 789 goto out_cancel; 790 } 791 if (ino != dirent->ino) { 792 ASSERT(ino == dirent->ino); 793 error = -EIO; 794 goto out_cancel; 795 } 796 #endif 797 798 error = xrep_dir_replay_removename(rd, xname, resblks); 799 if (error) 800 goto out_cancel; 801 802 if (xname->type == XFS_DIR3_FT_DIR) 803 rd->subdirs--; 804 rd->dirents--; 805 break; 806 default: 807 ASSERT(0); 808 error = -EIO; 809 goto out_cancel; 810 } 811 812 /* Commit and unlock. */ 813 error = xrep_trans_commit(rd->sc); 814 if (error) 815 return error; 816 817 xrep_tempfile_iunlock(rd->sc); 818 return 0; 819 out_cancel: 820 xchk_trans_cancel(rd->sc); 821 xrep_tempfile_iunlock(rd->sc); 822 return error; 823 } 824 825 /* 826 * Flush stashed incore dirent updates that have been recorded by the scanner. 827 * This is done to reduce the memory requirements of the directory rebuild, 828 * since directories can contain up to 32GB of directory data. 829 * 830 * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir 831 * IOLOCK. 832 */ 833 STATIC int xrep_dir_replay_updates(struct xrep_dir * rd)834 xrep_dir_replay_updates( 835 struct xrep_dir *rd) 836 { 837 xfarray_idx_t array_cur; 838 int error; 839 840 /* Add all the salvaged dirents to the temporary directory. */ 841 mutex_lock(&rd->pscan.lock); 842 foreach_xfarray_idx(rd->dir_entries, array_cur) { 843 struct xrep_dirent dirent; 844 845 error = xfarray_load(rd->dir_entries, array_cur, &dirent); 846 if (error) 847 goto out_unlock; 848 849 error = xfblob_loadname(rd->dir_names, dirent.name_cookie, 850 &rd->xname, dirent.namelen); 851 if (error) 852 goto out_unlock; 853 rd->xname.type = dirent.ftype; 854 mutex_unlock(&rd->pscan.lock); 855 856 error = xrep_dir_replay_update(rd, &rd->xname, &dirent); 857 if (error) 858 return error; 859 mutex_lock(&rd->pscan.lock); 860 } 861 862 /* Empty out both arrays now that we've added the entries. */ 863 xfarray_truncate(rd->dir_entries); 864 xfblob_truncate(rd->dir_names); 865 mutex_unlock(&rd->pscan.lock); 866 return 0; 867 out_unlock: 868 mutex_unlock(&rd->pscan.lock); 869 return error; 870 } 871 872 /* 873 * Periodically flush stashed directory entries to the temporary dir. This 874 * is done to reduce the memory requirements of the directory rebuild, since 875 * directories can contain up to 32GB of directory data. 876 */ 877 STATIC int xrep_dir_flush_stashed(struct xrep_dir * rd)878 xrep_dir_flush_stashed( 879 struct xrep_dir *rd) 880 { 881 int error; 882 883 /* 884 * Entering this function, the scrub context has a reference to the 885 * inode being repaired, the temporary file, and a scrub transaction 886 * that we use during dirent salvaging to avoid livelocking if there 887 * are cycles in the directory structures. We hold ILOCK_EXCL on both 888 * the inode being repaired and the temporary file, though they are 889 * not ijoined to the scrub transaction. 890 * 891 * To constrain kernel memory use, we occasionally write salvaged 892 * dirents from the xfarray and xfblob structures into the temporary 893 * directory in preparation for exchanging the directory structures at 894 * the end. Updating the temporary file requires a transaction, so we 895 * commit the scrub transaction and drop the two ILOCKs so that 896 * we can allocate whatever transaction we want. 897 * 898 * We still hold IOLOCK_EXCL on the inode being repaired, which 899 * prevents anyone from accessing the damaged directory data while we 900 * repair it. 901 */ 902 error = xrep_trans_commit(rd->sc); 903 if (error) 904 return error; 905 xchk_iunlock(rd->sc, XFS_ILOCK_EXCL); 906 907 /* 908 * Take the IOLOCK of the temporary file while we modify dirents. This 909 * isn't strictly required because the temporary file is never revealed 910 * to userspace, but we follow the same locking rules. We still hold 911 * sc->ip's IOLOCK. 912 */ 913 error = xrep_tempfile_iolock_polled(rd->sc); 914 if (error) 915 return error; 916 917 /* Write to the tempdir all the updates that we've stashed. */ 918 error = xrep_dir_replay_updates(rd); 919 xrep_tempfile_iounlock(rd->sc); 920 if (error) 921 return error; 922 923 /* 924 * Recreate the salvage transaction and relock the dir we're salvaging. 925 */ 926 error = xchk_trans_alloc(rd->sc, 0); 927 if (error) 928 return error; 929 xchk_ilock(rd->sc, XFS_ILOCK_EXCL); 930 return 0; 931 } 932 933 /* Decide if we've stashed too much dirent data in memory. */ 934 static inline bool xrep_dir_want_flush_stashed(struct xrep_dir * rd)935 xrep_dir_want_flush_stashed( 936 struct xrep_dir *rd) 937 { 938 unsigned long long bytes; 939 940 bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names); 941 return bytes > XREP_DIR_MAX_STASH_BYTES; 942 } 943 944 /* Extract as many directory entries as we can. */ 945 STATIC int xrep_dir_recover(struct xrep_dir * rd)946 xrep_dir_recover( 947 struct xrep_dir *rd) 948 { 949 struct xfs_bmbt_irec got; 950 struct xfs_scrub *sc = rd->sc; 951 struct xfs_da_geometry *geo = sc->mp->m_dir_geo; 952 xfs_fileoff_t offset; 953 xfs_dablk_t dabno; 954 __be32 magic_guess; 955 int nmap; 956 int error; 957 958 xrep_dir_guess_format(rd, &magic_guess); 959 960 /* Iterate each directory data block in the data fork. */ 961 for (offset = 0; 962 offset < geo->leafblk; 963 offset = got.br_startoff + got.br_blockcount) { 964 nmap = 1; 965 error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset, 966 &got, &nmap, 0); 967 if (error) 968 return error; 969 if (nmap != 1) 970 return -EFSCORRUPTED; 971 if (!xfs_bmap_is_written_extent(&got)) 972 continue; 973 974 for (dabno = round_up(got.br_startoff, geo->fsbcount); 975 dabno < got.br_startoff + got.br_blockcount; 976 dabno += geo->fsbcount) { 977 if (xchk_should_terminate(rd->sc, &error)) 978 return error; 979 980 error = xrep_dir_recover_dirblock(rd, 981 magic_guess, dabno); 982 if (error) 983 return error; 984 985 /* Flush dirents to constrain memory usage. */ 986 if (xrep_dir_want_flush_stashed(rd)) { 987 error = xrep_dir_flush_stashed(rd); 988 if (error) 989 return error; 990 } 991 } 992 } 993 994 return 0; 995 } 996 997 /* 998 * Find all the directory entries for this inode by scraping them out of the 999 * directory leaf blocks by hand, and flushing them into the temp dir. 1000 */ 1001 STATIC int xrep_dir_find_entries(struct xrep_dir * rd)1002 xrep_dir_find_entries( 1003 struct xrep_dir *rd) 1004 { 1005 struct xfs_inode *dp = rd->sc->ip; 1006 int error; 1007 1008 /* 1009 * Salvage directory entries from the old directory, and write them to 1010 * the temporary directory. 1011 */ 1012 if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { 1013 error = xrep_dir_recover_sf(rd); 1014 } else { 1015 error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK); 1016 if (error) 1017 return error; 1018 1019 error = xrep_dir_recover(rd); 1020 } 1021 if (error) 1022 return error; 1023 1024 return xrep_dir_flush_stashed(rd); 1025 } 1026 1027 /* Scan all files in the filesystem for dirents. */ 1028 STATIC int xrep_dir_salvage_entries(struct xrep_dir * rd)1029 xrep_dir_salvage_entries( 1030 struct xrep_dir *rd) 1031 { 1032 struct xfs_scrub *sc = rd->sc; 1033 int error; 1034 1035 /* 1036 * Drop the ILOCK on this directory so that we can scan for this 1037 * directory's parent. Figure out who is going to be the parent of 1038 * this directory, then retake the ILOCK so that we can salvage 1039 * directory entries. 1040 */ 1041 xchk_iunlock(sc, XFS_ILOCK_EXCL); 1042 error = xrep_dir_find_parent(rd); 1043 xchk_ilock(sc, XFS_ILOCK_EXCL); 1044 if (error) 1045 return error; 1046 1047 /* 1048 * Collect directory entries by parsing raw leaf blocks to salvage 1049 * whatever we can. When we're done, free the staging memory before 1050 * exchanging the directories to reduce memory usage. 1051 */ 1052 error = xrep_dir_find_entries(rd); 1053 if (error) 1054 return error; 1055 1056 /* 1057 * Cancel the repair transaction and drop the ILOCK so that we can 1058 * (later) use the atomic mapping exchange functions to compute the 1059 * correct block reservations and re-lock the inodes. 1060 * 1061 * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory 1062 * modifications, but there's nothing to prevent userspace from reading 1063 * the directory until we're ready for the exchange operation. Reads 1064 * will return -EIO without shutting down the fs, so we're ok with 1065 * that. 1066 * 1067 * The VFS can change dotdot on us, but the findparent scan will keep 1068 * our incore parent inode up to date. See the note on locking issues 1069 * for more details. 1070 */ 1071 error = xrep_trans_commit(sc); 1072 if (error) 1073 return error; 1074 1075 xchk_iunlock(sc, XFS_ILOCK_EXCL); 1076 return 0; 1077 } 1078 1079 1080 /* 1081 * Examine a parent pointer of a file. If it leads us back to the directory 1082 * that we're rebuilding, create an incore dirent from the parent pointer and 1083 * stash it. 1084 */ 1085 STATIC int xrep_dir_scan_pptr(struct xfs_scrub * sc,struct xfs_inode * ip,unsigned int attr_flags,const unsigned char * name,unsigned int namelen,const void * value,unsigned int valuelen,void * priv)1086 xrep_dir_scan_pptr( 1087 struct xfs_scrub *sc, 1088 struct xfs_inode *ip, 1089 unsigned int attr_flags, 1090 const unsigned char *name, 1091 unsigned int namelen, 1092 const void *value, 1093 unsigned int valuelen, 1094 void *priv) 1095 { 1096 struct xfs_name xname = { 1097 .name = name, 1098 .len = namelen, 1099 .type = xfs_mode_to_ftype(VFS_I(ip)->i_mode), 1100 }; 1101 xfs_ino_t parent_ino; 1102 uint32_t parent_gen; 1103 struct xrep_dir *rd = priv; 1104 int error; 1105 1106 if (!(attr_flags & XFS_ATTR_PARENT)) 1107 return 0; 1108 1109 /* 1110 * Ignore parent pointers that point back to a different dir, list the 1111 * wrong generation number, or are invalid. 1112 */ 1113 error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, 1114 valuelen, &parent_ino, &parent_gen); 1115 if (error) 1116 return error; 1117 1118 if (parent_ino != sc->ip->i_ino || 1119 parent_gen != VFS_I(sc->ip)->i_generation) 1120 return 0; 1121 1122 mutex_lock(&rd->pscan.lock); 1123 error = xrep_dir_stash_createname(rd, &xname, ip->i_ino); 1124 mutex_unlock(&rd->pscan.lock); 1125 return error; 1126 } 1127 1128 /* 1129 * If this child dirent points to the directory being repaired, remember that 1130 * fact so that we can reset the dotdot entry if necessary. 1131 */ 1132 STATIC int xrep_dir_scan_dirent(struct xfs_scrub * sc,struct xfs_inode * dp,xfs_dir2_dataptr_t dapos,const struct xfs_name * name,xfs_ino_t ino,void * priv)1133 xrep_dir_scan_dirent( 1134 struct xfs_scrub *sc, 1135 struct xfs_inode *dp, 1136 xfs_dir2_dataptr_t dapos, 1137 const struct xfs_name *name, 1138 xfs_ino_t ino, 1139 void *priv) 1140 { 1141 struct xrep_dir *rd = priv; 1142 1143 /* Dirent doesn't point to this directory. */ 1144 if (ino != rd->sc->ip->i_ino) 1145 return 0; 1146 1147 /* Ignore garbage inum. */ 1148 if (!xfs_verify_dir_ino(rd->sc->mp, ino)) 1149 return 0; 1150 1151 /* No weird looking names. */ 1152 if (name->len >= MAXNAMELEN || name->len <= 0) 1153 return 0; 1154 1155 /* Don't pick up dot or dotdot entries; we only want child dirents. */ 1156 if (xfs_dir2_samename(name, &xfs_name_dotdot) || 1157 xfs_dir2_samename(name, &xfs_name_dot)) 1158 return 0; 1159 1160 trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot, 1161 dp->i_ino); 1162 1163 xrep_findparent_scan_found(&rd->pscan, dp->i_ino); 1164 return 0; 1165 } 1166 1167 /* 1168 * Decide if we want to look for child dirents or parent pointers in this file. 1169 * Skip the dir being repaired and any files being used to stage repairs. 1170 */ 1171 static inline bool xrep_dir_want_scan(struct xrep_dir * rd,const struct xfs_inode * ip)1172 xrep_dir_want_scan( 1173 struct xrep_dir *rd, 1174 const struct xfs_inode *ip) 1175 { 1176 return ip != rd->sc->ip && !xrep_is_tempfile(ip); 1177 } 1178 1179 /* 1180 * Take ILOCK on a file that we want to scan. 1181 * 1182 * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or 1183 * has an unloaded attr bmbt. Otherwise, take ILOCK_SHARED. 1184 */ 1185 static inline unsigned int xrep_dir_scan_ilock(struct xrep_dir * rd,struct xfs_inode * ip)1186 xrep_dir_scan_ilock( 1187 struct xrep_dir *rd, 1188 struct xfs_inode *ip) 1189 { 1190 uint lock_mode = XFS_ILOCK_SHARED; 1191 1192 /* Need to take the shared ILOCK to advance the iscan cursor. */ 1193 if (!xrep_dir_want_scan(rd, ip)) 1194 goto lock; 1195 1196 if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) { 1197 lock_mode = XFS_ILOCK_EXCL; 1198 goto lock; 1199 } 1200 1201 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) 1202 lock_mode = XFS_ILOCK_EXCL; 1203 1204 lock: 1205 xfs_ilock(ip, lock_mode); 1206 return lock_mode; 1207 } 1208 1209 /* 1210 * Scan this file for relevant child dirents or parent pointers that point to 1211 * the directory we're rebuilding. 1212 */ 1213 STATIC int xrep_dir_scan_file(struct xrep_dir * rd,struct xfs_inode * ip)1214 xrep_dir_scan_file( 1215 struct xrep_dir *rd, 1216 struct xfs_inode *ip) 1217 { 1218 unsigned int lock_mode; 1219 int error = 0; 1220 1221 lock_mode = xrep_dir_scan_ilock(rd, ip); 1222 1223 if (!xrep_dir_want_scan(rd, ip)) 1224 goto scan_done; 1225 1226 /* 1227 * If the extended attributes look as though they has been zapped by 1228 * the inode record repair code, we cannot scan for parent pointers. 1229 */ 1230 if (xchk_pptr_looks_zapped(ip)) { 1231 error = -EBUSY; 1232 goto scan_done; 1233 } 1234 1235 error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd); 1236 if (error) 1237 goto scan_done; 1238 1239 if (S_ISDIR(VFS_I(ip)->i_mode)) { 1240 /* 1241 * If the directory looks as though it has been zapped by the 1242 * inode record repair code, we cannot scan for child dirents. 1243 */ 1244 if (xchk_dir_looks_zapped(ip)) { 1245 error = -EBUSY; 1246 goto scan_done; 1247 } 1248 1249 error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd); 1250 if (error) 1251 goto scan_done; 1252 } 1253 1254 scan_done: 1255 xchk_iscan_mark_visited(&rd->pscan.iscan, ip); 1256 xfs_iunlock(ip, lock_mode); 1257 return error; 1258 } 1259 1260 /* 1261 * Scan all files in the filesystem for parent pointers that we can turn into 1262 * replacement dirents, and a dirent that we can use to set the dotdot pointer. 1263 */ 1264 STATIC int xrep_dir_scan_dirtree(struct xrep_dir * rd)1265 xrep_dir_scan_dirtree( 1266 struct xrep_dir *rd) 1267 { 1268 struct xfs_scrub *sc = rd->sc; 1269 struct xfs_inode *ip; 1270 int error; 1271 1272 /* Roots of directory trees are their own parents. */ 1273 if (sc->ip == sc->mp->m_rootip) 1274 xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino); 1275 1276 /* 1277 * Filesystem scans are time consuming. Drop the directory ILOCK and 1278 * all other resources for the duration of the scan and hope for the 1279 * best. The live update hooks will keep our scan information up to 1280 * date even though we've dropped the locks. 1281 */ 1282 xchk_trans_cancel(sc); 1283 if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) 1284 xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | 1285 XFS_ILOCK_EXCL)); 1286 error = xchk_trans_alloc_empty(sc); 1287 if (error) 1288 return error; 1289 1290 while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) { 1291 bool flush; 1292 1293 error = xrep_dir_scan_file(rd, ip); 1294 xchk_irele(sc, ip); 1295 if (error) 1296 break; 1297 1298 /* Flush stashed dirent updates to constrain memory usage. */ 1299 mutex_lock(&rd->pscan.lock); 1300 flush = xrep_dir_want_flush_stashed(rd); 1301 mutex_unlock(&rd->pscan.lock); 1302 if (flush) { 1303 xchk_trans_cancel(sc); 1304 1305 error = xrep_tempfile_iolock_polled(sc); 1306 if (error) 1307 break; 1308 1309 error = xrep_dir_replay_updates(rd); 1310 xrep_tempfile_iounlock(sc); 1311 if (error) 1312 break; 1313 1314 error = xchk_trans_alloc_empty(sc); 1315 if (error) 1316 break; 1317 } 1318 1319 if (xchk_should_terminate(sc, &error)) 1320 break; 1321 } 1322 xchk_iscan_iter_finish(&rd->pscan.iscan); 1323 if (error) { 1324 /* 1325 * If we couldn't grab an inode that was busy with a state 1326 * change, change the error code so that we exit to userspace 1327 * as quickly as possible. 1328 */ 1329 if (error == -EBUSY) 1330 return -ECANCELED; 1331 return error; 1332 } 1333 1334 /* 1335 * Cancel the empty transaction so that we can (later) use the atomic 1336 * file mapping exchange functions to lock files and commit the new 1337 * directory. 1338 */ 1339 xchk_trans_cancel(rd->sc); 1340 return 0; 1341 } 1342 1343 /* 1344 * Capture dirent updates being made by other threads which are relevant to the 1345 * directory being repaired. 1346 */ 1347 STATIC int xrep_dir_live_update(struct notifier_block * nb,unsigned long action,void * data)1348 xrep_dir_live_update( 1349 struct notifier_block *nb, 1350 unsigned long action, 1351 void *data) 1352 { 1353 struct xfs_dir_update_params *p = data; 1354 struct xrep_dir *rd; 1355 struct xfs_scrub *sc; 1356 int error = 0; 1357 1358 rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb); 1359 sc = rd->sc; 1360 1361 /* 1362 * This thread updated a child dirent in the directory that we're 1363 * rebuilding. Stash the update for replay against the temporary 1364 * directory. 1365 */ 1366 if (p->dp->i_ino == sc->ip->i_ino && 1367 xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) { 1368 mutex_lock(&rd->pscan.lock); 1369 if (p->delta > 0) 1370 error = xrep_dir_stash_createname(rd, p->name, 1371 p->ip->i_ino); 1372 else 1373 error = xrep_dir_stash_removename(rd, p->name, 1374 p->ip->i_ino); 1375 mutex_unlock(&rd->pscan.lock); 1376 if (error) 1377 goto out_abort; 1378 } 1379 1380 /* 1381 * This thread updated another directory's child dirent that points to 1382 * the directory that we're rebuilding, so remember the new dotdot 1383 * target. 1384 */ 1385 if (p->ip->i_ino == sc->ip->i_ino && 1386 xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) { 1387 if (p->delta > 0) { 1388 trace_xrep_dir_stash_createname(sc->tempip, 1389 &xfs_name_dotdot, 1390 p->dp->i_ino); 1391 1392 xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino); 1393 } else { 1394 trace_xrep_dir_stash_removename(sc->tempip, 1395 &xfs_name_dotdot, 1396 rd->pscan.parent_ino); 1397 1398 xrep_findparent_scan_found(&rd->pscan, NULLFSINO); 1399 } 1400 } 1401 1402 return NOTIFY_DONE; 1403 out_abort: 1404 xchk_iscan_abort(&rd->pscan.iscan); 1405 return NOTIFY_DONE; 1406 } 1407 1408 /* 1409 * Free all the directory blocks and reset the data fork. The caller must 1410 * join the inode to the transaction. This function returns with the inode 1411 * joined to a clean scrub transaction. 1412 */ 1413 STATIC int xrep_dir_reset_fork(struct xrep_dir * rd,xfs_ino_t parent_ino)1414 xrep_dir_reset_fork( 1415 struct xrep_dir *rd, 1416 xfs_ino_t parent_ino) 1417 { 1418 struct xfs_scrub *sc = rd->sc; 1419 struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK); 1420 int error; 1421 1422 /* Unmap all the directory buffers. */ 1423 if (xfs_ifork_has_extents(ifp)) { 1424 error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); 1425 if (error) 1426 return error; 1427 } 1428 1429 trace_xrep_dir_reset_fork(sc->tempip, parent_ino); 1430 1431 /* Reset the data fork to an empty data fork. */ 1432 xfs_idestroy_fork(ifp); 1433 ifp->if_bytes = 0; 1434 sc->tempip->i_disk_size = 0; 1435 1436 /* Reinitialize the short form directory. */ 1437 xrep_dir_init_args(rd, sc->tempip, NULL); 1438 return xfs_dir2_sf_create(&rd->args, parent_ino); 1439 } 1440 1441 /* 1442 * Prepare both inodes' directory forks for exchanging mappings. Promote the 1443 * tempfile from short format to leaf format, and if the file being repaired 1444 * has a short format data fork, turn it into an empty extent list. 1445 */ 1446 STATIC int xrep_dir_swap_prep(struct xfs_scrub * sc,bool temp_local,bool ip_local)1447 xrep_dir_swap_prep( 1448 struct xfs_scrub *sc, 1449 bool temp_local, 1450 bool ip_local) 1451 { 1452 int error; 1453 1454 /* 1455 * If the tempfile's directory is in shortform format, convert that to 1456 * a single leaf extent so that we can use the atomic mapping exchange. 1457 */ 1458 if (temp_local) { 1459 struct xfs_da_args args = { 1460 .dp = sc->tempip, 1461 .geo = sc->mp->m_dir_geo, 1462 .whichfork = XFS_DATA_FORK, 1463 .trans = sc->tp, 1464 .total = 1, 1465 .owner = sc->ip->i_ino, 1466 }; 1467 1468 error = xfs_dir2_sf_to_block(&args); 1469 if (error) 1470 return error; 1471 1472 /* 1473 * Roll the deferred log items to get us back to a clean 1474 * transaction. 1475 */ 1476 error = xfs_defer_finish(&sc->tp); 1477 if (error) 1478 return error; 1479 } 1480 1481 /* 1482 * If the file being repaired had a shortform data fork, convert that 1483 * to an empty extent list in preparation for the atomic mapping 1484 * exchange. 1485 */ 1486 if (ip_local) { 1487 struct xfs_ifork *ifp; 1488 1489 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 1490 xfs_idestroy_fork(ifp); 1491 ifp->if_format = XFS_DINODE_FMT_EXTENTS; 1492 ifp->if_nextents = 0; 1493 ifp->if_bytes = 0; 1494 ifp->if_data = NULL; 1495 ifp->if_height = 0; 1496 1497 xfs_trans_log_inode(sc->tp, sc->ip, 1498 XFS_ILOG_CORE | XFS_ILOG_DDATA); 1499 } 1500 1501 return 0; 1502 } 1503 1504 /* 1505 * Replace the inode number of a directory entry. 1506 */ 1507 static int xrep_dir_replace(struct xrep_dir * rd,struct xfs_inode * dp,const struct xfs_name * name,xfs_ino_t inum,xfs_extlen_t total)1508 xrep_dir_replace( 1509 struct xrep_dir *rd, 1510 struct xfs_inode *dp, 1511 const struct xfs_name *name, 1512 xfs_ino_t inum, 1513 xfs_extlen_t total) 1514 { 1515 struct xfs_scrub *sc = rd->sc; 1516 int error; 1517 1518 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 1519 1520 error = xfs_dir_ino_validate(sc->mp, inum); 1521 if (error) 1522 return error; 1523 1524 xrep_dir_init_args(rd, dp, name); 1525 rd->args.inumber = inum; 1526 rd->args.total = total; 1527 return xfs_dir_replace_args(&rd->args); 1528 } 1529 1530 /* 1531 * Reset the link count of this directory and adjust the unlinked list pointers 1532 * as needed. 1533 */ 1534 STATIC int xrep_dir_set_nlink(struct xrep_dir * rd)1535 xrep_dir_set_nlink( 1536 struct xrep_dir *rd) 1537 { 1538 struct xfs_scrub *sc = rd->sc; 1539 struct xfs_inode *dp = sc->ip; 1540 struct xfs_perag *pag; 1541 unsigned int new_nlink = min_t(unsigned long long, 1542 rd->subdirs + 2, 1543 XFS_NLINK_PINNED); 1544 int error; 1545 1546 /* 1547 * The directory is not on the incore unlinked list, which means that 1548 * it needs to be reachable via the directory tree. Update the nlink 1549 * with our observed link count. If the directory has no parent, it 1550 * will be moved to the orphanage. 1551 */ 1552 if (!xfs_inode_on_unlinked_list(dp)) 1553 goto reset_nlink; 1554 1555 /* 1556 * The directory is on the unlinked list and we did not find any 1557 * dirents. Set the link count to zero and let the directory 1558 * inactivate when the last reference drops. 1559 */ 1560 if (rd->dirents == 0) { 1561 rd->needs_adoption = false; 1562 new_nlink = 0; 1563 goto reset_nlink; 1564 } 1565 1566 /* 1567 * The directory is on the unlinked list and we found dirents. This 1568 * directory needs to be reachable via the directory tree. Remove the 1569 * dir from the unlinked list and update nlink with the observed link 1570 * count. If the directory has no parent, it will be moved to the 1571 * orphanage. 1572 */ 1573 pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino)); 1574 if (!pag) { 1575 ASSERT(0); 1576 return -EFSCORRUPTED; 1577 } 1578 1579 error = xfs_iunlink_remove(sc->tp, pag, dp); 1580 xfs_perag_put(pag); 1581 if (error) 1582 return error; 1583 1584 reset_nlink: 1585 if (VFS_I(dp)->i_nlink != new_nlink) 1586 set_nlink(VFS_I(dp), new_nlink); 1587 return 0; 1588 } 1589 1590 /* 1591 * Finish replaying stashed dirent updates, allocate a transaction for 1592 * exchanging data fork mappings, and take the ILOCKs of both directories 1593 * before we commit the new directory structure. 1594 */ 1595 STATIC int xrep_dir_finalize_tempdir(struct xrep_dir * rd)1596 xrep_dir_finalize_tempdir( 1597 struct xrep_dir *rd) 1598 { 1599 struct xfs_scrub *sc = rd->sc; 1600 int error; 1601 1602 if (!xfs_has_parent(sc->mp)) 1603 return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); 1604 1605 /* 1606 * Repair relies on the ILOCK to quiesce all possible dirent updates. 1607 * Replay all queued dirent updates into the tempdir before exchanging 1608 * the contents, even if that means dropping the ILOCKs and the 1609 * transaction. 1610 */ 1611 do { 1612 error = xrep_dir_replay_updates(rd); 1613 if (error) 1614 return error; 1615 1616 error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); 1617 if (error) 1618 return error; 1619 1620 if (xfarray_length(rd->dir_entries) == 0) 1621 break; 1622 1623 xchk_trans_cancel(sc); 1624 xrep_tempfile_iunlock_both(sc); 1625 } while (!xchk_should_terminate(sc, &error)); 1626 return error; 1627 } 1628 1629 /* Exchange the temporary directory's data fork with the one being repaired. */ 1630 STATIC int xrep_dir_swap(struct xrep_dir * rd)1631 xrep_dir_swap( 1632 struct xrep_dir *rd) 1633 { 1634 struct xfs_scrub *sc = rd->sc; 1635 bool ip_local, temp_local; 1636 int error = 0; 1637 1638 /* 1639 * If we never found the parent for this directory, temporarily assign 1640 * the root dir as the parent; we'll move this to the orphanage after 1641 * exchanging the dir contents. We hold the ILOCK of the dir being 1642 * repaired, so we're not worried about racy updates of dotdot. 1643 */ 1644 ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); 1645 if (rd->pscan.parent_ino == NULLFSINO) { 1646 rd->needs_adoption = true; 1647 rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino; 1648 } 1649 1650 /* 1651 * Reset the temporary directory's '..' entry to point to the parent 1652 * that we found. The temporary directory was created with the root 1653 * directory as the parent, so we can skip this if repairing a 1654 * subdirectory of the root. 1655 * 1656 * It's also possible that this replacement could also expand a sf 1657 * tempdir into block format. 1658 */ 1659 if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) { 1660 error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot, 1661 rd->pscan.parent_ino, rd->tx.req.resblks); 1662 if (error) 1663 return error; 1664 } 1665 1666 /* 1667 * Changing the dot and dotdot entries could have changed the shape of 1668 * the directory, so we recompute these. 1669 */ 1670 ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL; 1671 temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL; 1672 1673 /* 1674 * If the both files have a local format data fork and the rebuilt 1675 * directory data would fit in the repaired file's data fork, copy 1676 * the contents from the tempfile and update the directory link count. 1677 * We're done now. 1678 */ 1679 if (ip_local && temp_local && 1680 sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) { 1681 xrep_tempfile_copyout_local(sc, XFS_DATA_FORK); 1682 return xrep_dir_set_nlink(rd); 1683 } 1684 1685 /* 1686 * Clean the transaction before we start working on exchanging 1687 * directory contents. 1688 */ 1689 error = xrep_tempfile_roll_trans(rd->sc); 1690 if (error) 1691 return error; 1692 1693 /* Otherwise, make sure both data forks are in block-mapping mode. */ 1694 error = xrep_dir_swap_prep(sc, temp_local, ip_local); 1695 if (error) 1696 return error; 1697 1698 /* 1699 * Set nlink of the directory in the same transaction sequence that 1700 * (atomically) commits the new directory data. 1701 */ 1702 error = xrep_dir_set_nlink(rd); 1703 if (error) 1704 return error; 1705 1706 return xrep_tempexch_contents(sc, &rd->tx); 1707 } 1708 1709 /* 1710 * Exchange the new directory contents (which we created in the tempfile) with 1711 * the directory being repaired. 1712 */ 1713 STATIC int xrep_dir_rebuild_tree(struct xrep_dir * rd)1714 xrep_dir_rebuild_tree( 1715 struct xrep_dir *rd) 1716 { 1717 struct xfs_scrub *sc = rd->sc; 1718 int error; 1719 1720 trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino); 1721 1722 /* 1723 * Take the IOLOCK on the temporary file so that we can run dir 1724 * operations with the same locks held as we would for a normal file. 1725 * We still hold sc->ip's IOLOCK. 1726 */ 1727 error = xrep_tempfile_iolock_polled(rd->sc); 1728 if (error) 1729 return error; 1730 1731 /* 1732 * Allocate transaction, lock inodes, and make sure that we've replayed 1733 * all the stashed dirent updates to the tempdir. After this point, 1734 * we're ready to exchange data fork mappings. 1735 */ 1736 error = xrep_dir_finalize_tempdir(rd); 1737 if (error) 1738 return error; 1739 1740 if (xchk_iscan_aborted(&rd->pscan.iscan)) 1741 return -ECANCELED; 1742 1743 /* 1744 * Exchange the tempdir's data fork with the file being repaired. This 1745 * recreates the transaction and re-takes the ILOCK in the scrub 1746 * context. 1747 */ 1748 error = xrep_dir_swap(rd); 1749 if (error) 1750 return error; 1751 1752 /* 1753 * Release the old directory blocks and reset the data fork of the temp 1754 * directory to an empty shortform directory because inactivation does 1755 * nothing for directories. 1756 */ 1757 error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino); 1758 if (error) 1759 return error; 1760 1761 /* 1762 * Roll to get a transaction without any inodes joined to it. Then we 1763 * can drop the tempfile's ILOCK and IOLOCK before doing more work on 1764 * the scrub target directory. 1765 */ 1766 error = xfs_trans_roll(&sc->tp); 1767 if (error) 1768 return error; 1769 1770 xrep_tempfile_iunlock(sc); 1771 xrep_tempfile_iounlock(sc); 1772 return 0; 1773 } 1774 1775 /* Set up the filesystem scan so we can regenerate directory entries. */ 1776 STATIC int xrep_dir_setup_scan(struct xrep_dir * rd)1777 xrep_dir_setup_scan( 1778 struct xrep_dir *rd) 1779 { 1780 struct xfs_scrub *sc = rd->sc; 1781 char *descr; 1782 int error; 1783 1784 /* Set up some staging memory for salvaging dirents. */ 1785 descr = xchk_xfile_ino_descr(sc, "directory entries"); 1786 error = xfarray_create(descr, 0, sizeof(struct xrep_dirent), 1787 &rd->dir_entries); 1788 kfree(descr); 1789 if (error) 1790 return error; 1791 1792 descr = xchk_xfile_ino_descr(sc, "directory entry names"); 1793 error = xfblob_create(descr, &rd->dir_names); 1794 kfree(descr); 1795 if (error) 1796 goto out_xfarray; 1797 1798 if (xfs_has_parent(sc->mp)) 1799 error = __xrep_findparent_scan_start(sc, &rd->pscan, 1800 xrep_dir_live_update); 1801 else 1802 error = xrep_findparent_scan_start(sc, &rd->pscan); 1803 if (error) 1804 goto out_xfblob; 1805 1806 return 0; 1807 1808 out_xfblob: 1809 xfblob_destroy(rd->dir_names); 1810 rd->dir_names = NULL; 1811 out_xfarray: 1812 xfarray_destroy(rd->dir_entries); 1813 rd->dir_entries = NULL; 1814 return error; 1815 } 1816 1817 /* 1818 * Move the current file to the orphanage. 1819 * 1820 * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon 1821 * successful return, the scrub transaction will have enough extra reservation 1822 * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the 1823 * orphanage; and both inodes will be ijoined. 1824 */ 1825 STATIC int xrep_dir_move_to_orphanage(struct xrep_dir * rd)1826 xrep_dir_move_to_orphanage( 1827 struct xrep_dir *rd) 1828 { 1829 struct xfs_scrub *sc = rd->sc; 1830 xfs_ino_t orig_parent, new_parent; 1831 int error; 1832 1833 /* 1834 * We are about to drop the ILOCK on sc->ip to lock the orphanage and 1835 * prepare for the adoption. Therefore, look up the old dotdot entry 1836 * for sc->ip so that we can compare it after we re-lock sc->ip. 1837 */ 1838 error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent); 1839 if (error) 1840 return error; 1841 1842 /* 1843 * Drop the ILOCK on the scrub target and commit the transaction. 1844 * Adoption computes its own resource requirements and gathers the 1845 * necessary components. 1846 */ 1847 error = xrep_trans_commit(sc); 1848 if (error) 1849 return error; 1850 xchk_iunlock(sc, XFS_ILOCK_EXCL); 1851 1852 /* If we can take the orphanage's iolock then we're ready to move. */ 1853 if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { 1854 xchk_iunlock(sc, sc->ilock_flags); 1855 error = xrep_orphanage_iolock_two(sc); 1856 if (error) 1857 return error; 1858 } 1859 1860 /* Grab transaction and ILOCK the two files. */ 1861 error = xrep_adoption_trans_alloc(sc, &rd->adoption); 1862 if (error) 1863 return error; 1864 1865 error = xrep_adoption_compute_name(&rd->adoption, &rd->xname); 1866 if (error) 1867 return error; 1868 1869 /* 1870 * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot 1871 * entry again. If the parent changed or the child was unlinked while 1872 * the child directory was unlocked, we don't need to move the child to 1873 * the orphanage after all. 1874 */ 1875 error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent); 1876 if (error) 1877 return error; 1878 1879 /* 1880 * Attach to the orphanage if we still have a linked directory and it 1881 * hasn't been moved. 1882 */ 1883 if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) { 1884 error = xrep_adoption_move(&rd->adoption); 1885 if (error) 1886 return error; 1887 } 1888 1889 /* 1890 * Launder the scrub transaction so we can drop the orphanage ILOCK 1891 * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK. 1892 */ 1893 error = xrep_adoption_trans_roll(&rd->adoption); 1894 if (error) 1895 return error; 1896 1897 xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); 1898 xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); 1899 return 0; 1900 } 1901 1902 /* 1903 * Repair the directory metadata. 1904 * 1905 * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer 1906 * cache in XFS can't handle aliased multiblock buffers, so this might 1907 * misbehave if the directory blocks are crosslinked with other filesystem 1908 * metadata. 1909 * 1910 * XXX: Is it necessary to check the dcache for this directory to make sure 1911 * that we always recreate every cached entry? 1912 */ 1913 int xrep_directory(struct xfs_scrub * sc)1914 xrep_directory( 1915 struct xfs_scrub *sc) 1916 { 1917 struct xrep_dir *rd = sc->buf; 1918 int error; 1919 1920 /* The rmapbt is required to reap the old data fork. */ 1921 if (!xfs_has_rmapbt(sc->mp)) 1922 return -EOPNOTSUPP; 1923 /* We require atomic file exchange range to rebuild anything. */ 1924 if (!xfs_has_exchange_range(sc->mp)) 1925 return -EOPNOTSUPP; 1926 1927 error = xrep_dir_setup_scan(rd); 1928 if (error) 1929 return error; 1930 1931 if (xfs_has_parent(sc->mp)) 1932 error = xrep_dir_scan_dirtree(rd); 1933 else 1934 error = xrep_dir_salvage_entries(rd); 1935 if (error) 1936 goto out_teardown; 1937 1938 /* Last chance to abort before we start committing fixes. */ 1939 if (xchk_should_terminate(sc, &error)) 1940 goto out_teardown; 1941 1942 error = xrep_dir_rebuild_tree(rd); 1943 if (error) 1944 goto out_teardown; 1945 1946 if (rd->needs_adoption) { 1947 if (!xrep_orphanage_can_adopt(rd->sc)) 1948 error = -EFSCORRUPTED; 1949 else 1950 error = xrep_dir_move_to_orphanage(rd); 1951 if (error) 1952 goto out_teardown; 1953 } 1954 1955 out_teardown: 1956 xrep_dir_teardown(sc); 1957 return error; 1958 } 1959