1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_bit.h"
16 #include "xfs_log_format.h"
17 #include "xfs_trans.h"
18 #include "xfs_sb.h"
19 #include "xfs_alloc.h"
20 #include "xfs_alloc_btree.h"
21 #include "xfs_rmap.h"
22 #include "xfs_rmap_btree.h"
23 #include "xfs_inode.h"
24 #include "xfs_refcount.h"
25 #include "xfs_extent_busy.h"
26 #include "xfs_health.h"
27 #include "xfs_bmap.h"
28 #include "xfs_ialloc.h"
29 #include "xfs_ag.h"
30 #include "scrub/xfs_scrub.h"
31 #include "scrub/scrub.h"
32 #include "scrub/common.h"
33 #include "scrub/btree.h"
34 #include "scrub/trace.h"
35 #include "scrub/repair.h"
36 #include "scrub/bitmap.h"
37 #include "scrub/agb_bitmap.h"
38 #include "scrub/xfile.h"
39 #include "scrub/xfarray.h"
40 #include "scrub/newbt.h"
41 #include "scrub/reap.h"
42
43 /*
44 * Free Space Btree Repair
45 * =======================
46 *
47 * The reverse mappings are supposed to record all space usage for the entire
48 * AG. Therefore, we can recreate the free extent records in an AG by looking
49 * for gaps in the physical extents recorded in the rmapbt. These records are
50 * staged in @free_records. Identifying the gaps is more difficult on a
51 * reflink filesystem because rmap records are allowed to overlap.
52 *
53 * Because the final step of building a new index is to free the space used by
54 * the old index, repair needs to find that space. Unfortunately, all
55 * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share
56 * the same rmapbt owner code (OWN_AG), so this is not straightforward.
57 *
58 * The scan of the reverse mapping information records the space used by OWN_AG
59 * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While
60 * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to
61 * record all visited rmap btree blocks and all blocks owned by the AGFL.
62 *
63 * After that is where the definitions of old_allocbt_blocks shifts. This
64 * expression identifies possible former bnobt/cntbt blocks:
65 *
66 * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks);
67 *
68 * Substituting from above definitions, that becomes:
69 *
70 * old_allocbt_blocks & ~not_allocbt_blocks
71 *
72 * The OWN_AG bitmap itself isn't needed after this point, so what we really do
73 * instead is:
74 *
75 * old_allocbt_blocks &= ~not_allocbt_blocks;
76 *
77 * After this point, @old_allocbt_blocks is a bitmap of alleged former
78 * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first
79 * parameter in place to avoid copying records around.
80 *
81 * Next, some of the space described by @free_records are diverted to the newbt
82 * reservation and used to format new btree blocks. The remaining records are
83 * written to the new btree indices. We reconstruct both bnobt and cntbt at
84 * the same time since we've already done all the work.
85 *
86 * We use the prefix 'xrep_abt' here because we regenerate both free space
87 * allocation btrees at the same time.
88 */
89
90 struct xrep_abt {
91 /* Blocks owned by the rmapbt or the agfl. */
92 struct xagb_bitmap not_allocbt_blocks;
93
94 /* All OWN_AG blocks. */
95 struct xagb_bitmap old_allocbt_blocks;
96
97 /*
98 * New bnobt information. All btree block reservations are added to
99 * the reservation list in new_bnobt.
100 */
101 struct xrep_newbt new_bnobt;
102
103 /* new cntbt information */
104 struct xrep_newbt new_cntbt;
105
106 /* Free space extents. */
107 struct xfarray *free_records;
108
109 struct xfs_scrub *sc;
110
111 /* Number of non-null records in @free_records. */
112 uint64_t nr_real_records;
113
114 /* get_records()'s position in the free space record array. */
115 xfarray_idx_t array_cur;
116
117 /*
118 * Next block we anticipate seeing in the rmap records. If the next
119 * rmap record is greater than next_agbno, we have found unused space.
120 */
121 xfs_agblock_t next_agbno;
122
123 /* Number of free blocks in this AG. */
124 xfs_agblock_t nr_blocks;
125
126 /* Longest free extent we found in the AG. */
127 xfs_agblock_t longest;
128 };
129
130 /* Set up to repair AG free space btrees. */
131 int
xrep_setup_ag_allocbt(struct xfs_scrub * sc)132 xrep_setup_ag_allocbt(
133 struct xfs_scrub *sc)
134 {
135 unsigned int busy_gen;
136
137 /*
138 * Make sure the busy extent list is clear because we can't put extents
139 * on there twice.
140 */
141 busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
142 if (xfs_extent_busy_list_empty(sc->sa.pag))
143 return 0;
144
145 return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0);
146 }
147
148 /* Check for any obvious conflicts in the free extent. */
149 STATIC int
xrep_abt_check_free_ext(struct xfs_scrub * sc,const struct xfs_alloc_rec_incore * rec)150 xrep_abt_check_free_ext(
151 struct xfs_scrub *sc,
152 const struct xfs_alloc_rec_incore *rec)
153 {
154 enum xbtree_recpacking outcome;
155 int error;
156
157 if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL)
158 return -EFSCORRUPTED;
159
160 /* Must not be an inode chunk. */
161 error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
162 rec->ar_startblock, rec->ar_blockcount, &outcome);
163 if (error)
164 return error;
165 if (outcome != XBTREE_RECPACKING_EMPTY)
166 return -EFSCORRUPTED;
167
168 /* Must not be shared or CoW staging. */
169 if (sc->sa.refc_cur) {
170 error = xfs_refcount_has_records(sc->sa.refc_cur,
171 XFS_REFC_DOMAIN_SHARED, rec->ar_startblock,
172 rec->ar_blockcount, &outcome);
173 if (error)
174 return error;
175 if (outcome != XBTREE_RECPACKING_EMPTY)
176 return -EFSCORRUPTED;
177
178 error = xfs_refcount_has_records(sc->sa.refc_cur,
179 XFS_REFC_DOMAIN_COW, rec->ar_startblock,
180 rec->ar_blockcount, &outcome);
181 if (error)
182 return error;
183 if (outcome != XBTREE_RECPACKING_EMPTY)
184 return -EFSCORRUPTED;
185 }
186
187 return 0;
188 }
189
190 /*
191 * Stash a free space record for all the space since the last bno we found
192 * all the way up to @end.
193 */
194 static int
xrep_abt_stash(struct xrep_abt * ra,xfs_agblock_t end)195 xrep_abt_stash(
196 struct xrep_abt *ra,
197 xfs_agblock_t end)
198 {
199 struct xfs_alloc_rec_incore arec = {
200 .ar_startblock = ra->next_agbno,
201 .ar_blockcount = end - ra->next_agbno,
202 };
203 struct xfs_scrub *sc = ra->sc;
204 int error = 0;
205
206 if (xchk_should_terminate(sc, &error))
207 return error;
208
209 error = xrep_abt_check_free_ext(ra->sc, &arec);
210 if (error)
211 return error;
212
213 trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
214
215 error = xfarray_append(ra->free_records, &arec);
216 if (error)
217 return error;
218
219 ra->nr_blocks += arec.ar_blockcount;
220 return 0;
221 }
222
223 /* Record extents that aren't in use from gaps in the rmap records. */
224 STATIC int
xrep_abt_walk_rmap(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * priv)225 xrep_abt_walk_rmap(
226 struct xfs_btree_cur *cur,
227 const struct xfs_rmap_irec *rec,
228 void *priv)
229 {
230 struct xrep_abt *ra = priv;
231 int error;
232
233 /* Record all the OWN_AG blocks... */
234 if (rec->rm_owner == XFS_RMAP_OWN_AG) {
235 error = xagb_bitmap_set(&ra->old_allocbt_blocks,
236 rec->rm_startblock, rec->rm_blockcount);
237 if (error)
238 return error;
239 }
240
241 /* ...and all the rmapbt blocks... */
242 error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur);
243 if (error)
244 return error;
245
246 /* ...and all the free space. */
247 if (rec->rm_startblock > ra->next_agbno) {
248 error = xrep_abt_stash(ra, rec->rm_startblock);
249 if (error)
250 return error;
251 }
252
253 /*
254 * rmap records can overlap on reflink filesystems, so project
255 * next_agbno as far out into the AG space as we currently know about.
256 */
257 ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno,
258 rec->rm_startblock + rec->rm_blockcount);
259 return 0;
260 }
261
262 /* Collect an AGFL block for the not-to-release list. */
263 static int
xrep_abt_walk_agfl(struct xfs_mount * mp,xfs_agblock_t agbno,void * priv)264 xrep_abt_walk_agfl(
265 struct xfs_mount *mp,
266 xfs_agblock_t agbno,
267 void *priv)
268 {
269 struct xrep_abt *ra = priv;
270
271 return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1);
272 }
273
274 /*
275 * Compare two free space extents by block number. We want to sort in order of
276 * increasing block number.
277 */
278 static int
xrep_bnobt_extent_cmp(const void * a,const void * b)279 xrep_bnobt_extent_cmp(
280 const void *a,
281 const void *b)
282 {
283 const struct xfs_alloc_rec_incore *ap = a;
284 const struct xfs_alloc_rec_incore *bp = b;
285
286 if (ap->ar_startblock > bp->ar_startblock)
287 return 1;
288 else if (ap->ar_startblock < bp->ar_startblock)
289 return -1;
290 return 0;
291 }
292
293 /*
294 * Re-sort the free extents by block number so that we can put the records into
295 * the bnobt in the correct order. Make sure the records do not overlap in
296 * physical space.
297 */
298 STATIC int
xrep_bnobt_sort_records(struct xrep_abt * ra)299 xrep_bnobt_sort_records(
300 struct xrep_abt *ra)
301 {
302 struct xfs_alloc_rec_incore arec;
303 xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
304 xfs_agblock_t next_agbno = 0;
305 int error;
306
307 error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0);
308 if (error)
309 return error;
310
311 while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) {
312 if (arec.ar_startblock < next_agbno)
313 return -EFSCORRUPTED;
314
315 next_agbno = arec.ar_startblock + arec.ar_blockcount;
316 }
317
318 return error;
319 }
320
321 /*
322 * Compare two free space extents by length and then block number. We want
323 * to sort first in order of increasing length and then in order of increasing
324 * block number.
325 */
326 static int
xrep_cntbt_extent_cmp(const void * a,const void * b)327 xrep_cntbt_extent_cmp(
328 const void *a,
329 const void *b)
330 {
331 const struct xfs_alloc_rec_incore *ap = a;
332 const struct xfs_alloc_rec_incore *bp = b;
333
334 if (ap->ar_blockcount > bp->ar_blockcount)
335 return 1;
336 else if (ap->ar_blockcount < bp->ar_blockcount)
337 return -1;
338 return xrep_bnobt_extent_cmp(a, b);
339 }
340
341 /*
342 * Sort the free extents by length so so that we can put the records into the
343 * cntbt in the correct order. Don't let userspace kill us if we're resorting
344 * after allocating btree blocks.
345 */
346 STATIC int
xrep_cntbt_sort_records(struct xrep_abt * ra,bool is_resort)347 xrep_cntbt_sort_records(
348 struct xrep_abt *ra,
349 bool is_resort)
350 {
351 return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp,
352 is_resort ? 0 : XFARRAY_SORT_KILLABLE);
353 }
354
355 /*
356 * Iterate all reverse mappings to find (1) the gaps between rmap records (all
357 * unowned space), (2) the OWN_AG extents (which encompass the free space
358 * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL
359 * blocks. The free space is (1) + (2) - (3) - (4).
360 */
361 STATIC int
xrep_abt_find_freespace(struct xrep_abt * ra)362 xrep_abt_find_freespace(
363 struct xrep_abt *ra)
364 {
365 struct xfs_scrub *sc = ra->sc;
366 struct xfs_mount *mp = sc->mp;
367 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
368 struct xfs_buf *agfl_bp;
369 xfs_agblock_t agend;
370 int error;
371
372 xagb_bitmap_init(&ra->not_allocbt_blocks);
373
374 xrep_ag_btcur_init(sc, &sc->sa);
375
376 /*
377 * Iterate all the reverse mappings to find gaps in the physical
378 * mappings, all the OWN_AG blocks, and all the rmapbt extents.
379 */
380 error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra);
381 if (error)
382 goto err;
383
384 /* Insert a record for space between the last rmap and EOAG. */
385 agend = be32_to_cpu(agf->agf_length);
386 if (ra->next_agbno < agend) {
387 error = xrep_abt_stash(ra, agend);
388 if (error)
389 goto err;
390 }
391
392 /* Collect all the AGFL blocks. */
393 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
394 if (error)
395 goto err;
396
397 error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra);
398 if (error)
399 goto err_agfl;
400
401 /* Compute the old bnobt/cntbt blocks. */
402 error = xagb_bitmap_disunion(&ra->old_allocbt_blocks,
403 &ra->not_allocbt_blocks);
404 if (error)
405 goto err_agfl;
406
407 ra->nr_real_records = xfarray_length(ra->free_records);
408 err_agfl:
409 xfs_trans_brelse(sc->tp, agfl_bp);
410 err:
411 xchk_ag_btcur_free(&sc->sa);
412 xagb_bitmap_destroy(&ra->not_allocbt_blocks);
413 return error;
414 }
415
416 /*
417 * We're going to use the observed free space records to reserve blocks for the
418 * new free space btrees, so we play an iterative game where we try to converge
419 * on the number of blocks we need:
420 *
421 * 1. Estimate how many blocks we'll need to store the records.
422 * 2. If the first free record has more blocks than we need, we're done.
423 * We will have to re-sort the records prior to building the cntbt.
424 * 3. If that record has exactly the number of blocks we need, null out the
425 * record. We're done.
426 * 4. Otherwise, we still need more blocks. Null out the record, subtract its
427 * length from the number of blocks we need, and go back to step 1.
428 *
429 * Fortunately, we don't have to do any transaction work to play this game, so
430 * we don't have to tear down the staging cursors.
431 */
432 STATIC int
xrep_abt_reserve_space(struct xrep_abt * ra,struct xfs_btree_cur * bno_cur,struct xfs_btree_cur * cnt_cur,bool * needs_resort)433 xrep_abt_reserve_space(
434 struct xrep_abt *ra,
435 struct xfs_btree_cur *bno_cur,
436 struct xfs_btree_cur *cnt_cur,
437 bool *needs_resort)
438 {
439 struct xfs_scrub *sc = ra->sc;
440 xfarray_idx_t record_nr;
441 unsigned int allocated = 0;
442 int error = 0;
443
444 record_nr = xfarray_length(ra->free_records) - 1;
445 do {
446 struct xfs_alloc_rec_incore arec;
447 uint64_t required;
448 unsigned int desired;
449 unsigned int len;
450
451 /* Compute how many blocks we'll need. */
452 error = xfs_btree_bload_compute_geometry(cnt_cur,
453 &ra->new_cntbt.bload, ra->nr_real_records);
454 if (error)
455 break;
456
457 error = xfs_btree_bload_compute_geometry(bno_cur,
458 &ra->new_bnobt.bload, ra->nr_real_records);
459 if (error)
460 break;
461
462 /* How many btree blocks do we need to store all records? */
463 required = ra->new_bnobt.bload.nr_blocks +
464 ra->new_cntbt.bload.nr_blocks;
465 ASSERT(required < INT_MAX);
466
467 /* If we've reserved enough blocks, we're done. */
468 if (allocated >= required)
469 break;
470
471 desired = required - allocated;
472
473 /* We need space but there's none left; bye! */
474 if (ra->nr_real_records == 0) {
475 error = -ENOSPC;
476 break;
477 }
478
479 /* Grab the first record from the list. */
480 error = xfarray_load(ra->free_records, record_nr, &arec);
481 if (error)
482 break;
483
484 ASSERT(arec.ar_blockcount <= UINT_MAX);
485 len = min_t(unsigned int, arec.ar_blockcount, desired);
486
487 trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno,
488 arec.ar_startblock, len, XFS_RMAP_OWN_AG);
489
490 error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag,
491 arec.ar_startblock, len);
492 if (error)
493 break;
494 allocated += len;
495 ra->nr_blocks -= len;
496
497 if (arec.ar_blockcount > desired) {
498 /*
499 * Record has more space than we need. The number of
500 * free records doesn't change, so shrink the free
501 * record, inform the caller that the records are no
502 * longer sorted by length, and exit.
503 */
504 arec.ar_startblock += desired;
505 arec.ar_blockcount -= desired;
506 error = xfarray_store(ra->free_records, record_nr,
507 &arec);
508 if (error)
509 break;
510
511 *needs_resort = true;
512 return 0;
513 }
514
515 /*
516 * We're going to use up the entire record, so unset it and
517 * move on to the next one. This changes the number of free
518 * records (but doesn't break the sorting order), so we must
519 * go around the loop once more to re-run _bload_init.
520 */
521 error = xfarray_unset(ra->free_records, record_nr);
522 if (error)
523 break;
524 ra->nr_real_records--;
525 record_nr--;
526 } while (1);
527
528 return error;
529 }
530
531 STATIC int
xrep_abt_dispose_one(struct xrep_abt * ra,struct xrep_newbt_resv * resv)532 xrep_abt_dispose_one(
533 struct xrep_abt *ra,
534 struct xrep_newbt_resv *resv)
535 {
536 struct xfs_scrub *sc = ra->sc;
537 struct xfs_perag *pag = sc->sa.pag;
538 xfs_agblock_t free_agbno = resv->agbno + resv->used;
539 xfs_extlen_t free_aglen = resv->len - resv->used;
540 int error;
541
542 ASSERT(pag == resv->pag);
543
544 /* Add a deferred rmap for each extent we used. */
545 if (resv->used > 0)
546 xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno,
547 resv->used, XFS_RMAP_OWN_AG);
548
549 /*
550 * For each reserved btree block we didn't use, add it to the free
551 * space btree. We didn't touch fdblocks when we reserved them, so
552 * we don't touch it now.
553 */
554 if (free_aglen == 0)
555 return 0;
556
557 trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
558 free_aglen, ra->new_bnobt.oinfo.oi_owner);
559
560 error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen,
561 &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true);
562 if (error)
563 return error;
564
565 return xrep_defer_finish(sc);
566 }
567
568 /*
569 * Deal with all the space we reserved. Blocks that were allocated for the
570 * free space btrees need to have a (deferred) rmap added for the OWN_AG
571 * allocation, and blocks that didn't get used can be freed via the usual
572 * (deferred) means.
573 */
574 STATIC void
xrep_abt_dispose_reservations(struct xrep_abt * ra,int error)575 xrep_abt_dispose_reservations(
576 struct xrep_abt *ra,
577 int error)
578 {
579 struct xrep_newbt_resv *resv, *n;
580
581 if (error)
582 goto junkit;
583
584 list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
585 error = xrep_abt_dispose_one(ra, resv);
586 if (error)
587 goto junkit;
588 }
589
590 junkit:
591 list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
592 xfs_perag_put(resv->pag);
593 list_del(&resv->list);
594 kfree(resv);
595 }
596
597 xrep_newbt_cancel(&ra->new_bnobt);
598 xrep_newbt_cancel(&ra->new_cntbt);
599 }
600
601 /* Retrieve free space data for bulk load. */
602 STATIC int
xrep_abt_get_records(struct xfs_btree_cur * cur,unsigned int idx,struct xfs_btree_block * block,unsigned int nr_wanted,void * priv)603 xrep_abt_get_records(
604 struct xfs_btree_cur *cur,
605 unsigned int idx,
606 struct xfs_btree_block *block,
607 unsigned int nr_wanted,
608 void *priv)
609 {
610 struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a;
611 struct xrep_abt *ra = priv;
612 union xfs_btree_rec *block_rec;
613 unsigned int loaded;
614 int error;
615
616 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
617 error = xfarray_load_next(ra->free_records, &ra->array_cur,
618 arec);
619 if (error)
620 return error;
621
622 ra->longest = max(ra->longest, arec->ar_blockcount);
623
624 block_rec = xfs_btree_rec_addr(cur, idx, block);
625 cur->bc_ops->init_rec_from_cur(cur, block_rec);
626 }
627
628 return loaded;
629 }
630
631 /* Feed one of the new btree blocks to the bulk loader. */
632 STATIC int
xrep_abt_claim_block(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr,void * priv)633 xrep_abt_claim_block(
634 struct xfs_btree_cur *cur,
635 union xfs_btree_ptr *ptr,
636 void *priv)
637 {
638 struct xrep_abt *ra = priv;
639
640 return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr);
641 }
642
643 /*
644 * Reset the AGF counters to reflect the free space btrees that we just
645 * rebuilt, then reinitialize the per-AG data.
646 */
647 STATIC int
xrep_abt_reset_counters(struct xrep_abt * ra)648 xrep_abt_reset_counters(
649 struct xrep_abt *ra)
650 {
651 struct xfs_scrub *sc = ra->sc;
652 struct xfs_perag *pag = sc->sa.pag;
653 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
654 unsigned int freesp_btreeblks = 0;
655
656 /*
657 * Compute the contribution to agf_btreeblks for the new free space
658 * btrees. This is the computed btree size minus anything we didn't
659 * use.
660 */
661 freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1;
662 freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1;
663
664 freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt);
665 freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt);
666
667 /*
668 * The AGF header contains extra information related to the free space
669 * btrees, so we must update those fields here.
670 */
671 agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks +
672 (be32_to_cpu(agf->agf_rmap_blocks) - 1));
673 agf->agf_freeblks = cpu_to_be32(ra->nr_blocks);
674 agf->agf_longest = cpu_to_be32(ra->longest);
675 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS |
676 XFS_AGF_LONGEST |
677 XFS_AGF_FREEBLKS);
678
679 /*
680 * After we commit the new btree to disk, it is possible that the
681 * process to reap the old btree blocks will race with the AIL trying
682 * to checkpoint the old btree blocks into the filesystem. If the new
683 * tree is shorter than the old one, the allocbt write verifier will
684 * fail and the AIL will shut down the filesystem.
685 *
686 * To avoid this, save the old incore btree height values as the alt
687 * height values before re-initializing the perag info from the updated
688 * AGF to capture all the new values.
689 */
690 pag->pagf_repair_bno_level = pag->pagf_bno_level;
691 pag->pagf_repair_cnt_level = pag->pagf_cnt_level;
692
693 /* Reinitialize with the values we just logged. */
694 return xrep_reinit_pagf(sc);
695 }
696
697 /*
698 * Use the collected free space information to stage new free space btrees.
699 * If this is successful we'll return with the new btree root
700 * information logged to the repair transaction but not yet committed.
701 */
702 STATIC int
xrep_abt_build_new_trees(struct xrep_abt * ra)703 xrep_abt_build_new_trees(
704 struct xrep_abt *ra)
705 {
706 struct xfs_scrub *sc = ra->sc;
707 struct xfs_btree_cur *bno_cur;
708 struct xfs_btree_cur *cnt_cur;
709 struct xfs_perag *pag = sc->sa.pag;
710 bool needs_resort = false;
711 int error;
712
713 /*
714 * Sort the free extents by length so that we can set up the free space
715 * btrees in as few extents as possible. This reduces the amount of
716 * deferred rmap / free work we have to do at the end.
717 */
718 error = xrep_cntbt_sort_records(ra, false);
719 if (error)
720 return error;
721
722 /*
723 * Prepare to construct the new btree by reserving disk space for the
724 * new btree and setting up all the accounting information we'll need
725 * to root the new btree while it's under construction and before we
726 * attach it to the AG header.
727 */
728 xrep_newbt_init_bare(&ra->new_bnobt, sc);
729 xrep_newbt_init_bare(&ra->new_cntbt, sc);
730
731 ra->new_bnobt.bload.get_records = xrep_abt_get_records;
732 ra->new_cntbt.bload.get_records = xrep_abt_get_records;
733
734 ra->new_bnobt.bload.claim_block = xrep_abt_claim_block;
735 ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
736
737 /* Allocate cursors for the staged btrees. */
738 bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag);
739 xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake);
740
741 cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag);
742 xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake);
743
744 /* Last chance to abort before we start committing fixes. */
745 if (xchk_should_terminate(sc, &error))
746 goto err_cur;
747
748 /* Reserve the space we'll need for the new btrees. */
749 error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort);
750 if (error)
751 goto err_cur;
752
753 /*
754 * If we need to re-sort the free extents by length, do so so that we
755 * can put the records into the cntbt in the correct order.
756 */
757 if (needs_resort) {
758 error = xrep_cntbt_sort_records(ra, needs_resort);
759 if (error)
760 goto err_cur;
761 }
762
763 /*
764 * Due to btree slack factors, it's possible for a new btree to be one
765 * level taller than the old btree. Update the alternate incore btree
766 * height so that we don't trip the verifiers when writing the new
767 * btree blocks to disk.
768 */
769 pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height;
770 pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height;
771
772 /* Load the free space by length tree. */
773 ra->array_cur = XFARRAY_CURSOR_INIT;
774 ra->longest = 0;
775 error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra);
776 if (error)
777 goto err_levels;
778
779 error = xrep_bnobt_sort_records(ra);
780 if (error)
781 goto err_levels;
782
783 /* Load the free space by block number tree. */
784 ra->array_cur = XFARRAY_CURSOR_INIT;
785 error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra);
786 if (error)
787 goto err_levels;
788
789 /*
790 * Install the new btrees in the AG header. After this point the old
791 * btrees are no longer accessible and the new trees are live.
792 */
793 xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp);
794 xfs_btree_del_cursor(bno_cur, 0);
795 xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp);
796 xfs_btree_del_cursor(cnt_cur, 0);
797
798 /* Reset the AGF counters now that we've changed the btree shape. */
799 error = xrep_abt_reset_counters(ra);
800 if (error)
801 goto err_newbt;
802
803 /* Dispose of any unused blocks and the accounting information. */
804 xrep_abt_dispose_reservations(ra, error);
805
806 return xrep_roll_ag_trans(sc);
807
808 err_levels:
809 pag->pagf_repair_bno_level = 0;
810 pag->pagf_repair_cnt_level = 0;
811 err_cur:
812 xfs_btree_del_cursor(cnt_cur, error);
813 xfs_btree_del_cursor(bno_cur, error);
814 err_newbt:
815 xrep_abt_dispose_reservations(ra, error);
816 return error;
817 }
818
819 /*
820 * Now that we've logged the roots of the new btrees, invalidate all of the
821 * old blocks and free them.
822 */
823 STATIC int
xrep_abt_remove_old_trees(struct xrep_abt * ra)824 xrep_abt_remove_old_trees(
825 struct xrep_abt *ra)
826 {
827 struct xfs_perag *pag = ra->sc->sa.pag;
828 int error;
829
830 /* Free the old btree blocks if they're not in use. */
831 error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks,
832 &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE);
833 if (error)
834 return error;
835
836 /*
837 * Now that we've zapped all the old allocbt blocks we can turn off
838 * the alternate height mechanism.
839 */
840 pag->pagf_repair_bno_level = 0;
841 pag->pagf_repair_cnt_level = 0;
842 return 0;
843 }
844
845 /* Repair the freespace btrees for some AG. */
846 int
xrep_allocbt(struct xfs_scrub * sc)847 xrep_allocbt(
848 struct xfs_scrub *sc)
849 {
850 struct xrep_abt *ra;
851 struct xfs_mount *mp = sc->mp;
852 char *descr;
853 int error;
854
855 /* We require the rmapbt to rebuild anything. */
856 if (!xfs_has_rmapbt(mp))
857 return -EOPNOTSUPP;
858
859 ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS);
860 if (!ra)
861 return -ENOMEM;
862 ra->sc = sc;
863
864 /* We rebuild both data structures. */
865 sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT;
866
867 /*
868 * Make sure the busy extent list is clear because we can't put extents
869 * on there twice. In theory we cleared this before we started, but
870 * let's not risk the filesystem.
871 */
872 if (!xfs_extent_busy_list_empty(sc->sa.pag)) {
873 error = -EDEADLOCK;
874 goto out_ra;
875 }
876
877 /* Set up enough storage to handle maximally fragmented free space. */
878 descr = xchk_xfile_ag_descr(sc, "free space records");
879 error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2,
880 sizeof(struct xfs_alloc_rec_incore),
881 &ra->free_records);
882 kfree(descr);
883 if (error)
884 goto out_ra;
885
886 /* Collect the free space data and find the old btree blocks. */
887 xagb_bitmap_init(&ra->old_allocbt_blocks);
888 error = xrep_abt_find_freespace(ra);
889 if (error)
890 goto out_bitmap;
891
892 /* Rebuild the free space information. */
893 error = xrep_abt_build_new_trees(ra);
894 if (error)
895 goto out_bitmap;
896
897 /* Kill the old trees. */
898 error = xrep_abt_remove_old_trees(ra);
899 if (error)
900 goto out_bitmap;
901
902 out_bitmap:
903 xagb_bitmap_destroy(&ra->old_allocbt_blocks);
904 xfarray_destroy(ra->free_records);
905 out_ra:
906 kfree(ra);
907 return error;
908 }
909
910 /* Make sure both btrees are ok after we've rebuilt them. */
911 int
xrep_revalidate_allocbt(struct xfs_scrub * sc)912 xrep_revalidate_allocbt(
913 struct xfs_scrub *sc)
914 {
915 __u32 old_type = sc->sm->sm_type;
916 int error;
917
918 /*
919 * We must update sm_type temporarily so that the tree-to-tree cross
920 * reference checks will work in the correct direction, and also so
921 * that tracing will report correctly if there are more errors.
922 */
923 sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT;
924 error = xchk_allocbt(sc);
925 if (error)
926 goto out;
927
928 sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT;
929 error = xchk_allocbt(sc);
930 out:
931 sc->sm->sm_type = old_type;
932 return error;
933 }
934