1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_btree_staging.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_sb.h"
17 #include "xfs_inode.h"
18 #include "xfs_alloc.h"
19 #include "xfs_rmap.h"
20 #include "xfs_ag.h"
21 #include "xfs_defer.h"
22 #include "scrub/scrub.h"
23 #include "scrub/common.h"
24 #include "scrub/trace.h"
25 #include "scrub/repair.h"
26 #include "scrub/newbt.h"
27
28 /*
29 * Estimate proper slack values for a btree that's being reloaded.
30 *
31 * Under most circumstances, we'll take whatever default loading value the
32 * btree bulk loading code calculates for us. However, there are some
33 * exceptions to this rule:
34 *
35 * (0) If someone turned one of the debug knobs.
36 * (1) If this is a per-AG btree and the AG has less than 10% space free.
37 * (2) If this is an inode btree and the FS has less than 10% space free.
38
39 * In either case, format the new btree blocks almost completely full to
40 * minimize space usage.
41 */
42 static void
xrep_newbt_estimate_slack(struct xrep_newbt * xnr)43 xrep_newbt_estimate_slack(
44 struct xrep_newbt *xnr)
45 {
46 struct xfs_scrub *sc = xnr->sc;
47 struct xfs_btree_bload *bload = &xnr->bload;
48 uint64_t free;
49 uint64_t sz;
50
51 /*
52 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
53 * unless someone has set them otherwise, so we just pull the values
54 * here.
55 */
56 bload->leaf_slack = xfs_globals.bload_leaf_slack;
57 bload->node_slack = xfs_globals.bload_node_slack;
58
59 if (sc->ops->type == ST_PERAG) {
60 free = sc->sa.pag->pagf_freeblks;
61 sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
62 } else {
63 free = percpu_counter_sum(&sc->mp->m_fdblocks);
64 sz = sc->mp->m_sb.sb_dblocks;
65 }
66
67 /* No further changes if there's more than 10% free space left. */
68 if (free >= div_u64(sz, 10))
69 return;
70
71 /*
72 * We're low on space; load the btrees as tightly as possible. Leave
73 * a couple of open slots in each btree block so that we don't end up
74 * splitting the btrees like crazy after a mount.
75 */
76 if (bload->leaf_slack < 0)
77 bload->leaf_slack = 2;
78 if (bload->node_slack < 0)
79 bload->node_slack = 2;
80 }
81
82 /* Initialize accounting resources for staging a new AG btree. */
83 void
xrep_newbt_init_ag(struct xrep_newbt * xnr,struct xfs_scrub * sc,const struct xfs_owner_info * oinfo,xfs_fsblock_t alloc_hint,enum xfs_ag_resv_type resv)84 xrep_newbt_init_ag(
85 struct xrep_newbt *xnr,
86 struct xfs_scrub *sc,
87 const struct xfs_owner_info *oinfo,
88 xfs_fsblock_t alloc_hint,
89 enum xfs_ag_resv_type resv)
90 {
91 memset(xnr, 0, sizeof(struct xrep_newbt));
92 xnr->sc = sc;
93 xnr->oinfo = *oinfo; /* structure copy */
94 xnr->alloc_hint = alloc_hint;
95 xnr->resv = resv;
96 INIT_LIST_HEAD(&xnr->resv_list);
97 xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
98 xrep_newbt_estimate_slack(xnr);
99 }
100
101 /* Initialize accounting resources for staging a new inode fork btree. */
102 int
xrep_newbt_init_inode(struct xrep_newbt * xnr,struct xfs_scrub * sc,int whichfork,const struct xfs_owner_info * oinfo)103 xrep_newbt_init_inode(
104 struct xrep_newbt *xnr,
105 struct xfs_scrub *sc,
106 int whichfork,
107 const struct xfs_owner_info *oinfo)
108 {
109 struct xfs_ifork *ifp;
110
111 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
112 if (!ifp)
113 return -ENOMEM;
114
115 xrep_newbt_init_ag(xnr, sc, oinfo,
116 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
117 XFS_AG_RESV_NONE);
118 xnr->ifake.if_fork = ifp;
119 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
120 return 0;
121 }
122
123 /*
124 * Initialize accounting resources for staging a new btree. Callers are
125 * expected to add their own reservations (and clean them up) manually.
126 */
127 void
xrep_newbt_init_bare(struct xrep_newbt * xnr,struct xfs_scrub * sc)128 xrep_newbt_init_bare(
129 struct xrep_newbt *xnr,
130 struct xfs_scrub *sc)
131 {
132 xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
133 XFS_AG_RESV_NONE);
134 }
135
136 /*
137 * Designate specific blocks to be used to build our new btree. @pag must be
138 * a passive reference.
139 */
140 STATIC int
xrep_newbt_add_blocks(struct xrep_newbt * xnr,struct xfs_perag * pag,const struct xfs_alloc_arg * args)141 xrep_newbt_add_blocks(
142 struct xrep_newbt *xnr,
143 struct xfs_perag *pag,
144 const struct xfs_alloc_arg *args)
145 {
146 struct xfs_mount *mp = xnr->sc->mp;
147 struct xrep_newbt_resv *resv;
148 int error;
149
150 resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
151 if (!resv)
152 return -ENOMEM;
153
154 INIT_LIST_HEAD(&resv->list);
155 resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
156 resv->len = args->len;
157 resv->used = 0;
158 resv->pag = xfs_perag_hold(pag);
159
160 if (args->tp) {
161 ASSERT(xnr->oinfo.oi_offset == 0);
162
163 error = xfs_alloc_schedule_autoreap(args,
164 XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap);
165 if (error)
166 goto out_pag;
167 }
168
169 list_add_tail(&resv->list, &xnr->resv_list);
170 return 0;
171 out_pag:
172 xfs_perag_put(resv->pag);
173 kfree(resv);
174 return error;
175 }
176
177 /*
178 * Add an extent to the new btree reservation pool. Callers are required to
179 * reap this reservation manually if the repair is cancelled. @pag must be a
180 * passive reference.
181 */
182 int
xrep_newbt_add_extent(struct xrep_newbt * xnr,struct xfs_perag * pag,xfs_agblock_t agbno,xfs_extlen_t len)183 xrep_newbt_add_extent(
184 struct xrep_newbt *xnr,
185 struct xfs_perag *pag,
186 xfs_agblock_t agbno,
187 xfs_extlen_t len)
188 {
189 struct xfs_mount *mp = xnr->sc->mp;
190 struct xfs_alloc_arg args = {
191 .tp = NULL, /* no autoreap */
192 .oinfo = xnr->oinfo,
193 .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
194 .len = len,
195 .resv = xnr->resv,
196 };
197
198 return xrep_newbt_add_blocks(xnr, pag, &args);
199 }
200
201 /* Don't let our allocation hint take us beyond this AG */
202 static inline void
xrep_newbt_validate_ag_alloc_hint(struct xrep_newbt * xnr)203 xrep_newbt_validate_ag_alloc_hint(
204 struct xrep_newbt *xnr)
205 {
206 struct xfs_scrub *sc = xnr->sc;
207 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
208
209 if (agno == sc->sa.pag->pag_agno &&
210 xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
211 return;
212
213 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
214 XFS_AGFL_BLOCK(sc->mp) + 1);
215 }
216
217 /* Allocate disk space for a new per-AG btree. */
218 STATIC int
xrep_newbt_alloc_ag_blocks(struct xrep_newbt * xnr,uint64_t nr_blocks)219 xrep_newbt_alloc_ag_blocks(
220 struct xrep_newbt *xnr,
221 uint64_t nr_blocks)
222 {
223 struct xfs_scrub *sc = xnr->sc;
224 struct xfs_mount *mp = sc->mp;
225 int error = 0;
226
227 ASSERT(sc->sa.pag != NULL);
228
229 while (nr_blocks > 0) {
230 struct xfs_alloc_arg args = {
231 .tp = sc->tp,
232 .mp = mp,
233 .oinfo = xnr->oinfo,
234 .minlen = 1,
235 .maxlen = nr_blocks,
236 .prod = 1,
237 .resv = xnr->resv,
238 };
239 xfs_agnumber_t agno;
240
241 xrep_newbt_validate_ag_alloc_hint(xnr);
242
243 if (xnr->alloc_vextent)
244 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
245 else
246 error = xfs_alloc_vextent_near_bno(&args,
247 xnr->alloc_hint);
248 if (error)
249 return error;
250 if (args.fsbno == NULLFSBLOCK)
251 return -ENOSPC;
252
253 agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
254
255 trace_xrep_newbt_alloc_ag_blocks(mp, agno,
256 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
257 xnr->oinfo.oi_owner);
258
259 if (agno != sc->sa.pag->pag_agno) {
260 ASSERT(agno == sc->sa.pag->pag_agno);
261 return -EFSCORRUPTED;
262 }
263
264 error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
265 if (error)
266 return error;
267
268 nr_blocks -= args.len;
269 xnr->alloc_hint = args.fsbno + args.len;
270
271 error = xrep_defer_finish(sc);
272 if (error)
273 return error;
274 }
275
276 return 0;
277 }
278
279 /* Don't let our allocation hint take us beyond EOFS */
280 static inline void
xrep_newbt_validate_file_alloc_hint(struct xrep_newbt * xnr)281 xrep_newbt_validate_file_alloc_hint(
282 struct xrep_newbt *xnr)
283 {
284 struct xfs_scrub *sc = xnr->sc;
285
286 if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
287 return;
288
289 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
290 }
291
292 /* Allocate disk space for our new file-based btree. */
293 STATIC int
xrep_newbt_alloc_file_blocks(struct xrep_newbt * xnr,uint64_t nr_blocks)294 xrep_newbt_alloc_file_blocks(
295 struct xrep_newbt *xnr,
296 uint64_t nr_blocks)
297 {
298 struct xfs_scrub *sc = xnr->sc;
299 struct xfs_mount *mp = sc->mp;
300 int error = 0;
301
302 while (nr_blocks > 0) {
303 struct xfs_alloc_arg args = {
304 .tp = sc->tp,
305 .mp = mp,
306 .oinfo = xnr->oinfo,
307 .minlen = 1,
308 .maxlen = nr_blocks,
309 .prod = 1,
310 .resv = xnr->resv,
311 };
312 struct xfs_perag *pag;
313 xfs_agnumber_t agno;
314
315 xrep_newbt_validate_file_alloc_hint(xnr);
316
317 if (xnr->alloc_vextent)
318 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
319 else
320 error = xfs_alloc_vextent_start_ag(&args,
321 xnr->alloc_hint);
322 if (error)
323 return error;
324 if (args.fsbno == NULLFSBLOCK)
325 return -ENOSPC;
326
327 agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
328
329 trace_xrep_newbt_alloc_file_blocks(mp, agno,
330 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
331 xnr->oinfo.oi_owner);
332
333 pag = xfs_perag_get(mp, agno);
334 if (!pag) {
335 ASSERT(0);
336 return -EFSCORRUPTED;
337 }
338
339 error = xrep_newbt_add_blocks(xnr, pag, &args);
340 xfs_perag_put(pag);
341 if (error)
342 return error;
343
344 nr_blocks -= args.len;
345 xnr->alloc_hint = args.fsbno + args.len;
346
347 error = xrep_defer_finish(sc);
348 if (error)
349 return error;
350 }
351
352 return 0;
353 }
354
355 /* Allocate disk space for our new btree. */
356 int
xrep_newbt_alloc_blocks(struct xrep_newbt * xnr,uint64_t nr_blocks)357 xrep_newbt_alloc_blocks(
358 struct xrep_newbt *xnr,
359 uint64_t nr_blocks)
360 {
361 if (xnr->sc->ip)
362 return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
363 return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
364 }
365
366 /*
367 * Free the unused part of a space extent that was reserved for a new ondisk
368 * structure. Returns the number of EFIs logged or a negative errno.
369 */
370 STATIC int
xrep_newbt_free_extent(struct xrep_newbt * xnr,struct xrep_newbt_resv * resv,bool btree_committed)371 xrep_newbt_free_extent(
372 struct xrep_newbt *xnr,
373 struct xrep_newbt_resv *resv,
374 bool btree_committed)
375 {
376 struct xfs_scrub *sc = xnr->sc;
377 xfs_agblock_t free_agbno = resv->agbno;
378 xfs_extlen_t free_aglen = resv->len;
379 xfs_fsblock_t fsbno;
380 int error;
381
382 if (!btree_committed || resv->used == 0) {
383 /*
384 * If we're not committing a new btree or we didn't use the
385 * space reservation, let the existing EFI free the entire
386 * space extent.
387 */
388 trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
389 free_agbno, free_aglen, xnr->oinfo.oi_owner);
390 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
391 return 1;
392 }
393
394 /*
395 * We used space and committed the btree. Cancel the autoreap, remove
396 * the written blocks from the reservation, and possibly log a new EFI
397 * to free any unused reservation space.
398 */
399 xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
400 free_agbno += resv->used;
401 free_aglen -= resv->used;
402
403 if (free_aglen == 0)
404 return 0;
405
406 trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
407 free_aglen, xnr->oinfo.oi_owner);
408
409 ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
410 ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
411
412 /*
413 * Use EFIs to free the reservations. This reduces the chance
414 * that we leak blocks if the system goes down.
415 */
416 fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
417 error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
418 xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
419 if (error)
420 return error;
421
422 return 1;
423 }
424
425 /* Free all the accounting info and disk space we reserved for a new btree. */
426 STATIC int
xrep_newbt_free(struct xrep_newbt * xnr,bool btree_committed)427 xrep_newbt_free(
428 struct xrep_newbt *xnr,
429 bool btree_committed)
430 {
431 struct xfs_scrub *sc = xnr->sc;
432 struct xrep_newbt_resv *resv, *n;
433 unsigned int freed = 0;
434 int error = 0;
435
436 /*
437 * If the filesystem already went down, we can't free the blocks. Skip
438 * ahead to freeing the incore metadata because we can't fix anything.
439 */
440 if (xfs_is_shutdown(sc->mp))
441 goto junkit;
442
443 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
444 int ret;
445
446 ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
447 list_del(&resv->list);
448 xfs_perag_put(resv->pag);
449 kfree(resv);
450 if (ret < 0) {
451 error = ret;
452 goto junkit;
453 }
454
455 freed += ret;
456 if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
457 error = xrep_defer_finish(sc);
458 if (error)
459 goto junkit;
460 freed = 0;
461 }
462 }
463
464 if (freed)
465 error = xrep_defer_finish(sc);
466
467 junkit:
468 /*
469 * If we still have reservations attached to @newbt, cleanup must have
470 * failed and the filesystem is about to go down. Clean up the incore
471 * reservations and try to commit to freeing the space we used.
472 */
473 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
474 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
475 list_del(&resv->list);
476 xfs_perag_put(resv->pag);
477 kfree(resv);
478 }
479
480 if (sc->ip) {
481 kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
482 xnr->ifake.if_fork = NULL;
483 }
484
485 return error;
486 }
487
488 /*
489 * Free all the accounting info and unused disk space allocations after
490 * committing a new btree.
491 */
492 int
xrep_newbt_commit(struct xrep_newbt * xnr)493 xrep_newbt_commit(
494 struct xrep_newbt *xnr)
495 {
496 return xrep_newbt_free(xnr, true);
497 }
498
499 /*
500 * Free all the accounting info and all of the disk space we reserved for a new
501 * btree that we're not going to commit. We want to try to roll things back
502 * cleanly for things like ENOSPC midway through allocation.
503 */
504 void
xrep_newbt_cancel(struct xrep_newbt * xnr)505 xrep_newbt_cancel(
506 struct xrep_newbt *xnr)
507 {
508 xrep_newbt_free(xnr, false);
509 }
510
511 /* Feed one of the reserved btree blocks to the bulk loader. */
512 int
xrep_newbt_claim_block(struct xfs_btree_cur * cur,struct xrep_newbt * xnr,union xfs_btree_ptr * ptr)513 xrep_newbt_claim_block(
514 struct xfs_btree_cur *cur,
515 struct xrep_newbt *xnr,
516 union xfs_btree_ptr *ptr)
517 {
518 struct xrep_newbt_resv *resv;
519 struct xfs_mount *mp = cur->bc_mp;
520 xfs_agblock_t agbno;
521
522 /*
523 * The first item in the list should always have a free block unless
524 * we're completely out.
525 */
526 resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
527 if (resv->used == resv->len)
528 return -ENOSPC;
529
530 /*
531 * Peel off a block from the start of the reservation. We allocate
532 * blocks in order to place blocks on disk in increasing record or key
533 * order. The block reservations tend to end up on the list in
534 * decreasing order, which hopefully results in leaf blocks ending up
535 * together.
536 */
537 agbno = resv->agbno + resv->used;
538 resv->used++;
539
540 /* If we used all the blocks in this reservation, move it to the end. */
541 if (resv->used == resv->len)
542 list_move_tail(&resv->list, &xnr->resv_list);
543
544 trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
545 xnr->oinfo.oi_owner);
546
547 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
548 ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
549 agbno));
550 else
551 ptr->s = cpu_to_be32(agbno);
552
553 /* Relog all the EFIs. */
554 return xrep_defer_finish(xnr->sc);
555 }
556
557 /* How many reserved blocks are unused? */
558 unsigned int
xrep_newbt_unused_blocks(struct xrep_newbt * xnr)559 xrep_newbt_unused_blocks(
560 struct xrep_newbt *xnr)
561 {
562 struct xrep_newbt_resv *resv;
563 unsigned int unused = 0;
564
565 list_for_each_entry(resv, &xnr->resv_list, list)
566 unused += resv->len - resv->used;
567 return unused;
568 }
569