1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (c) 2014 Red Hat, Inc.
4   * All Rights Reserved.
5   */
6  #include "xfs.h"
7  #include "xfs_fs.h"
8  #include "xfs_shared.h"
9  #include "xfs_format.h"
10  #include "xfs_log_format.h"
11  #include "xfs_trans_resv.h"
12  #include "xfs_mount.h"
13  #include "xfs_trans.h"
14  #include "xfs_alloc.h"
15  #include "xfs_btree.h"
16  #include "xfs_btree_staging.h"
17  #include "xfs_rmap.h"
18  #include "xfs_rmap_btree.h"
19  #include "xfs_health.h"
20  #include "xfs_trace.h"
21  #include "xfs_error.h"
22  #include "xfs_extent_busy.h"
23  #include "xfs_ag.h"
24  #include "xfs_ag_resv.h"
25  #include "xfs_buf_mem.h"
26  #include "xfs_btree_mem.h"
27  
28  static struct kmem_cache	*xfs_rmapbt_cur_cache;
29  
30  /*
31   * Reverse map btree.
32   *
33   * This is a per-ag tree used to track the owner(s) of a given extent. With
34   * reflink it is possible for there to be multiple owners, which is a departure
35   * from classic XFS. Owner records for data extents are inserted when the
36   * extent is mapped and removed when an extent is unmapped.  Owner records for
37   * all other block types (i.e. metadata) are inserted when an extent is
38   * allocated and removed when an extent is freed. There can only be one owner
39   * of a metadata extent, usually an inode or some other metadata structure like
40   * an AG btree.
41   *
42   * The rmap btree is part of the free space management, so blocks for the tree
43   * are sourced from the agfl. Hence we need transaction reservation support for
44   * this tree so that the freelist is always large enough. This also impacts on
45   * the minimum space we need to leave free in the AG.
46   *
47   * The tree is ordered by [ag block, owner, offset]. This is a large key size,
48   * but it is the only way to enforce unique keys when a block can be owned by
49   * multiple files at any offset. There's no need to order/search by extent
50   * size for online updating/management of the tree. It is intended that most
51   * reverse lookups will be to find the owner(s) of a particular block, or to
52   * try to recover tree and file data from corrupt primary metadata.
53   */
54  
55  static struct xfs_btree_cur *
xfs_rmapbt_dup_cursor(struct xfs_btree_cur * cur)56  xfs_rmapbt_dup_cursor(
57  	struct xfs_btree_cur	*cur)
58  {
59  	return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
60  				cur->bc_ag.agbp, cur->bc_ag.pag);
61  }
62  
63  STATIC void
xfs_rmapbt_set_root(struct xfs_btree_cur * cur,const union xfs_btree_ptr * ptr,int inc)64  xfs_rmapbt_set_root(
65  	struct xfs_btree_cur		*cur,
66  	const union xfs_btree_ptr	*ptr,
67  	int				inc)
68  {
69  	struct xfs_buf		*agbp = cur->bc_ag.agbp;
70  	struct xfs_agf		*agf = agbp->b_addr;
71  
72  	ASSERT(ptr->s != 0);
73  
74  	agf->agf_rmap_root = ptr->s;
75  	be32_add_cpu(&agf->agf_rmap_level, inc);
76  	cur->bc_ag.pag->pagf_rmap_level += inc;
77  
78  	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
79  }
80  
81  STATIC int
xfs_rmapbt_alloc_block(struct xfs_btree_cur * cur,const union xfs_btree_ptr * start,union xfs_btree_ptr * new,int * stat)82  xfs_rmapbt_alloc_block(
83  	struct xfs_btree_cur		*cur,
84  	const union xfs_btree_ptr	*start,
85  	union xfs_btree_ptr		*new,
86  	int				*stat)
87  {
88  	struct xfs_buf		*agbp = cur->bc_ag.agbp;
89  	struct xfs_agf		*agf = agbp->b_addr;
90  	struct xfs_perag	*pag = cur->bc_ag.pag;
91  	struct xfs_alloc_arg    args = { .len = 1 };
92  	int			error;
93  	xfs_agblock_t		bno;
94  
95  	/* Allocate the new block from the freelist. If we can't, give up.  */
96  	error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp,
97  				       &bno, 1);
98  	if (error)
99  		return error;
100  	if (bno == NULLAGBLOCK) {
101  		*stat = 0;
102  		return 0;
103  	}
104  
105  	xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false);
106  
107  	new->s = cpu_to_be32(bno);
108  	be32_add_cpu(&agf->agf_rmap_blocks, 1);
109  	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
110  
111  	/*
112  	 * Since rmapbt blocks are sourced from the AGFL, they are allocated one
113  	 * at a time and the reservation updates don't require a transaction.
114  	 */
115  	xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args);
116  
117  	*stat = 1;
118  	return 0;
119  }
120  
121  STATIC int
xfs_rmapbt_free_block(struct xfs_btree_cur * cur,struct xfs_buf * bp)122  xfs_rmapbt_free_block(
123  	struct xfs_btree_cur	*cur,
124  	struct xfs_buf		*bp)
125  {
126  	struct xfs_buf		*agbp = cur->bc_ag.agbp;
127  	struct xfs_agf		*agf = agbp->b_addr;
128  	struct xfs_perag	*pag = cur->bc_ag.pag;
129  	xfs_agblock_t		bno;
130  	int			error;
131  
132  	bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
133  	be32_add_cpu(&agf->agf_rmap_blocks, -1);
134  	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
135  	error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
136  	if (error)
137  		return error;
138  
139  	xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1,
140  			      XFS_EXTENT_BUSY_SKIP_DISCARD);
141  
142  	xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
143  	return 0;
144  }
145  
146  STATIC int
xfs_rmapbt_get_minrecs(struct xfs_btree_cur * cur,int level)147  xfs_rmapbt_get_minrecs(
148  	struct xfs_btree_cur	*cur,
149  	int			level)
150  {
151  	return cur->bc_mp->m_rmap_mnr[level != 0];
152  }
153  
154  STATIC int
xfs_rmapbt_get_maxrecs(struct xfs_btree_cur * cur,int level)155  xfs_rmapbt_get_maxrecs(
156  	struct xfs_btree_cur	*cur,
157  	int			level)
158  {
159  	return cur->bc_mp->m_rmap_mxr[level != 0];
160  }
161  
162  /*
163   * Convert the ondisk record's offset field into the ondisk key's offset field.
164   * Fork and bmbt are significant parts of the rmap record key, but written
165   * status is merely a record attribute.
166   */
ondisk_rec_offset_to_key(const union xfs_btree_rec * rec)167  static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
168  {
169  	return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
170  }
171  
172  STATIC void
xfs_rmapbt_init_key_from_rec(union xfs_btree_key * key,const union xfs_btree_rec * rec)173  xfs_rmapbt_init_key_from_rec(
174  	union xfs_btree_key		*key,
175  	const union xfs_btree_rec	*rec)
176  {
177  	key->rmap.rm_startblock = rec->rmap.rm_startblock;
178  	key->rmap.rm_owner = rec->rmap.rm_owner;
179  	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
180  }
181  
182  /*
183   * The high key for a reverse mapping record can be computed by shifting
184   * the startblock and offset to the highest value that would still map
185   * to that record.  In practice this means that we add blockcount-1 to
186   * the startblock for all records, and if the record is for a data/attr
187   * fork mapping, we add blockcount-1 to the offset too.
188   */
189  STATIC void
xfs_rmapbt_init_high_key_from_rec(union xfs_btree_key * key,const union xfs_btree_rec * rec)190  xfs_rmapbt_init_high_key_from_rec(
191  	union xfs_btree_key		*key,
192  	const union xfs_btree_rec	*rec)
193  {
194  	uint64_t			off;
195  	int				adj;
196  
197  	adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
198  
199  	key->rmap.rm_startblock = rec->rmap.rm_startblock;
200  	be32_add_cpu(&key->rmap.rm_startblock, adj);
201  	key->rmap.rm_owner = rec->rmap.rm_owner;
202  	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
203  	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
204  	    XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
205  		return;
206  	off = be64_to_cpu(key->rmap.rm_offset);
207  	off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
208  	key->rmap.rm_offset = cpu_to_be64(off);
209  }
210  
211  STATIC void
xfs_rmapbt_init_rec_from_cur(struct xfs_btree_cur * cur,union xfs_btree_rec * rec)212  xfs_rmapbt_init_rec_from_cur(
213  	struct xfs_btree_cur	*cur,
214  	union xfs_btree_rec	*rec)
215  {
216  	rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
217  	rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
218  	rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
219  	rec->rmap.rm_offset = cpu_to_be64(
220  			xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
221  }
222  
223  STATIC void
xfs_rmapbt_init_ptr_from_cur(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr)224  xfs_rmapbt_init_ptr_from_cur(
225  	struct xfs_btree_cur	*cur,
226  	union xfs_btree_ptr	*ptr)
227  {
228  	struct xfs_agf		*agf = cur->bc_ag.agbp->b_addr;
229  
230  	ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
231  
232  	ptr->s = agf->agf_rmap_root;
233  }
234  
235  /*
236   * Mask the appropriate parts of the ondisk key field for a key comparison.
237   * Fork and bmbt are significant parts of the rmap record key, but written
238   * status is merely a record attribute.
239   */
offset_keymask(uint64_t offset)240  static inline uint64_t offset_keymask(uint64_t offset)
241  {
242  	return offset & ~XFS_RMAP_OFF_UNWRITTEN;
243  }
244  
245  STATIC int64_t
xfs_rmapbt_key_diff(struct xfs_btree_cur * cur,const union xfs_btree_key * key)246  xfs_rmapbt_key_diff(
247  	struct xfs_btree_cur		*cur,
248  	const union xfs_btree_key	*key)
249  {
250  	struct xfs_rmap_irec		*rec = &cur->bc_rec.r;
251  	const struct xfs_rmap_key	*kp = &key->rmap;
252  	__u64				x, y;
253  	int64_t				d;
254  
255  	d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
256  	if (d)
257  		return d;
258  
259  	x = be64_to_cpu(kp->rm_owner);
260  	y = rec->rm_owner;
261  	if (x > y)
262  		return 1;
263  	else if (y > x)
264  		return -1;
265  
266  	x = offset_keymask(be64_to_cpu(kp->rm_offset));
267  	y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
268  	if (x > y)
269  		return 1;
270  	else if (y > x)
271  		return -1;
272  	return 0;
273  }
274  
275  STATIC int64_t
xfs_rmapbt_diff_two_keys(struct xfs_btree_cur * cur,const union xfs_btree_key * k1,const union xfs_btree_key * k2,const union xfs_btree_key * mask)276  xfs_rmapbt_diff_two_keys(
277  	struct xfs_btree_cur		*cur,
278  	const union xfs_btree_key	*k1,
279  	const union xfs_btree_key	*k2,
280  	const union xfs_btree_key	*mask)
281  {
282  	const struct xfs_rmap_key	*kp1 = &k1->rmap;
283  	const struct xfs_rmap_key	*kp2 = &k2->rmap;
284  	int64_t				d;
285  	__u64				x, y;
286  
287  	/* Doesn't make sense to mask off the physical space part */
288  	ASSERT(!mask || mask->rmap.rm_startblock);
289  
290  	d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
291  		     be32_to_cpu(kp2->rm_startblock);
292  	if (d)
293  		return d;
294  
295  	if (!mask || mask->rmap.rm_owner) {
296  		x = be64_to_cpu(kp1->rm_owner);
297  		y = be64_to_cpu(kp2->rm_owner);
298  		if (x > y)
299  			return 1;
300  		else if (y > x)
301  			return -1;
302  	}
303  
304  	if (!mask || mask->rmap.rm_offset) {
305  		/* Doesn't make sense to allow offset but not owner */
306  		ASSERT(!mask || mask->rmap.rm_owner);
307  
308  		x = offset_keymask(be64_to_cpu(kp1->rm_offset));
309  		y = offset_keymask(be64_to_cpu(kp2->rm_offset));
310  		if (x > y)
311  			return 1;
312  		else if (y > x)
313  			return -1;
314  	}
315  
316  	return 0;
317  }
318  
319  static xfs_failaddr_t
xfs_rmapbt_verify(struct xfs_buf * bp)320  xfs_rmapbt_verify(
321  	struct xfs_buf		*bp)
322  {
323  	struct xfs_mount	*mp = bp->b_mount;
324  	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
325  	struct xfs_perag	*pag = bp->b_pag;
326  	xfs_failaddr_t		fa;
327  	unsigned int		level;
328  
329  	/*
330  	 * magic number and level verification
331  	 *
332  	 * During growfs operations, we can't verify the exact level or owner as
333  	 * the perag is not fully initialised and hence not attached to the
334  	 * buffer.  In this case, check against the maximum tree depth.
335  	 *
336  	 * Similarly, during log recovery we will have a perag structure
337  	 * attached, but the agf information will not yet have been initialised
338  	 * from the on disk AGF. Again, we can only check against maximum limits
339  	 * in this case.
340  	 */
341  	if (!xfs_verify_magic(bp, block->bb_magic))
342  		return __this_address;
343  
344  	if (!xfs_has_rmapbt(mp))
345  		return __this_address;
346  	fa = xfs_btree_agblock_v5hdr_verify(bp);
347  	if (fa)
348  		return fa;
349  
350  	level = be16_to_cpu(block->bb_level);
351  	if (pag && xfs_perag_initialised_agf(pag)) {
352  		unsigned int	maxlevel = pag->pagf_rmap_level;
353  
354  #ifdef CONFIG_XFS_ONLINE_REPAIR
355  		/*
356  		 * Online repair could be rewriting the free space btrees, so
357  		 * we'll validate against the larger of either tree while this
358  		 * is going on.
359  		 */
360  		maxlevel = max_t(unsigned int, maxlevel,
361  				pag->pagf_repair_rmap_level);
362  #endif
363  		if (level >= maxlevel)
364  			return __this_address;
365  	} else if (level >= mp->m_rmap_maxlevels)
366  		return __this_address;
367  
368  	return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]);
369  }
370  
371  static void
xfs_rmapbt_read_verify(struct xfs_buf * bp)372  xfs_rmapbt_read_verify(
373  	struct xfs_buf	*bp)
374  {
375  	xfs_failaddr_t	fa;
376  
377  	if (!xfs_btree_agblock_verify_crc(bp))
378  		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
379  	else {
380  		fa = xfs_rmapbt_verify(bp);
381  		if (fa)
382  			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
383  	}
384  
385  	if (bp->b_error)
386  		trace_xfs_btree_corrupt(bp, _RET_IP_);
387  }
388  
389  static void
xfs_rmapbt_write_verify(struct xfs_buf * bp)390  xfs_rmapbt_write_verify(
391  	struct xfs_buf	*bp)
392  {
393  	xfs_failaddr_t	fa;
394  
395  	fa = xfs_rmapbt_verify(bp);
396  	if (fa) {
397  		trace_xfs_btree_corrupt(bp, _RET_IP_);
398  		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
399  		return;
400  	}
401  	xfs_btree_agblock_calc_crc(bp);
402  
403  }
404  
405  const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
406  	.name			= "xfs_rmapbt",
407  	.magic			= { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
408  	.verify_read		= xfs_rmapbt_read_verify,
409  	.verify_write		= xfs_rmapbt_write_verify,
410  	.verify_struct		= xfs_rmapbt_verify,
411  };
412  
413  STATIC int
xfs_rmapbt_keys_inorder(struct xfs_btree_cur * cur,const union xfs_btree_key * k1,const union xfs_btree_key * k2)414  xfs_rmapbt_keys_inorder(
415  	struct xfs_btree_cur		*cur,
416  	const union xfs_btree_key	*k1,
417  	const union xfs_btree_key	*k2)
418  {
419  	uint32_t		x;
420  	uint32_t		y;
421  	uint64_t		a;
422  	uint64_t		b;
423  
424  	x = be32_to_cpu(k1->rmap.rm_startblock);
425  	y = be32_to_cpu(k2->rmap.rm_startblock);
426  	if (x < y)
427  		return 1;
428  	else if (x > y)
429  		return 0;
430  	a = be64_to_cpu(k1->rmap.rm_owner);
431  	b = be64_to_cpu(k2->rmap.rm_owner);
432  	if (a < b)
433  		return 1;
434  	else if (a > b)
435  		return 0;
436  	a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
437  	b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
438  	if (a <= b)
439  		return 1;
440  	return 0;
441  }
442  
443  STATIC int
xfs_rmapbt_recs_inorder(struct xfs_btree_cur * cur,const union xfs_btree_rec * r1,const union xfs_btree_rec * r2)444  xfs_rmapbt_recs_inorder(
445  	struct xfs_btree_cur		*cur,
446  	const union xfs_btree_rec	*r1,
447  	const union xfs_btree_rec	*r2)
448  {
449  	uint32_t		x;
450  	uint32_t		y;
451  	uint64_t		a;
452  	uint64_t		b;
453  
454  	x = be32_to_cpu(r1->rmap.rm_startblock);
455  	y = be32_to_cpu(r2->rmap.rm_startblock);
456  	if (x < y)
457  		return 1;
458  	else if (x > y)
459  		return 0;
460  	a = be64_to_cpu(r1->rmap.rm_owner);
461  	b = be64_to_cpu(r2->rmap.rm_owner);
462  	if (a < b)
463  		return 1;
464  	else if (a > b)
465  		return 0;
466  	a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
467  	b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
468  	if (a <= b)
469  		return 1;
470  	return 0;
471  }
472  
473  STATIC enum xbtree_key_contig
xfs_rmapbt_keys_contiguous(struct xfs_btree_cur * cur,const union xfs_btree_key * key1,const union xfs_btree_key * key2,const union xfs_btree_key * mask)474  xfs_rmapbt_keys_contiguous(
475  	struct xfs_btree_cur		*cur,
476  	const union xfs_btree_key	*key1,
477  	const union xfs_btree_key	*key2,
478  	const union xfs_btree_key	*mask)
479  {
480  	ASSERT(!mask || mask->rmap.rm_startblock);
481  
482  	/*
483  	 * We only support checking contiguity of the physical space component.
484  	 * If any callers ever need more specificity than that, they'll have to
485  	 * implement it here.
486  	 */
487  	ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
488  
489  	return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
490  				 be32_to_cpu(key2->rmap.rm_startblock));
491  }
492  
493  const struct xfs_btree_ops xfs_rmapbt_ops = {
494  	.name			= "rmap",
495  	.type			= XFS_BTREE_TYPE_AG,
496  	.geom_flags		= XFS_BTGEO_OVERLAPPING,
497  
498  	.rec_len		= sizeof(struct xfs_rmap_rec),
499  	/* Overlapping btree; 2 keys per pointer. */
500  	.key_len		= 2 * sizeof(struct xfs_rmap_key),
501  	.ptr_len		= XFS_BTREE_SHORT_PTR_LEN,
502  
503  	.lru_refs		= XFS_RMAP_BTREE_REF,
504  	.statoff		= XFS_STATS_CALC_INDEX(xs_rmap_2),
505  	.sick_mask		= XFS_SICK_AG_RMAPBT,
506  
507  	.dup_cursor		= xfs_rmapbt_dup_cursor,
508  	.set_root		= xfs_rmapbt_set_root,
509  	.alloc_block		= xfs_rmapbt_alloc_block,
510  	.free_block		= xfs_rmapbt_free_block,
511  	.get_minrecs		= xfs_rmapbt_get_minrecs,
512  	.get_maxrecs		= xfs_rmapbt_get_maxrecs,
513  	.init_key_from_rec	= xfs_rmapbt_init_key_from_rec,
514  	.init_high_key_from_rec	= xfs_rmapbt_init_high_key_from_rec,
515  	.init_rec_from_cur	= xfs_rmapbt_init_rec_from_cur,
516  	.init_ptr_from_cur	= xfs_rmapbt_init_ptr_from_cur,
517  	.key_diff		= xfs_rmapbt_key_diff,
518  	.buf_ops		= &xfs_rmapbt_buf_ops,
519  	.diff_two_keys		= xfs_rmapbt_diff_two_keys,
520  	.keys_inorder		= xfs_rmapbt_keys_inorder,
521  	.recs_inorder		= xfs_rmapbt_recs_inorder,
522  	.keys_contiguous	= xfs_rmapbt_keys_contiguous,
523  };
524  
525  /*
526   * Create a new reverse mapping btree cursor.
527   *
528   * For staging cursors tp and agbp are NULL.
529   */
530  struct xfs_btree_cur *
xfs_rmapbt_init_cursor(struct xfs_mount * mp,struct xfs_trans * tp,struct xfs_buf * agbp,struct xfs_perag * pag)531  xfs_rmapbt_init_cursor(
532  	struct xfs_mount	*mp,
533  	struct xfs_trans	*tp,
534  	struct xfs_buf		*agbp,
535  	struct xfs_perag	*pag)
536  {
537  	struct xfs_btree_cur	*cur;
538  
539  	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops,
540  			mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
541  	cur->bc_ag.pag = xfs_perag_hold(pag);
542  	cur->bc_ag.agbp = agbp;
543  	if (agbp) {
544  		struct xfs_agf		*agf = agbp->b_addr;
545  
546  		cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level);
547  	}
548  	return cur;
549  }
550  
551  #ifdef CONFIG_XFS_BTREE_IN_MEM
552  static inline unsigned int
xfs_rmapbt_mem_block_maxrecs(unsigned int blocklen,bool leaf)553  xfs_rmapbt_mem_block_maxrecs(
554  	unsigned int		blocklen,
555  	bool			leaf)
556  {
557  	if (leaf)
558  		return blocklen / sizeof(struct xfs_rmap_rec);
559  	return blocklen /
560  		(2 * sizeof(struct xfs_rmap_key) + sizeof(__be64));
561  }
562  
563  /*
564   * Validate an in-memory rmap btree block.  Callers are allowed to generate an
565   * in-memory btree even if the ondisk feature is not enabled.
566   */
567  static xfs_failaddr_t
xfs_rmapbt_mem_verify(struct xfs_buf * bp)568  xfs_rmapbt_mem_verify(
569  	struct xfs_buf		*bp)
570  {
571  	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
572  	xfs_failaddr_t		fa;
573  	unsigned int		level;
574  	unsigned int		maxrecs;
575  
576  	if (!xfs_verify_magic(bp, block->bb_magic))
577  		return __this_address;
578  
579  	fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
580  	if (fa)
581  		return fa;
582  
583  	level = be16_to_cpu(block->bb_level);
584  	if (level >= xfs_rmapbt_maxlevels_ondisk())
585  		return __this_address;
586  
587  	maxrecs = xfs_rmapbt_mem_block_maxrecs(
588  			XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0);
589  	return xfs_btree_memblock_verify(bp, maxrecs);
590  }
591  
592  static void
xfs_rmapbt_mem_rw_verify(struct xfs_buf * bp)593  xfs_rmapbt_mem_rw_verify(
594  	struct xfs_buf	*bp)
595  {
596  	xfs_failaddr_t	fa = xfs_rmapbt_mem_verify(bp);
597  
598  	if (fa)
599  		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
600  }
601  
602  /* skip crc checks on in-memory btrees to save time */
603  static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = {
604  	.name			= "xfs_rmapbt_mem",
605  	.magic			= { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
606  	.verify_read		= xfs_rmapbt_mem_rw_verify,
607  	.verify_write		= xfs_rmapbt_mem_rw_verify,
608  	.verify_struct		= xfs_rmapbt_mem_verify,
609  };
610  
611  const struct xfs_btree_ops xfs_rmapbt_mem_ops = {
612  	.name			= "mem_rmap",
613  	.type			= XFS_BTREE_TYPE_MEM,
614  	.geom_flags		= XFS_BTGEO_OVERLAPPING,
615  
616  	.rec_len		= sizeof(struct xfs_rmap_rec),
617  	/* Overlapping btree; 2 keys per pointer. */
618  	.key_len		= 2 * sizeof(struct xfs_rmap_key),
619  	.ptr_len		= XFS_BTREE_LONG_PTR_LEN,
620  
621  	.lru_refs		= XFS_RMAP_BTREE_REF,
622  	.statoff		= XFS_STATS_CALC_INDEX(xs_rmap_mem_2),
623  
624  	.dup_cursor		= xfbtree_dup_cursor,
625  	.set_root		= xfbtree_set_root,
626  	.alloc_block		= xfbtree_alloc_block,
627  	.free_block		= xfbtree_free_block,
628  	.get_minrecs		= xfbtree_get_minrecs,
629  	.get_maxrecs		= xfbtree_get_maxrecs,
630  	.init_key_from_rec	= xfs_rmapbt_init_key_from_rec,
631  	.init_high_key_from_rec	= xfs_rmapbt_init_high_key_from_rec,
632  	.init_rec_from_cur	= xfs_rmapbt_init_rec_from_cur,
633  	.init_ptr_from_cur	= xfbtree_init_ptr_from_cur,
634  	.key_diff		= xfs_rmapbt_key_diff,
635  	.buf_ops		= &xfs_rmapbt_mem_buf_ops,
636  	.diff_two_keys		= xfs_rmapbt_diff_two_keys,
637  	.keys_inorder		= xfs_rmapbt_keys_inorder,
638  	.recs_inorder		= xfs_rmapbt_recs_inorder,
639  	.keys_contiguous	= xfs_rmapbt_keys_contiguous,
640  };
641  
642  /* Create a cursor for an in-memory btree. */
643  struct xfs_btree_cur *
xfs_rmapbt_mem_cursor(struct xfs_perag * pag,struct xfs_trans * tp,struct xfbtree * xfbt)644  xfs_rmapbt_mem_cursor(
645  	struct xfs_perag	*pag,
646  	struct xfs_trans	*tp,
647  	struct xfbtree		*xfbt)
648  {
649  	struct xfs_btree_cur	*cur;
650  	struct xfs_mount	*mp = pag->pag_mount;
651  
652  	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops,
653  			xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache);
654  	cur->bc_mem.xfbtree = xfbt;
655  	cur->bc_nlevels = xfbt->nlevels;
656  
657  	cur->bc_mem.pag = xfs_perag_hold(pag);
658  	return cur;
659  }
660  
661  /* Create an in-memory rmap btree. */
662  int
xfs_rmapbt_mem_init(struct xfs_mount * mp,struct xfbtree * xfbt,struct xfs_buftarg * btp,xfs_agnumber_t agno)663  xfs_rmapbt_mem_init(
664  	struct xfs_mount	*mp,
665  	struct xfbtree		*xfbt,
666  	struct xfs_buftarg	*btp,
667  	xfs_agnumber_t		agno)
668  {
669  	xfbt->owner = agno;
670  	return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops);
671  }
672  
673  /* Compute the max possible height for reverse mapping btrees in memory. */
674  static unsigned int
xfs_rmapbt_mem_maxlevels(void)675  xfs_rmapbt_mem_maxlevels(void)
676  {
677  	unsigned int		minrecs[2];
678  	unsigned int		blocklen;
679  
680  	blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
681  
682  	minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2;
683  	minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2;
684  
685  	/*
686  	 * How tall can an in-memory rmap btree become if we filled the entire
687  	 * AG with rmap records?
688  	 */
689  	return xfs_btree_compute_maxlevels(minrecs,
690  			XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec));
691  }
692  #else
693  # define xfs_rmapbt_mem_maxlevels()	(0)
694  #endif /* CONFIG_XFS_BTREE_IN_MEM */
695  
696  /*
697   * Install a new reverse mapping btree root.  Caller is responsible for
698   * invalidating and freeing the old btree blocks.
699   */
700  void
xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur * cur,struct xfs_trans * tp,struct xfs_buf * agbp)701  xfs_rmapbt_commit_staged_btree(
702  	struct xfs_btree_cur	*cur,
703  	struct xfs_trans	*tp,
704  	struct xfs_buf		*agbp)
705  {
706  	struct xfs_agf		*agf = agbp->b_addr;
707  	struct xbtree_afakeroot	*afake = cur->bc_ag.afake;
708  
709  	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
710  
711  	agf->agf_rmap_root = cpu_to_be32(afake->af_root);
712  	agf->agf_rmap_level = cpu_to_be32(afake->af_levels);
713  	agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
714  	xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
715  				    XFS_AGF_RMAP_BLOCKS);
716  	xfs_btree_commit_afakeroot(cur, tp, agbp);
717  }
718  
719  /* Calculate number of records in a reverse mapping btree block. */
720  static inline unsigned int
xfs_rmapbt_block_maxrecs(unsigned int blocklen,bool leaf)721  xfs_rmapbt_block_maxrecs(
722  	unsigned int		blocklen,
723  	bool			leaf)
724  {
725  	if (leaf)
726  		return blocklen / sizeof(struct xfs_rmap_rec);
727  	return blocklen /
728  		(2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
729  }
730  
731  /*
732   * Calculate number of records in an rmap btree block.
733   */
734  unsigned int
xfs_rmapbt_maxrecs(struct xfs_mount * mp,unsigned int blocklen,bool leaf)735  xfs_rmapbt_maxrecs(
736  	struct xfs_mount	*mp,
737  	unsigned int		blocklen,
738  	bool			leaf)
739  {
740  	blocklen -= XFS_RMAP_BLOCK_LEN;
741  	return xfs_rmapbt_block_maxrecs(blocklen, leaf);
742  }
743  
744  /* Compute the max possible height for reverse mapping btrees. */
745  unsigned int
xfs_rmapbt_maxlevels_ondisk(void)746  xfs_rmapbt_maxlevels_ondisk(void)
747  {
748  	unsigned int		minrecs[2];
749  	unsigned int		blocklen;
750  
751  	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
752  
753  	minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2;
754  	minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2;
755  
756  	/*
757  	 * Compute the asymptotic maxlevels for an rmapbt on any reflink fs.
758  	 *
759  	 * On a reflink filesystem, each AG block can have up to 2^32 (per the
760  	 * refcount record format) owners, which means that theoretically we
761  	 * could face up to 2^64 rmap records.  However, we're likely to run
762  	 * out of blocks in the AG long before that happens, which means that
763  	 * we must compute the max height based on what the btree will look
764  	 * like if it consumes almost all the blocks in the AG due to maximal
765  	 * sharing factor.
766  	 */
767  	return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS),
768  		   xfs_rmapbt_mem_maxlevels());
769  }
770  
771  /* Compute the maximum height of an rmap btree. */
772  void
xfs_rmapbt_compute_maxlevels(struct xfs_mount * mp)773  xfs_rmapbt_compute_maxlevels(
774  	struct xfs_mount		*mp)
775  {
776  	if (!xfs_has_rmapbt(mp)) {
777  		mp->m_rmap_maxlevels = 0;
778  		return;
779  	}
780  
781  	if (xfs_has_reflink(mp)) {
782  		/*
783  		 * Compute the asymptotic maxlevels for an rmap btree on a
784  		 * filesystem that supports reflink.
785  		 *
786  		 * On a reflink filesystem, each AG block can have up to 2^32
787  		 * (per the refcount record format) owners, which means that
788  		 * theoretically we could face up to 2^64 rmap records.
789  		 * However, we're likely to run out of blocks in the AG long
790  		 * before that happens, which means that we must compute the
791  		 * max height based on what the btree will look like if it
792  		 * consumes almost all the blocks in the AG due to maximal
793  		 * sharing factor.
794  		 */
795  		mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr,
796  				mp->m_sb.sb_agblocks);
797  	} else {
798  		/*
799  		 * If there's no block sharing, compute the maximum rmapbt
800  		 * height assuming one rmap record per AG block.
801  		 */
802  		mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
803  				mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
804  	}
805  	ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk());
806  }
807  
808  /* Calculate the refcount btree size for some records. */
809  xfs_extlen_t
xfs_rmapbt_calc_size(struct xfs_mount * mp,unsigned long long len)810  xfs_rmapbt_calc_size(
811  	struct xfs_mount	*mp,
812  	unsigned long long	len)
813  {
814  	return xfs_btree_calc_size(mp->m_rmap_mnr, len);
815  }
816  
817  /*
818   * Calculate the maximum refcount btree size.
819   */
820  xfs_extlen_t
xfs_rmapbt_max_size(struct xfs_mount * mp,xfs_agblock_t agblocks)821  xfs_rmapbt_max_size(
822  	struct xfs_mount	*mp,
823  	xfs_agblock_t		agblocks)
824  {
825  	/* Bail out if we're uninitialized, which can happen in mkfs. */
826  	if (mp->m_rmap_mxr[0] == 0)
827  		return 0;
828  
829  	return xfs_rmapbt_calc_size(mp, agblocks);
830  }
831  
832  /*
833   * Figure out how many blocks to reserve and how many are used by this btree.
834   */
835  int
xfs_rmapbt_calc_reserves(struct xfs_mount * mp,struct xfs_trans * tp,struct xfs_perag * pag,xfs_extlen_t * ask,xfs_extlen_t * used)836  xfs_rmapbt_calc_reserves(
837  	struct xfs_mount	*mp,
838  	struct xfs_trans	*tp,
839  	struct xfs_perag	*pag,
840  	xfs_extlen_t		*ask,
841  	xfs_extlen_t		*used)
842  {
843  	struct xfs_buf		*agbp;
844  	struct xfs_agf		*agf;
845  	xfs_agblock_t		agblocks;
846  	xfs_extlen_t		tree_len;
847  	int			error;
848  
849  	if (!xfs_has_rmapbt(mp))
850  		return 0;
851  
852  	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
853  	if (error)
854  		return error;
855  
856  	agf = agbp->b_addr;
857  	agblocks = be32_to_cpu(agf->agf_length);
858  	tree_len = be32_to_cpu(agf->agf_rmap_blocks);
859  	xfs_trans_brelse(tp, agbp);
860  
861  	/*
862  	 * The log is permanently allocated, so the space it occupies will
863  	 * never be available for the kinds of things that would require btree
864  	 * expansion.  We therefore can pretend the space isn't there.
865  	 */
866  	if (xfs_ag_contains_log(mp, pag->pag_agno))
867  		agblocks -= mp->m_sb.sb_logblocks;
868  
869  	/* Reserve 1% of the AG or enough for 1 block per record. */
870  	*ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
871  	*used += tree_len;
872  
873  	return error;
874  }
875  
876  int __init
xfs_rmapbt_init_cur_cache(void)877  xfs_rmapbt_init_cur_cache(void)
878  {
879  	xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur",
880  			xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()),
881  			0, 0, NULL);
882  
883  	if (!xfs_rmapbt_cur_cache)
884  		return -ENOMEM;
885  	return 0;
886  }
887  
888  void
xfs_rmapbt_destroy_cur_cache(void)889  xfs_rmapbt_destroy_cur_cache(void)
890  {
891  	kmem_cache_destroy(xfs_rmapbt_cur_cache);
892  	xfs_rmapbt_cur_cache = NULL;
893  }
894