1  /* SPDX-License-Identifier: GPL-2.0 */
2  #ifndef _BCACHEFS_FORMAT_H
3  #define _BCACHEFS_FORMAT_H
4  
5  /*
6   * bcachefs on disk data structures
7   *
8   * OVERVIEW:
9   *
10   * There are three main types of on disk data structures in bcachefs (this is
11   * reduced from 5 in bcache)
12   *
13   *  - superblock
14   *  - journal
15   *  - btree
16   *
17   * The btree is the primary structure; most metadata exists as keys in the
18   * various btrees. There are only a small number of btrees, they're not
19   * sharded - we have one btree for extents, another for inodes, et cetera.
20   *
21   * SUPERBLOCK:
22   *
23   * The superblock contains the location of the journal, the list of devices in
24   * the filesystem, and in general any metadata we need in order to decide
25   * whether we can start a filesystem or prior to reading the journal/btree
26   * roots.
27   *
28   * The superblock is extensible, and most of the contents of the superblock are
29   * in variable length, type tagged fields; see struct bch_sb_field.
30   *
31   * Backup superblocks do not reside in a fixed location; also, superblocks do
32   * not have a fixed size. To locate backup superblocks we have struct
33   * bch_sb_layout; we store a copy of this inside every superblock, and also
34   * before the first superblock.
35   *
36   * JOURNAL:
37   *
38   * The journal primarily records btree updates in the order they occurred;
39   * journal replay consists of just iterating over all the keys in the open
40   * journal entries and re-inserting them into the btrees.
41   *
42   * The journal also contains entry types for the btree roots, and blacklisted
43   * journal sequence numbers (see journal_seq_blacklist.c).
44   *
45   * BTREE:
46   *
47   * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
48   * 128k-256k) and log structured. We use struct btree_node for writing the first
49   * entry in a given node (offset 0), and struct btree_node_entry for all
50   * subsequent writes.
51   *
52   * After the header, btree node entries contain a list of keys in sorted order.
53   * Values are stored inline with the keys; since values are variable length (and
54   * keys effectively are variable length too, due to packing) we can't do random
55   * access without building up additional in memory tables in the btree node read
56   * path.
57   *
58   * BTREE KEYS (struct bkey):
59   *
60   * The various btrees share a common format for the key - so as to avoid
61   * switching in fastpath lookup/comparison code - but define their own
62   * structures for the key values.
63   *
64   * The size of a key/value pair is stored as a u8 in units of u64s, so the max
65   * size is just under 2k. The common part also contains a type tag for the
66   * value, and a format field indicating whether the key is packed or not (and
67   * also meant to allow adding new key fields in the future, if desired).
68   *
69   * bkeys, when stored within a btree node, may also be packed. In that case, the
70   * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
71   * be generous with field sizes in the common part of the key format (64 bit
72   * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
73   */
74  
75  #include <asm/types.h>
76  #include <asm/byteorder.h>
77  #include <linux/kernel.h>
78  #include <linux/uuid.h>
79  #include <uapi/linux/magic.h>
80  #include "vstructs.h"
81  
82  #ifdef __KERNEL__
83  typedef uuid_t __uuid_t;
84  #endif
85  
86  #define BITMASK(name, type, field, offset, end)				\
87  static const __maybe_unused unsigned	name##_OFFSET = offset;		\
88  static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
89  									\
90  static inline __u64 name(const type *k)					\
91  {									\
92  	return (k->field >> offset) & ~(~0ULL << (end - offset));	\
93  }									\
94  									\
95  static inline void SET_##name(type *k, __u64 v)				\
96  {									\
97  	k->field &= ~(~(~0ULL << (end - offset)) << offset);		\
98  	k->field |= (v & ~(~0ULL << (end - offset))) << offset;		\
99  }
100  
101  #define LE_BITMASK(_bits, name, type, field, offset, end)		\
102  static const __maybe_unused unsigned	name##_OFFSET = offset;		\
103  static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
104  static const __maybe_unused __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;\
105  									\
106  static inline __u64 name(const type *k)					\
107  {									\
108  	return (__le##_bits##_to_cpu(k->field) >> offset) &		\
109  		~(~0ULL << (end - offset));				\
110  }									\
111  									\
112  static inline void SET_##name(type *k, __u64 v)				\
113  {									\
114  	__u##_bits new = __le##_bits##_to_cpu(k->field);		\
115  									\
116  	new &= ~(~(~0ULL << (end - offset)) << offset);			\
117  	new |= (v & ~(~0ULL << (end - offset))) << offset;		\
118  	k->field = __cpu_to_le##_bits(new);				\
119  }
120  
121  #define LE16_BITMASK(n, t, f, o, e)	LE_BITMASK(16, n, t, f, o, e)
122  #define LE32_BITMASK(n, t, f, o, e)	LE_BITMASK(32, n, t, f, o, e)
123  #define LE64_BITMASK(n, t, f, o, e)	LE_BITMASK(64, n, t, f, o, e)
124  
125  struct bkey_format {
126  	__u8		key_u64s;
127  	__u8		nr_fields;
128  	/* One unused slot for now: */
129  	__u8		bits_per_field[6];
130  	__le64		field_offset[6];
131  };
132  
133  /* Btree keys - all units are in sectors */
134  
135  struct bpos {
136  	/*
137  	 * Word order matches machine byte order - btree code treats a bpos as a
138  	 * single large integer, for search/comparison purposes
139  	 *
140  	 * Note that wherever a bpos is embedded in another on disk data
141  	 * structure, it has to be byte swabbed when reading in metadata that
142  	 * wasn't written in native endian order:
143  	 */
144  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
145  	__u32		snapshot;
146  	__u64		offset;
147  	__u64		inode;
148  #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
149  	__u64		inode;
150  	__u64		offset;		/* Points to end of extent - sectors */
151  	__u32		snapshot;
152  #else
153  #error edit for your odd byteorder.
154  #endif
155  } __packed
156  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
157  __aligned(4)
158  #endif
159  ;
160  
161  #define KEY_INODE_MAX			((__u64)~0ULL)
162  #define KEY_OFFSET_MAX			((__u64)~0ULL)
163  #define KEY_SNAPSHOT_MAX		((__u32)~0U)
164  #define KEY_SIZE_MAX			((__u32)~0U)
165  
SPOS(__u64 inode,__u64 offset,__u32 snapshot)166  static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
167  {
168  	return (struct bpos) {
169  		.inode		= inode,
170  		.offset		= offset,
171  		.snapshot	= snapshot,
172  	};
173  }
174  
175  #define POS_MIN				SPOS(0, 0, 0)
176  #define POS_MAX				SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
177  #define SPOS_MAX			SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
178  #define POS(_inode, _offset)		SPOS(_inode, _offset, 0)
179  
180  /* Empty placeholder struct, for container_of() */
181  struct bch_val {
182  	__u64		__nothing[0];
183  };
184  
185  struct bversion {
186  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
187  	__u64		lo;
188  	__u32		hi;
189  #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
190  	__u32		hi;
191  	__u64		lo;
192  #endif
193  } __packed
194  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
195  __aligned(4)
196  #endif
197  ;
198  
199  struct bkey {
200  	/* Size of combined key and value, in u64s */
201  	__u8		u64s;
202  
203  	/* Format of key (0 for format local to btree node) */
204  #if defined(__LITTLE_ENDIAN_BITFIELD)
205  	__u8		format:7,
206  			needs_whiteout:1;
207  #elif defined (__BIG_ENDIAN_BITFIELD)
208  	__u8		needs_whiteout:1,
209  			format:7;
210  #else
211  #error edit for your odd byteorder.
212  #endif
213  
214  	/* Type of the value */
215  	__u8		type;
216  
217  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
218  	__u8		pad[1];
219  
220  	struct bversion	bversion;
221  	__u32		size;		/* extent size, in sectors */
222  	struct bpos	p;
223  #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
224  	struct bpos	p;
225  	__u32		size;		/* extent size, in sectors */
226  	struct bversion	bversion;
227  
228  	__u8		pad[1];
229  #endif
230  } __packed
231  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
232  /*
233   * The big-endian version of bkey can't be compiled by rustc with the "aligned"
234   * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
235   * So for Rust compatibility, don't include this. It can be included in the LE
236   * version because the "packed" attr is redundant in that case.
237   *
238   * History: (quoting Kent)
239   *
240   * Specifically, when i was designing bkey, I wanted the header to be no
241   * bigger than necessary so that bkey_packed could use the rest. That means that
242   * decently offten extent keys will fit into only 8 bytes, instead of spilling over
243   * to 16.
244   *
245   * But packed_bkey treats the part after the header - the packed section -
246   * as a single multi word, variable length integer. And bkey, the unpacked
247   * version, is just a special case version of a bkey_packed; all the packed
248   * bkey code will work on keys in any packed format, the in-memory
249   * representation of an unpacked key also is just one type of packed key...
250   *
251   * So that constrains the key part of a bkig endian bkey to start right
252   * after the header.
253   *
254   * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
255   * some reason - that will clean up this wart.
256   */
257  __aligned(8)
258  #endif
259  ;
260  
261  struct bkey_packed {
262  	__u64		_data[0];
263  
264  	/* Size of combined key and value, in u64s */
265  	__u8		u64s;
266  
267  	/* Format of key (0 for format local to btree node) */
268  
269  	/*
270  	 * XXX: next incompat on disk format change, switch format and
271  	 * needs_whiteout - bkey_packed() will be cheaper if format is the high
272  	 * bits of the bitfield
273  	 */
274  #if defined(__LITTLE_ENDIAN_BITFIELD)
275  	__u8		format:7,
276  			needs_whiteout:1;
277  #elif defined (__BIG_ENDIAN_BITFIELD)
278  	__u8		needs_whiteout:1,
279  			format:7;
280  #endif
281  
282  	/* Type of the value */
283  	__u8		type;
284  	__u8		key_start[0];
285  
286  	/*
287  	 * We copy bkeys with struct assignment in various places, and while
288  	 * that shouldn't be done with packed bkeys we can't disallow it in C,
289  	 * and it's legal to cast a bkey to a bkey_packed  - so padding it out
290  	 * to the same size as struct bkey should hopefully be safest.
291  	 */
292  	__u8		pad[sizeof(struct bkey) - 3];
293  } __packed __aligned(8);
294  
295  typedef struct {
296  	__le64			lo;
297  	__le64			hi;
298  } bch_le128;
299  
300  #define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
301  #define BKEY_U64s_MAX			U8_MAX
302  #define BKEY_VAL_U64s_MAX		(BKEY_U64s_MAX - BKEY_U64s)
303  
304  #define KEY_PACKED_BITS_START		24
305  
306  #define KEY_FORMAT_LOCAL_BTREE		0
307  #define KEY_FORMAT_CURRENT		1
308  
309  enum bch_bkey_fields {
310  	BKEY_FIELD_INODE,
311  	BKEY_FIELD_OFFSET,
312  	BKEY_FIELD_SNAPSHOT,
313  	BKEY_FIELD_SIZE,
314  	BKEY_FIELD_VERSION_HI,
315  	BKEY_FIELD_VERSION_LO,
316  	BKEY_NR_FIELDS,
317  };
318  
319  #define bkey_format_field(name, field)					\
320  	[BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
321  
322  #define BKEY_FORMAT_CURRENT						\
323  ((struct bkey_format) {							\
324  	.key_u64s	= BKEY_U64s,					\
325  	.nr_fields	= BKEY_NR_FIELDS,				\
326  	.bits_per_field = {						\
327  		bkey_format_field(INODE,	p.inode),		\
328  		bkey_format_field(OFFSET,	p.offset),		\
329  		bkey_format_field(SNAPSHOT,	p.snapshot),		\
330  		bkey_format_field(SIZE,		size),			\
331  		bkey_format_field(VERSION_HI,	bversion.hi),		\
332  		bkey_format_field(VERSION_LO,	bversion.lo),		\
333  	},								\
334  })
335  
336  /* bkey with inline value */
337  struct bkey_i {
338  	__u64			_data[0];
339  
340  	struct bkey	k;
341  	struct bch_val	v;
342  };
343  
344  #define POS_KEY(_pos)							\
345  ((struct bkey) {							\
346  	.u64s		= BKEY_U64s,					\
347  	.format		= KEY_FORMAT_CURRENT,				\
348  	.p		= _pos,						\
349  })
350  
351  #define KEY(_inode, _offset, _size)					\
352  ((struct bkey) {							\
353  	.u64s		= BKEY_U64s,					\
354  	.format		= KEY_FORMAT_CURRENT,				\
355  	.p		= POS(_inode, _offset),				\
356  	.size		= _size,					\
357  })
358  
bkey_init(struct bkey * k)359  static inline void bkey_init(struct bkey *k)
360  {
361  	*k = KEY(0, 0, 0);
362  }
363  
364  #define bkey_bytes(_k)		((_k)->u64s * sizeof(__u64))
365  
366  #define __BKEY_PADDED(key, pad)					\
367  	struct bkey_i key; __u64 key ## _pad[pad]
368  
369  /*
370   * - DELETED keys are used internally to mark keys that should be ignored but
371   *   override keys in composition order.  Their version number is ignored.
372   *
373   * - DISCARDED keys indicate that the data is all 0s because it has been
374   *   discarded. DISCARDs may have a version; if the version is nonzero the key
375   *   will be persistent, otherwise the key will be dropped whenever the btree
376   *   node is rewritten (like DELETED keys).
377   *
378   * - ERROR: any read of the data returns a read error, as the data was lost due
379   *   to a failing device. Like DISCARDED keys, they can be removed (overridden)
380   *   by new writes or cluster-wide GC. Node repair can also overwrite them with
381   *   the same or a more recent version number, but not with an older version
382   *   number.
383   *
384   * - WHITEOUT: for hash table btrees
385   */
386  #define BCH_BKEY_TYPES()				\
387  	x(deleted,		0)			\
388  	x(whiteout,		1)			\
389  	x(error,		2)			\
390  	x(cookie,		3)			\
391  	x(hash_whiteout,	4)			\
392  	x(btree_ptr,		5)			\
393  	x(extent,		6)			\
394  	x(reservation,		7)			\
395  	x(inode,		8)			\
396  	x(inode_generation,	9)			\
397  	x(dirent,		10)			\
398  	x(xattr,		11)			\
399  	x(alloc,		12)			\
400  	x(quota,		13)			\
401  	x(stripe,		14)			\
402  	x(reflink_p,		15)			\
403  	x(reflink_v,		16)			\
404  	x(inline_data,		17)			\
405  	x(btree_ptr_v2,		18)			\
406  	x(indirect_inline_data,	19)			\
407  	x(alloc_v2,		20)			\
408  	x(subvolume,		21)			\
409  	x(snapshot,		22)			\
410  	x(inode_v2,		23)			\
411  	x(alloc_v3,		24)			\
412  	x(set,			25)			\
413  	x(lru,			26)			\
414  	x(alloc_v4,		27)			\
415  	x(backpointer,		28)			\
416  	x(inode_v3,		29)			\
417  	x(bucket_gens,		30)			\
418  	x(snapshot_tree,	31)			\
419  	x(logged_op_truncate,	32)			\
420  	x(logged_op_finsert,	33)			\
421  	x(accounting,		34)
422  
423  enum bch_bkey_type {
424  #define x(name, nr) KEY_TYPE_##name	= nr,
425  	BCH_BKEY_TYPES()
426  #undef x
427  	KEY_TYPE_MAX,
428  };
429  
430  struct bch_deleted {
431  	struct bch_val		v;
432  };
433  
434  struct bch_whiteout {
435  	struct bch_val		v;
436  };
437  
438  struct bch_error {
439  	struct bch_val		v;
440  };
441  
442  struct bch_cookie {
443  	struct bch_val		v;
444  	__le64			cookie;
445  };
446  
447  struct bch_hash_whiteout {
448  	struct bch_val		v;
449  };
450  
451  struct bch_set {
452  	struct bch_val		v;
453  };
454  
455  /* 128 bits, sufficient for cryptographic MACs: */
456  struct bch_csum {
457  	__le64			lo;
458  	__le64			hi;
459  } __packed __aligned(8);
460  
461  struct bch_backpointer {
462  	struct bch_val		v;
463  	__u8			btree_id;
464  	__u8			level;
465  	__u8			data_type;
466  	__u64			bucket_offset:40;
467  	__u32			bucket_len;
468  	struct bpos		pos;
469  } __packed __aligned(8);
470  
471  /* Optional/variable size superblock sections: */
472  
473  struct bch_sb_field {
474  	__u64			_data[0];
475  	__le32			u64s;
476  	__le32			type;
477  };
478  
479  #define BCH_SB_FIELDS()				\
480  	x(journal,			0)	\
481  	x(members_v1,			1)	\
482  	x(crypt,			2)	\
483  	x(replicas_v0,			3)	\
484  	x(quota,			4)	\
485  	x(disk_groups,			5)	\
486  	x(clean,			6)	\
487  	x(replicas,			7)	\
488  	x(journal_seq_blacklist,	8)	\
489  	x(journal_v2,			9)	\
490  	x(counters,			10)	\
491  	x(members_v2,			11)	\
492  	x(errors,			12)	\
493  	x(ext,				13)	\
494  	x(downgrade,			14)
495  
496  #include "alloc_background_format.h"
497  #include "dirent_format.h"
498  #include "disk_accounting_format.h"
499  #include "disk_groups_format.h"
500  #include "extents_format.h"
501  #include "ec_format.h"
502  #include "dirent_format.h"
503  #include "disk_groups_format.h"
504  #include "inode_format.h"
505  #include "journal_seq_blacklist_format.h"
506  #include "logged_ops_format.h"
507  #include "lru_format.h"
508  #include "quota_format.h"
509  #include "reflink_format.h"
510  #include "replicas_format.h"
511  #include "snapshot_format.h"
512  #include "subvolume_format.h"
513  #include "sb-counters_format.h"
514  #include "sb-downgrade_format.h"
515  #include "sb-errors_format.h"
516  #include "sb-members_format.h"
517  #include "xattr_format.h"
518  
519  enum bch_sb_field_type {
520  #define x(f, nr)	BCH_SB_FIELD_##f = nr,
521  	BCH_SB_FIELDS()
522  #undef x
523  	BCH_SB_FIELD_NR
524  };
525  
526  /*
527   * Most superblock fields are replicated in all device's superblocks - a few are
528   * not:
529   */
530  #define BCH_SINGLE_DEVICE_SB_FIELDS		\
531  	((1U << BCH_SB_FIELD_journal)|		\
532  	 (1U << BCH_SB_FIELD_journal_v2))
533  
534  /* BCH_SB_FIELD_journal: */
535  
536  struct bch_sb_field_journal {
537  	struct bch_sb_field	field;
538  	__le64			buckets[];
539  };
540  
541  struct bch_sb_field_journal_v2 {
542  	struct bch_sb_field	field;
543  
544  	struct bch_sb_field_journal_v2_entry {
545  		__le64		start;
546  		__le64		nr;
547  	}			d[];
548  };
549  
550  /* BCH_SB_FIELD_crypt: */
551  
552  struct nonce {
553  	__le32			d[4];
554  };
555  
556  struct bch_key {
557  	__le64			key[4];
558  };
559  
560  #define BCH_KEY_MAGIC					\
561  	(((__u64) 'b' <<  0)|((__u64) 'c' <<  8)|		\
562  	 ((__u64) 'h' << 16)|((__u64) '*' << 24)|		\
563  	 ((__u64) '*' << 32)|((__u64) 'k' << 40)|		\
564  	 ((__u64) 'e' << 48)|((__u64) 'y' << 56))
565  
566  struct bch_encrypted_key {
567  	__le64			magic;
568  	struct bch_key		key;
569  };
570  
571  /*
572   * If this field is present in the superblock, it stores an encryption key which
573   * is used encrypt all other data/metadata. The key will normally be encrypted
574   * with the key userspace provides, but if encryption has been turned off we'll
575   * just store the master key unencrypted in the superblock so we can access the
576   * previously encrypted data.
577   */
578  struct bch_sb_field_crypt {
579  	struct bch_sb_field	field;
580  
581  	__le64			flags;
582  	__le64			kdf_flags;
583  	struct bch_encrypted_key key;
584  };
585  
586  LE64_BITMASK(BCH_CRYPT_KDF_TYPE,	struct bch_sb_field_crypt, flags, 0, 4);
587  
588  enum bch_kdf_types {
589  	BCH_KDF_SCRYPT		= 0,
590  	BCH_KDF_NR		= 1,
591  };
592  
593  /* stored as base 2 log of scrypt params: */
594  LE64_BITMASK(BCH_KDF_SCRYPT_N,	struct bch_sb_field_crypt, kdf_flags,  0, 16);
595  LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
596  LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
597  
598  /*
599   * On clean shutdown, store btree roots and current journal sequence number in
600   * the superblock:
601   */
602  struct jset_entry {
603  	__le16			u64s;
604  	__u8			btree_id;
605  	__u8			level;
606  	__u8			type; /* designates what this jset holds */
607  	__u8			pad[3];
608  
609  	struct bkey_i		start[0];
610  	__u64			_data[];
611  };
612  
613  struct bch_sb_field_clean {
614  	struct bch_sb_field	field;
615  
616  	__le32			flags;
617  	__le16			_read_clock; /* no longer used */
618  	__le16			_write_clock;
619  	__le64			journal_seq;
620  
621  	struct jset_entry	start[0];
622  	__u64			_data[];
623  };
624  
625  struct bch_sb_field_ext {
626  	struct bch_sb_field	field;
627  	__le64			recovery_passes_required[2];
628  	__le64			errors_silent[8];
629  	__le64			btrees_lost_data;
630  };
631  
632  /* Superblock: */
633  
634  /*
635   * New versioning scheme:
636   * One common version number for all on disk data structures - superblock, btree
637   * nodes, journal entries
638   */
639  #define BCH_VERSION_MAJOR(_v)		((__u16) ((_v) >> 10))
640  #define BCH_VERSION_MINOR(_v)		((__u16) ((_v) & ~(~0U << 10)))
641  #define BCH_VERSION(_major, _minor)	(((_major) << 10)|(_minor) << 0)
642  
643  /*
644   * field 1:		version name
645   * field 2:		BCH_VERSION(major, minor)
646   * field 3:		recovery passess required on upgrade
647   */
648  #define BCH_METADATA_VERSIONS()						\
649  	x(bkey_renumber,		BCH_VERSION(0, 10))		\
650  	x(inode_btree_change,		BCH_VERSION(0, 11))		\
651  	x(snapshot,			BCH_VERSION(0, 12))		\
652  	x(inode_backpointers,		BCH_VERSION(0, 13))		\
653  	x(btree_ptr_sectors_written,	BCH_VERSION(0, 14))		\
654  	x(snapshot_2,			BCH_VERSION(0, 15))		\
655  	x(reflink_p_fix,		BCH_VERSION(0, 16))		\
656  	x(subvol_dirent,		BCH_VERSION(0, 17))		\
657  	x(inode_v2,			BCH_VERSION(0, 18))		\
658  	x(freespace,			BCH_VERSION(0, 19))		\
659  	x(alloc_v4,			BCH_VERSION(0, 20))		\
660  	x(new_data_types,		BCH_VERSION(0, 21))		\
661  	x(backpointers,			BCH_VERSION(0, 22))		\
662  	x(inode_v3,			BCH_VERSION(0, 23))		\
663  	x(unwritten_extents,		BCH_VERSION(0, 24))		\
664  	x(bucket_gens,			BCH_VERSION(0, 25))		\
665  	x(lru_v2,			BCH_VERSION(0, 26))		\
666  	x(fragmentation_lru,		BCH_VERSION(0, 27))		\
667  	x(no_bps_in_alloc_keys,		BCH_VERSION(0, 28))		\
668  	x(snapshot_trees,		BCH_VERSION(0, 29))		\
669  	x(major_minor,			BCH_VERSION(1,  0))		\
670  	x(snapshot_skiplists,		BCH_VERSION(1,  1))		\
671  	x(deleted_inodes,		BCH_VERSION(1,  2))		\
672  	x(rebalance_work,		BCH_VERSION(1,  3))		\
673  	x(member_seq,			BCH_VERSION(1,  4))		\
674  	x(subvolume_fs_parent,		BCH_VERSION(1,  5))		\
675  	x(btree_subvolume_children,	BCH_VERSION(1,  6))		\
676  	x(mi_btree_bitmap,		BCH_VERSION(1,  7))		\
677  	x(bucket_stripe_sectors,	BCH_VERSION(1,  8))		\
678  	x(disk_accounting_v2,		BCH_VERSION(1,  9))		\
679  	x(disk_accounting_v3,		BCH_VERSION(1, 10))		\
680  	x(disk_accounting_inum,		BCH_VERSION(1, 11))		\
681  	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))		\
682  	x(inode_has_child_snapshots,	BCH_VERSION(1, 13))
683  
684  enum bcachefs_metadata_version {
685  	bcachefs_metadata_version_min = 9,
686  #define x(t, n)	bcachefs_metadata_version_##t = n,
687  	BCH_METADATA_VERSIONS()
688  #undef x
689  	bcachefs_metadata_version_max
690  };
691  
692  static const __maybe_unused
693  unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
694  
695  #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
696  
697  #define BCH_SB_SECTOR			8
698  
699  #define BCH_SB_LAYOUT_SIZE_BITS_MAX	16 /* 32 MB */
700  
701  struct bch_sb_layout {
702  	__uuid_t		magic;	/* bcachefs superblock UUID */
703  	__u8			layout_type;
704  	__u8			sb_max_size_bits; /* base 2 of 512 byte sectors */
705  	__u8			nr_superblocks;
706  	__u8			pad[5];
707  	__le64			sb_offset[61];
708  } __packed __aligned(8);
709  
710  #define BCH_SB_LAYOUT_SECTOR	7
711  
712  /*
713   * @offset	- sector where this sb was written
714   * @version	- on disk format version
715   * @version_min	- Oldest metadata version this filesystem contains; so we can
716   *		  safely drop compatibility code and refuse to mount filesystems
717   *		  we'd need it for
718   * @magic	- identifies as a bcachefs superblock (BCHFS_MAGIC)
719   * @seq		- incremented each time superblock is written
720   * @uuid	- used for generating various magic numbers and identifying
721   *                member devices, never changes
722   * @user_uuid	- user visible UUID, may be changed
723   * @label	- filesystem label
724   * @seq		- identifies most recent superblock, incremented each time
725   *		  superblock is written
726   * @features	- enabled incompatible features
727   */
728  struct bch_sb {
729  	struct bch_csum		csum;
730  	__le16			version;
731  	__le16			version_min;
732  	__le16			pad[2];
733  	__uuid_t		magic;
734  	__uuid_t		uuid;
735  	__uuid_t		user_uuid;
736  	__u8			label[BCH_SB_LABEL_SIZE];
737  	__le64			offset;
738  	__le64			seq;
739  
740  	__le16			block_size;
741  	__u8			dev_idx;
742  	__u8			nr_devices;
743  	__le32			u64s;
744  
745  	__le64			time_base_lo;
746  	__le32			time_base_hi;
747  	__le32			time_precision;
748  
749  	__le64			flags[7];
750  	__le64			write_time;
751  	__le64			features[2];
752  	__le64			compat[2];
753  
754  	struct bch_sb_layout	layout;
755  
756  	struct bch_sb_field	start[0];
757  	__le64			_data[];
758  } __packed __aligned(8);
759  
760  /*
761   * Flags:
762   * BCH_SB_INITALIZED	- set on first mount
763   * BCH_SB_CLEAN		- did we shut down cleanly? Just a hint, doesn't affect
764   *			  behaviour of mount/recovery path:
765   * BCH_SB_INODE_32BIT	- limit inode numbers to 32 bits
766   * BCH_SB_128_BIT_MACS	- 128 bit macs instead of 80
767   * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
768   *			   DATA/META_CSUM_TYPE. Also indicates encryption
769   *			   algorithm in use, if/when we get more than one
770   */
771  
772  LE16_BITMASK(BCH_SB_BLOCK_SIZE,		struct bch_sb, block_size, 0, 16);
773  
774  LE64_BITMASK(BCH_SB_INITIALIZED,	struct bch_sb, flags[0],  0,  1);
775  LE64_BITMASK(BCH_SB_CLEAN,		struct bch_sb, flags[0],  1,  2);
776  LE64_BITMASK(BCH_SB_CSUM_TYPE,		struct bch_sb, flags[0],  2,  8);
777  LE64_BITMASK(BCH_SB_ERROR_ACTION,	struct bch_sb, flags[0],  8, 12);
778  
779  LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,	struct bch_sb, flags[0], 12, 28);
780  
781  LE64_BITMASK(BCH_SB_GC_RESERVE,		struct bch_sb, flags[0], 28, 33);
782  LE64_BITMASK(BCH_SB_ROOT_RESERVE,	struct bch_sb, flags[0], 33, 40);
783  
784  LE64_BITMASK(BCH_SB_META_CSUM_TYPE,	struct bch_sb, flags[0], 40, 44);
785  LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,	struct bch_sb, flags[0], 44, 48);
786  
787  LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,	struct bch_sb, flags[0], 48, 52);
788  LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,	struct bch_sb, flags[0], 52, 56);
789  
790  LE64_BITMASK(BCH_SB_POSIX_ACL,		struct bch_sb, flags[0], 56, 57);
791  LE64_BITMASK(BCH_SB_USRQUOTA,		struct bch_sb, flags[0], 57, 58);
792  LE64_BITMASK(BCH_SB_GRPQUOTA,		struct bch_sb, flags[0], 58, 59);
793  LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
794  
795  LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
796  LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
797  
798  LE64_BITMASK(BCH_SB_BIG_ENDIAN,		struct bch_sb, flags[0], 62, 63);
799  LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
800  					struct bch_sb, flags[0], 63, 64);
801  
802  LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
803  LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1],  4,  8);
804  LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);
805  
806  LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
807  LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,	struct bch_sb, flags[1], 10, 14);
808  
809  /*
810   * Max size of an extent that may require bouncing to read or write
811   * (checksummed, compressed): 64k
812   */
813  LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
814  					struct bch_sb, flags[1], 14, 20);
815  
816  LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
817  LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);
818  
819  LE64_BITMASK(BCH_SB_PROMOTE_TARGET,	struct bch_sb, flags[1], 28, 40);
820  LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,	struct bch_sb, flags[1], 40, 52);
821  LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);
822  
823  LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
824  					struct bch_sb, flags[2],  0,  4);
825  LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
826  
827  LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
828  LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
829  LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
830  LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
831  LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
832  LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
833  LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
834  LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
835  LE64_BITMASK(BCH_SB_NOCOW,		struct bch_sb, flags[4], 33, 34);
836  LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE,	struct bch_sb, flags[4], 34, 54);
837  LE64_BITMASK(BCH_SB_VERSION_UPGRADE,	struct bch_sb, flags[4], 54, 56);
838  
839  LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
840  LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
841  					struct bch_sb, flags[4], 60, 64);
842  
843  LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
844  					struct bch_sb, flags[5],  0, 16);
845  LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
846  					struct bch_sb, flags[5], 16, 32);
847  
BCH_SB_COMPRESSION_TYPE(const struct bch_sb * sb)848  static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
849  {
850  	return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4);
851  }
852  
SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb * sb,__u64 v)853  static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
854  {
855  	SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v);
856  	SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4);
857  }
858  
BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb * sb)859  static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
860  {
861  	return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) |
862  		(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4);
863  }
864  
SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb * sb,__u64 v)865  static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
866  {
867  	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v);
868  	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4);
869  }
870  
871  /*
872   * Features:
873   *
874   * journal_seq_blacklist_v3:	gates BCH_SB_FIELD_journal_seq_blacklist
875   * reflink:			gates KEY_TYPE_reflink
876   * inline_data:			gates KEY_TYPE_inline_data
877   * new_siphash:			gates BCH_STR_HASH_siphash
878   * new_extent_overwrite:	gates BTREE_NODE_NEW_EXTENT_OVERWRITE
879   */
880  #define BCH_SB_FEATURES()			\
881  	x(lz4,				0)	\
882  	x(gzip,				1)	\
883  	x(zstd,				2)	\
884  	x(atomic_nlink,			3)	\
885  	x(ec,				4)	\
886  	x(journal_seq_blacklist_v3,	5)	\
887  	x(reflink,			6)	\
888  	x(new_siphash,			7)	\
889  	x(inline_data,			8)	\
890  	x(new_extent_overwrite,		9)	\
891  	x(incompressible,		10)	\
892  	x(btree_ptr_v2,			11)	\
893  	x(extents_above_btree_updates,	12)	\
894  	x(btree_updates_journalled,	13)	\
895  	x(reflink_inline_data,		14)	\
896  	x(new_varint,			15)	\
897  	x(journal_no_flush,		16)	\
898  	x(alloc_v2,			17)	\
899  	x(extents_across_btree_nodes,	18)
900  
901  #define BCH_SB_FEATURES_ALWAYS				\
902  	((1ULL << BCH_FEATURE_new_extent_overwrite)|	\
903  	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
904  	 (1ULL << BCH_FEATURE_btree_updates_journalled)|\
905  	 (1ULL << BCH_FEATURE_alloc_v2)|\
906  	 (1ULL << BCH_FEATURE_extents_across_btree_nodes))
907  
908  #define BCH_SB_FEATURES_ALL				\
909  	(BCH_SB_FEATURES_ALWAYS|			\
910  	 (1ULL << BCH_FEATURE_new_siphash)|		\
911  	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
912  	 (1ULL << BCH_FEATURE_new_varint)|		\
913  	 (1ULL << BCH_FEATURE_journal_no_flush))
914  
915  enum bch_sb_feature {
916  #define x(f, n) BCH_FEATURE_##f,
917  	BCH_SB_FEATURES()
918  #undef x
919  	BCH_FEATURE_NR,
920  };
921  
922  #define BCH_SB_COMPAT()					\
923  	x(alloc_info,				0)	\
924  	x(alloc_metadata,			1)	\
925  	x(extents_above_btree_updates_done,	2)	\
926  	x(bformat_overflow_done,		3)
927  
928  enum bch_sb_compat {
929  #define x(f, n) BCH_COMPAT_##f,
930  	BCH_SB_COMPAT()
931  #undef x
932  	BCH_COMPAT_NR,
933  };
934  
935  /* options: */
936  
937  #define BCH_VERSION_UPGRADE_OPTS()	\
938  	x(compatible,		0)	\
939  	x(incompatible,		1)	\
940  	x(none,			2)
941  
942  enum bch_version_upgrade_opts {
943  #define x(t, n) BCH_VERSION_UPGRADE_##t = n,
944  	BCH_VERSION_UPGRADE_OPTS()
945  #undef x
946  };
947  
948  #define BCH_REPLICAS_MAX		4U
949  
950  #define BCH_BKEY_PTRS_MAX		16U
951  
952  #define BCH_ERROR_ACTIONS()		\
953  	x(continue,		0)	\
954  	x(fix_safe,		1)	\
955  	x(panic,		2)	\
956  	x(ro,			3)
957  
958  enum bch_error_actions {
959  #define x(t, n) BCH_ON_ERROR_##t = n,
960  	BCH_ERROR_ACTIONS()
961  #undef x
962  	BCH_ON_ERROR_NR
963  };
964  
965  #define BCH_STR_HASH_TYPES()		\
966  	x(crc32c,		0)	\
967  	x(crc64,		1)	\
968  	x(siphash_old,		2)	\
969  	x(siphash,		3)
970  
971  enum bch_str_hash_type {
972  #define x(t, n) BCH_STR_HASH_##t = n,
973  	BCH_STR_HASH_TYPES()
974  #undef x
975  	BCH_STR_HASH_NR
976  };
977  
978  #define BCH_STR_HASH_OPTS()		\
979  	x(crc32c,		0)	\
980  	x(crc64,		1)	\
981  	x(siphash,		2)
982  
983  enum bch_str_hash_opts {
984  #define x(t, n) BCH_STR_HASH_OPT_##t = n,
985  	BCH_STR_HASH_OPTS()
986  #undef x
987  	BCH_STR_HASH_OPT_NR
988  };
989  
990  #define BCH_CSUM_TYPES()			\
991  	x(none,				0)	\
992  	x(crc32c_nonzero,		1)	\
993  	x(crc64_nonzero,		2)	\
994  	x(chacha20_poly1305_80,		3)	\
995  	x(chacha20_poly1305_128,	4)	\
996  	x(crc32c,			5)	\
997  	x(crc64,			6)	\
998  	x(xxhash,			7)
999  
1000  enum bch_csum_type {
1001  #define x(t, n) BCH_CSUM_##t = n,
1002  	BCH_CSUM_TYPES()
1003  #undef x
1004  	BCH_CSUM_NR
1005  };
1006  
1007  static const __maybe_unused unsigned bch_crc_bytes[] = {
1008  	[BCH_CSUM_none]				= 0,
1009  	[BCH_CSUM_crc32c_nonzero]		= 4,
1010  	[BCH_CSUM_crc32c]			= 4,
1011  	[BCH_CSUM_crc64_nonzero]		= 8,
1012  	[BCH_CSUM_crc64]			= 8,
1013  	[BCH_CSUM_xxhash]			= 8,
1014  	[BCH_CSUM_chacha20_poly1305_80]		= 10,
1015  	[BCH_CSUM_chacha20_poly1305_128]	= 16,
1016  };
1017  
bch2_csum_type_is_encryption(enum bch_csum_type type)1018  static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
1019  {
1020  	switch (type) {
1021  	case BCH_CSUM_chacha20_poly1305_80:
1022  	case BCH_CSUM_chacha20_poly1305_128:
1023  		return true;
1024  	default:
1025  		return false;
1026  	}
1027  }
1028  
1029  #define BCH_CSUM_OPTS()			\
1030  	x(none,			0)	\
1031  	x(crc32c,		1)	\
1032  	x(crc64,		2)	\
1033  	x(xxhash,		3)
1034  
1035  enum bch_csum_opts {
1036  #define x(t, n) BCH_CSUM_OPT_##t = n,
1037  	BCH_CSUM_OPTS()
1038  #undef x
1039  	BCH_CSUM_OPT_NR
1040  };
1041  
1042  #define BCH_COMPRESSION_TYPES()		\
1043  	x(none,			0)	\
1044  	x(lz4_old,		1)	\
1045  	x(gzip,			2)	\
1046  	x(lz4,			3)	\
1047  	x(zstd,			4)	\
1048  	x(incompressible,	5)
1049  
1050  enum bch_compression_type {
1051  #define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
1052  	BCH_COMPRESSION_TYPES()
1053  #undef x
1054  	BCH_COMPRESSION_TYPE_NR
1055  };
1056  
1057  #define BCH_COMPRESSION_OPTS()		\
1058  	x(none,		0)		\
1059  	x(lz4,		1)		\
1060  	x(gzip,		2)		\
1061  	x(zstd,		3)
1062  
1063  enum bch_compression_opts {
1064  #define x(t, n) BCH_COMPRESSION_OPT_##t = n,
1065  	BCH_COMPRESSION_OPTS()
1066  #undef x
1067  	BCH_COMPRESSION_OPT_NR
1068  };
1069  
1070  /*
1071   * Magic numbers
1072   *
1073   * The various other data structures have their own magic numbers, which are
1074   * xored with the first part of the cache set's UUID
1075   */
1076  
1077  #define BCACHE_MAGIC							\
1078  	UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca,				\
1079  		  0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
1080  #define BCHFS_MAGIC							\
1081  	UUID_INIT(0xc68573f6, 0x66ce, 0x90a9,				\
1082  		  0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
1083  
1084  #define BCACHEFS_STATFS_MAGIC		BCACHEFS_SUPER_MAGIC
1085  
1086  #define JSET_MAGIC		__cpu_to_le64(0x245235c1a3625032ULL)
1087  #define BSET_MAGIC		__cpu_to_le64(0x90135c78b99e07f5ULL)
1088  
__bch2_sb_magic(struct bch_sb * sb)1089  static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
1090  {
1091  	__le64 ret;
1092  
1093  	memcpy(&ret, &sb->uuid, sizeof(ret));
1094  	return ret;
1095  }
1096  
__jset_magic(struct bch_sb * sb)1097  static inline __u64 __jset_magic(struct bch_sb *sb)
1098  {
1099  	return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
1100  }
1101  
__bset_magic(struct bch_sb * sb)1102  static inline __u64 __bset_magic(struct bch_sb *sb)
1103  {
1104  	return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
1105  }
1106  
1107  /* Journal */
1108  
1109  #define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
1110  
1111  #define BCH_JSET_ENTRY_TYPES()			\
1112  	x(btree_keys,		0)		\
1113  	x(btree_root,		1)		\
1114  	x(prio_ptrs,		2)		\
1115  	x(blacklist,		3)		\
1116  	x(blacklist_v2,		4)		\
1117  	x(usage,		5)		\
1118  	x(data_usage,		6)		\
1119  	x(clock,		7)		\
1120  	x(dev_usage,		8)		\
1121  	x(log,			9)		\
1122  	x(overwrite,		10)		\
1123  	x(write_buffer_keys,	11)		\
1124  	x(datetime,		12)
1125  
1126  enum bch_jset_entry_type {
1127  #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
1128  	BCH_JSET_ENTRY_TYPES()
1129  #undef x
1130  	BCH_JSET_ENTRY_NR
1131  };
1132  
jset_entry_is_key(struct jset_entry * e)1133  static inline bool jset_entry_is_key(struct jset_entry *e)
1134  {
1135  	switch (e->type) {
1136  	case BCH_JSET_ENTRY_btree_keys:
1137  	case BCH_JSET_ENTRY_btree_root:
1138  	case BCH_JSET_ENTRY_write_buffer_keys:
1139  		return true;
1140  	}
1141  
1142  	return false;
1143  }
1144  
1145  /*
1146   * Journal sequence numbers can be blacklisted: bsets record the max sequence
1147   * number of all the journal entries they contain updates for, so that on
1148   * recovery we can ignore those bsets that contain index updates newer that what
1149   * made it into the journal.
1150   *
1151   * This means that we can't reuse that journal_seq - we have to skip it, and
1152   * then record that we skipped it so that the next time we crash and recover we
1153   * don't think there was a missing journal entry.
1154   */
1155  struct jset_entry_blacklist {
1156  	struct jset_entry	entry;
1157  	__le64			seq;
1158  };
1159  
1160  struct jset_entry_blacklist_v2 {
1161  	struct jset_entry	entry;
1162  	__le64			start;
1163  	__le64			end;
1164  };
1165  
1166  #define BCH_FS_USAGE_TYPES()			\
1167  	x(reserved,		0)		\
1168  	x(inodes,		1)		\
1169  	x(key_version,		2)
1170  
1171  enum bch_fs_usage_type {
1172  #define x(f, nr)	BCH_FS_USAGE_##f	= nr,
1173  	BCH_FS_USAGE_TYPES()
1174  #undef x
1175  	BCH_FS_USAGE_NR
1176  };
1177  
1178  struct jset_entry_usage {
1179  	struct jset_entry	entry;
1180  	__le64			v;
1181  } __packed;
1182  
1183  struct jset_entry_data_usage {
1184  	struct jset_entry	entry;
1185  	__le64			v;
1186  	struct bch_replicas_entry_v1 r;
1187  } __packed;
1188  
1189  struct jset_entry_clock {
1190  	struct jset_entry	entry;
1191  	__u8			rw;
1192  	__u8			pad[7];
1193  	__le64			time;
1194  } __packed;
1195  
1196  struct jset_entry_dev_usage_type {
1197  	__le64			buckets;
1198  	__le64			sectors;
1199  	__le64			fragmented;
1200  } __packed;
1201  
1202  struct jset_entry_dev_usage {
1203  	struct jset_entry	entry;
1204  	__le32			dev;
1205  	__u32			pad;
1206  
1207  	__le64			_buckets_ec;		/* No longer used */
1208  	__le64			_buckets_unavailable;	/* No longer used */
1209  
1210  	struct jset_entry_dev_usage_type d[];
1211  };
1212  
jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage * u)1213  static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
1214  {
1215  	return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
1216  		sizeof(struct jset_entry_dev_usage_type);
1217  }
1218  
1219  struct jset_entry_log {
1220  	struct jset_entry	entry;
1221  	u8			d[];
1222  } __packed __aligned(8);
1223  
1224  struct jset_entry_datetime {
1225  	struct jset_entry	entry;
1226  	__le64			seconds;
1227  } __packed __aligned(8);
1228  
1229  /*
1230   * On disk format for a journal entry:
1231   * seq is monotonically increasing; every journal entry has its own unique
1232   * sequence number.
1233   *
1234   * last_seq is the oldest journal entry that still has keys the btree hasn't
1235   * flushed to disk yet.
1236   *
1237   * version is for on disk format changes.
1238   */
1239  struct jset {
1240  	struct bch_csum		csum;
1241  
1242  	__le64			magic;
1243  	__le64			seq;
1244  	__le32			version;
1245  	__le32			flags;
1246  
1247  	__le32			u64s; /* size of d[] in u64s */
1248  
1249  	__u8			encrypted_start[0];
1250  
1251  	__le16			_read_clock; /* no longer used */
1252  	__le16			_write_clock;
1253  
1254  	/* Sequence number of oldest dirty journal entry */
1255  	__le64			last_seq;
1256  
1257  
1258  	struct jset_entry	start[0];
1259  	__u64			_data[];
1260  } __packed __aligned(8);
1261  
1262  LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
1263  LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
1264  LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
1265  
1266  #define BCH_JOURNAL_BUCKETS_MIN		8
1267  
1268  /* Btree: */
1269  
1270  enum btree_id_flags {
1271  	BTREE_ID_EXTENTS	= BIT(0),
1272  	BTREE_ID_SNAPSHOTS	= BIT(1),
1273  	BTREE_ID_SNAPSHOT_FIELD	= BIT(2),
1274  	BTREE_ID_DATA		= BIT(3),
1275  };
1276  
1277  #define BCH_BTREE_IDS()								\
1278  	x(extents,		0,	BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
1279  	  BIT_ULL(KEY_TYPE_whiteout)|						\
1280  	  BIT_ULL(KEY_TYPE_error)|						\
1281  	  BIT_ULL(KEY_TYPE_cookie)|						\
1282  	  BIT_ULL(KEY_TYPE_extent)|						\
1283  	  BIT_ULL(KEY_TYPE_reservation)|					\
1284  	  BIT_ULL(KEY_TYPE_reflink_p)|						\
1285  	  BIT_ULL(KEY_TYPE_inline_data))					\
1286  	x(inodes,		1,	BTREE_ID_SNAPSHOTS,			\
1287  	  BIT_ULL(KEY_TYPE_whiteout)|						\
1288  	  BIT_ULL(KEY_TYPE_inode)|						\
1289  	  BIT_ULL(KEY_TYPE_inode_v2)|						\
1290  	  BIT_ULL(KEY_TYPE_inode_v3)|						\
1291  	  BIT_ULL(KEY_TYPE_inode_generation))					\
1292  	x(dirents,		2,	BTREE_ID_SNAPSHOTS,			\
1293  	  BIT_ULL(KEY_TYPE_whiteout)|						\
1294  	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
1295  	  BIT_ULL(KEY_TYPE_dirent))						\
1296  	x(xattrs,		3,	BTREE_ID_SNAPSHOTS,			\
1297  	  BIT_ULL(KEY_TYPE_whiteout)|						\
1298  	  BIT_ULL(KEY_TYPE_cookie)|						\
1299  	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
1300  	  BIT_ULL(KEY_TYPE_xattr))						\
1301  	x(alloc,		4,	0,					\
1302  	  BIT_ULL(KEY_TYPE_alloc)|						\
1303  	  BIT_ULL(KEY_TYPE_alloc_v2)|						\
1304  	  BIT_ULL(KEY_TYPE_alloc_v3)|						\
1305  	  BIT_ULL(KEY_TYPE_alloc_v4))						\
1306  	x(quotas,		5,	0,					\
1307  	  BIT_ULL(KEY_TYPE_quota))						\
1308  	x(stripes,		6,	0,					\
1309  	  BIT_ULL(KEY_TYPE_stripe))						\
1310  	x(reflink,		7,	BTREE_ID_EXTENTS|BTREE_ID_DATA,		\
1311  	  BIT_ULL(KEY_TYPE_reflink_v)|						\
1312  	  BIT_ULL(KEY_TYPE_indirect_inline_data)|				\
1313  	  BIT_ULL(KEY_TYPE_error))						\
1314  	x(subvolumes,		8,	0,					\
1315  	  BIT_ULL(KEY_TYPE_subvolume))						\
1316  	x(snapshots,		9,	0,					\
1317  	  BIT_ULL(KEY_TYPE_snapshot))						\
1318  	x(lru,			10,	0,					\
1319  	  BIT_ULL(KEY_TYPE_set))						\
1320  	x(freespace,		11,	BTREE_ID_EXTENTS,			\
1321  	  BIT_ULL(KEY_TYPE_set))						\
1322  	x(need_discard,		12,	0,					\
1323  	  BIT_ULL(KEY_TYPE_set))						\
1324  	x(backpointers,		13,	0,					\
1325  	  BIT_ULL(KEY_TYPE_backpointer))					\
1326  	x(bucket_gens,		14,	0,					\
1327  	  BIT_ULL(KEY_TYPE_bucket_gens))					\
1328  	x(snapshot_trees,	15,	0,					\
1329  	  BIT_ULL(KEY_TYPE_snapshot_tree))					\
1330  	x(deleted_inodes,	16,	BTREE_ID_SNAPSHOT_FIELD,		\
1331  	  BIT_ULL(KEY_TYPE_set))						\
1332  	x(logged_ops,		17,	0,					\
1333  	  BIT_ULL(KEY_TYPE_logged_op_truncate)|					\
1334  	  BIT_ULL(KEY_TYPE_logged_op_finsert))					\
1335  	x(rebalance_work,	18,	BTREE_ID_SNAPSHOT_FIELD,		\
1336  	  BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))			\
1337  	x(subvolume_children,	19,	0,					\
1338  	  BIT_ULL(KEY_TYPE_set))						\
1339  	x(accounting,		20,	BTREE_ID_SNAPSHOT_FIELD,		\
1340  	  BIT_ULL(KEY_TYPE_accounting))						\
1341  
1342  enum btree_id {
1343  #define x(name, nr, ...) BTREE_ID_##name = nr,
1344  	BCH_BTREE_IDS()
1345  #undef x
1346  	BTREE_ID_NR
1347  };
1348  
1349  /*
1350   * Maximum number of btrees that we will _ever_ have under the current scheme,
1351   * where we refer to them with 64 bit bitfields - and we also need a bit for
1352   * the interior btree node type:
1353   */
1354  #define BTREE_ID_NR_MAX		63
1355  
btree_id_is_alloc(enum btree_id id)1356  static inline bool btree_id_is_alloc(enum btree_id id)
1357  {
1358  	switch (id) {
1359  	case BTREE_ID_alloc:
1360  	case BTREE_ID_backpointers:
1361  	case BTREE_ID_need_discard:
1362  	case BTREE_ID_freespace:
1363  	case BTREE_ID_bucket_gens:
1364  		return true;
1365  	default:
1366  		return false;
1367  	}
1368  }
1369  
1370  #define BTREE_MAX_DEPTH		4U
1371  
1372  /* Btree nodes */
1373  
1374  /*
1375   * Btree nodes
1376   *
1377   * On disk a btree node is a list/log of these; within each set the keys are
1378   * sorted
1379   */
1380  struct bset {
1381  	__le64			seq;
1382  
1383  	/*
1384  	 * Highest journal entry this bset contains keys for.
1385  	 * If on recovery we don't see that journal entry, this bset is ignored:
1386  	 * this allows us to preserve the order of all index updates after a
1387  	 * crash, since the journal records a total order of all index updates
1388  	 * and anything that didn't make it to the journal doesn't get used.
1389  	 */
1390  	__le64			journal_seq;
1391  
1392  	__le32			flags;
1393  	__le16			version;
1394  	__le16			u64s; /* count of d[] in u64s */
1395  
1396  	struct bkey_packed	start[0];
1397  	__u64			_data[];
1398  } __packed __aligned(8);
1399  
1400  LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
1401  
1402  LE32_BITMASK(BSET_BIG_ENDIAN,	struct bset, flags, 4, 5);
1403  LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
1404  				struct bset, flags, 5, 6);
1405  
1406  /* Sector offset within the btree node: */
1407  LE32_BITMASK(BSET_OFFSET,	struct bset, flags, 16, 32);
1408  
1409  struct btree_node {
1410  	struct bch_csum		csum;
1411  	__le64			magic;
1412  
1413  	/* this flags field is encrypted, unlike bset->flags: */
1414  	__le64			flags;
1415  
1416  	/* Closed interval: */
1417  	struct bpos		min_key;
1418  	struct bpos		max_key;
1419  	struct bch_extent_ptr	_ptr; /* not used anymore */
1420  	struct bkey_format	format;
1421  
1422  	union {
1423  	struct bset		keys;
1424  	struct {
1425  		__u8		pad[22];
1426  		__le16		u64s;
1427  		__u64		_data[0];
1428  
1429  	};
1430  	};
1431  } __packed __aligned(8);
1432  
1433  LE64_BITMASK(BTREE_NODE_ID_LO,	struct btree_node, flags,  0,  4);
1434  LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
1435  LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
1436  				struct btree_node, flags,  8,  9);
1437  LE64_BITMASK(BTREE_NODE_ID_HI,	struct btree_node, flags,  9, 25);
1438  /* 25-32 unused */
1439  LE64_BITMASK(BTREE_NODE_SEQ,	struct btree_node, flags, 32, 64);
1440  
BTREE_NODE_ID(struct btree_node * n)1441  static inline __u64 BTREE_NODE_ID(struct btree_node *n)
1442  {
1443  	return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
1444  }
1445  
SET_BTREE_NODE_ID(struct btree_node * n,__u64 v)1446  static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
1447  {
1448  	SET_BTREE_NODE_ID_LO(n, v);
1449  	SET_BTREE_NODE_ID_HI(n, v >> 4);
1450  }
1451  
1452  struct btree_node_entry {
1453  	struct bch_csum		csum;
1454  
1455  	union {
1456  	struct bset		keys;
1457  	struct {
1458  		__u8		pad[22];
1459  		__le16		u64s;
1460  		__u64		_data[0];
1461  	};
1462  	};
1463  } __packed __aligned(8);
1464  
1465  #endif /* _BCACHEFS_FORMAT_H */
1466