1  /* SPDX-License-Identifier: GPL-2.0 */
2  
3  #ifndef BTRFS_BLOCK_GROUP_H
4  #define BTRFS_BLOCK_GROUP_H
5  
6  #include <linux/atomic.h>
7  #include <linux/mutex.h>
8  #include <linux/list.h>
9  #include <linux/spinlock.h>
10  #include <linux/refcount.h>
11  #include <linux/wait.h>
12  #include <linux/sizes.h>
13  #include <linux/rwsem.h>
14  #include <linux/rbtree.h>
15  #include <uapi/linux/btrfs_tree.h>
16  #include "free-space-cache.h"
17  
18  struct btrfs_chunk_map;
19  struct btrfs_fs_info;
20  struct btrfs_inode;
21  struct btrfs_trans_handle;
22  
23  enum btrfs_disk_cache_state {
24  	BTRFS_DC_WRITTEN,
25  	BTRFS_DC_ERROR,
26  	BTRFS_DC_CLEAR,
27  	BTRFS_DC_SETUP,
28  };
29  
30  enum btrfs_block_group_size_class {
31  	/* Unset */
32  	BTRFS_BG_SZ_NONE,
33  	/* 0 < size <= 128K */
34  	BTRFS_BG_SZ_SMALL,
35  	/* 128K < size <= 8M */
36  	BTRFS_BG_SZ_MEDIUM,
37  	/* 8M < size < BG_LENGTH */
38  	BTRFS_BG_SZ_LARGE,
39  };
40  
41  /*
42   * This describes the state of the block_group for async discard.  This is due
43   * to the two pass nature of it where extent discarding is prioritized over
44   * bitmap discarding.  BTRFS_DISCARD_RESET_CURSOR is set when we are resetting
45   * between lists to prevent contention for discard state variables
46   * (eg. discard_cursor).
47   */
48  enum btrfs_discard_state {
49  	BTRFS_DISCARD_EXTENTS,
50  	BTRFS_DISCARD_BITMAPS,
51  	BTRFS_DISCARD_RESET_CURSOR,
52  };
53  
54  /*
55   * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to
56   * only allocate a chunk if we really need one.
57   *
58   * CHUNK_ALLOC_LIMITED means to only try and allocate one if we have very few
59   * chunks already allocated.  This is used as part of the clustering code to
60   * help make sure we have a good pool of storage to cluster in, without filling
61   * the FS with empty chunks
62   *
63   * CHUNK_ALLOC_FORCE means it must try to allocate one
64   *
65   * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from
66   * find_free_extent() that also activaes the zone
67   */
68  enum btrfs_chunk_alloc_enum {
69  	CHUNK_ALLOC_NO_FORCE,
70  	CHUNK_ALLOC_LIMITED,
71  	CHUNK_ALLOC_FORCE,
72  	CHUNK_ALLOC_FORCE_FOR_EXTENT,
73  };
74  
75  /* Block group flags set at runtime */
76  enum btrfs_block_group_flags {
77  	BLOCK_GROUP_FLAG_IREF,
78  	BLOCK_GROUP_FLAG_REMOVED,
79  	BLOCK_GROUP_FLAG_TO_COPY,
80  	BLOCK_GROUP_FLAG_RELOCATING_REPAIR,
81  	BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
82  	BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
83  	BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
84  	/* Does the block group need to be added to the free space tree? */
85  	BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
86  	/* Indicate that the block group is placed on a sequential zone */
87  	BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
88  	/*
89  	 * Indicate that block group is in the list of new block groups of a
90  	 * transaction.
91  	 */
92  	BLOCK_GROUP_FLAG_NEW,
93  };
94  
95  enum btrfs_caching_type {
96  	BTRFS_CACHE_NO,
97  	BTRFS_CACHE_STARTED,
98  	BTRFS_CACHE_FINISHED,
99  	BTRFS_CACHE_ERROR,
100  };
101  
102  struct btrfs_caching_control {
103  	struct list_head list;
104  	struct mutex mutex;
105  	wait_queue_head_t wait;
106  	struct btrfs_work work;
107  	struct btrfs_block_group *block_group;
108  	/* Track progress of caching during allocation. */
109  	atomic_t progress;
110  	refcount_t count;
111  };
112  
113  /* Once caching_thread() finds this much free space, it will wake up waiters. */
114  #define CACHING_CTL_WAKE_UP SZ_2M
115  
116  struct btrfs_block_group {
117  	struct btrfs_fs_info *fs_info;
118  	struct btrfs_inode *inode;
119  	spinlock_t lock;
120  	u64 start;
121  	u64 length;
122  	u64 pinned;
123  	u64 reserved;
124  	u64 used;
125  	u64 delalloc_bytes;
126  	u64 bytes_super;
127  	u64 flags;
128  	u64 cache_generation;
129  	u64 global_root_id;
130  
131  	/*
132  	 * The last committed used bytes of this block group, if the above @used
133  	 * is still the same as @commit_used, we don't need to update block
134  	 * group item of this block group.
135  	 */
136  	u64 commit_used;
137  	/*
138  	 * If the free space extent count exceeds this number, convert the block
139  	 * group to bitmaps.
140  	 */
141  	u32 bitmap_high_thresh;
142  
143  	/*
144  	 * If the free space extent count drops below this number, convert the
145  	 * block group back to extents.
146  	 */
147  	u32 bitmap_low_thresh;
148  
149  	/*
150  	 * It is just used for the delayed data space allocation because
151  	 * only the data space allocation and the relative metadata update
152  	 * can be done cross the transaction.
153  	 */
154  	struct rw_semaphore data_rwsem;
155  
156  	/* For raid56, this is a full stripe, without parity */
157  	unsigned long full_stripe_len;
158  	unsigned long runtime_flags;
159  
160  	unsigned int ro;
161  
162  	int disk_cache_state;
163  
164  	/* Cache tracking stuff */
165  	int cached;
166  	struct btrfs_caching_control *caching_ctl;
167  
168  	struct btrfs_space_info *space_info;
169  
170  	/* Free space cache stuff */
171  	struct btrfs_free_space_ctl *free_space_ctl;
172  
173  	/* Block group cache stuff */
174  	struct rb_node cache_node;
175  
176  	/* For block groups in the same raid type */
177  	struct list_head list;
178  
179  	refcount_t refs;
180  
181  	/*
182  	 * List of struct btrfs_free_clusters for this block group.
183  	 * Today it will only have one thing on it, but that may change
184  	 */
185  	struct list_head cluster_list;
186  
187  	/*
188  	 * Used for several lists:
189  	 *
190  	 * 1) struct btrfs_fs_info::unused_bgs
191  	 * 2) struct btrfs_fs_info::reclaim_bgs
192  	 * 3) struct btrfs_transaction::deleted_bgs
193  	 * 4) struct btrfs_trans_handle::new_bgs
194  	 */
195  	struct list_head bg_list;
196  
197  	/* For read-only block groups */
198  	struct list_head ro_list;
199  
200  	/*
201  	 * When non-zero it means the block group's logical address and its
202  	 * device extents can not be reused for future block group allocations
203  	 * until the counter goes down to 0. This is to prevent them from being
204  	 * reused while some task is still using the block group after it was
205  	 * deleted - we want to make sure they can only be reused for new block
206  	 * groups after that task is done with the deleted block group.
207  	 */
208  	atomic_t frozen;
209  
210  	/* For discard operations */
211  	struct list_head discard_list;
212  	int discard_index;
213  	u64 discard_eligible_time;
214  	u64 discard_cursor;
215  	enum btrfs_discard_state discard_state;
216  
217  	/* For dirty block groups */
218  	struct list_head dirty_list;
219  	struct list_head io_list;
220  
221  	struct btrfs_io_ctl io_ctl;
222  
223  	/*
224  	 * Incremented when doing extent allocations and holding a read lock
225  	 * on the space_info's groups_sem semaphore.
226  	 * Decremented when an ordered extent that represents an IO against this
227  	 * block group's range is created (after it's added to its inode's
228  	 * root's list of ordered extents) or immediately after the allocation
229  	 * if it's a metadata extent or fallocate extent (for these cases we
230  	 * don't create ordered extents).
231  	 */
232  	atomic_t reservations;
233  
234  	/*
235  	 * Incremented while holding the spinlock *lock* by a task checking if
236  	 * it can perform a nocow write (incremented if the value for the *ro*
237  	 * field is 0). Decremented by such tasks once they create an ordered
238  	 * extent or before that if some error happens before reaching that step.
239  	 * This is to prevent races between block group relocation and nocow
240  	 * writes through direct IO.
241  	 */
242  	atomic_t nocow_writers;
243  
244  	/* Lock for free space tree operations. */
245  	struct mutex free_space_lock;
246  
247  	/*
248  	 * Number of extents in this block group used for swap files.
249  	 * All accesses protected by the spinlock 'lock'.
250  	 */
251  	int swap_extents;
252  
253  	/*
254  	 * Allocation offset for the block group to implement sequential
255  	 * allocation. This is used only on a zoned filesystem.
256  	 */
257  	u64 alloc_offset;
258  	u64 zone_unusable;
259  	u64 zone_capacity;
260  	u64 meta_write_pointer;
261  	struct btrfs_chunk_map *physical_map;
262  	struct list_head active_bg_list;
263  	struct work_struct zone_finish_work;
264  	struct extent_buffer *last_eb;
265  	enum btrfs_block_group_size_class size_class;
266  	u64 reclaim_mark;
267  };
268  
btrfs_block_group_end(const struct btrfs_block_group * block_group)269  static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group)
270  {
271  	return (block_group->start + block_group->length);
272  }
273  
btrfs_is_block_group_used(const struct btrfs_block_group * bg)274  static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
275  {
276  	lockdep_assert_held(&bg->lock);
277  
278  	return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
279  }
280  
btrfs_is_block_group_data_only(const struct btrfs_block_group * block_group)281  static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group)
282  {
283  	/*
284  	 * In mixed mode the fragmentation is expected to be high, lowering the
285  	 * efficiency, so only proper data block groups are considered.
286  	 */
287  	return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
288  	       !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA);
289  }
290  
291  #ifdef CONFIG_BTRFS_DEBUG
292  int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group);
293  #endif
294  
295  struct btrfs_block_group *btrfs_lookup_first_block_group(
296  		struct btrfs_fs_info *info, u64 bytenr);
297  struct btrfs_block_group *btrfs_lookup_block_group(
298  		struct btrfs_fs_info *info, u64 bytenr);
299  struct btrfs_block_group *btrfs_next_block_group(
300  		struct btrfs_block_group *cache);
301  void btrfs_get_block_group(struct btrfs_block_group *cache);
302  void btrfs_put_block_group(struct btrfs_block_group *cache);
303  void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
304  					const u64 start);
305  void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
306  struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
307  						  u64 bytenr);
308  void btrfs_dec_nocow_writers(struct btrfs_block_group *bg);
309  void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
310  void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
311  				           u64 num_bytes);
312  int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
313  struct btrfs_caching_control *btrfs_get_caching_control(
314  		struct btrfs_block_group *cache);
315  int btrfs_add_new_free_space(struct btrfs_block_group *block_group,
316  			     u64 start, u64 end, u64 *total_added_ret);
317  struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
318  				struct btrfs_fs_info *fs_info,
319  				const u64 chunk_offset);
320  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
321  			     struct btrfs_chunk_map *map);
322  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
323  void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
324  void btrfs_reclaim_bgs_work(struct work_struct *work);
325  void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
326  void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
327  int btrfs_read_block_groups(struct btrfs_fs_info *info);
328  struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
329  						 u64 type,
330  						 u64 chunk_offset, u64 size);
331  void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
332  int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
333  			     bool do_chunk_alloc);
334  void btrfs_dec_block_group_ro(struct btrfs_block_group *cache);
335  int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
336  int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
337  int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
338  int btrfs_update_block_group(struct btrfs_trans_handle *trans,
339  			     u64 bytenr, u64 num_bytes, bool alloc);
340  int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
341  			     u64 ram_bytes, u64 num_bytes, int delalloc,
342  			     bool force_wrong_size_class);
343  void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
344  			       u64 num_bytes, int delalloc);
345  int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
346  		      enum btrfs_chunk_alloc_enum force);
347  int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
348  void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
349  void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
350  				  bool is_item_insertion);
351  u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
352  void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
353  int btrfs_free_block_groups(struct btrfs_fs_info *info);
354  int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
355  		     u64 physical, u64 **logical, int *naddrs, int *stripe_len);
356  
btrfs_data_alloc_profile(struct btrfs_fs_info * fs_info)357  static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
358  {
359  	return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
360  }
361  
btrfs_metadata_alloc_profile(struct btrfs_fs_info * fs_info)362  static inline u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
363  {
364  	return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
365  }
366  
btrfs_system_alloc_profile(struct btrfs_fs_info * fs_info)367  static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
368  {
369  	return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
370  }
371  
btrfs_block_group_done(const struct btrfs_block_group * cache)372  static inline int btrfs_block_group_done(const struct btrfs_block_group *cache)
373  {
374  	smp_mb();
375  	return cache->cached == BTRFS_CACHE_FINISHED ||
376  		cache->cached == BTRFS_CACHE_ERROR;
377  }
378  
379  void btrfs_freeze_block_group(struct btrfs_block_group *cache);
380  void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);
381  
382  bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg);
383  void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount);
384  
385  enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size);
386  int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
387  				     enum btrfs_block_group_size_class size_class,
388  				     bool force_wrong_size_class);
389  bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg);
390  
391  #endif /* BTRFS_BLOCK_GROUP_H */
392