1  // SPDX-License-Identifier: GPL-2.0
2  
3  #include <linux/slab.h>
4  #include <trace/events/btrfs.h>
5  #include "messages.h"
6  #include "ctree.h"
7  #include "extent_io.h"
8  #include "extent-io-tree.h"
9  #include "btrfs_inode.h"
10  
11  static struct kmem_cache *extent_state_cache;
12  
extent_state_in_tree(const struct extent_state * state)13  static inline bool extent_state_in_tree(const struct extent_state *state)
14  {
15  	return !RB_EMPTY_NODE(&state->rb_node);
16  }
17  
18  #ifdef CONFIG_BTRFS_DEBUG
19  static LIST_HEAD(states);
20  static DEFINE_SPINLOCK(leak_lock);
21  
btrfs_leak_debug_add_state(struct extent_state * state)22  static inline void btrfs_leak_debug_add_state(struct extent_state *state)
23  {
24  	unsigned long flags;
25  
26  	spin_lock_irqsave(&leak_lock, flags);
27  	list_add(&state->leak_list, &states);
28  	spin_unlock_irqrestore(&leak_lock, flags);
29  }
30  
btrfs_leak_debug_del_state(struct extent_state * state)31  static inline void btrfs_leak_debug_del_state(struct extent_state *state)
32  {
33  	unsigned long flags;
34  
35  	spin_lock_irqsave(&leak_lock, flags);
36  	list_del(&state->leak_list);
37  	spin_unlock_irqrestore(&leak_lock, flags);
38  }
39  
btrfs_extent_state_leak_debug_check(void)40  static inline void btrfs_extent_state_leak_debug_check(void)
41  {
42  	struct extent_state *state;
43  
44  	while (!list_empty(&states)) {
45  		state = list_entry(states.next, struct extent_state, leak_list);
46  		pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
47  		       state->start, state->end, state->state,
48  		       extent_state_in_tree(state),
49  		       refcount_read(&state->refs));
50  		list_del(&state->leak_list);
51  		WARN_ON_ONCE(1);
52  		kmem_cache_free(extent_state_cache, state);
53  	}
54  }
55  
56  #define btrfs_debug_check_extent_io_range(tree, start, end)		\
57  	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
__btrfs_debug_check_extent_io_range(const char * caller,struct extent_io_tree * tree,u64 start,u64 end)58  static inline void __btrfs_debug_check_extent_io_range(const char *caller,
59  						       struct extent_io_tree *tree,
60  						       u64 start, u64 end)
61  {
62  	const struct btrfs_inode *inode;
63  	u64 isize;
64  
65  	if (tree->owner != IO_TREE_INODE_IO)
66  		return;
67  
68  	inode = extent_io_tree_to_inode_const(tree);
69  	isize = i_size_read(&inode->vfs_inode);
70  	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
71  		btrfs_debug_rl(inode->root->fs_info,
72  		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
73  			caller, btrfs_ino(inode), isize, start, end);
74  	}
75  }
76  #else
77  #define btrfs_leak_debug_add_state(state)		do {} while (0)
78  #define btrfs_leak_debug_del_state(state)		do {} while (0)
79  #define btrfs_extent_state_leak_debug_check()		do {} while (0)
80  #define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
81  #endif
82  
83  
84  /*
85   * The only tree allowed to set the inode is IO_TREE_INODE_IO.
86   */
is_inode_io_tree(const struct extent_io_tree * tree)87  static bool is_inode_io_tree(const struct extent_io_tree *tree)
88  {
89  	return tree->owner == IO_TREE_INODE_IO;
90  }
91  
92  /* Return the inode if it's valid for the given tree, otherwise NULL. */
extent_io_tree_to_inode(struct extent_io_tree * tree)93  struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree)
94  {
95  	if (tree->owner == IO_TREE_INODE_IO)
96  		return tree->inode;
97  	return NULL;
98  }
99  
100  /* Read-only access to the inode. */
extent_io_tree_to_inode_const(const struct extent_io_tree * tree)101  const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree)
102  {
103  	if (tree->owner == IO_TREE_INODE_IO)
104  		return tree->inode;
105  	return NULL;
106  }
107  
108  /* For read-only access to fs_info. */
extent_io_tree_to_fs_info(const struct extent_io_tree * tree)109  const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
110  {
111  	if (tree->owner == IO_TREE_INODE_IO)
112  		return tree->inode->root->fs_info;
113  	return tree->fs_info;
114  }
115  
extent_io_tree_init(struct btrfs_fs_info * fs_info,struct extent_io_tree * tree,unsigned int owner)116  void extent_io_tree_init(struct btrfs_fs_info *fs_info,
117  			 struct extent_io_tree *tree, unsigned int owner)
118  {
119  	tree->state = RB_ROOT;
120  	spin_lock_init(&tree->lock);
121  	tree->fs_info = fs_info;
122  	tree->owner = owner;
123  }
124  
125  /*
126   * Empty an io tree, removing and freeing every extent state record from the
127   * tree. This should be called once we are sure no other task can access the
128   * tree anymore, so no tree updates happen after we empty the tree and there
129   * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never
130   * set on any extent state when calling this function).
131   */
extent_io_tree_release(struct extent_io_tree * tree)132  void extent_io_tree_release(struct extent_io_tree *tree)
133  {
134  	struct rb_root root;
135  	struct extent_state *state;
136  	struct extent_state *tmp;
137  
138  	spin_lock(&tree->lock);
139  	root = tree->state;
140  	tree->state = RB_ROOT;
141  	rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) {
142  		/* Clear node to keep free_extent_state() happy. */
143  		RB_CLEAR_NODE(&state->rb_node);
144  		ASSERT(!(state->state & EXTENT_LOCK_BITS));
145  		/*
146  		 * No need for a memory barrier here, as we are holding the tree
147  		 * lock and we only change the waitqueue while holding that lock
148  		 * (see wait_extent_bit()).
149  		 */
150  		ASSERT(!waitqueue_active(&state->wq));
151  		free_extent_state(state);
152  		cond_resched_lock(&tree->lock);
153  	}
154  	/*
155  	 * Should still be empty even after a reschedule, no other task should
156  	 * be accessing the tree anymore.
157  	 */
158  	ASSERT(RB_EMPTY_ROOT(&tree->state));
159  	spin_unlock(&tree->lock);
160  }
161  
alloc_extent_state(gfp_t mask)162  static struct extent_state *alloc_extent_state(gfp_t mask)
163  {
164  	struct extent_state *state;
165  
166  	/*
167  	 * The given mask might be not appropriate for the slab allocator,
168  	 * drop the unsupported bits
169  	 */
170  	mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
171  	state = kmem_cache_alloc(extent_state_cache, mask);
172  	if (!state)
173  		return state;
174  	state->state = 0;
175  	RB_CLEAR_NODE(&state->rb_node);
176  	btrfs_leak_debug_add_state(state);
177  	refcount_set(&state->refs, 1);
178  	init_waitqueue_head(&state->wq);
179  	trace_alloc_extent_state(state, mask, _RET_IP_);
180  	return state;
181  }
182  
alloc_extent_state_atomic(struct extent_state * prealloc)183  static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc)
184  {
185  	if (!prealloc)
186  		prealloc = alloc_extent_state(GFP_ATOMIC);
187  
188  	return prealloc;
189  }
190  
free_extent_state(struct extent_state * state)191  void free_extent_state(struct extent_state *state)
192  {
193  	if (!state)
194  		return;
195  	if (refcount_dec_and_test(&state->refs)) {
196  		WARN_ON(extent_state_in_tree(state));
197  		btrfs_leak_debug_del_state(state);
198  		trace_free_extent_state(state, _RET_IP_);
199  		kmem_cache_free(extent_state_cache, state);
200  	}
201  }
202  
add_extent_changeset(struct extent_state * state,u32 bits,struct extent_changeset * changeset,int set)203  static int add_extent_changeset(struct extent_state *state, u32 bits,
204  				 struct extent_changeset *changeset,
205  				 int set)
206  {
207  	int ret;
208  
209  	if (!changeset)
210  		return 0;
211  	if (set && (state->state & bits) == bits)
212  		return 0;
213  	if (!set && (state->state & bits) == 0)
214  		return 0;
215  	changeset->bytes_changed += state->end - state->start + 1;
216  	ret = ulist_add(&changeset->range_changed, state->start, state->end,
217  			GFP_ATOMIC);
218  	return ret;
219  }
220  
next_state(struct extent_state * state)221  static inline struct extent_state *next_state(struct extent_state *state)
222  {
223  	struct rb_node *next = rb_next(&state->rb_node);
224  
225  	if (next)
226  		return rb_entry(next, struct extent_state, rb_node);
227  	else
228  		return NULL;
229  }
230  
prev_state(struct extent_state * state)231  static inline struct extent_state *prev_state(struct extent_state *state)
232  {
233  	struct rb_node *next = rb_prev(&state->rb_node);
234  
235  	if (next)
236  		return rb_entry(next, struct extent_state, rb_node);
237  	else
238  		return NULL;
239  }
240  
241  /*
242   * Search @tree for an entry that contains @offset. Such entry would have
243   * entry->start <= offset && entry->end >= offset.
244   *
245   * @tree:       the tree to search
246   * @offset:     offset that should fall within an entry in @tree
247   * @node_ret:   pointer where new node should be anchored (used when inserting an
248   *	        entry in the tree)
249   * @parent_ret: points to entry which would have been the parent of the entry,
250   *               containing @offset
251   *
252   * Return a pointer to the entry that contains @offset byte address and don't change
253   * @node_ret and @parent_ret.
254   *
255   * If no such entry exists, return pointer to entry that ends before @offset
256   * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
257   */
tree_search_for_insert(struct extent_io_tree * tree,u64 offset,struct rb_node *** node_ret,struct rb_node ** parent_ret)258  static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
259  							  u64 offset,
260  							  struct rb_node ***node_ret,
261  							  struct rb_node **parent_ret)
262  {
263  	struct rb_root *root = &tree->state;
264  	struct rb_node **node = &root->rb_node;
265  	struct rb_node *prev = NULL;
266  	struct extent_state *entry = NULL;
267  
268  	while (*node) {
269  		prev = *node;
270  		entry = rb_entry(prev, struct extent_state, rb_node);
271  
272  		if (offset < entry->start)
273  			node = &(*node)->rb_left;
274  		else if (offset > entry->end)
275  			node = &(*node)->rb_right;
276  		else
277  			return entry;
278  	}
279  
280  	if (node_ret)
281  		*node_ret = node;
282  	if (parent_ret)
283  		*parent_ret = prev;
284  
285  	/* Search neighbors until we find the first one past the end */
286  	while (entry && offset > entry->end)
287  		entry = next_state(entry);
288  
289  	return entry;
290  }
291  
292  /*
293   * Search offset in the tree or fill neighbor rbtree node pointers.
294   *
295   * @tree:      the tree to search
296   * @offset:    offset that should fall within an entry in @tree
297   * @next_ret:  pointer to the first entry whose range ends after @offset
298   * @prev_ret:  pointer to the first entry whose range begins before @offset
299   *
300   * Return a pointer to the entry that contains @offset byte address. If no
301   * such entry exists, then return NULL and fill @prev_ret and @next_ret.
302   * Otherwise return the found entry and other pointers are left untouched.
303   */
tree_search_prev_next(struct extent_io_tree * tree,u64 offset,struct extent_state ** prev_ret,struct extent_state ** next_ret)304  static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree,
305  						  u64 offset,
306  						  struct extent_state **prev_ret,
307  						  struct extent_state **next_ret)
308  {
309  	struct rb_root *root = &tree->state;
310  	struct rb_node **node = &root->rb_node;
311  	struct extent_state *orig_prev;
312  	struct extent_state *entry = NULL;
313  
314  	ASSERT(prev_ret);
315  	ASSERT(next_ret);
316  
317  	while (*node) {
318  		entry = rb_entry(*node, struct extent_state, rb_node);
319  
320  		if (offset < entry->start)
321  			node = &(*node)->rb_left;
322  		else if (offset > entry->end)
323  			node = &(*node)->rb_right;
324  		else
325  			return entry;
326  	}
327  
328  	orig_prev = entry;
329  	while (entry && offset > entry->end)
330  		entry = next_state(entry);
331  	*next_ret = entry;
332  	entry = orig_prev;
333  
334  	while (entry && offset < entry->start)
335  		entry = prev_state(entry);
336  	*prev_ret = entry;
337  
338  	return NULL;
339  }
340  
341  /*
342   * Inexact rb-tree search, return the next entry if @offset is not found
343   */
tree_search(struct extent_io_tree * tree,u64 offset)344  static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset)
345  {
346  	return tree_search_for_insert(tree, offset, NULL, NULL);
347  }
348  
extent_io_tree_panic(const struct extent_io_tree * tree,const struct extent_state * state,const char * opname,int err)349  static void extent_io_tree_panic(const struct extent_io_tree *tree,
350  				 const struct extent_state *state,
351  				 const char *opname,
352  				 int err)
353  {
354  	btrfs_panic(extent_io_tree_to_fs_info(tree), err,
355  		    "extent io tree error on %s state start %llu end %llu",
356  		    opname, state->start, state->end);
357  }
358  
merge_prev_state(struct extent_io_tree * tree,struct extent_state * state)359  static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
360  {
361  	struct extent_state *prev;
362  
363  	prev = prev_state(state);
364  	if (prev && prev->end == state->start - 1 && prev->state == state->state) {
365  		if (is_inode_io_tree(tree))
366  			btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
367  						    state, prev);
368  		state->start = prev->start;
369  		rb_erase(&prev->rb_node, &tree->state);
370  		RB_CLEAR_NODE(&prev->rb_node);
371  		free_extent_state(prev);
372  	}
373  }
374  
merge_next_state(struct extent_io_tree * tree,struct extent_state * state)375  static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state)
376  {
377  	struct extent_state *next;
378  
379  	next = next_state(state);
380  	if (next && next->start == state->end + 1 && next->state == state->state) {
381  		if (is_inode_io_tree(tree))
382  			btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
383  						    state, next);
384  		state->end = next->end;
385  		rb_erase(&next->rb_node, &tree->state);
386  		RB_CLEAR_NODE(&next->rb_node);
387  		free_extent_state(next);
388  	}
389  }
390  
391  /*
392   * Utility function to look for merge candidates inside a given range.  Any
393   * extents with matching state are merged together into a single extent in the
394   * tree.  Extents with EXTENT_IO in their state field are not merged because
395   * the end_io handlers need to be able to do operations on them without
396   * sleeping (or doing allocations/splits).
397   *
398   * This should be called with the tree lock held.
399   */
merge_state(struct extent_io_tree * tree,struct extent_state * state)400  static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
401  {
402  	if (state->state & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY))
403  		return;
404  
405  	merge_prev_state(tree, state);
406  	merge_next_state(tree, state);
407  }
408  
set_state_bits(struct extent_io_tree * tree,struct extent_state * state,u32 bits,struct extent_changeset * changeset)409  static void set_state_bits(struct extent_io_tree *tree,
410  			   struct extent_state *state,
411  			   u32 bits, struct extent_changeset *changeset)
412  {
413  	u32 bits_to_set = bits & ~EXTENT_CTLBITS;
414  	int ret;
415  
416  	if (is_inode_io_tree(tree))
417  		btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits);
418  
419  	ret = add_extent_changeset(state, bits_to_set, changeset, 1);
420  	BUG_ON(ret < 0);
421  	state->state |= bits_to_set;
422  }
423  
424  /*
425   * Insert an extent_state struct into the tree.  'bits' are set on the
426   * struct before it is inserted.
427   *
428   * Returns a pointer to the struct extent_state record containing the range
429   * requested for insertion, which may be the same as the given struct or it
430   * may be an existing record in the tree that was expanded to accommodate the
431   * requested range. In case of an extent_state different from the one that was
432   * given, the later can be freed or reused by the caller.
433   *
434   * On error it returns an error pointer.
435   *
436   * The tree lock is not taken internally.  This is a utility function and
437   * probably isn't what you want to call (see set/clear_extent_bit).
438   */
insert_state(struct extent_io_tree * tree,struct extent_state * state,u32 bits,struct extent_changeset * changeset)439  static struct extent_state *insert_state(struct extent_io_tree *tree,
440  					 struct extent_state *state,
441  					 u32 bits,
442  					 struct extent_changeset *changeset)
443  {
444  	struct rb_node **node;
445  	struct rb_node *parent = NULL;
446  	const u64 start = state->start - 1;
447  	const u64 end = state->end + 1;
448  	const bool try_merge = !(bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY));
449  
450  	set_state_bits(tree, state, bits, changeset);
451  
452  	node = &tree->state.rb_node;
453  	while (*node) {
454  		struct extent_state *entry;
455  
456  		parent = *node;
457  		entry = rb_entry(parent, struct extent_state, rb_node);
458  
459  		if (state->end < entry->start) {
460  			if (try_merge && end == entry->start &&
461  			    state->state == entry->state) {
462  				if (is_inode_io_tree(tree))
463  					btrfs_merge_delalloc_extent(
464  							extent_io_tree_to_inode(tree),
465  							state, entry);
466  				entry->start = state->start;
467  				merge_prev_state(tree, entry);
468  				state->state = 0;
469  				return entry;
470  			}
471  			node = &(*node)->rb_left;
472  		} else if (state->end > entry->end) {
473  			if (try_merge && entry->end == start &&
474  			    state->state == entry->state) {
475  				if (is_inode_io_tree(tree))
476  					btrfs_merge_delalloc_extent(
477  							extent_io_tree_to_inode(tree),
478  							state, entry);
479  				entry->end = state->end;
480  				merge_next_state(tree, entry);
481  				state->state = 0;
482  				return entry;
483  			}
484  			node = &(*node)->rb_right;
485  		} else {
486  			return ERR_PTR(-EEXIST);
487  		}
488  	}
489  
490  	rb_link_node(&state->rb_node, parent, node);
491  	rb_insert_color(&state->rb_node, &tree->state);
492  
493  	return state;
494  }
495  
496  /*
497   * Insert state to @tree to the location given by @node and @parent.
498   */
insert_state_fast(struct extent_io_tree * tree,struct extent_state * state,struct rb_node ** node,struct rb_node * parent,unsigned bits,struct extent_changeset * changeset)499  static void insert_state_fast(struct extent_io_tree *tree,
500  			      struct extent_state *state, struct rb_node **node,
501  			      struct rb_node *parent, unsigned bits,
502  			      struct extent_changeset *changeset)
503  {
504  	set_state_bits(tree, state, bits, changeset);
505  	rb_link_node(&state->rb_node, parent, node);
506  	rb_insert_color(&state->rb_node, &tree->state);
507  	merge_state(tree, state);
508  }
509  
510  /*
511   * Split a given extent state struct in two, inserting the preallocated
512   * struct 'prealloc' as the newly created second half.  'split' indicates an
513   * offset inside 'orig' where it should be split.
514   *
515   * Before calling,
516   * the tree has 'orig' at [orig->start, orig->end].  After calling, there
517   * are two extent state structs in the tree:
518   * prealloc: [orig->start, split - 1]
519   * orig: [ split, orig->end ]
520   *
521   * The tree locks are not taken by this function. They need to be held
522   * by the caller.
523   */
split_state(struct extent_io_tree * tree,struct extent_state * orig,struct extent_state * prealloc,u64 split)524  static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
525  		       struct extent_state *prealloc, u64 split)
526  {
527  	struct rb_node *parent = NULL;
528  	struct rb_node **node;
529  
530  	if (is_inode_io_tree(tree))
531  		btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig,
532  					    split);
533  
534  	prealloc->start = orig->start;
535  	prealloc->end = split - 1;
536  	prealloc->state = orig->state;
537  	orig->start = split;
538  
539  	parent = &orig->rb_node;
540  	node = &parent;
541  	while (*node) {
542  		struct extent_state *entry;
543  
544  		parent = *node;
545  		entry = rb_entry(parent, struct extent_state, rb_node);
546  
547  		if (prealloc->end < entry->start) {
548  			node = &(*node)->rb_left;
549  		} else if (prealloc->end > entry->end) {
550  			node = &(*node)->rb_right;
551  		} else {
552  			free_extent_state(prealloc);
553  			return -EEXIST;
554  		}
555  	}
556  
557  	rb_link_node(&prealloc->rb_node, parent, node);
558  	rb_insert_color(&prealloc->rb_node, &tree->state);
559  
560  	return 0;
561  }
562  
563  /*
564   * Utility function to clear some bits in an extent state struct.  It will
565   * optionally wake up anyone waiting on this state (wake == 1).
566   *
567   * If no bits are set on the state struct after clearing things, the
568   * struct is freed and removed from the tree
569   */
clear_state_bit(struct extent_io_tree * tree,struct extent_state * state,u32 bits,int wake,struct extent_changeset * changeset)570  static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
571  					    struct extent_state *state,
572  					    u32 bits, int wake,
573  					    struct extent_changeset *changeset)
574  {
575  	struct extent_state *next;
576  	u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
577  	int ret;
578  
579  	if (is_inode_io_tree(tree))
580  		btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state,
581  					    bits);
582  
583  	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
584  	BUG_ON(ret < 0);
585  	state->state &= ~bits_to_clear;
586  	if (wake)
587  		wake_up(&state->wq);
588  	if (state->state == 0) {
589  		next = next_state(state);
590  		if (extent_state_in_tree(state)) {
591  			rb_erase(&state->rb_node, &tree->state);
592  			RB_CLEAR_NODE(&state->rb_node);
593  			free_extent_state(state);
594  		} else {
595  			WARN_ON(1);
596  		}
597  	} else {
598  		merge_state(tree, state);
599  		next = next_state(state);
600  	}
601  	return next;
602  }
603  
604  /*
605   * Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly,
606   * unset the EXTENT_NOWAIT bit.
607   */
set_gfp_mask_from_bits(u32 * bits,gfp_t * mask)608  static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask)
609  {
610  	*mask = (*bits & EXTENT_NOWAIT ? GFP_NOWAIT : GFP_NOFS);
611  	*bits &= EXTENT_NOWAIT - 1;
612  }
613  
614  /*
615   * Clear some bits on a range in the tree.  This may require splitting or
616   * inserting elements in the tree, so the gfp mask is used to indicate which
617   * allocations or sleeping are allowed.
618   *
619   * The range [start, end] is inclusive.
620   *
621   * This takes the tree lock, and returns 0 on success and < 0 on error.
622   */
__clear_extent_bit(struct extent_io_tree * tree,u64 start,u64 end,u32 bits,struct extent_state ** cached_state,struct extent_changeset * changeset)623  int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
624  		       u32 bits, struct extent_state **cached_state,
625  		       struct extent_changeset *changeset)
626  {
627  	struct extent_state *state;
628  	struct extent_state *cached;
629  	struct extent_state *prealloc = NULL;
630  	u64 last_end;
631  	int err;
632  	int clear = 0;
633  	int wake;
634  	int delete = (bits & EXTENT_CLEAR_ALL_BITS);
635  	gfp_t mask;
636  
637  	set_gfp_mask_from_bits(&bits, &mask);
638  	btrfs_debug_check_extent_io_range(tree, start, end);
639  	trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
640  
641  	if (delete)
642  		bits |= ~EXTENT_CTLBITS;
643  
644  	if (bits & EXTENT_DELALLOC)
645  		bits |= EXTENT_NORESERVE;
646  
647  	wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0);
648  	if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY))
649  		clear = 1;
650  again:
651  	if (!prealloc) {
652  		/*
653  		 * Don't care for allocation failure here because we might end
654  		 * up not needing the pre-allocated extent state at all, which
655  		 * is the case if we only have in the tree extent states that
656  		 * cover our input range and don't cover too any other range.
657  		 * If we end up needing a new extent state we allocate it later.
658  		 */
659  		prealloc = alloc_extent_state(mask);
660  	}
661  
662  	spin_lock(&tree->lock);
663  	if (cached_state) {
664  		cached = *cached_state;
665  
666  		if (clear) {
667  			*cached_state = NULL;
668  			cached_state = NULL;
669  		}
670  
671  		if (cached && extent_state_in_tree(cached) &&
672  		    cached->start <= start && cached->end > start) {
673  			if (clear)
674  				refcount_dec(&cached->refs);
675  			state = cached;
676  			goto hit_next;
677  		}
678  		if (clear)
679  			free_extent_state(cached);
680  	}
681  
682  	/* This search will find the extents that end after our range starts. */
683  	state = tree_search(tree, start);
684  	if (!state)
685  		goto out;
686  hit_next:
687  	if (state->start > end)
688  		goto out;
689  	WARN_ON(state->end < start);
690  	last_end = state->end;
691  
692  	/* The state doesn't have the wanted bits, go ahead. */
693  	if (!(state->state & bits)) {
694  		state = next_state(state);
695  		goto next;
696  	}
697  
698  	/*
699  	 *     | ---- desired range ---- |
700  	 *  | state | or
701  	 *  | ------------- state -------------- |
702  	 *
703  	 * We need to split the extent we found, and may flip bits on second
704  	 * half.
705  	 *
706  	 * If the extent we found extends past our range, we just split and
707  	 * search again.  It'll get split again the next time though.
708  	 *
709  	 * If the extent we found is inside our range, we clear the desired bit
710  	 * on it.
711  	 */
712  
713  	if (state->start < start) {
714  		prealloc = alloc_extent_state_atomic(prealloc);
715  		if (!prealloc)
716  			goto search_again;
717  		err = split_state(tree, state, prealloc, start);
718  		if (err)
719  			extent_io_tree_panic(tree, state, "split", err);
720  
721  		prealloc = NULL;
722  		if (err)
723  			goto out;
724  		if (state->end <= end) {
725  			state = clear_state_bit(tree, state, bits, wake, changeset);
726  			goto next;
727  		}
728  		goto search_again;
729  	}
730  	/*
731  	 * | ---- desired range ---- |
732  	 *                        | state |
733  	 * We need to split the extent, and clear the bit on the first half.
734  	 */
735  	if (state->start <= end && state->end > end) {
736  		prealloc = alloc_extent_state_atomic(prealloc);
737  		if (!prealloc)
738  			goto search_again;
739  		err = split_state(tree, state, prealloc, end + 1);
740  		if (err)
741  			extent_io_tree_panic(tree, state, "split", err);
742  
743  		if (wake)
744  			wake_up(&state->wq);
745  
746  		clear_state_bit(tree, prealloc, bits, wake, changeset);
747  
748  		prealloc = NULL;
749  		goto out;
750  	}
751  
752  	state = clear_state_bit(tree, state, bits, wake, changeset);
753  next:
754  	if (last_end == (u64)-1)
755  		goto out;
756  	start = last_end + 1;
757  	if (start <= end && state && !need_resched())
758  		goto hit_next;
759  
760  search_again:
761  	if (start > end)
762  		goto out;
763  	spin_unlock(&tree->lock);
764  	if (gfpflags_allow_blocking(mask))
765  		cond_resched();
766  	goto again;
767  
768  out:
769  	spin_unlock(&tree->lock);
770  	if (prealloc)
771  		free_extent_state(prealloc);
772  
773  	return 0;
774  
775  }
776  
777  /*
778   * Wait for one or more bits to clear on a range in the state tree.
779   * The range [start, end] is inclusive.
780   * The tree lock is taken by this function
781   */
wait_extent_bit(struct extent_io_tree * tree,u64 start,u64 end,u32 bits,struct extent_state ** cached_state)782  static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
783  			    u32 bits, struct extent_state **cached_state)
784  {
785  	struct extent_state *state;
786  
787  	btrfs_debug_check_extent_io_range(tree, start, end);
788  
789  	spin_lock(&tree->lock);
790  again:
791  	/*
792  	 * Maintain cached_state, as we may not remove it from the tree if there
793  	 * are more bits than the bits we're waiting on set on this state.
794  	 */
795  	if (cached_state && *cached_state) {
796  		state = *cached_state;
797  		if (extent_state_in_tree(state) &&
798  		    state->start <= start && start < state->end)
799  			goto process_node;
800  	}
801  	while (1) {
802  		/*
803  		 * This search will find all the extents that end after our
804  		 * range starts.
805  		 */
806  		state = tree_search(tree, start);
807  process_node:
808  		if (!state)
809  			break;
810  		if (state->start > end)
811  			goto out;
812  
813  		if (state->state & bits) {
814  			DEFINE_WAIT(wait);
815  
816  			start = state->start;
817  			refcount_inc(&state->refs);
818  			prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
819  			spin_unlock(&tree->lock);
820  			schedule();
821  			spin_lock(&tree->lock);
822  			finish_wait(&state->wq, &wait);
823  			free_extent_state(state);
824  			goto again;
825  		}
826  		start = state->end + 1;
827  
828  		if (start > end)
829  			break;
830  
831  		if (!cond_resched_lock(&tree->lock)) {
832  			state = next_state(state);
833  			goto process_node;
834  		}
835  	}
836  out:
837  	/* This state is no longer useful, clear it and free it up. */
838  	if (cached_state && *cached_state) {
839  		state = *cached_state;
840  		*cached_state = NULL;
841  		free_extent_state(state);
842  	}
843  	spin_unlock(&tree->lock);
844  }
845  
cache_state_if_flags(struct extent_state * state,struct extent_state ** cached_ptr,unsigned flags)846  static void cache_state_if_flags(struct extent_state *state,
847  				 struct extent_state **cached_ptr,
848  				 unsigned flags)
849  {
850  	if (cached_ptr && !(*cached_ptr)) {
851  		if (!flags || (state->state & flags)) {
852  			*cached_ptr = state;
853  			refcount_inc(&state->refs);
854  		}
855  	}
856  }
857  
cache_state(struct extent_state * state,struct extent_state ** cached_ptr)858  static void cache_state(struct extent_state *state,
859  			struct extent_state **cached_ptr)
860  {
861  	return cache_state_if_flags(state, cached_ptr, EXTENT_LOCK_BITS | EXTENT_BOUNDARY);
862  }
863  
864  /*
865   * Find the first state struct with 'bits' set after 'start', and return it.
866   * tree->lock must be held.  NULL will returned if nothing was found after
867   * 'start'.
868   */
find_first_extent_bit_state(struct extent_io_tree * tree,u64 start,u32 bits)869  static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
870  							u64 start, u32 bits)
871  {
872  	struct extent_state *state;
873  
874  	/*
875  	 * This search will find all the extents that end after our range
876  	 * starts.
877  	 */
878  	state = tree_search(tree, start);
879  	while (state) {
880  		if (state->end >= start && (state->state & bits))
881  			return state;
882  		state = next_state(state);
883  	}
884  	return NULL;
885  }
886  
887  /*
888   * Find the first offset in the io tree with one or more @bits set.
889   *
890   * Note: If there are multiple bits set in @bits, any of them will match.
891   *
892   * Return true if we find something, and update @start_ret and @end_ret.
893   * Return false if we found nothing.
894   */
find_first_extent_bit(struct extent_io_tree * tree,u64 start,u64 * start_ret,u64 * end_ret,u32 bits,struct extent_state ** cached_state)895  bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
896  			   u64 *start_ret, u64 *end_ret, u32 bits,
897  			   struct extent_state **cached_state)
898  {
899  	struct extent_state *state;
900  	bool ret = false;
901  
902  	spin_lock(&tree->lock);
903  	if (cached_state && *cached_state) {
904  		state = *cached_state;
905  		if (state->end == start - 1 && extent_state_in_tree(state)) {
906  			while ((state = next_state(state)) != NULL) {
907  				if (state->state & bits)
908  					break;
909  			}
910  			/*
911  			 * If we found the next extent state, clear cached_state
912  			 * so that we can cache the next extent state below and
913  			 * avoid future calls going over the same extent state
914  			 * again. If we haven't found any, clear as well since
915  			 * it's now useless.
916  			 */
917  			free_extent_state(*cached_state);
918  			*cached_state = NULL;
919  			if (state)
920  				goto got_it;
921  			goto out;
922  		}
923  		free_extent_state(*cached_state);
924  		*cached_state = NULL;
925  	}
926  
927  	state = find_first_extent_bit_state(tree, start, bits);
928  got_it:
929  	if (state) {
930  		cache_state_if_flags(state, cached_state, 0);
931  		*start_ret = state->start;
932  		*end_ret = state->end;
933  		ret = true;
934  	}
935  out:
936  	spin_unlock(&tree->lock);
937  	return ret;
938  }
939  
940  /*
941   * Find a contiguous area of bits
942   *
943   * @tree:      io tree to check
944   * @start:     offset to start the search from
945   * @start_ret: the first offset we found with the bits set
946   * @end_ret:   the final contiguous range of the bits that were set
947   * @bits:      bits to look for
948   *
949   * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
950   * to set bits appropriately, and then merge them again.  During this time it
951   * will drop the tree->lock, so use this helper if you want to find the actual
952   * contiguous area for given bits.  We will search to the first bit we find, and
953   * then walk down the tree until we find a non-contiguous area.  The area
954   * returned will be the full contiguous area with the bits set.
955   */
find_contiguous_extent_bit(struct extent_io_tree * tree,u64 start,u64 * start_ret,u64 * end_ret,u32 bits)956  int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
957  			       u64 *start_ret, u64 *end_ret, u32 bits)
958  {
959  	struct extent_state *state;
960  	int ret = 1;
961  
962  	ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES));
963  
964  	spin_lock(&tree->lock);
965  	state = find_first_extent_bit_state(tree, start, bits);
966  	if (state) {
967  		*start_ret = state->start;
968  		*end_ret = state->end;
969  		while ((state = next_state(state)) != NULL) {
970  			if (state->start > (*end_ret + 1))
971  				break;
972  			*end_ret = state->end;
973  		}
974  		ret = 0;
975  	}
976  	spin_unlock(&tree->lock);
977  	return ret;
978  }
979  
980  /*
981   * Find a contiguous range of bytes in the file marked as delalloc, not more
982   * than 'max_bytes'.  start and end are used to return the range,
983   *
984   * True is returned if we find something, false if nothing was in the tree.
985   */
btrfs_find_delalloc_range(struct extent_io_tree * tree,u64 * start,u64 * end,u64 max_bytes,struct extent_state ** cached_state)986  bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
987  			       u64 *end, u64 max_bytes,
988  			       struct extent_state **cached_state)
989  {
990  	struct extent_state *state;
991  	u64 cur_start = *start;
992  	bool found = false;
993  	u64 total_bytes = 0;
994  
995  	spin_lock(&tree->lock);
996  
997  	/*
998  	 * This search will find all the extents that end after our range
999  	 * starts.
1000  	 */
1001  	state = tree_search(tree, cur_start);
1002  	if (!state) {
1003  		*end = (u64)-1;
1004  		goto out;
1005  	}
1006  
1007  	while (state) {
1008  		if (found && (state->start != cur_start ||
1009  			      (state->state & EXTENT_BOUNDARY))) {
1010  			goto out;
1011  		}
1012  		if (!(state->state & EXTENT_DELALLOC)) {
1013  			if (!found)
1014  				*end = state->end;
1015  			goto out;
1016  		}
1017  		if (!found) {
1018  			*start = state->start;
1019  			*cached_state = state;
1020  			refcount_inc(&state->refs);
1021  		}
1022  		found = true;
1023  		*end = state->end;
1024  		cur_start = state->end + 1;
1025  		total_bytes += state->end - state->start + 1;
1026  		if (total_bytes >= max_bytes)
1027  			break;
1028  		state = next_state(state);
1029  	}
1030  out:
1031  	spin_unlock(&tree->lock);
1032  	return found;
1033  }
1034  
1035  /*
1036   * Set some bits on a range in the tree.  This may require allocations or
1037   * sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for
1038   * GFP_NOWAIT.
1039   *
1040   * If any of the exclusive bits are set, this will fail with -EEXIST if some
1041   * part of the range already has the desired bits set.  The extent_state of the
1042   * existing range is returned in failed_state in this case, and the start of the
1043   * existing range is returned in failed_start.  failed_state is used as an
1044   * optimization for wait_extent_bit, failed_start must be used as the source of
1045   * truth as failed_state may have changed since we returned.
1046   *
1047   * [start, end] is inclusive This takes the tree lock.
1048   */
__set_extent_bit(struct extent_io_tree * tree,u64 start,u64 end,u32 bits,u64 * failed_start,struct extent_state ** failed_state,struct extent_state ** cached_state,struct extent_changeset * changeset)1049  static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1050  			    u32 bits, u64 *failed_start,
1051  			    struct extent_state **failed_state,
1052  			    struct extent_state **cached_state,
1053  			    struct extent_changeset *changeset)
1054  {
1055  	struct extent_state *state;
1056  	struct extent_state *prealloc = NULL;
1057  	struct rb_node **p = NULL;
1058  	struct rb_node *parent = NULL;
1059  	int ret = 0;
1060  	u64 last_start;
1061  	u64 last_end;
1062  	u32 exclusive_bits = (bits & EXTENT_LOCK_BITS);
1063  	gfp_t mask;
1064  
1065  	set_gfp_mask_from_bits(&bits, &mask);
1066  	btrfs_debug_check_extent_io_range(tree, start, end);
1067  	trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
1068  
1069  	if (exclusive_bits)
1070  		ASSERT(failed_start);
1071  	else
1072  		ASSERT(failed_start == NULL && failed_state == NULL);
1073  again:
1074  	if (!prealloc) {
1075  		/*
1076  		 * Don't care for allocation failure here because we might end
1077  		 * up not needing the pre-allocated extent state at all, which
1078  		 * is the case if we only have in the tree extent states that
1079  		 * cover our input range and don't cover too any other range.
1080  		 * If we end up needing a new extent state we allocate it later.
1081  		 */
1082  		prealloc = alloc_extent_state(mask);
1083  	}
1084  	/* Optimistically preallocate the extent changeset ulist node. */
1085  	if (changeset)
1086  		extent_changeset_prealloc(changeset, mask);
1087  
1088  	spin_lock(&tree->lock);
1089  	if (cached_state && *cached_state) {
1090  		state = *cached_state;
1091  		if (state->start <= start && state->end > start &&
1092  		    extent_state_in_tree(state))
1093  			goto hit_next;
1094  	}
1095  	/*
1096  	 * This search will find all the extents that end after our range
1097  	 * starts.
1098  	 */
1099  	state = tree_search_for_insert(tree, start, &p, &parent);
1100  	if (!state) {
1101  		prealloc = alloc_extent_state_atomic(prealloc);
1102  		if (!prealloc)
1103  			goto search_again;
1104  		prealloc->start = start;
1105  		prealloc->end = end;
1106  		insert_state_fast(tree, prealloc, p, parent, bits, changeset);
1107  		cache_state(prealloc, cached_state);
1108  		prealloc = NULL;
1109  		goto out;
1110  	}
1111  hit_next:
1112  	last_start = state->start;
1113  	last_end = state->end;
1114  
1115  	/*
1116  	 * | ---- desired range ---- |
1117  	 * | state |
1118  	 *
1119  	 * Just lock what we found and keep going
1120  	 */
1121  	if (state->start == start && state->end <= end) {
1122  		if (state->state & exclusive_bits) {
1123  			*failed_start = state->start;
1124  			cache_state(state, failed_state);
1125  			ret = -EEXIST;
1126  			goto out;
1127  		}
1128  
1129  		set_state_bits(tree, state, bits, changeset);
1130  		cache_state(state, cached_state);
1131  		merge_state(tree, state);
1132  		if (last_end == (u64)-1)
1133  			goto out;
1134  		start = last_end + 1;
1135  		state = next_state(state);
1136  		if (start < end && state && state->start == start &&
1137  		    !need_resched())
1138  			goto hit_next;
1139  		goto search_again;
1140  	}
1141  
1142  	/*
1143  	 *     | ---- desired range ---- |
1144  	 * | state |
1145  	 *   or
1146  	 * | ------------- state -------------- |
1147  	 *
1148  	 * We need to split the extent we found, and may flip bits on second
1149  	 * half.
1150  	 *
1151  	 * If the extent we found extends past our range, we just split and
1152  	 * search again.  It'll get split again the next time though.
1153  	 *
1154  	 * If the extent we found is inside our range, we set the desired bit
1155  	 * on it.
1156  	 */
1157  	if (state->start < start) {
1158  		if (state->state & exclusive_bits) {
1159  			*failed_start = start;
1160  			cache_state(state, failed_state);
1161  			ret = -EEXIST;
1162  			goto out;
1163  		}
1164  
1165  		/*
1166  		 * If this extent already has all the bits we want set, then
1167  		 * skip it, not necessary to split it or do anything with it.
1168  		 */
1169  		if ((state->state & bits) == bits) {
1170  			start = state->end + 1;
1171  			cache_state(state, cached_state);
1172  			goto search_again;
1173  		}
1174  
1175  		prealloc = alloc_extent_state_atomic(prealloc);
1176  		if (!prealloc)
1177  			goto search_again;
1178  		ret = split_state(tree, state, prealloc, start);
1179  		if (ret)
1180  			extent_io_tree_panic(tree, state, "split", ret);
1181  
1182  		prealloc = NULL;
1183  		if (ret)
1184  			goto out;
1185  		if (state->end <= end) {
1186  			set_state_bits(tree, state, bits, changeset);
1187  			cache_state(state, cached_state);
1188  			merge_state(tree, state);
1189  			if (last_end == (u64)-1)
1190  				goto out;
1191  			start = last_end + 1;
1192  			state = next_state(state);
1193  			if (start < end && state && state->start == start &&
1194  			    !need_resched())
1195  				goto hit_next;
1196  		}
1197  		goto search_again;
1198  	}
1199  	/*
1200  	 * | ---- desired range ---- |
1201  	 *     | state | or               | state |
1202  	 *
1203  	 * There's a hole, we need to insert something in it and ignore the
1204  	 * extent we found.
1205  	 */
1206  	if (state->start > start) {
1207  		u64 this_end;
1208  		struct extent_state *inserted_state;
1209  
1210  		if (end < last_start)
1211  			this_end = end;
1212  		else
1213  			this_end = last_start - 1;
1214  
1215  		prealloc = alloc_extent_state_atomic(prealloc);
1216  		if (!prealloc)
1217  			goto search_again;
1218  
1219  		/*
1220  		 * Avoid to free 'prealloc' if it can be merged with the later
1221  		 * extent.
1222  		 */
1223  		prealloc->start = start;
1224  		prealloc->end = this_end;
1225  		inserted_state = insert_state(tree, prealloc, bits, changeset);
1226  		if (IS_ERR(inserted_state)) {
1227  			ret = PTR_ERR(inserted_state);
1228  			extent_io_tree_panic(tree, prealloc, "insert", ret);
1229  		}
1230  
1231  		cache_state(inserted_state, cached_state);
1232  		if (inserted_state == prealloc)
1233  			prealloc = NULL;
1234  		start = this_end + 1;
1235  		goto search_again;
1236  	}
1237  	/*
1238  	 * | ---- desired range ---- |
1239  	 *                        | state |
1240  	 *
1241  	 * We need to split the extent, and set the bit on the first half
1242  	 */
1243  	if (state->start <= end && state->end > end) {
1244  		if (state->state & exclusive_bits) {
1245  			*failed_start = start;
1246  			cache_state(state, failed_state);
1247  			ret = -EEXIST;
1248  			goto out;
1249  		}
1250  
1251  		prealloc = alloc_extent_state_atomic(prealloc);
1252  		if (!prealloc)
1253  			goto search_again;
1254  		ret = split_state(tree, state, prealloc, end + 1);
1255  		if (ret)
1256  			extent_io_tree_panic(tree, state, "split", ret);
1257  
1258  		set_state_bits(tree, prealloc, bits, changeset);
1259  		cache_state(prealloc, cached_state);
1260  		merge_state(tree, prealloc);
1261  		prealloc = NULL;
1262  		goto out;
1263  	}
1264  
1265  search_again:
1266  	if (start > end)
1267  		goto out;
1268  	spin_unlock(&tree->lock);
1269  	if (gfpflags_allow_blocking(mask))
1270  		cond_resched();
1271  	goto again;
1272  
1273  out:
1274  	spin_unlock(&tree->lock);
1275  	if (prealloc)
1276  		free_extent_state(prealloc);
1277  
1278  	return ret;
1279  
1280  }
1281  
set_extent_bit(struct extent_io_tree * tree,u64 start,u64 end,u32 bits,struct extent_state ** cached_state)1282  int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1283  		   u32 bits, struct extent_state **cached_state)
1284  {
1285  	return __set_extent_bit(tree, start, end, bits, NULL, NULL,
1286  				cached_state, NULL);
1287  }
1288  
1289  /*
1290   * Convert all bits in a given range from one bit to another
1291   *
1292   * @tree:	the io tree to search
1293   * @start:	the start offset in bytes
1294   * @end:	the end offset in bytes (inclusive)
1295   * @bits:	the bits to set in this range
1296   * @clear_bits:	the bits to clear in this range
1297   * @cached_state:	state that we're going to cache
1298   *
1299   * This will go through and set bits for the given range.  If any states exist
1300   * already in this range they are set with the given bit and cleared of the
1301   * clear_bits.  This is only meant to be used by things that are mergeable, ie.
1302   * converting from say DELALLOC to DIRTY.  This is not meant to be used with
1303   * boundary bits like LOCK.
1304   *
1305   * All allocations are done with GFP_NOFS.
1306   */
convert_extent_bit(struct extent_io_tree * tree,u64 start,u64 end,u32 bits,u32 clear_bits,struct extent_state ** cached_state)1307  int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1308  		       u32 bits, u32 clear_bits,
1309  		       struct extent_state **cached_state)
1310  {
1311  	struct extent_state *state;
1312  	struct extent_state *prealloc = NULL;
1313  	struct rb_node **p = NULL;
1314  	struct rb_node *parent = NULL;
1315  	int ret = 0;
1316  	u64 last_start;
1317  	u64 last_end;
1318  	bool first_iteration = true;
1319  
1320  	btrfs_debug_check_extent_io_range(tree, start, end);
1321  	trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1322  				       clear_bits);
1323  
1324  again:
1325  	if (!prealloc) {
1326  		/*
1327  		 * Best effort, don't worry if extent state allocation fails
1328  		 * here for the first iteration. We might have a cached state
1329  		 * that matches exactly the target range, in which case no
1330  		 * extent state allocations are needed. We'll only know this
1331  		 * after locking the tree.
1332  		 */
1333  		prealloc = alloc_extent_state(GFP_NOFS);
1334  		if (!prealloc && !first_iteration)
1335  			return -ENOMEM;
1336  	}
1337  
1338  	spin_lock(&tree->lock);
1339  	if (cached_state && *cached_state) {
1340  		state = *cached_state;
1341  		if (state->start <= start && state->end > start &&
1342  		    extent_state_in_tree(state))
1343  			goto hit_next;
1344  	}
1345  
1346  	/*
1347  	 * This search will find all the extents that end after our range
1348  	 * starts.
1349  	 */
1350  	state = tree_search_for_insert(tree, start, &p, &parent);
1351  	if (!state) {
1352  		prealloc = alloc_extent_state_atomic(prealloc);
1353  		if (!prealloc) {
1354  			ret = -ENOMEM;
1355  			goto out;
1356  		}
1357  		prealloc->start = start;
1358  		prealloc->end = end;
1359  		insert_state_fast(tree, prealloc, p, parent, bits, NULL);
1360  		cache_state(prealloc, cached_state);
1361  		prealloc = NULL;
1362  		goto out;
1363  	}
1364  hit_next:
1365  	last_start = state->start;
1366  	last_end = state->end;
1367  
1368  	/*
1369  	 * | ---- desired range ---- |
1370  	 * | state |
1371  	 *
1372  	 * Just lock what we found and keep going.
1373  	 */
1374  	if (state->start == start && state->end <= end) {
1375  		set_state_bits(tree, state, bits, NULL);
1376  		cache_state(state, cached_state);
1377  		state = clear_state_bit(tree, state, clear_bits, 0, NULL);
1378  		if (last_end == (u64)-1)
1379  			goto out;
1380  		start = last_end + 1;
1381  		if (start < end && state && state->start == start &&
1382  		    !need_resched())
1383  			goto hit_next;
1384  		goto search_again;
1385  	}
1386  
1387  	/*
1388  	 *     | ---- desired range ---- |
1389  	 * | state |
1390  	 *   or
1391  	 * | ------------- state -------------- |
1392  	 *
1393  	 * We need to split the extent we found, and may flip bits on second
1394  	 * half.
1395  	 *
1396  	 * If the extent we found extends past our range, we just split and
1397  	 * search again.  It'll get split again the next time though.
1398  	 *
1399  	 * If the extent we found is inside our range, we set the desired bit
1400  	 * on it.
1401  	 */
1402  	if (state->start < start) {
1403  		prealloc = alloc_extent_state_atomic(prealloc);
1404  		if (!prealloc) {
1405  			ret = -ENOMEM;
1406  			goto out;
1407  		}
1408  		ret = split_state(tree, state, prealloc, start);
1409  		if (ret)
1410  			extent_io_tree_panic(tree, state, "split", ret);
1411  		prealloc = NULL;
1412  		if (ret)
1413  			goto out;
1414  		if (state->end <= end) {
1415  			set_state_bits(tree, state, bits, NULL);
1416  			cache_state(state, cached_state);
1417  			state = clear_state_bit(tree, state, clear_bits, 0, NULL);
1418  			if (last_end == (u64)-1)
1419  				goto out;
1420  			start = last_end + 1;
1421  			if (start < end && state && state->start == start &&
1422  			    !need_resched())
1423  				goto hit_next;
1424  		}
1425  		goto search_again;
1426  	}
1427  	/*
1428  	 * | ---- desired range ---- |
1429  	 *     | state | or               | state |
1430  	 *
1431  	 * There's a hole, we need to insert something in it and ignore the
1432  	 * extent we found.
1433  	 */
1434  	if (state->start > start) {
1435  		u64 this_end;
1436  		struct extent_state *inserted_state;
1437  
1438  		if (end < last_start)
1439  			this_end = end;
1440  		else
1441  			this_end = last_start - 1;
1442  
1443  		prealloc = alloc_extent_state_atomic(prealloc);
1444  		if (!prealloc) {
1445  			ret = -ENOMEM;
1446  			goto out;
1447  		}
1448  
1449  		/*
1450  		 * Avoid to free 'prealloc' if it can be merged with the later
1451  		 * extent.
1452  		 */
1453  		prealloc->start = start;
1454  		prealloc->end = this_end;
1455  		inserted_state = insert_state(tree, prealloc, bits, NULL);
1456  		if (IS_ERR(inserted_state)) {
1457  			ret = PTR_ERR(inserted_state);
1458  			extent_io_tree_panic(tree, prealloc, "insert", ret);
1459  		}
1460  		cache_state(inserted_state, cached_state);
1461  		if (inserted_state == prealloc)
1462  			prealloc = NULL;
1463  		start = this_end + 1;
1464  		goto search_again;
1465  	}
1466  	/*
1467  	 * | ---- desired range ---- |
1468  	 *                        | state |
1469  	 *
1470  	 * We need to split the extent, and set the bit on the first half.
1471  	 */
1472  	if (state->start <= end && state->end > end) {
1473  		prealloc = alloc_extent_state_atomic(prealloc);
1474  		if (!prealloc) {
1475  			ret = -ENOMEM;
1476  			goto out;
1477  		}
1478  
1479  		ret = split_state(tree, state, prealloc, end + 1);
1480  		if (ret)
1481  			extent_io_tree_panic(tree, state, "split", ret);
1482  
1483  		set_state_bits(tree, prealloc, bits, NULL);
1484  		cache_state(prealloc, cached_state);
1485  		clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
1486  		prealloc = NULL;
1487  		goto out;
1488  	}
1489  
1490  search_again:
1491  	if (start > end)
1492  		goto out;
1493  	spin_unlock(&tree->lock);
1494  	cond_resched();
1495  	first_iteration = false;
1496  	goto again;
1497  
1498  out:
1499  	spin_unlock(&tree->lock);
1500  	if (prealloc)
1501  		free_extent_state(prealloc);
1502  
1503  	return ret;
1504  }
1505  
1506  /*
1507   * Find the first range that has @bits not set. This range could start before
1508   * @start.
1509   *
1510   * @tree:      the tree to search
1511   * @start:     offset at/after which the found extent should start
1512   * @start_ret: records the beginning of the range
1513   * @end_ret:   records the end of the range (inclusive)
1514   * @bits:      the set of bits which must be unset
1515   *
1516   * Since unallocated range is also considered one which doesn't have the bits
1517   * set it's possible that @end_ret contains -1, this happens in case the range
1518   * spans (last_range_end, end of device]. In this case it's up to the caller to
1519   * trim @end_ret to the appropriate size.
1520   */
find_first_clear_extent_bit(struct extent_io_tree * tree,u64 start,u64 * start_ret,u64 * end_ret,u32 bits)1521  void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1522  				 u64 *start_ret, u64 *end_ret, u32 bits)
1523  {
1524  	struct extent_state *state;
1525  	struct extent_state *prev = NULL, *next = NULL;
1526  
1527  	spin_lock(&tree->lock);
1528  
1529  	/* Find first extent with bits cleared */
1530  	while (1) {
1531  		state = tree_search_prev_next(tree, start, &prev, &next);
1532  		if (!state && !next && !prev) {
1533  			/*
1534  			 * Tree is completely empty, send full range and let
1535  			 * caller deal with it
1536  			 */
1537  			*start_ret = 0;
1538  			*end_ret = -1;
1539  			goto out;
1540  		} else if (!state && !next) {
1541  			/*
1542  			 * We are past the last allocated chunk, set start at
1543  			 * the end of the last extent.
1544  			 */
1545  			*start_ret = prev->end + 1;
1546  			*end_ret = -1;
1547  			goto out;
1548  		} else if (!state) {
1549  			state = next;
1550  		}
1551  
1552  		/*
1553  		 * At this point 'state' either contains 'start' or start is
1554  		 * before 'state'
1555  		 */
1556  		if (in_range(start, state->start, state->end - state->start + 1)) {
1557  			if (state->state & bits) {
1558  				/*
1559  				 * |--range with bits sets--|
1560  				 *    |
1561  				 *    start
1562  				 */
1563  				start = state->end + 1;
1564  			} else {
1565  				/*
1566  				 * 'start' falls within a range that doesn't
1567  				 * have the bits set, so take its start as the
1568  				 * beginning of the desired range
1569  				 *
1570  				 * |--range with bits cleared----|
1571  				 *      |
1572  				 *      start
1573  				 */
1574  				*start_ret = state->start;
1575  				break;
1576  			}
1577  		} else {
1578  			/*
1579  			 * |---prev range---|---hole/unset---|---node range---|
1580  			 *                          |
1581  			 *                        start
1582  			 *
1583  			 *                        or
1584  			 *
1585  			 * |---hole/unset--||--first node--|
1586  			 * 0   |
1587  			 *    start
1588  			 */
1589  			if (prev)
1590  				*start_ret = prev->end + 1;
1591  			else
1592  				*start_ret = 0;
1593  			break;
1594  		}
1595  	}
1596  
1597  	/*
1598  	 * Find the longest stretch from start until an entry which has the
1599  	 * bits set
1600  	 */
1601  	while (state) {
1602  		if (state->end >= start && !(state->state & bits)) {
1603  			*end_ret = state->end;
1604  		} else {
1605  			*end_ret = state->start - 1;
1606  			break;
1607  		}
1608  		state = next_state(state);
1609  	}
1610  out:
1611  	spin_unlock(&tree->lock);
1612  }
1613  
1614  /*
1615   * Count the number of bytes in the tree that have a given bit(s) set for a
1616   * given range.
1617   *
1618   * @tree:         The io tree to search.
1619   * @start:        The start offset of the range. This value is updated to the
1620   *                offset of the first byte found with the given bit(s), so it
1621   *                can end up being bigger than the initial value.
1622   * @search_end:   The end offset (inclusive value) of the search range.
1623   * @max_bytes:    The maximum byte count we are interested. The search stops
1624   *                once it reaches this count.
1625   * @bits:         The bits the range must have in order to be accounted for.
1626   *                If multiple bits are set, then only subranges that have all
1627   *                the bits set are accounted for.
1628   * @contig:       Indicate if we should ignore holes in the range or not. If
1629   *                this is true, then stop once we find a hole.
1630   * @cached_state: A cached state to be used across multiple calls to this
1631   *                function in order to speedup searches. Use NULL if this is
1632   *                called only once or if each call does not start where the
1633   *                previous one ended.
1634   *
1635   * Returns the total number of bytes found within the given range that have
1636   * all given bits set. If the returned number of bytes is greater than zero
1637   * then @start is updated with the offset of the first byte with the bits set.
1638   */
count_range_bits(struct extent_io_tree * tree,u64 * start,u64 search_end,u64 max_bytes,u32 bits,int contig,struct extent_state ** cached_state)1639  u64 count_range_bits(struct extent_io_tree *tree,
1640  		     u64 *start, u64 search_end, u64 max_bytes,
1641  		     u32 bits, int contig,
1642  		     struct extent_state **cached_state)
1643  {
1644  	struct extent_state *state = NULL;
1645  	struct extent_state *cached;
1646  	u64 cur_start = *start;
1647  	u64 total_bytes = 0;
1648  	u64 last = 0;
1649  	int found = 0;
1650  
1651  	if (WARN_ON(search_end < cur_start))
1652  		return 0;
1653  
1654  	spin_lock(&tree->lock);
1655  
1656  	if (!cached_state || !*cached_state)
1657  		goto search;
1658  
1659  	cached = *cached_state;
1660  
1661  	if (!extent_state_in_tree(cached))
1662  		goto search;
1663  
1664  	if (cached->start <= cur_start && cur_start <= cached->end) {
1665  		state = cached;
1666  	} else if (cached->start > cur_start) {
1667  		struct extent_state *prev;
1668  
1669  		/*
1670  		 * The cached state starts after our search range's start. Check
1671  		 * if the previous state record starts at or before the range we
1672  		 * are looking for, and if so, use it - this is a common case
1673  		 * when there are holes between records in the tree. If there is
1674  		 * no previous state record, we can start from our cached state.
1675  		 */
1676  		prev = prev_state(cached);
1677  		if (!prev)
1678  			state = cached;
1679  		else if (prev->start <= cur_start && cur_start <= prev->end)
1680  			state = prev;
1681  	}
1682  
1683  	/*
1684  	 * This search will find all the extents that end after our range
1685  	 * starts.
1686  	 */
1687  search:
1688  	if (!state)
1689  		state = tree_search(tree, cur_start);
1690  
1691  	while (state) {
1692  		if (state->start > search_end)
1693  			break;
1694  		if (contig && found && state->start > last + 1)
1695  			break;
1696  		if (state->end >= cur_start && (state->state & bits) == bits) {
1697  			total_bytes += min(search_end, state->end) + 1 -
1698  				       max(cur_start, state->start);
1699  			if (total_bytes >= max_bytes)
1700  				break;
1701  			if (!found) {
1702  				*start = max(cur_start, state->start);
1703  				found = 1;
1704  			}
1705  			last = state->end;
1706  		} else if (contig && found) {
1707  			break;
1708  		}
1709  		state = next_state(state);
1710  	}
1711  
1712  	if (cached_state) {
1713  		free_extent_state(*cached_state);
1714  		*cached_state = state;
1715  		if (state)
1716  			refcount_inc(&state->refs);
1717  	}
1718  
1719  	spin_unlock(&tree->lock);
1720  
1721  	return total_bytes;
1722  }
1723  
1724  /*
1725   * Check if the single @bit exists in the given range.
1726   */
test_range_bit_exists(struct extent_io_tree * tree,u64 start,u64 end,u32 bit)1727  bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
1728  {
1729  	struct extent_state *state = NULL;
1730  	bool bitset = false;
1731  
1732  	ASSERT(is_power_of_2(bit));
1733  
1734  	spin_lock(&tree->lock);
1735  	state = tree_search(tree, start);
1736  	while (state && start <= end) {
1737  		if (state->start > end)
1738  			break;
1739  
1740  		if (state->state & bit) {
1741  			bitset = true;
1742  			break;
1743  		}
1744  
1745  		/* If state->end is (u64)-1, start will overflow to 0 */
1746  		start = state->end + 1;
1747  		if (start > end || start == 0)
1748  			break;
1749  		state = next_state(state);
1750  	}
1751  	spin_unlock(&tree->lock);
1752  	return bitset;
1753  }
1754  
1755  /*
1756   * Check if the whole range [@start,@end) contains the single @bit set.
1757   */
test_range_bit(struct extent_io_tree * tree,u64 start,u64 end,u32 bit,struct extent_state * cached)1758  bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
1759  		    struct extent_state *cached)
1760  {
1761  	struct extent_state *state = NULL;
1762  	bool bitset = true;
1763  
1764  	ASSERT(is_power_of_2(bit));
1765  
1766  	spin_lock(&tree->lock);
1767  	if (cached && extent_state_in_tree(cached) && cached->start <= start &&
1768  	    cached->end > start)
1769  		state = cached;
1770  	else
1771  		state = tree_search(tree, start);
1772  	while (state && start <= end) {
1773  		if (state->start > start) {
1774  			bitset = false;
1775  			break;
1776  		}
1777  
1778  		if (state->start > end)
1779  			break;
1780  
1781  		if ((state->state & bit) == 0) {
1782  			bitset = false;
1783  			break;
1784  		}
1785  
1786  		if (state->end == (u64)-1)
1787  			break;
1788  
1789  		/*
1790  		 * Last entry (if state->end is (u64)-1 and overflow happens),
1791  		 * or next entry starts after the range.
1792  		 */
1793  		start = state->end + 1;
1794  		if (start > end || start == 0)
1795  			break;
1796  		state = next_state(state);
1797  	}
1798  
1799  	/* We ran out of states and were still inside of our range. */
1800  	if (!state)
1801  		bitset = false;
1802  	spin_unlock(&tree->lock);
1803  	return bitset;
1804  }
1805  
1806  /* Wrappers around set/clear extent bit */
set_record_extent_bits(struct extent_io_tree * tree,u64 start,u64 end,u32 bits,struct extent_changeset * changeset)1807  int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1808  			   u32 bits, struct extent_changeset *changeset)
1809  {
1810  	/*
1811  	 * We don't support EXTENT_LOCK_BITS yet, as current changeset will
1812  	 * record any bits changed, so for EXTENT_LOCK_BITS case, it will either
1813  	 * fail with -EEXIST or changeset will record the whole range.
1814  	 */
1815  	ASSERT(!(bits & EXTENT_LOCK_BITS));
1816  
1817  	return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
1818  }
1819  
clear_record_extent_bits(struct extent_io_tree * tree,u64 start,u64 end,u32 bits,struct extent_changeset * changeset)1820  int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1821  			     u32 bits, struct extent_changeset *changeset)
1822  {
1823  	/*
1824  	 * Don't support EXTENT_LOCK_BITS case, same reason as
1825  	 * set_record_extent_bits().
1826  	 */
1827  	ASSERT(!(bits & EXTENT_LOCK_BITS));
1828  
1829  	return __clear_extent_bit(tree, start, end, bits, NULL, changeset);
1830  }
1831  
__try_lock_extent(struct extent_io_tree * tree,u64 start,u64 end,u32 bits,struct extent_state ** cached)1832  bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
1833  		       struct extent_state **cached)
1834  {
1835  	int err;
1836  	u64 failed_start;
1837  
1838  	err = __set_extent_bit(tree, start, end, bits, &failed_start,
1839  			       NULL, cached, NULL);
1840  	if (err == -EEXIST) {
1841  		if (failed_start > start)
1842  			clear_extent_bit(tree, start, failed_start - 1, bits, cached);
1843  		return 0;
1844  	}
1845  	return 1;
1846  }
1847  
1848  /*
1849   * Either insert or lock state struct between start and end use mask to tell
1850   * us if waiting is desired.
1851   */
__lock_extent(struct extent_io_tree * tree,u64 start,u64 end,u32 bits,struct extent_state ** cached_state)1852  int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
1853  		  struct extent_state **cached_state)
1854  {
1855  	struct extent_state *failed_state = NULL;
1856  	int err;
1857  	u64 failed_start;
1858  
1859  	err = __set_extent_bit(tree, start, end, bits, &failed_start,
1860  			       &failed_state, cached_state, NULL);
1861  	while (err == -EEXIST) {
1862  		if (failed_start != start)
1863  			clear_extent_bit(tree, start, failed_start - 1,
1864  					 bits, cached_state);
1865  
1866  		wait_extent_bit(tree, failed_start, end, bits, &failed_state);
1867  		err = __set_extent_bit(tree, start, end, bits,
1868  				       &failed_start, &failed_state,
1869  				       cached_state, NULL);
1870  	}
1871  	return err;
1872  }
1873  
extent_state_free_cachep(void)1874  void __cold extent_state_free_cachep(void)
1875  {
1876  	btrfs_extent_state_leak_debug_check();
1877  	kmem_cache_destroy(extent_state_cache);
1878  }
1879  
extent_state_init_cachep(void)1880  int __init extent_state_init_cachep(void)
1881  {
1882  	extent_state_cache = kmem_cache_create("btrfs_extent_state",
1883  					       sizeof(struct extent_state), 0, 0,
1884  					       NULL);
1885  	if (!extent_state_cache)
1886  		return -ENOMEM;
1887  
1888  	return 0;
1889  }
1890