1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2007 Oracle.  All rights reserved.
4   */
5  
6  #include <linux/sched.h>
7  #include <linux/sched/signal.h>
8  #include <linux/pagemap.h>
9  #include <linux/writeback.h>
10  #include <linux/blkdev.h>
11  #include <linux/sort.h>
12  #include <linux/rcupdate.h>
13  #include <linux/kthread.h>
14  #include <linux/slab.h>
15  #include <linux/ratelimit.h>
16  #include <linux/percpu_counter.h>
17  #include <linux/lockdep.h>
18  #include <linux/crc32c.h>
19  #include "ctree.h"
20  #include "extent-tree.h"
21  #include "transaction.h"
22  #include "disk-io.h"
23  #include "print-tree.h"
24  #include "volumes.h"
25  #include "raid56.h"
26  #include "locking.h"
27  #include "free-space-cache.h"
28  #include "free-space-tree.h"
29  #include "qgroup.h"
30  #include "ref-verify.h"
31  #include "space-info.h"
32  #include "block-rsv.h"
33  #include "discard.h"
34  #include "zoned.h"
35  #include "dev-replace.h"
36  #include "fs.h"
37  #include "accessors.h"
38  #include "root-tree.h"
39  #include "file-item.h"
40  #include "orphan.h"
41  #include "tree-checker.h"
42  #include "raid-stripe-tree.h"
43  
44  #undef SCRAMBLE_DELAYED_REFS
45  
46  
47  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
48  			       struct btrfs_delayed_ref_head *href,
49  			       struct btrfs_delayed_ref_node *node,
50  			       struct btrfs_delayed_extent_op *extra_op);
51  static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
52  				    struct extent_buffer *leaf,
53  				    struct btrfs_extent_item *ei);
54  static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
55  				      u64 parent, u64 root_objectid,
56  				      u64 flags, u64 owner, u64 offset,
57  				      struct btrfs_key *ins, int ref_mod, u64 oref_root);
58  static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
59  				     struct btrfs_delayed_ref_node *node,
60  				     struct btrfs_delayed_extent_op *extent_op);
61  static int find_next_key(struct btrfs_path *path, int level,
62  			 struct btrfs_key *key);
63  
block_group_bits(struct btrfs_block_group * cache,u64 bits)64  static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
65  {
66  	return (cache->flags & bits) == bits;
67  }
68  
69  /* simple helper to search for an existing data extent at a given offset */
btrfs_lookup_data_extent(struct btrfs_fs_info * fs_info,u64 start,u64 len)70  int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
71  {
72  	struct btrfs_root *root = btrfs_extent_root(fs_info, start);
73  	int ret;
74  	struct btrfs_key key;
75  	struct btrfs_path *path;
76  
77  	path = btrfs_alloc_path();
78  	if (!path)
79  		return -ENOMEM;
80  
81  	key.objectid = start;
82  	key.offset = len;
83  	key.type = BTRFS_EXTENT_ITEM_KEY;
84  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
85  	btrfs_free_path(path);
86  	return ret;
87  }
88  
89  /*
90   * helper function to lookup reference count and flags of a tree block.
91   *
92   * the head node for delayed ref is used to store the sum of all the
93   * reference count modifications queued up in the rbtree. the head
94   * node may also store the extent flags to set. This way you can check
95   * to see what the reference count and extent flags would be if all of
96   * the delayed refs are not processed.
97   */
btrfs_lookup_extent_info(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info,u64 bytenr,u64 offset,int metadata,u64 * refs,u64 * flags,u64 * owning_root)98  int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
99  			     struct btrfs_fs_info *fs_info, u64 bytenr,
100  			     u64 offset, int metadata, u64 *refs, u64 *flags,
101  			     u64 *owning_root)
102  {
103  	struct btrfs_root *extent_root;
104  	struct btrfs_delayed_ref_head *head;
105  	struct btrfs_delayed_ref_root *delayed_refs;
106  	struct btrfs_path *path;
107  	struct btrfs_key key;
108  	u64 num_refs;
109  	u64 extent_flags;
110  	u64 owner = 0;
111  	int ret;
112  
113  	/*
114  	 * If we don't have skinny metadata, don't bother doing anything
115  	 * different
116  	 */
117  	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
118  		offset = fs_info->nodesize;
119  		metadata = 0;
120  	}
121  
122  	path = btrfs_alloc_path();
123  	if (!path)
124  		return -ENOMEM;
125  
126  search_again:
127  	key.objectid = bytenr;
128  	key.offset = offset;
129  	if (metadata)
130  		key.type = BTRFS_METADATA_ITEM_KEY;
131  	else
132  		key.type = BTRFS_EXTENT_ITEM_KEY;
133  
134  	extent_root = btrfs_extent_root(fs_info, bytenr);
135  	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
136  	if (ret < 0)
137  		goto out_free;
138  
139  	if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) {
140  		if (path->slots[0]) {
141  			path->slots[0]--;
142  			btrfs_item_key_to_cpu(path->nodes[0], &key,
143  					      path->slots[0]);
144  			if (key.objectid == bytenr &&
145  			    key.type == BTRFS_EXTENT_ITEM_KEY &&
146  			    key.offset == fs_info->nodesize)
147  				ret = 0;
148  		}
149  	}
150  
151  	if (ret == 0) {
152  		struct extent_buffer *leaf = path->nodes[0];
153  		struct btrfs_extent_item *ei;
154  		const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
155  
156  		if (unlikely(item_size < sizeof(*ei))) {
157  			ret = -EUCLEAN;
158  			btrfs_err(fs_info,
159  			"unexpected extent item size, has %u expect >= %zu",
160  				  item_size, sizeof(*ei));
161  			btrfs_abort_transaction(trans, ret);
162  			goto out_free;
163  		}
164  
165  		ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
166  		num_refs = btrfs_extent_refs(leaf, ei);
167  		if (unlikely(num_refs == 0)) {
168  			ret = -EUCLEAN;
169  			btrfs_err(fs_info,
170  		"unexpected zero reference count for extent item (%llu %u %llu)",
171  				  key.objectid, key.type, key.offset);
172  			btrfs_abort_transaction(trans, ret);
173  			goto out_free;
174  		}
175  		extent_flags = btrfs_extent_flags(leaf, ei);
176  		owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
177  	} else {
178  		num_refs = 0;
179  		extent_flags = 0;
180  		ret = 0;
181  	}
182  
183  	delayed_refs = &trans->transaction->delayed_refs;
184  	spin_lock(&delayed_refs->lock);
185  	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
186  	if (head) {
187  		if (!mutex_trylock(&head->mutex)) {
188  			refcount_inc(&head->refs);
189  			spin_unlock(&delayed_refs->lock);
190  
191  			btrfs_release_path(path);
192  
193  			/*
194  			 * Mutex was contended, block until it's released and try
195  			 * again
196  			 */
197  			mutex_lock(&head->mutex);
198  			mutex_unlock(&head->mutex);
199  			btrfs_put_delayed_ref_head(head);
200  			goto search_again;
201  		}
202  		spin_lock(&head->lock);
203  		if (head->extent_op && head->extent_op->update_flags)
204  			extent_flags |= head->extent_op->flags_to_set;
205  
206  		num_refs += head->ref_mod;
207  		spin_unlock(&head->lock);
208  		mutex_unlock(&head->mutex);
209  	}
210  	spin_unlock(&delayed_refs->lock);
211  
212  	WARN_ON(num_refs == 0);
213  	if (refs)
214  		*refs = num_refs;
215  	if (flags)
216  		*flags = extent_flags;
217  	if (owning_root)
218  		*owning_root = owner;
219  out_free:
220  	btrfs_free_path(path);
221  	return ret;
222  }
223  
224  /*
225   * Back reference rules.  Back refs have three main goals:
226   *
227   * 1) differentiate between all holders of references to an extent so that
228   *    when a reference is dropped we can make sure it was a valid reference
229   *    before freeing the extent.
230   *
231   * 2) Provide enough information to quickly find the holders of an extent
232   *    if we notice a given block is corrupted or bad.
233   *
234   * 3) Make it easy to migrate blocks for FS shrinking or storage pool
235   *    maintenance.  This is actually the same as #2, but with a slightly
236   *    different use case.
237   *
238   * There are two kinds of back refs. The implicit back refs is optimized
239   * for pointers in non-shared tree blocks. For a given pointer in a block,
240   * back refs of this kind provide information about the block's owner tree
241   * and the pointer's key. These information allow us to find the block by
242   * b-tree searching. The full back refs is for pointers in tree blocks not
243   * referenced by their owner trees. The location of tree block is recorded
244   * in the back refs. Actually the full back refs is generic, and can be
245   * used in all cases the implicit back refs is used. The major shortcoming
246   * of the full back refs is its overhead. Every time a tree block gets
247   * COWed, we have to update back refs entry for all pointers in it.
248   *
249   * For a newly allocated tree block, we use implicit back refs for
250   * pointers in it. This means most tree related operations only involve
251   * implicit back refs. For a tree block created in old transaction, the
252   * only way to drop a reference to it is COW it. So we can detect the
253   * event that tree block loses its owner tree's reference and do the
254   * back refs conversion.
255   *
256   * When a tree block is COWed through a tree, there are four cases:
257   *
258   * The reference count of the block is one and the tree is the block's
259   * owner tree. Nothing to do in this case.
260   *
261   * The reference count of the block is one and the tree is not the
262   * block's owner tree. In this case, full back refs is used for pointers
263   * in the block. Remove these full back refs, add implicit back refs for
264   * every pointers in the new block.
265   *
266   * The reference count of the block is greater than one and the tree is
267   * the block's owner tree. In this case, implicit back refs is used for
268   * pointers in the block. Add full back refs for every pointers in the
269   * block, increase lower level extents' reference counts. The original
270   * implicit back refs are entailed to the new block.
271   *
272   * The reference count of the block is greater than one and the tree is
273   * not the block's owner tree. Add implicit back refs for every pointer in
274   * the new block, increase lower level extents' reference count.
275   *
276   * Back Reference Key composing:
277   *
278   * The key objectid corresponds to the first byte in the extent,
279   * The key type is used to differentiate between types of back refs.
280   * There are different meanings of the key offset for different types
281   * of back refs.
282   *
283   * File extents can be referenced by:
284   *
285   * - multiple snapshots, subvolumes, or different generations in one subvol
286   * - different files inside a single subvolume
287   * - different offsets inside a file (bookend extents in file.c)
288   *
289   * The extent ref structure for the implicit back refs has fields for:
290   *
291   * - Objectid of the subvolume root
292   * - objectid of the file holding the reference
293   * - original offset in the file
294   * - how many bookend extents
295   *
296   * The key offset for the implicit back refs is hash of the first
297   * three fields.
298   *
299   * The extent ref structure for the full back refs has field for:
300   *
301   * - number of pointers in the tree leaf
302   *
303   * The key offset for the implicit back refs is the first byte of
304   * the tree leaf
305   *
306   * When a file extent is allocated, The implicit back refs is used.
307   * the fields are filled in:
308   *
309   *     (root_key.objectid, inode objectid, offset in file, 1)
310   *
311   * When a file extent is removed file truncation, we find the
312   * corresponding implicit back refs and check the following fields:
313   *
314   *     (btrfs_header_owner(leaf), inode objectid, offset in file)
315   *
316   * Btree extents can be referenced by:
317   *
318   * - Different subvolumes
319   *
320   * Both the implicit back refs and the full back refs for tree blocks
321   * only consist of key. The key offset for the implicit back refs is
322   * objectid of block's owner tree. The key offset for the full back refs
323   * is the first byte of parent block.
324   *
325   * When implicit back refs is used, information about the lowest key and
326   * level of the tree block are required. These information are stored in
327   * tree block info structure.
328   */
329  
330  /*
331   * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
332   * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
333   * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
334   */
btrfs_get_extent_inline_ref_type(const struct extent_buffer * eb,struct btrfs_extent_inline_ref * iref,enum btrfs_inline_ref_type is_data)335  int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
336  				     struct btrfs_extent_inline_ref *iref,
337  				     enum btrfs_inline_ref_type is_data)
338  {
339  	struct btrfs_fs_info *fs_info = eb->fs_info;
340  	int type = btrfs_extent_inline_ref_type(eb, iref);
341  	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
342  
343  	if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
344  		ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
345  		return type;
346  	}
347  
348  	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
349  	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
350  	    type == BTRFS_SHARED_DATA_REF_KEY ||
351  	    type == BTRFS_EXTENT_DATA_REF_KEY) {
352  		if (is_data == BTRFS_REF_TYPE_BLOCK) {
353  			if (type == BTRFS_TREE_BLOCK_REF_KEY)
354  				return type;
355  			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
356  				ASSERT(fs_info);
357  				/*
358  				 * Every shared one has parent tree block,
359  				 * which must be aligned to sector size.
360  				 */
361  				if (offset && IS_ALIGNED(offset, fs_info->sectorsize))
362  					return type;
363  			}
364  		} else if (is_data == BTRFS_REF_TYPE_DATA) {
365  			if (type == BTRFS_EXTENT_DATA_REF_KEY)
366  				return type;
367  			if (type == BTRFS_SHARED_DATA_REF_KEY) {
368  				ASSERT(fs_info);
369  				/*
370  				 * Every shared one has parent tree block,
371  				 * which must be aligned to sector size.
372  				 */
373  				if (offset &&
374  				    IS_ALIGNED(offset, fs_info->sectorsize))
375  					return type;
376  			}
377  		} else {
378  			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
379  			return type;
380  		}
381  	}
382  
383  	WARN_ON(1);
384  	btrfs_print_leaf(eb);
385  	btrfs_err(fs_info,
386  		  "eb %llu iref 0x%lx invalid extent inline ref type %d",
387  		  eb->start, (unsigned long)iref, type);
388  
389  	return BTRFS_REF_TYPE_INVALID;
390  }
391  
hash_extent_data_ref(u64 root_objectid,u64 owner,u64 offset)392  u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
393  {
394  	u32 high_crc = ~(u32)0;
395  	u32 low_crc = ~(u32)0;
396  	__le64 lenum;
397  
398  	lenum = cpu_to_le64(root_objectid);
399  	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
400  	lenum = cpu_to_le64(owner);
401  	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
402  	lenum = cpu_to_le64(offset);
403  	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
404  
405  	return ((u64)high_crc << 31) ^ (u64)low_crc;
406  }
407  
hash_extent_data_ref_item(struct extent_buffer * leaf,struct btrfs_extent_data_ref * ref)408  static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
409  				     struct btrfs_extent_data_ref *ref)
410  {
411  	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
412  				    btrfs_extent_data_ref_objectid(leaf, ref),
413  				    btrfs_extent_data_ref_offset(leaf, ref));
414  }
415  
match_extent_data_ref(struct extent_buffer * leaf,struct btrfs_extent_data_ref * ref,u64 root_objectid,u64 owner,u64 offset)416  static int match_extent_data_ref(struct extent_buffer *leaf,
417  				 struct btrfs_extent_data_ref *ref,
418  				 u64 root_objectid, u64 owner, u64 offset)
419  {
420  	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
421  	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
422  	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
423  		return 0;
424  	return 1;
425  }
426  
lookup_extent_data_ref(struct btrfs_trans_handle * trans,struct btrfs_path * path,u64 bytenr,u64 parent,u64 root_objectid,u64 owner,u64 offset)427  static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
428  					   struct btrfs_path *path,
429  					   u64 bytenr, u64 parent,
430  					   u64 root_objectid,
431  					   u64 owner, u64 offset)
432  {
433  	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
434  	struct btrfs_key key;
435  	struct btrfs_extent_data_ref *ref;
436  	struct extent_buffer *leaf;
437  	u32 nritems;
438  	int recow;
439  	int ret;
440  
441  	key.objectid = bytenr;
442  	if (parent) {
443  		key.type = BTRFS_SHARED_DATA_REF_KEY;
444  		key.offset = parent;
445  	} else {
446  		key.type = BTRFS_EXTENT_DATA_REF_KEY;
447  		key.offset = hash_extent_data_ref(root_objectid,
448  						  owner, offset);
449  	}
450  again:
451  	recow = 0;
452  	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
453  	if (ret < 0)
454  		return ret;
455  
456  	if (parent) {
457  		if (ret)
458  			return -ENOENT;
459  		return 0;
460  	}
461  
462  	ret = -ENOENT;
463  	leaf = path->nodes[0];
464  	nritems = btrfs_header_nritems(leaf);
465  	while (1) {
466  		if (path->slots[0] >= nritems) {
467  			ret = btrfs_next_leaf(root, path);
468  			if (ret) {
469  				if (ret > 0)
470  					return -ENOENT;
471  				return ret;
472  			}
473  
474  			leaf = path->nodes[0];
475  			nritems = btrfs_header_nritems(leaf);
476  			recow = 1;
477  		}
478  
479  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
480  		if (key.objectid != bytenr ||
481  		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
482  			goto fail;
483  
484  		ref = btrfs_item_ptr(leaf, path->slots[0],
485  				     struct btrfs_extent_data_ref);
486  
487  		if (match_extent_data_ref(leaf, ref, root_objectid,
488  					  owner, offset)) {
489  			if (recow) {
490  				btrfs_release_path(path);
491  				goto again;
492  			}
493  			ret = 0;
494  			break;
495  		}
496  		path->slots[0]++;
497  	}
498  fail:
499  	return ret;
500  }
501  
insert_extent_data_ref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_delayed_ref_node * node,u64 bytenr)502  static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
503  					   struct btrfs_path *path,
504  					   struct btrfs_delayed_ref_node *node,
505  					   u64 bytenr)
506  {
507  	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
508  	struct btrfs_key key;
509  	struct extent_buffer *leaf;
510  	u64 owner = btrfs_delayed_ref_owner(node);
511  	u64 offset = btrfs_delayed_ref_offset(node);
512  	u32 size;
513  	u32 num_refs;
514  	int ret;
515  
516  	key.objectid = bytenr;
517  	if (node->parent) {
518  		key.type = BTRFS_SHARED_DATA_REF_KEY;
519  		key.offset = node->parent;
520  		size = sizeof(struct btrfs_shared_data_ref);
521  	} else {
522  		key.type = BTRFS_EXTENT_DATA_REF_KEY;
523  		key.offset = hash_extent_data_ref(node->ref_root, owner, offset);
524  		size = sizeof(struct btrfs_extent_data_ref);
525  	}
526  
527  	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
528  	if (ret && ret != -EEXIST)
529  		goto fail;
530  
531  	leaf = path->nodes[0];
532  	if (node->parent) {
533  		struct btrfs_shared_data_ref *ref;
534  		ref = btrfs_item_ptr(leaf, path->slots[0],
535  				     struct btrfs_shared_data_ref);
536  		if (ret == 0) {
537  			btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod);
538  		} else {
539  			num_refs = btrfs_shared_data_ref_count(leaf, ref);
540  			num_refs += node->ref_mod;
541  			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
542  		}
543  	} else {
544  		struct btrfs_extent_data_ref *ref;
545  		while (ret == -EEXIST) {
546  			ref = btrfs_item_ptr(leaf, path->slots[0],
547  					     struct btrfs_extent_data_ref);
548  			if (match_extent_data_ref(leaf, ref, node->ref_root,
549  						  owner, offset))
550  				break;
551  			btrfs_release_path(path);
552  			key.offset++;
553  			ret = btrfs_insert_empty_item(trans, root, path, &key,
554  						      size);
555  			if (ret && ret != -EEXIST)
556  				goto fail;
557  
558  			leaf = path->nodes[0];
559  		}
560  		ref = btrfs_item_ptr(leaf, path->slots[0],
561  				     struct btrfs_extent_data_ref);
562  		if (ret == 0) {
563  			btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root);
564  			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
565  			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
566  			btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod);
567  		} else {
568  			num_refs = btrfs_extent_data_ref_count(leaf, ref);
569  			num_refs += node->ref_mod;
570  			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
571  		}
572  	}
573  	btrfs_mark_buffer_dirty(trans, leaf);
574  	ret = 0;
575  fail:
576  	btrfs_release_path(path);
577  	return ret;
578  }
579  
remove_extent_data_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int refs_to_drop)580  static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
581  					   struct btrfs_root *root,
582  					   struct btrfs_path *path,
583  					   int refs_to_drop)
584  {
585  	struct btrfs_key key;
586  	struct btrfs_extent_data_ref *ref1 = NULL;
587  	struct btrfs_shared_data_ref *ref2 = NULL;
588  	struct extent_buffer *leaf;
589  	u32 num_refs = 0;
590  	int ret = 0;
591  
592  	leaf = path->nodes[0];
593  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
594  
595  	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
596  		ref1 = btrfs_item_ptr(leaf, path->slots[0],
597  				      struct btrfs_extent_data_ref);
598  		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
599  	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
600  		ref2 = btrfs_item_ptr(leaf, path->slots[0],
601  				      struct btrfs_shared_data_ref);
602  		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
603  	} else {
604  		btrfs_err(trans->fs_info,
605  			  "unrecognized backref key (%llu %u %llu)",
606  			  key.objectid, key.type, key.offset);
607  		btrfs_abort_transaction(trans, -EUCLEAN);
608  		return -EUCLEAN;
609  	}
610  
611  	BUG_ON(num_refs < refs_to_drop);
612  	num_refs -= refs_to_drop;
613  
614  	if (num_refs == 0) {
615  		ret = btrfs_del_item(trans, root, path);
616  	} else {
617  		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
618  			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
619  		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
620  			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
621  		btrfs_mark_buffer_dirty(trans, leaf);
622  	}
623  	return ret;
624  }
625  
extent_data_ref_count(struct btrfs_path * path,struct btrfs_extent_inline_ref * iref)626  static noinline u32 extent_data_ref_count(struct btrfs_path *path,
627  					  struct btrfs_extent_inline_ref *iref)
628  {
629  	struct btrfs_key key;
630  	struct extent_buffer *leaf;
631  	struct btrfs_extent_data_ref *ref1;
632  	struct btrfs_shared_data_ref *ref2;
633  	u32 num_refs = 0;
634  	int type;
635  
636  	leaf = path->nodes[0];
637  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
638  
639  	if (iref) {
640  		/*
641  		 * If type is invalid, we should have bailed out earlier than
642  		 * this call.
643  		 */
644  		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
645  		ASSERT(type != BTRFS_REF_TYPE_INVALID);
646  		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
647  			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
648  			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
649  		} else {
650  			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
651  			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
652  		}
653  	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
654  		ref1 = btrfs_item_ptr(leaf, path->slots[0],
655  				      struct btrfs_extent_data_ref);
656  		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
657  	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
658  		ref2 = btrfs_item_ptr(leaf, path->slots[0],
659  				      struct btrfs_shared_data_ref);
660  		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
661  	} else {
662  		WARN_ON(1);
663  	}
664  	return num_refs;
665  }
666  
lookup_tree_block_ref(struct btrfs_trans_handle * trans,struct btrfs_path * path,u64 bytenr,u64 parent,u64 root_objectid)667  static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
668  					  struct btrfs_path *path,
669  					  u64 bytenr, u64 parent,
670  					  u64 root_objectid)
671  {
672  	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
673  	struct btrfs_key key;
674  	int ret;
675  
676  	key.objectid = bytenr;
677  	if (parent) {
678  		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
679  		key.offset = parent;
680  	} else {
681  		key.type = BTRFS_TREE_BLOCK_REF_KEY;
682  		key.offset = root_objectid;
683  	}
684  
685  	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
686  	if (ret > 0)
687  		ret = -ENOENT;
688  	return ret;
689  }
690  
insert_tree_block_ref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_delayed_ref_node * node,u64 bytenr)691  static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
692  					  struct btrfs_path *path,
693  					  struct btrfs_delayed_ref_node *node,
694  					  u64 bytenr)
695  {
696  	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
697  	struct btrfs_key key;
698  	int ret;
699  
700  	key.objectid = bytenr;
701  	if (node->parent) {
702  		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
703  		key.offset = node->parent;
704  	} else {
705  		key.type = BTRFS_TREE_BLOCK_REF_KEY;
706  		key.offset = node->ref_root;
707  	}
708  
709  	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
710  	btrfs_release_path(path);
711  	return ret;
712  }
713  
extent_ref_type(u64 parent,u64 owner)714  static inline int extent_ref_type(u64 parent, u64 owner)
715  {
716  	int type;
717  	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
718  		if (parent > 0)
719  			type = BTRFS_SHARED_BLOCK_REF_KEY;
720  		else
721  			type = BTRFS_TREE_BLOCK_REF_KEY;
722  	} else {
723  		if (parent > 0)
724  			type = BTRFS_SHARED_DATA_REF_KEY;
725  		else
726  			type = BTRFS_EXTENT_DATA_REF_KEY;
727  	}
728  	return type;
729  }
730  
find_next_key(struct btrfs_path * path,int level,struct btrfs_key * key)731  static int find_next_key(struct btrfs_path *path, int level,
732  			 struct btrfs_key *key)
733  
734  {
735  	for (; level < BTRFS_MAX_LEVEL; level++) {
736  		if (!path->nodes[level])
737  			break;
738  		if (path->slots[level] + 1 >=
739  		    btrfs_header_nritems(path->nodes[level]))
740  			continue;
741  		if (level == 0)
742  			btrfs_item_key_to_cpu(path->nodes[level], key,
743  					      path->slots[level] + 1);
744  		else
745  			btrfs_node_key_to_cpu(path->nodes[level], key,
746  					      path->slots[level] + 1);
747  		return 0;
748  	}
749  	return 1;
750  }
751  
752  /*
753   * look for inline back ref. if back ref is found, *ref_ret is set
754   * to the address of inline back ref, and 0 is returned.
755   *
756   * if back ref isn't found, *ref_ret is set to the address where it
757   * should be inserted, and -ENOENT is returned.
758   *
759   * if insert is true and there are too many inline back refs, the path
760   * points to the extent item, and -EAGAIN is returned.
761   *
762   * NOTE: inline back refs are ordered in the same way that back ref
763   *	 items in the tree are ordered.
764   */
765  static noinline_for_stack
lookup_inline_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_extent_inline_ref ** ref_ret,u64 bytenr,u64 num_bytes,u64 parent,u64 root_objectid,u64 owner,u64 offset,int insert)766  int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
767  				 struct btrfs_path *path,
768  				 struct btrfs_extent_inline_ref **ref_ret,
769  				 u64 bytenr, u64 num_bytes,
770  				 u64 parent, u64 root_objectid,
771  				 u64 owner, u64 offset, int insert)
772  {
773  	struct btrfs_fs_info *fs_info = trans->fs_info;
774  	struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
775  	struct btrfs_key key;
776  	struct extent_buffer *leaf;
777  	struct btrfs_extent_item *ei;
778  	struct btrfs_extent_inline_ref *iref;
779  	u64 flags;
780  	u64 item_size;
781  	unsigned long ptr;
782  	unsigned long end;
783  	int extra_size;
784  	int type;
785  	int want;
786  	int ret;
787  	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
788  	int needed;
789  
790  	key.objectid = bytenr;
791  	key.type = BTRFS_EXTENT_ITEM_KEY;
792  	key.offset = num_bytes;
793  
794  	want = extent_ref_type(parent, owner);
795  	if (insert) {
796  		extra_size = btrfs_extent_inline_ref_size(want);
797  		path->search_for_extension = 1;
798  		path->keep_locks = 1;
799  	} else
800  		extra_size = -1;
801  
802  	/*
803  	 * Owner is our level, so we can just add one to get the level for the
804  	 * block we are interested in.
805  	 */
806  	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
807  		key.type = BTRFS_METADATA_ITEM_KEY;
808  		key.offset = owner;
809  	}
810  
811  again:
812  	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
813  	if (ret < 0)
814  		goto out;
815  
816  	/*
817  	 * We may be a newly converted file system which still has the old fat
818  	 * extent entries for metadata, so try and see if we have one of those.
819  	 */
820  	if (ret > 0 && skinny_metadata) {
821  		skinny_metadata = false;
822  		if (path->slots[0]) {
823  			path->slots[0]--;
824  			btrfs_item_key_to_cpu(path->nodes[0], &key,
825  					      path->slots[0]);
826  			if (key.objectid == bytenr &&
827  			    key.type == BTRFS_EXTENT_ITEM_KEY &&
828  			    key.offset == num_bytes)
829  				ret = 0;
830  		}
831  		if (ret) {
832  			key.objectid = bytenr;
833  			key.type = BTRFS_EXTENT_ITEM_KEY;
834  			key.offset = num_bytes;
835  			btrfs_release_path(path);
836  			goto again;
837  		}
838  	}
839  
840  	if (ret && !insert) {
841  		ret = -ENOENT;
842  		goto out;
843  	} else if (WARN_ON(ret)) {
844  		btrfs_print_leaf(path->nodes[0]);
845  		btrfs_err(fs_info,
846  "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
847  			  bytenr, num_bytes, parent, root_objectid, owner,
848  			  offset);
849  		ret = -EUCLEAN;
850  		goto out;
851  	}
852  
853  	leaf = path->nodes[0];
854  	item_size = btrfs_item_size(leaf, path->slots[0]);
855  	if (unlikely(item_size < sizeof(*ei))) {
856  		ret = -EUCLEAN;
857  		btrfs_err(fs_info,
858  			  "unexpected extent item size, has %llu expect >= %zu",
859  			  item_size, sizeof(*ei));
860  		btrfs_abort_transaction(trans, ret);
861  		goto out;
862  	}
863  
864  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
865  	flags = btrfs_extent_flags(leaf, ei);
866  
867  	ptr = (unsigned long)(ei + 1);
868  	end = (unsigned long)ei + item_size;
869  
870  	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
871  		ptr += sizeof(struct btrfs_tree_block_info);
872  		BUG_ON(ptr > end);
873  	}
874  
875  	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
876  		needed = BTRFS_REF_TYPE_DATA;
877  	else
878  		needed = BTRFS_REF_TYPE_BLOCK;
879  
880  	ret = -ENOENT;
881  	while (ptr < end) {
882  		iref = (struct btrfs_extent_inline_ref *)ptr;
883  		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
884  		if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
885  			ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
886  			ptr += btrfs_extent_inline_ref_size(type);
887  			continue;
888  		}
889  		if (type == BTRFS_REF_TYPE_INVALID) {
890  			ret = -EUCLEAN;
891  			goto out;
892  		}
893  
894  		if (want < type)
895  			break;
896  		if (want > type) {
897  			ptr += btrfs_extent_inline_ref_size(type);
898  			continue;
899  		}
900  
901  		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
902  			struct btrfs_extent_data_ref *dref;
903  			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
904  			if (match_extent_data_ref(leaf, dref, root_objectid,
905  						  owner, offset)) {
906  				ret = 0;
907  				break;
908  			}
909  			if (hash_extent_data_ref_item(leaf, dref) <
910  			    hash_extent_data_ref(root_objectid, owner, offset))
911  				break;
912  		} else {
913  			u64 ref_offset;
914  			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
915  			if (parent > 0) {
916  				if (parent == ref_offset) {
917  					ret = 0;
918  					break;
919  				}
920  				if (ref_offset < parent)
921  					break;
922  			} else {
923  				if (root_objectid == ref_offset) {
924  					ret = 0;
925  					break;
926  				}
927  				if (ref_offset < root_objectid)
928  					break;
929  			}
930  		}
931  		ptr += btrfs_extent_inline_ref_size(type);
932  	}
933  
934  	if (unlikely(ptr > end)) {
935  		ret = -EUCLEAN;
936  		btrfs_print_leaf(path->nodes[0]);
937  		btrfs_crit(fs_info,
938  "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
939  			   path->slots[0], root_objectid, owner, offset, parent);
940  		goto out;
941  	}
942  
943  	if (ret == -ENOENT && insert) {
944  		if (item_size + extra_size >=
945  		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
946  			ret = -EAGAIN;
947  			goto out;
948  		}
949  		/*
950  		 * To add new inline back ref, we have to make sure
951  		 * there is no corresponding back ref item.
952  		 * For simplicity, we just do not add new inline back
953  		 * ref if there is any kind of item for this block
954  		 */
955  		if (find_next_key(path, 0, &key) == 0 &&
956  		    key.objectid == bytenr &&
957  		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
958  			ret = -EAGAIN;
959  			goto out;
960  		}
961  	}
962  	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
963  out:
964  	if (insert) {
965  		path->keep_locks = 0;
966  		path->search_for_extension = 0;
967  		btrfs_unlock_up_safe(path, 1);
968  	}
969  	return ret;
970  }
971  
972  /*
973   * helper to add new inline back ref
974   */
975  static noinline_for_stack
setup_inline_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_extent_inline_ref * iref,u64 parent,u64 root_objectid,u64 owner,u64 offset,int refs_to_add,struct btrfs_delayed_extent_op * extent_op)976  void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
977  				 struct btrfs_path *path,
978  				 struct btrfs_extent_inline_ref *iref,
979  				 u64 parent, u64 root_objectid,
980  				 u64 owner, u64 offset, int refs_to_add,
981  				 struct btrfs_delayed_extent_op *extent_op)
982  {
983  	struct extent_buffer *leaf;
984  	struct btrfs_extent_item *ei;
985  	unsigned long ptr;
986  	unsigned long end;
987  	unsigned long item_offset;
988  	u64 refs;
989  	int size;
990  	int type;
991  
992  	leaf = path->nodes[0];
993  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
994  	item_offset = (unsigned long)iref - (unsigned long)ei;
995  
996  	type = extent_ref_type(parent, owner);
997  	size = btrfs_extent_inline_ref_size(type);
998  
999  	btrfs_extend_item(trans, path, size);
1000  
1001  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1002  	refs = btrfs_extent_refs(leaf, ei);
1003  	refs += refs_to_add;
1004  	btrfs_set_extent_refs(leaf, ei, refs);
1005  	if (extent_op)
1006  		__run_delayed_extent_op(extent_op, leaf, ei);
1007  
1008  	ptr = (unsigned long)ei + item_offset;
1009  	end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
1010  	if (ptr < end - size)
1011  		memmove_extent_buffer(leaf, ptr + size, ptr,
1012  				      end - size - ptr);
1013  
1014  	iref = (struct btrfs_extent_inline_ref *)ptr;
1015  	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1016  	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1017  		struct btrfs_extent_data_ref *dref;
1018  		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1019  		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1020  		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1021  		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1022  		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1023  	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1024  		struct btrfs_shared_data_ref *sref;
1025  		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1026  		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1027  		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1028  	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1029  		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1030  	} else {
1031  		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1032  	}
1033  	btrfs_mark_buffer_dirty(trans, leaf);
1034  }
1035  
lookup_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_extent_inline_ref ** ref_ret,u64 bytenr,u64 num_bytes,u64 parent,u64 root_objectid,u64 owner,u64 offset)1036  static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1037  				 struct btrfs_path *path,
1038  				 struct btrfs_extent_inline_ref **ref_ret,
1039  				 u64 bytenr, u64 num_bytes, u64 parent,
1040  				 u64 root_objectid, u64 owner, u64 offset)
1041  {
1042  	int ret;
1043  
1044  	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1045  					   num_bytes, parent, root_objectid,
1046  					   owner, offset, 0);
1047  	if (ret != -ENOENT)
1048  		return ret;
1049  
1050  	btrfs_release_path(path);
1051  	*ref_ret = NULL;
1052  
1053  	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1054  		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1055  					    root_objectid);
1056  	} else {
1057  		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1058  					     root_objectid, owner, offset);
1059  	}
1060  	return ret;
1061  }
1062  
1063  /*
1064   * helper to update/remove inline back ref
1065   */
update_inline_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_extent_inline_ref * iref,int refs_to_mod,struct btrfs_delayed_extent_op * extent_op)1066  static noinline_for_stack int update_inline_extent_backref(
1067  				  struct btrfs_trans_handle *trans,
1068  				  struct btrfs_path *path,
1069  				  struct btrfs_extent_inline_ref *iref,
1070  				  int refs_to_mod,
1071  				  struct btrfs_delayed_extent_op *extent_op)
1072  {
1073  	struct extent_buffer *leaf = path->nodes[0];
1074  	struct btrfs_fs_info *fs_info = leaf->fs_info;
1075  	struct btrfs_extent_item *ei;
1076  	struct btrfs_extent_data_ref *dref = NULL;
1077  	struct btrfs_shared_data_ref *sref = NULL;
1078  	unsigned long ptr;
1079  	unsigned long end;
1080  	u32 item_size;
1081  	int size;
1082  	int type;
1083  	u64 refs;
1084  
1085  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1086  	refs = btrfs_extent_refs(leaf, ei);
1087  	if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) {
1088  		struct btrfs_key key;
1089  		u32 extent_size;
1090  
1091  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1092  		if (key.type == BTRFS_METADATA_ITEM_KEY)
1093  			extent_size = fs_info->nodesize;
1094  		else
1095  			extent_size = key.offset;
1096  		btrfs_print_leaf(leaf);
1097  		btrfs_err(fs_info,
1098  	"invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu",
1099  			  key.objectid, extent_size, refs_to_mod, refs);
1100  		return -EUCLEAN;
1101  	}
1102  	refs += refs_to_mod;
1103  	btrfs_set_extent_refs(leaf, ei, refs);
1104  	if (extent_op)
1105  		__run_delayed_extent_op(extent_op, leaf, ei);
1106  
1107  	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1108  	/*
1109  	 * Function btrfs_get_extent_inline_ref_type() has already printed
1110  	 * error messages.
1111  	 */
1112  	if (unlikely(type == BTRFS_REF_TYPE_INVALID))
1113  		return -EUCLEAN;
1114  
1115  	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1116  		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1117  		refs = btrfs_extent_data_ref_count(leaf, dref);
1118  	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1119  		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1120  		refs = btrfs_shared_data_ref_count(leaf, sref);
1121  	} else {
1122  		refs = 1;
1123  		/*
1124  		 * For tree blocks we can only drop one ref for it, and tree
1125  		 * blocks should not have refs > 1.
1126  		 *
1127  		 * Furthermore if we're inserting a new inline backref, we
1128  		 * won't reach this path either. That would be
1129  		 * setup_inline_extent_backref().
1130  		 */
1131  		if (unlikely(refs_to_mod != -1)) {
1132  			struct btrfs_key key;
1133  
1134  			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1135  
1136  			btrfs_print_leaf(leaf);
1137  			btrfs_err(fs_info,
1138  			"invalid refs_to_mod for tree block %llu, has %d expect -1",
1139  				  key.objectid, refs_to_mod);
1140  			return -EUCLEAN;
1141  		}
1142  	}
1143  
1144  	if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) {
1145  		struct btrfs_key key;
1146  		u32 extent_size;
1147  
1148  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1149  		if (key.type == BTRFS_METADATA_ITEM_KEY)
1150  			extent_size = fs_info->nodesize;
1151  		else
1152  			extent_size = key.offset;
1153  		btrfs_print_leaf(leaf);
1154  		btrfs_err(fs_info,
1155  "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu",
1156  			  (unsigned long)iref, key.objectid, extent_size,
1157  			  refs_to_mod, refs);
1158  		return -EUCLEAN;
1159  	}
1160  	refs += refs_to_mod;
1161  
1162  	if (refs > 0) {
1163  		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1164  			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1165  		else
1166  			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1167  	} else {
1168  		size =  btrfs_extent_inline_ref_size(type);
1169  		item_size = btrfs_item_size(leaf, path->slots[0]);
1170  		ptr = (unsigned long)iref;
1171  		end = (unsigned long)ei + item_size;
1172  		if (ptr + size < end)
1173  			memmove_extent_buffer(leaf, ptr, ptr + size,
1174  					      end - ptr - size);
1175  		item_size -= size;
1176  		btrfs_truncate_item(trans, path, item_size, 1);
1177  	}
1178  	btrfs_mark_buffer_dirty(trans, leaf);
1179  	return 0;
1180  }
1181  
1182  static noinline_for_stack
insert_inline_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,u64 bytenr,u64 num_bytes,u64 parent,u64 root_objectid,u64 owner,u64 offset,int refs_to_add,struct btrfs_delayed_extent_op * extent_op)1183  int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1184  				 struct btrfs_path *path,
1185  				 u64 bytenr, u64 num_bytes, u64 parent,
1186  				 u64 root_objectid, u64 owner,
1187  				 u64 offset, int refs_to_add,
1188  				 struct btrfs_delayed_extent_op *extent_op)
1189  {
1190  	struct btrfs_extent_inline_ref *iref;
1191  	int ret;
1192  
1193  	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1194  					   num_bytes, parent, root_objectid,
1195  					   owner, offset, 1);
1196  	if (ret == 0) {
1197  		/*
1198  		 * We're adding refs to a tree block we already own, this
1199  		 * should not happen at all.
1200  		 */
1201  		if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1202  			btrfs_print_leaf(path->nodes[0]);
1203  			btrfs_crit(trans->fs_info,
1204  "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
1205  				   bytenr, num_bytes, root_objectid, path->slots[0]);
1206  			return -EUCLEAN;
1207  		}
1208  		ret = update_inline_extent_backref(trans, path, iref,
1209  						   refs_to_add, extent_op);
1210  	} else if (ret == -ENOENT) {
1211  		setup_inline_extent_backref(trans, path, iref, parent,
1212  					    root_objectid, owner, offset,
1213  					    refs_to_add, extent_op);
1214  		ret = 0;
1215  	}
1216  	return ret;
1217  }
1218  
remove_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_extent_inline_ref * iref,int refs_to_drop,int is_data)1219  static int remove_extent_backref(struct btrfs_trans_handle *trans,
1220  				 struct btrfs_root *root,
1221  				 struct btrfs_path *path,
1222  				 struct btrfs_extent_inline_ref *iref,
1223  				 int refs_to_drop, int is_data)
1224  {
1225  	int ret = 0;
1226  
1227  	BUG_ON(!is_data && refs_to_drop != 1);
1228  	if (iref)
1229  		ret = update_inline_extent_backref(trans, path, iref,
1230  						   -refs_to_drop, NULL);
1231  	else if (is_data)
1232  		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1233  	else
1234  		ret = btrfs_del_item(trans, root, path);
1235  	return ret;
1236  }
1237  
btrfs_issue_discard(struct block_device * bdev,u64 start,u64 len,u64 * discarded_bytes)1238  static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1239  			       u64 *discarded_bytes)
1240  {
1241  	int j, ret = 0;
1242  	u64 bytes_left, end;
1243  	u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
1244  
1245  	/* Adjust the range to be aligned to 512B sectors if necessary. */
1246  	if (start != aligned_start) {
1247  		len -= aligned_start - start;
1248  		len = round_down(len, 1 << SECTOR_SHIFT);
1249  		start = aligned_start;
1250  	}
1251  
1252  	*discarded_bytes = 0;
1253  
1254  	if (!len)
1255  		return 0;
1256  
1257  	end = start + len;
1258  	bytes_left = len;
1259  
1260  	/* Skip any superblocks on this device. */
1261  	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1262  		u64 sb_start = btrfs_sb_offset(j);
1263  		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1264  		u64 size = sb_start - start;
1265  
1266  		if (!in_range(sb_start, start, bytes_left) &&
1267  		    !in_range(sb_end, start, bytes_left) &&
1268  		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1269  			continue;
1270  
1271  		/*
1272  		 * Superblock spans beginning of range.  Adjust start and
1273  		 * try again.
1274  		 */
1275  		if (sb_start <= start) {
1276  			start += sb_end - start;
1277  			if (start > end) {
1278  				bytes_left = 0;
1279  				break;
1280  			}
1281  			bytes_left = end - start;
1282  			continue;
1283  		}
1284  
1285  		if (size) {
1286  			ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1287  						   size >> SECTOR_SHIFT,
1288  						   GFP_NOFS);
1289  			if (!ret)
1290  				*discarded_bytes += size;
1291  			else if (ret != -EOPNOTSUPP)
1292  				return ret;
1293  		}
1294  
1295  		start = sb_end;
1296  		if (start > end) {
1297  			bytes_left = 0;
1298  			break;
1299  		}
1300  		bytes_left = end - start;
1301  	}
1302  
1303  	while (bytes_left) {
1304  		u64 bytes_to_discard = min(BTRFS_MAX_DISCARD_CHUNK_SIZE, bytes_left);
1305  
1306  		ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1307  					   bytes_to_discard >> SECTOR_SHIFT,
1308  					   GFP_NOFS);
1309  
1310  		if (ret) {
1311  			if (ret != -EOPNOTSUPP)
1312  				break;
1313  			continue;
1314  		}
1315  
1316  		start += bytes_to_discard;
1317  		bytes_left -= bytes_to_discard;
1318  		*discarded_bytes += bytes_to_discard;
1319  
1320  		if (btrfs_trim_interrupted()) {
1321  			ret = -ERESTARTSYS;
1322  			break;
1323  		}
1324  	}
1325  
1326  	return ret;
1327  }
1328  
do_discard_extent(struct btrfs_discard_stripe * stripe,u64 * bytes)1329  static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
1330  {
1331  	struct btrfs_device *dev = stripe->dev;
1332  	struct btrfs_fs_info *fs_info = dev->fs_info;
1333  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1334  	u64 phys = stripe->physical;
1335  	u64 len = stripe->length;
1336  	u64 discarded = 0;
1337  	int ret = 0;
1338  
1339  	/* Zone reset on a zoned filesystem */
1340  	if (btrfs_can_zone_reset(dev, phys, len)) {
1341  		u64 src_disc;
1342  
1343  		ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1344  		if (ret)
1345  			goto out;
1346  
1347  		if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1348  		    dev != dev_replace->srcdev)
1349  			goto out;
1350  
1351  		src_disc = discarded;
1352  
1353  		/* Send to replace target as well */
1354  		ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1355  					      &discarded);
1356  		discarded += src_disc;
1357  	} else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
1358  		ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1359  	} else {
1360  		ret = 0;
1361  		*bytes = 0;
1362  	}
1363  
1364  out:
1365  	*bytes = discarded;
1366  	return ret;
1367  }
1368  
btrfs_discard_extent(struct btrfs_fs_info * fs_info,u64 bytenr,u64 num_bytes,u64 * actual_bytes)1369  int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1370  			 u64 num_bytes, u64 *actual_bytes)
1371  {
1372  	int ret = 0;
1373  	u64 discarded_bytes = 0;
1374  	u64 end = bytenr + num_bytes;
1375  	u64 cur = bytenr;
1376  
1377  	/*
1378  	 * Avoid races with device replace and make sure the devices in the
1379  	 * stripes don't go away while we are discarding.
1380  	 */
1381  	btrfs_bio_counter_inc_blocked(fs_info);
1382  	while (cur < end) {
1383  		struct btrfs_discard_stripe *stripes;
1384  		unsigned int num_stripes;
1385  		int i;
1386  
1387  		num_bytes = end - cur;
1388  		stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
1389  		if (IS_ERR(stripes)) {
1390  			ret = PTR_ERR(stripes);
1391  			if (ret == -EOPNOTSUPP)
1392  				ret = 0;
1393  			break;
1394  		}
1395  
1396  		for (i = 0; i < num_stripes; i++) {
1397  			struct btrfs_discard_stripe *stripe = stripes + i;
1398  			u64 bytes;
1399  
1400  			if (!stripe->dev->bdev) {
1401  				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1402  				continue;
1403  			}
1404  
1405  			if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
1406  					&stripe->dev->dev_state))
1407  				continue;
1408  
1409  			ret = do_discard_extent(stripe, &bytes);
1410  			if (ret) {
1411  				/*
1412  				 * Keep going if discard is not supported by the
1413  				 * device.
1414  				 */
1415  				if (ret != -EOPNOTSUPP)
1416  					break;
1417  				ret = 0;
1418  			} else {
1419  				discarded_bytes += bytes;
1420  			}
1421  		}
1422  		kfree(stripes);
1423  		if (ret)
1424  			break;
1425  		cur += num_bytes;
1426  	}
1427  	btrfs_bio_counter_dec(fs_info);
1428  	if (actual_bytes)
1429  		*actual_bytes = discarded_bytes;
1430  	return ret;
1431  }
1432  
1433  /* Can return -ENOMEM */
btrfs_inc_extent_ref(struct btrfs_trans_handle * trans,struct btrfs_ref * generic_ref)1434  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1435  			 struct btrfs_ref *generic_ref)
1436  {
1437  	struct btrfs_fs_info *fs_info = trans->fs_info;
1438  	int ret;
1439  
1440  	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1441  	       generic_ref->action);
1442  	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1443  	       generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID);
1444  
1445  	if (generic_ref->type == BTRFS_REF_METADATA)
1446  		ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1447  	else
1448  		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1449  
1450  	btrfs_ref_tree_mod(fs_info, generic_ref);
1451  
1452  	return ret;
1453  }
1454  
1455  /*
1456   * Insert backreference for a given extent.
1457   *
1458   * The counterpart is in __btrfs_free_extent(), with examples and more details
1459   * how it works.
1460   *
1461   * @trans:	    Handle of transaction
1462   *
1463   * @node:	    The delayed ref node used to get the bytenr/length for
1464   *		    extent whose references are incremented.
1465   *
1466   * @extent_op       Pointer to a structure, holding information necessary when
1467   *                  updating a tree block's flags
1468   *
1469   */
__btrfs_inc_extent_ref(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op)1470  static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1471  				  struct btrfs_delayed_ref_node *node,
1472  				  struct btrfs_delayed_extent_op *extent_op)
1473  {
1474  	struct btrfs_path *path;
1475  	struct extent_buffer *leaf;
1476  	struct btrfs_extent_item *item;
1477  	struct btrfs_key key;
1478  	u64 bytenr = node->bytenr;
1479  	u64 num_bytes = node->num_bytes;
1480  	u64 owner = btrfs_delayed_ref_owner(node);
1481  	u64 offset = btrfs_delayed_ref_offset(node);
1482  	u64 refs;
1483  	int refs_to_add = node->ref_mod;
1484  	int ret;
1485  
1486  	path = btrfs_alloc_path();
1487  	if (!path)
1488  		return -ENOMEM;
1489  
1490  	/* this will setup the path even if it fails to insert the back ref */
1491  	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1492  					   node->parent, node->ref_root, owner,
1493  					   offset, refs_to_add, extent_op);
1494  	if ((ret < 0 && ret != -EAGAIN) || !ret)
1495  		goto out;
1496  
1497  	/*
1498  	 * Ok we had -EAGAIN which means we didn't have space to insert and
1499  	 * inline extent ref, so just update the reference count and add a
1500  	 * normal backref.
1501  	 */
1502  	leaf = path->nodes[0];
1503  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1504  	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1505  	refs = btrfs_extent_refs(leaf, item);
1506  	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1507  	if (extent_op)
1508  		__run_delayed_extent_op(extent_op, leaf, item);
1509  
1510  	btrfs_mark_buffer_dirty(trans, leaf);
1511  	btrfs_release_path(path);
1512  
1513  	/* now insert the actual backref */
1514  	if (owner < BTRFS_FIRST_FREE_OBJECTID)
1515  		ret = insert_tree_block_ref(trans, path, node, bytenr);
1516  	else
1517  		ret = insert_extent_data_ref(trans, path, node, bytenr);
1518  
1519  	if (ret)
1520  		btrfs_abort_transaction(trans, ret);
1521  out:
1522  	btrfs_free_path(path);
1523  	return ret;
1524  }
1525  
free_head_ref_squota_rsv(struct btrfs_fs_info * fs_info,struct btrfs_delayed_ref_head * href)1526  static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
1527  				     struct btrfs_delayed_ref_head *href)
1528  {
1529  	u64 root = href->owning_root;
1530  
1531  	/*
1532  	 * Don't check must_insert_reserved, as this is called from contexts
1533  	 * where it has already been unset.
1534  	 */
1535  	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
1536  	    !href->is_data || !is_fstree(root))
1537  		return;
1538  
1539  	btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
1540  				  BTRFS_QGROUP_RSV_DATA);
1541  }
1542  
run_delayed_data_ref(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * href,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op,bool insert_reserved)1543  static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1544  				struct btrfs_delayed_ref_head *href,
1545  				struct btrfs_delayed_ref_node *node,
1546  				struct btrfs_delayed_extent_op *extent_op,
1547  				bool insert_reserved)
1548  {
1549  	int ret = 0;
1550  	u64 parent = 0;
1551  	u64 flags = 0;
1552  
1553  	trace_run_delayed_data_ref(trans->fs_info, node);
1554  
1555  	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1556  		parent = node->parent;
1557  
1558  	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1559  		struct btrfs_key key;
1560  		struct btrfs_squota_delta delta = {
1561  			.root = href->owning_root,
1562  			.num_bytes = node->num_bytes,
1563  			.is_data = true,
1564  			.is_inc	= true,
1565  			.generation = trans->transid,
1566  		};
1567  		u64 owner = btrfs_delayed_ref_owner(node);
1568  		u64 offset = btrfs_delayed_ref_offset(node);
1569  
1570  		if (extent_op)
1571  			flags |= extent_op->flags_to_set;
1572  
1573  		key.objectid = node->bytenr;
1574  		key.type = BTRFS_EXTENT_ITEM_KEY;
1575  		key.offset = node->num_bytes;
1576  
1577  		ret = alloc_reserved_file_extent(trans, parent, node->ref_root,
1578  						 flags, owner, offset, &key,
1579  						 node->ref_mod,
1580  						 href->owning_root);
1581  		free_head_ref_squota_rsv(trans->fs_info, href);
1582  		if (!ret)
1583  			ret = btrfs_record_squota_delta(trans->fs_info, &delta);
1584  	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1585  		ret = __btrfs_inc_extent_ref(trans, node, extent_op);
1586  	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1587  		ret = __btrfs_free_extent(trans, href, node, extent_op);
1588  	} else {
1589  		BUG();
1590  	}
1591  	return ret;
1592  }
1593  
__run_delayed_extent_op(struct btrfs_delayed_extent_op * extent_op,struct extent_buffer * leaf,struct btrfs_extent_item * ei)1594  static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1595  				    struct extent_buffer *leaf,
1596  				    struct btrfs_extent_item *ei)
1597  {
1598  	u64 flags = btrfs_extent_flags(leaf, ei);
1599  	if (extent_op->update_flags) {
1600  		flags |= extent_op->flags_to_set;
1601  		btrfs_set_extent_flags(leaf, ei, flags);
1602  	}
1603  
1604  	if (extent_op->update_key) {
1605  		struct btrfs_tree_block_info *bi;
1606  		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1607  		bi = (struct btrfs_tree_block_info *)(ei + 1);
1608  		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1609  	}
1610  }
1611  
run_delayed_extent_op(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * head,struct btrfs_delayed_extent_op * extent_op)1612  static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1613  				 struct btrfs_delayed_ref_head *head,
1614  				 struct btrfs_delayed_extent_op *extent_op)
1615  {
1616  	struct btrfs_fs_info *fs_info = trans->fs_info;
1617  	struct btrfs_root *root;
1618  	struct btrfs_key key;
1619  	struct btrfs_path *path;
1620  	struct btrfs_extent_item *ei;
1621  	struct extent_buffer *leaf;
1622  	u32 item_size;
1623  	int ret;
1624  	int metadata = 1;
1625  
1626  	if (TRANS_ABORTED(trans))
1627  		return 0;
1628  
1629  	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1630  		metadata = 0;
1631  
1632  	path = btrfs_alloc_path();
1633  	if (!path)
1634  		return -ENOMEM;
1635  
1636  	key.objectid = head->bytenr;
1637  
1638  	if (metadata) {
1639  		key.type = BTRFS_METADATA_ITEM_KEY;
1640  		key.offset = head->level;
1641  	} else {
1642  		key.type = BTRFS_EXTENT_ITEM_KEY;
1643  		key.offset = head->num_bytes;
1644  	}
1645  
1646  	root = btrfs_extent_root(fs_info, key.objectid);
1647  again:
1648  	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1649  	if (ret < 0) {
1650  		goto out;
1651  	} else if (ret > 0) {
1652  		if (metadata) {
1653  			if (path->slots[0] > 0) {
1654  				path->slots[0]--;
1655  				btrfs_item_key_to_cpu(path->nodes[0], &key,
1656  						      path->slots[0]);
1657  				if (key.objectid == head->bytenr &&
1658  				    key.type == BTRFS_EXTENT_ITEM_KEY &&
1659  				    key.offset == head->num_bytes)
1660  					ret = 0;
1661  			}
1662  			if (ret > 0) {
1663  				btrfs_release_path(path);
1664  				metadata = 0;
1665  
1666  				key.objectid = head->bytenr;
1667  				key.offset = head->num_bytes;
1668  				key.type = BTRFS_EXTENT_ITEM_KEY;
1669  				goto again;
1670  			}
1671  		} else {
1672  			ret = -EUCLEAN;
1673  			btrfs_err(fs_info,
1674  		  "missing extent item for extent %llu num_bytes %llu level %d",
1675  				  head->bytenr, head->num_bytes, head->level);
1676  			goto out;
1677  		}
1678  	}
1679  
1680  	leaf = path->nodes[0];
1681  	item_size = btrfs_item_size(leaf, path->slots[0]);
1682  
1683  	if (unlikely(item_size < sizeof(*ei))) {
1684  		ret = -EUCLEAN;
1685  		btrfs_err(fs_info,
1686  			  "unexpected extent item size, has %u expect >= %zu",
1687  			  item_size, sizeof(*ei));
1688  		btrfs_abort_transaction(trans, ret);
1689  		goto out;
1690  	}
1691  
1692  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1693  	__run_delayed_extent_op(extent_op, leaf, ei);
1694  
1695  	btrfs_mark_buffer_dirty(trans, leaf);
1696  out:
1697  	btrfs_free_path(path);
1698  	return ret;
1699  }
1700  
run_delayed_tree_ref(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * href,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op,bool insert_reserved)1701  static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1702  				struct btrfs_delayed_ref_head *href,
1703  				struct btrfs_delayed_ref_node *node,
1704  				struct btrfs_delayed_extent_op *extent_op,
1705  				bool insert_reserved)
1706  {
1707  	int ret = 0;
1708  	struct btrfs_fs_info *fs_info = trans->fs_info;
1709  	u64 parent = 0;
1710  	u64 ref_root = 0;
1711  
1712  	trace_run_delayed_tree_ref(trans->fs_info, node);
1713  
1714  	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1715  		parent = node->parent;
1716  	ref_root = node->ref_root;
1717  
1718  	if (unlikely(node->ref_mod != 1)) {
1719  		btrfs_err(trans->fs_info,
1720  	"btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
1721  			  node->bytenr, node->ref_mod, node->action, ref_root,
1722  			  parent);
1723  		return -EUCLEAN;
1724  	}
1725  	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1726  		struct btrfs_squota_delta delta = {
1727  			.root = href->owning_root,
1728  			.num_bytes = fs_info->nodesize,
1729  			.is_data = false,
1730  			.is_inc = true,
1731  			.generation = trans->transid,
1732  		};
1733  
1734  		ret = alloc_reserved_tree_block(trans, node, extent_op);
1735  		if (!ret)
1736  			btrfs_record_squota_delta(fs_info, &delta);
1737  	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1738  		ret = __btrfs_inc_extent_ref(trans, node, extent_op);
1739  	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1740  		ret = __btrfs_free_extent(trans, href, node, extent_op);
1741  	} else {
1742  		BUG();
1743  	}
1744  	return ret;
1745  }
1746  
1747  /* helper function to actually process a single delayed ref entry */
run_one_delayed_ref(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * href,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op,bool insert_reserved)1748  static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1749  			       struct btrfs_delayed_ref_head *href,
1750  			       struct btrfs_delayed_ref_node *node,
1751  			       struct btrfs_delayed_extent_op *extent_op,
1752  			       bool insert_reserved)
1753  {
1754  	int ret = 0;
1755  
1756  	if (TRANS_ABORTED(trans)) {
1757  		if (insert_reserved) {
1758  			btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1759  			free_head_ref_squota_rsv(trans->fs_info, href);
1760  		}
1761  		return 0;
1762  	}
1763  
1764  	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1765  	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1766  		ret = run_delayed_tree_ref(trans, href, node, extent_op,
1767  					   insert_reserved);
1768  	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1769  		 node->type == BTRFS_SHARED_DATA_REF_KEY)
1770  		ret = run_delayed_data_ref(trans, href, node, extent_op,
1771  					   insert_reserved);
1772  	else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
1773  		ret = 0;
1774  	else
1775  		BUG();
1776  	if (ret && insert_reserved)
1777  		btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1778  	if (ret < 0)
1779  		btrfs_err(trans->fs_info,
1780  "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1781  			  node->bytenr, node->num_bytes, node->type,
1782  			  node->action, node->ref_mod, ret);
1783  	return ret;
1784  }
1785  
1786  static inline struct btrfs_delayed_ref_node *
select_delayed_ref(struct btrfs_delayed_ref_head * head)1787  select_delayed_ref(struct btrfs_delayed_ref_head *head)
1788  {
1789  	struct btrfs_delayed_ref_node *ref;
1790  
1791  	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1792  		return NULL;
1793  
1794  	/*
1795  	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1796  	 * This is to prevent a ref count from going down to zero, which deletes
1797  	 * the extent item from the extent tree, when there still are references
1798  	 * to add, which would fail because they would not find the extent item.
1799  	 */
1800  	if (!list_empty(&head->ref_add_list))
1801  		return list_first_entry(&head->ref_add_list,
1802  				struct btrfs_delayed_ref_node, add_list);
1803  
1804  	ref = rb_entry(rb_first_cached(&head->ref_tree),
1805  		       struct btrfs_delayed_ref_node, ref_node);
1806  	ASSERT(list_empty(&ref->add_list));
1807  	return ref;
1808  }
1809  
unselect_delayed_ref_head(struct btrfs_delayed_ref_root * delayed_refs,struct btrfs_delayed_ref_head * head)1810  static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1811  				      struct btrfs_delayed_ref_head *head)
1812  {
1813  	spin_lock(&delayed_refs->lock);
1814  	head->processing = false;
1815  	delayed_refs->num_heads_ready++;
1816  	spin_unlock(&delayed_refs->lock);
1817  	btrfs_delayed_ref_unlock(head);
1818  }
1819  
cleanup_extent_op(struct btrfs_delayed_ref_head * head)1820  static struct btrfs_delayed_extent_op *cleanup_extent_op(
1821  				struct btrfs_delayed_ref_head *head)
1822  {
1823  	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1824  
1825  	if (!extent_op)
1826  		return NULL;
1827  
1828  	if (head->must_insert_reserved) {
1829  		head->extent_op = NULL;
1830  		btrfs_free_delayed_extent_op(extent_op);
1831  		return NULL;
1832  	}
1833  	return extent_op;
1834  }
1835  
run_and_cleanup_extent_op(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * head)1836  static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1837  				     struct btrfs_delayed_ref_head *head)
1838  {
1839  	struct btrfs_delayed_extent_op *extent_op;
1840  	int ret;
1841  
1842  	extent_op = cleanup_extent_op(head);
1843  	if (!extent_op)
1844  		return 0;
1845  	head->extent_op = NULL;
1846  	spin_unlock(&head->lock);
1847  	ret = run_delayed_extent_op(trans, head, extent_op);
1848  	btrfs_free_delayed_extent_op(extent_op);
1849  	return ret ? ret : 1;
1850  }
1851  
btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info * fs_info,struct btrfs_delayed_ref_root * delayed_refs,struct btrfs_delayed_ref_head * head)1852  u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1853  				  struct btrfs_delayed_ref_root *delayed_refs,
1854  				  struct btrfs_delayed_ref_head *head)
1855  {
1856  	u64 ret = 0;
1857  
1858  	/*
1859  	 * We had csum deletions accounted for in our delayed refs rsv, we need
1860  	 * to drop the csum leaves for this update from our delayed_refs_rsv.
1861  	 */
1862  	if (head->total_ref_mod < 0 && head->is_data) {
1863  		int nr_csums;
1864  
1865  		spin_lock(&delayed_refs->lock);
1866  		delayed_refs->pending_csums -= head->num_bytes;
1867  		spin_unlock(&delayed_refs->lock);
1868  		nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1869  
1870  		btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
1871  
1872  		ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
1873  	}
1874  	/* must_insert_reserved can be set only if we didn't run the head ref. */
1875  	if (head->must_insert_reserved)
1876  		free_head_ref_squota_rsv(fs_info, head);
1877  
1878  	return ret;
1879  }
1880  
cleanup_ref_head(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * head,u64 * bytes_released)1881  static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1882  			    struct btrfs_delayed_ref_head *head,
1883  			    u64 *bytes_released)
1884  {
1885  
1886  	struct btrfs_fs_info *fs_info = trans->fs_info;
1887  	struct btrfs_delayed_ref_root *delayed_refs;
1888  	int ret;
1889  
1890  	delayed_refs = &trans->transaction->delayed_refs;
1891  
1892  	ret = run_and_cleanup_extent_op(trans, head);
1893  	if (ret < 0) {
1894  		unselect_delayed_ref_head(delayed_refs, head);
1895  		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1896  		return ret;
1897  	} else if (ret) {
1898  		return ret;
1899  	}
1900  
1901  	/*
1902  	 * Need to drop our head ref lock and re-acquire the delayed ref lock
1903  	 * and then re-check to make sure nobody got added.
1904  	 */
1905  	spin_unlock(&head->lock);
1906  	spin_lock(&delayed_refs->lock);
1907  	spin_lock(&head->lock);
1908  	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1909  		spin_unlock(&head->lock);
1910  		spin_unlock(&delayed_refs->lock);
1911  		return 1;
1912  	}
1913  	btrfs_delete_ref_head(delayed_refs, head);
1914  	spin_unlock(&head->lock);
1915  	spin_unlock(&delayed_refs->lock);
1916  
1917  	if (head->must_insert_reserved) {
1918  		btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1919  		if (head->is_data) {
1920  			struct btrfs_root *csum_root;
1921  
1922  			csum_root = btrfs_csum_root(fs_info, head->bytenr);
1923  			ret = btrfs_del_csums(trans, csum_root, head->bytenr,
1924  					      head->num_bytes);
1925  		}
1926  	}
1927  
1928  	*bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1929  
1930  	trace_run_delayed_ref_head(fs_info, head, 0);
1931  	btrfs_delayed_ref_unlock(head);
1932  	btrfs_put_delayed_ref_head(head);
1933  	return ret;
1934  }
1935  
btrfs_obtain_ref_head(struct btrfs_trans_handle * trans)1936  static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1937  					struct btrfs_trans_handle *trans)
1938  {
1939  	struct btrfs_delayed_ref_root *delayed_refs =
1940  		&trans->transaction->delayed_refs;
1941  	struct btrfs_delayed_ref_head *head = NULL;
1942  	int ret;
1943  
1944  	spin_lock(&delayed_refs->lock);
1945  	head = btrfs_select_ref_head(delayed_refs);
1946  	if (!head) {
1947  		spin_unlock(&delayed_refs->lock);
1948  		return head;
1949  	}
1950  
1951  	/*
1952  	 * Grab the lock that says we are going to process all the refs for
1953  	 * this head
1954  	 */
1955  	ret = btrfs_delayed_ref_lock(delayed_refs, head);
1956  	spin_unlock(&delayed_refs->lock);
1957  
1958  	/*
1959  	 * We may have dropped the spin lock to get the head mutex lock, and
1960  	 * that might have given someone else time to free the head.  If that's
1961  	 * true, it has been removed from our list and we can move on.
1962  	 */
1963  	if (ret == -EAGAIN)
1964  		head = ERR_PTR(-EAGAIN);
1965  
1966  	return head;
1967  }
1968  
btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * locked_ref,u64 * bytes_released)1969  static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1970  					   struct btrfs_delayed_ref_head *locked_ref,
1971  					   u64 *bytes_released)
1972  {
1973  	struct btrfs_fs_info *fs_info = trans->fs_info;
1974  	struct btrfs_delayed_ref_root *delayed_refs;
1975  	struct btrfs_delayed_extent_op *extent_op;
1976  	struct btrfs_delayed_ref_node *ref;
1977  	bool must_insert_reserved;
1978  	int ret;
1979  
1980  	delayed_refs = &trans->transaction->delayed_refs;
1981  
1982  	lockdep_assert_held(&locked_ref->mutex);
1983  	lockdep_assert_held(&locked_ref->lock);
1984  
1985  	while ((ref = select_delayed_ref(locked_ref))) {
1986  		if (ref->seq &&
1987  		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
1988  			spin_unlock(&locked_ref->lock);
1989  			unselect_delayed_ref_head(delayed_refs, locked_ref);
1990  			return -EAGAIN;
1991  		}
1992  
1993  		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1994  		RB_CLEAR_NODE(&ref->ref_node);
1995  		if (!list_empty(&ref->add_list))
1996  			list_del(&ref->add_list);
1997  		/*
1998  		 * When we play the delayed ref, also correct the ref_mod on
1999  		 * head
2000  		 */
2001  		switch (ref->action) {
2002  		case BTRFS_ADD_DELAYED_REF:
2003  		case BTRFS_ADD_DELAYED_EXTENT:
2004  			locked_ref->ref_mod -= ref->ref_mod;
2005  			break;
2006  		case BTRFS_DROP_DELAYED_REF:
2007  			locked_ref->ref_mod += ref->ref_mod;
2008  			break;
2009  		default:
2010  			WARN_ON(1);
2011  		}
2012  		atomic_dec(&delayed_refs->num_entries);
2013  
2014  		/*
2015  		 * Record the must_insert_reserved flag before we drop the
2016  		 * spin lock.
2017  		 */
2018  		must_insert_reserved = locked_ref->must_insert_reserved;
2019  		/*
2020  		 * Unsetting this on the head ref relinquishes ownership of
2021  		 * the rsv_bytes, so it is critical that every possible code
2022  		 * path from here forward frees all reserves including qgroup
2023  		 * reserve.
2024  		 */
2025  		locked_ref->must_insert_reserved = false;
2026  
2027  		extent_op = locked_ref->extent_op;
2028  		locked_ref->extent_op = NULL;
2029  		spin_unlock(&locked_ref->lock);
2030  
2031  		ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op,
2032  					  must_insert_reserved);
2033  		btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
2034  		*bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
2035  
2036  		btrfs_free_delayed_extent_op(extent_op);
2037  		if (ret) {
2038  			unselect_delayed_ref_head(delayed_refs, locked_ref);
2039  			btrfs_put_delayed_ref(ref);
2040  			return ret;
2041  		}
2042  
2043  		btrfs_put_delayed_ref(ref);
2044  		cond_resched();
2045  
2046  		spin_lock(&locked_ref->lock);
2047  		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2048  	}
2049  
2050  	return 0;
2051  }
2052  
2053  /*
2054   * Returns 0 on success or if called with an already aborted transaction.
2055   * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2056   */
__btrfs_run_delayed_refs(struct btrfs_trans_handle * trans,u64 min_bytes)2057  static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2058  					     u64 min_bytes)
2059  {
2060  	struct btrfs_fs_info *fs_info = trans->fs_info;
2061  	struct btrfs_delayed_ref_root *delayed_refs;
2062  	struct btrfs_delayed_ref_head *locked_ref = NULL;
2063  	int ret;
2064  	unsigned long count = 0;
2065  	unsigned long max_count = 0;
2066  	u64 bytes_processed = 0;
2067  
2068  	delayed_refs = &trans->transaction->delayed_refs;
2069  	if (min_bytes == 0) {
2070  		max_count = delayed_refs->num_heads_ready;
2071  		min_bytes = U64_MAX;
2072  	}
2073  
2074  	do {
2075  		if (!locked_ref) {
2076  			locked_ref = btrfs_obtain_ref_head(trans);
2077  			if (IS_ERR_OR_NULL(locked_ref)) {
2078  				if (PTR_ERR(locked_ref) == -EAGAIN) {
2079  					continue;
2080  				} else {
2081  					break;
2082  				}
2083  			}
2084  			count++;
2085  		}
2086  		/*
2087  		 * We need to try and merge add/drops of the same ref since we
2088  		 * can run into issues with relocate dropping the implicit ref
2089  		 * and then it being added back again before the drop can
2090  		 * finish.  If we merged anything we need to re-loop so we can
2091  		 * get a good ref.
2092  		 * Or we can get node references of the same type that weren't
2093  		 * merged when created due to bumps in the tree mod seq, and
2094  		 * we need to merge them to prevent adding an inline extent
2095  		 * backref before dropping it (triggering a BUG_ON at
2096  		 * insert_inline_extent_backref()).
2097  		 */
2098  		spin_lock(&locked_ref->lock);
2099  		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2100  
2101  		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
2102  		if (ret < 0 && ret != -EAGAIN) {
2103  			/*
2104  			 * Error, btrfs_run_delayed_refs_for_head already
2105  			 * unlocked everything so just bail out
2106  			 */
2107  			return ret;
2108  		} else if (!ret) {
2109  			/*
2110  			 * Success, perform the usual cleanup of a processed
2111  			 * head
2112  			 */
2113  			ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
2114  			if (ret > 0 ) {
2115  				/* We dropped our lock, we need to loop. */
2116  				ret = 0;
2117  				continue;
2118  			} else if (ret) {
2119  				return ret;
2120  			}
2121  		}
2122  
2123  		/*
2124  		 * Either success case or btrfs_run_delayed_refs_for_head
2125  		 * returned -EAGAIN, meaning we need to select another head
2126  		 */
2127  
2128  		locked_ref = NULL;
2129  		cond_resched();
2130  	} while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
2131  		 (max_count > 0 && count < max_count) ||
2132  		 locked_ref);
2133  
2134  	return 0;
2135  }
2136  
2137  #ifdef SCRAMBLE_DELAYED_REFS
2138  /*
2139   * Normally delayed refs get processed in ascending bytenr order. This
2140   * correlates in most cases to the order added. To expose dependencies on this
2141   * order, we start to process the tree in the middle instead of the beginning
2142   */
find_middle(struct rb_root * root)2143  static u64 find_middle(struct rb_root *root)
2144  {
2145  	struct rb_node *n = root->rb_node;
2146  	struct btrfs_delayed_ref_node *entry;
2147  	int alt = 1;
2148  	u64 middle;
2149  	u64 first = 0, last = 0;
2150  
2151  	n = rb_first(root);
2152  	if (n) {
2153  		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2154  		first = entry->bytenr;
2155  	}
2156  	n = rb_last(root);
2157  	if (n) {
2158  		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2159  		last = entry->bytenr;
2160  	}
2161  	n = root->rb_node;
2162  
2163  	while (n) {
2164  		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2165  		WARN_ON(!entry->in_tree);
2166  
2167  		middle = entry->bytenr;
2168  
2169  		if (alt)
2170  			n = n->rb_left;
2171  		else
2172  			n = n->rb_right;
2173  
2174  		alt = 1 - alt;
2175  	}
2176  	return middle;
2177  }
2178  #endif
2179  
2180  /*
2181   * Start processing the delayed reference count updates and extent insertions
2182   * we have queued up so far.
2183   *
2184   * @trans:	Transaction handle.
2185   * @min_bytes:	How many bytes of delayed references to process. After this
2186   *		many bytes we stop processing delayed references if there are
2187   *		any more. If 0 it means to run all existing delayed references,
2188   *		but not new ones added after running all existing ones.
2189   *		Use (u64)-1 (U64_MAX) to run all existing delayed references
2190   *		plus any new ones that are added.
2191   *
2192   * Returns 0 on success or if called with an aborted transaction
2193   * Returns <0 on error and aborts the transaction
2194   */
btrfs_run_delayed_refs(struct btrfs_trans_handle * trans,u64 min_bytes)2195  int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
2196  {
2197  	struct btrfs_fs_info *fs_info = trans->fs_info;
2198  	struct btrfs_delayed_ref_root *delayed_refs;
2199  	int ret;
2200  
2201  	/* We'll clean this up in btrfs_cleanup_transaction */
2202  	if (TRANS_ABORTED(trans))
2203  		return 0;
2204  
2205  	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2206  		return 0;
2207  
2208  	delayed_refs = &trans->transaction->delayed_refs;
2209  again:
2210  #ifdef SCRAMBLE_DELAYED_REFS
2211  	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2212  #endif
2213  	ret = __btrfs_run_delayed_refs(trans, min_bytes);
2214  	if (ret < 0) {
2215  		btrfs_abort_transaction(trans, ret);
2216  		return ret;
2217  	}
2218  
2219  	if (min_bytes == U64_MAX) {
2220  		btrfs_create_pending_block_groups(trans);
2221  
2222  		spin_lock(&delayed_refs->lock);
2223  		if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
2224  			spin_unlock(&delayed_refs->lock);
2225  			return 0;
2226  		}
2227  		spin_unlock(&delayed_refs->lock);
2228  
2229  		cond_resched();
2230  		goto again;
2231  	}
2232  
2233  	return 0;
2234  }
2235  
btrfs_set_disk_extent_flags(struct btrfs_trans_handle * trans,struct extent_buffer * eb,u64 flags)2236  int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2237  				struct extent_buffer *eb, u64 flags)
2238  {
2239  	struct btrfs_delayed_extent_op *extent_op;
2240  	int ret;
2241  
2242  	extent_op = btrfs_alloc_delayed_extent_op();
2243  	if (!extent_op)
2244  		return -ENOMEM;
2245  
2246  	extent_op->flags_to_set = flags;
2247  	extent_op->update_flags = true;
2248  	extent_op->update_key = false;
2249  
2250  	ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len,
2251  					  btrfs_header_level(eb), extent_op);
2252  	if (ret)
2253  		btrfs_free_delayed_extent_op(extent_op);
2254  	return ret;
2255  }
2256  
check_delayed_ref(struct btrfs_root * root,struct btrfs_path * path,u64 objectid,u64 offset,u64 bytenr)2257  static noinline int check_delayed_ref(struct btrfs_root *root,
2258  				      struct btrfs_path *path,
2259  				      u64 objectid, u64 offset, u64 bytenr)
2260  {
2261  	struct btrfs_delayed_ref_head *head;
2262  	struct btrfs_delayed_ref_node *ref;
2263  	struct btrfs_delayed_ref_root *delayed_refs;
2264  	struct btrfs_transaction *cur_trans;
2265  	struct rb_node *node;
2266  	int ret = 0;
2267  
2268  	spin_lock(&root->fs_info->trans_lock);
2269  	cur_trans = root->fs_info->running_transaction;
2270  	if (cur_trans)
2271  		refcount_inc(&cur_trans->use_count);
2272  	spin_unlock(&root->fs_info->trans_lock);
2273  	if (!cur_trans)
2274  		return 0;
2275  
2276  	delayed_refs = &cur_trans->delayed_refs;
2277  	spin_lock(&delayed_refs->lock);
2278  	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2279  	if (!head) {
2280  		spin_unlock(&delayed_refs->lock);
2281  		btrfs_put_transaction(cur_trans);
2282  		return 0;
2283  	}
2284  
2285  	if (!mutex_trylock(&head->mutex)) {
2286  		if (path->nowait) {
2287  			spin_unlock(&delayed_refs->lock);
2288  			btrfs_put_transaction(cur_trans);
2289  			return -EAGAIN;
2290  		}
2291  
2292  		refcount_inc(&head->refs);
2293  		spin_unlock(&delayed_refs->lock);
2294  
2295  		btrfs_release_path(path);
2296  
2297  		/*
2298  		 * Mutex was contended, block until it's released and let
2299  		 * caller try again
2300  		 */
2301  		mutex_lock(&head->mutex);
2302  		mutex_unlock(&head->mutex);
2303  		btrfs_put_delayed_ref_head(head);
2304  		btrfs_put_transaction(cur_trans);
2305  		return -EAGAIN;
2306  	}
2307  	spin_unlock(&delayed_refs->lock);
2308  
2309  	spin_lock(&head->lock);
2310  	/*
2311  	 * XXX: We should replace this with a proper search function in the
2312  	 * future.
2313  	 */
2314  	for (node = rb_first_cached(&head->ref_tree); node;
2315  	     node = rb_next(node)) {
2316  		u64 ref_owner;
2317  		u64 ref_offset;
2318  
2319  		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2320  		/* If it's a shared ref we know a cross reference exists */
2321  		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2322  			ret = 1;
2323  			break;
2324  		}
2325  
2326  		ref_owner = btrfs_delayed_ref_owner(ref);
2327  		ref_offset = btrfs_delayed_ref_offset(ref);
2328  
2329  		/*
2330  		 * If our ref doesn't match the one we're currently looking at
2331  		 * then we have a cross reference.
2332  		 */
2333  		if (ref->ref_root != btrfs_root_id(root) ||
2334  		    ref_owner != objectid || ref_offset != offset) {
2335  			ret = 1;
2336  			break;
2337  		}
2338  	}
2339  	spin_unlock(&head->lock);
2340  	mutex_unlock(&head->mutex);
2341  	btrfs_put_transaction(cur_trans);
2342  	return ret;
2343  }
2344  
check_committed_ref(struct btrfs_root * root,struct btrfs_path * path,u64 objectid,u64 offset,u64 bytenr,bool strict)2345  static noinline int check_committed_ref(struct btrfs_root *root,
2346  					struct btrfs_path *path,
2347  					u64 objectid, u64 offset, u64 bytenr,
2348  					bool strict)
2349  {
2350  	struct btrfs_fs_info *fs_info = root->fs_info;
2351  	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
2352  	struct extent_buffer *leaf;
2353  	struct btrfs_extent_data_ref *ref;
2354  	struct btrfs_extent_inline_ref *iref;
2355  	struct btrfs_extent_item *ei;
2356  	struct btrfs_key key;
2357  	u32 item_size;
2358  	u32 expected_size;
2359  	int type;
2360  	int ret;
2361  
2362  	key.objectid = bytenr;
2363  	key.offset = (u64)-1;
2364  	key.type = BTRFS_EXTENT_ITEM_KEY;
2365  
2366  	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2367  	if (ret < 0)
2368  		goto out;
2369  	if (ret == 0) {
2370  		/*
2371  		 * Key with offset -1 found, there would have to exist an extent
2372  		 * item with such offset, but this is out of the valid range.
2373  		 */
2374  		ret = -EUCLEAN;
2375  		goto out;
2376  	}
2377  
2378  	ret = -ENOENT;
2379  	if (path->slots[0] == 0)
2380  		goto out;
2381  
2382  	path->slots[0]--;
2383  	leaf = path->nodes[0];
2384  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2385  
2386  	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2387  		goto out;
2388  
2389  	ret = 1;
2390  	item_size = btrfs_item_size(leaf, path->slots[0]);
2391  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2392  	expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
2393  
2394  	/* No inline refs; we need to bail before checking for owner ref. */
2395  	if (item_size == sizeof(*ei))
2396  		goto out;
2397  
2398  	/* Check for an owner ref; skip over it to the real inline refs. */
2399  	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2400  	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2401  	if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
2402  		expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
2403  		iref = (struct btrfs_extent_inline_ref *)(iref + 1);
2404  	}
2405  
2406  	/* If extent item has more than 1 inline ref then it's shared */
2407  	if (item_size != expected_size)
2408  		goto out;
2409  
2410  	/*
2411  	 * If extent created before last snapshot => it's shared unless the
2412  	 * snapshot has been deleted. Use the heuristic if strict is false.
2413  	 */
2414  	if (!strict &&
2415  	    (btrfs_extent_generation(leaf, ei) <=
2416  	     btrfs_root_last_snapshot(&root->root_item)))
2417  		goto out;
2418  
2419  	/* If this extent has SHARED_DATA_REF then it's shared */
2420  	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2421  	if (type != BTRFS_EXTENT_DATA_REF_KEY)
2422  		goto out;
2423  
2424  	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2425  	if (btrfs_extent_refs(leaf, ei) !=
2426  	    btrfs_extent_data_ref_count(leaf, ref) ||
2427  	    btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
2428  	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2429  	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2430  		goto out;
2431  
2432  	ret = 0;
2433  out:
2434  	return ret;
2435  }
2436  
btrfs_cross_ref_exist(struct btrfs_root * root,u64 objectid,u64 offset,u64 bytenr,bool strict,struct btrfs_path * path)2437  int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2438  			  u64 bytenr, bool strict, struct btrfs_path *path)
2439  {
2440  	int ret;
2441  
2442  	do {
2443  		ret = check_committed_ref(root, path, objectid,
2444  					  offset, bytenr, strict);
2445  		if (ret && ret != -ENOENT)
2446  			goto out;
2447  
2448  		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2449  	} while (ret == -EAGAIN);
2450  
2451  out:
2452  	btrfs_release_path(path);
2453  	if (btrfs_is_data_reloc_root(root))
2454  		WARN_ON(ret > 0);
2455  	return ret;
2456  }
2457  
__btrfs_mod_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,int full_backref,int inc)2458  static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2459  			   struct btrfs_root *root,
2460  			   struct extent_buffer *buf,
2461  			   int full_backref, int inc)
2462  {
2463  	struct btrfs_fs_info *fs_info = root->fs_info;
2464  	u64 parent;
2465  	u64 ref_root;
2466  	u32 nritems;
2467  	struct btrfs_key key;
2468  	struct btrfs_file_extent_item *fi;
2469  	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2470  	int i;
2471  	int action;
2472  	int level;
2473  	int ret = 0;
2474  
2475  	if (btrfs_is_testing(fs_info))
2476  		return 0;
2477  
2478  	ref_root = btrfs_header_owner(buf);
2479  	nritems = btrfs_header_nritems(buf);
2480  	level = btrfs_header_level(buf);
2481  
2482  	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2483  		return 0;
2484  
2485  	if (full_backref)
2486  		parent = buf->start;
2487  	else
2488  		parent = 0;
2489  	if (inc)
2490  		action = BTRFS_ADD_DELAYED_REF;
2491  	else
2492  		action = BTRFS_DROP_DELAYED_REF;
2493  
2494  	for (i = 0; i < nritems; i++) {
2495  		struct btrfs_ref ref = {
2496  			.action = action,
2497  			.parent = parent,
2498  			.ref_root = ref_root,
2499  		};
2500  
2501  		if (level == 0) {
2502  			btrfs_item_key_to_cpu(buf, &key, i);
2503  			if (key.type != BTRFS_EXTENT_DATA_KEY)
2504  				continue;
2505  			fi = btrfs_item_ptr(buf, i,
2506  					    struct btrfs_file_extent_item);
2507  			if (btrfs_file_extent_type(buf, fi) ==
2508  			    BTRFS_FILE_EXTENT_INLINE)
2509  				continue;
2510  			ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2511  			if (ref.bytenr == 0)
2512  				continue;
2513  
2514  			ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2515  			ref.owning_root = ref_root;
2516  
2517  			key.offset -= btrfs_file_extent_offset(buf, fi);
2518  			btrfs_init_data_ref(&ref, key.objectid, key.offset,
2519  					    btrfs_root_id(root), for_reloc);
2520  			if (inc)
2521  				ret = btrfs_inc_extent_ref(trans, &ref);
2522  			else
2523  				ret = btrfs_free_extent(trans, &ref);
2524  			if (ret)
2525  				goto fail;
2526  		} else {
2527  			/* We don't know the owning_root, leave as 0. */
2528  			ref.bytenr = btrfs_node_blockptr(buf, i);
2529  			ref.num_bytes = fs_info->nodesize;
2530  
2531  			btrfs_init_tree_ref(&ref, level - 1,
2532  					    btrfs_root_id(root), for_reloc);
2533  			if (inc)
2534  				ret = btrfs_inc_extent_ref(trans, &ref);
2535  			else
2536  				ret = btrfs_free_extent(trans, &ref);
2537  			if (ret)
2538  				goto fail;
2539  		}
2540  	}
2541  	return 0;
2542  fail:
2543  	return ret;
2544  }
2545  
btrfs_inc_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,int full_backref)2546  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2547  		  struct extent_buffer *buf, int full_backref)
2548  {
2549  	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2550  }
2551  
btrfs_dec_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,int full_backref)2552  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2553  		  struct extent_buffer *buf, int full_backref)
2554  {
2555  	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2556  }
2557  
get_alloc_profile_by_root(struct btrfs_root * root,int data)2558  static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2559  {
2560  	struct btrfs_fs_info *fs_info = root->fs_info;
2561  	u64 flags;
2562  	u64 ret;
2563  
2564  	if (data)
2565  		flags = BTRFS_BLOCK_GROUP_DATA;
2566  	else if (root == fs_info->chunk_root)
2567  		flags = BTRFS_BLOCK_GROUP_SYSTEM;
2568  	else
2569  		flags = BTRFS_BLOCK_GROUP_METADATA;
2570  
2571  	ret = btrfs_get_alloc_profile(fs_info, flags);
2572  	return ret;
2573  }
2574  
first_logical_byte(struct btrfs_fs_info * fs_info)2575  static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
2576  {
2577  	struct rb_node *leftmost;
2578  	u64 bytenr = 0;
2579  
2580  	read_lock(&fs_info->block_group_cache_lock);
2581  	/* Get the block group with the lowest logical start address. */
2582  	leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
2583  	if (leftmost) {
2584  		struct btrfs_block_group *bg;
2585  
2586  		bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
2587  		bytenr = bg->start;
2588  	}
2589  	read_unlock(&fs_info->block_group_cache_lock);
2590  
2591  	return bytenr;
2592  }
2593  
pin_down_extent(struct btrfs_trans_handle * trans,struct btrfs_block_group * cache,u64 bytenr,u64 num_bytes,int reserved)2594  static int pin_down_extent(struct btrfs_trans_handle *trans,
2595  			   struct btrfs_block_group *cache,
2596  			   u64 bytenr, u64 num_bytes, int reserved)
2597  {
2598  	struct btrfs_fs_info *fs_info = cache->fs_info;
2599  
2600  	spin_lock(&cache->space_info->lock);
2601  	spin_lock(&cache->lock);
2602  	cache->pinned += num_bytes;
2603  	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2604  					     num_bytes);
2605  	if (reserved) {
2606  		cache->reserved -= num_bytes;
2607  		cache->space_info->bytes_reserved -= num_bytes;
2608  	}
2609  	spin_unlock(&cache->lock);
2610  	spin_unlock(&cache->space_info->lock);
2611  
2612  	set_extent_bit(&trans->transaction->pinned_extents, bytenr,
2613  		       bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
2614  	return 0;
2615  }
2616  
btrfs_pin_extent(struct btrfs_trans_handle * trans,u64 bytenr,u64 num_bytes,int reserved)2617  int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2618  		     u64 bytenr, u64 num_bytes, int reserved)
2619  {
2620  	struct btrfs_block_group *cache;
2621  
2622  	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2623  	BUG_ON(!cache); /* Logic error */
2624  
2625  	pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2626  
2627  	btrfs_put_block_group(cache);
2628  	return 0;
2629  }
2630  
btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle * trans,const struct extent_buffer * eb)2631  int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2632  				    const struct extent_buffer *eb)
2633  {
2634  	struct btrfs_block_group *cache;
2635  	int ret;
2636  
2637  	cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
2638  	if (!cache)
2639  		return -EINVAL;
2640  
2641  	/*
2642  	 * Fully cache the free space first so that our pin removes the free space
2643  	 * from the cache.
2644  	 */
2645  	ret = btrfs_cache_block_group(cache, true);
2646  	if (ret)
2647  		goto out;
2648  
2649  	pin_down_extent(trans, cache, eb->start, eb->len, 0);
2650  
2651  	/* remove us from the free space cache (if we're there at all) */
2652  	ret = btrfs_remove_free_space(cache, eb->start, eb->len);
2653  out:
2654  	btrfs_put_block_group(cache);
2655  	return ret;
2656  }
2657  
__exclude_logged_extent(struct btrfs_fs_info * fs_info,u64 start,u64 num_bytes)2658  static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2659  				   u64 start, u64 num_bytes)
2660  {
2661  	int ret;
2662  	struct btrfs_block_group *block_group;
2663  
2664  	block_group = btrfs_lookup_block_group(fs_info, start);
2665  	if (!block_group)
2666  		return -EINVAL;
2667  
2668  	ret = btrfs_cache_block_group(block_group, true);
2669  	if (ret)
2670  		goto out;
2671  
2672  	ret = btrfs_remove_free_space(block_group, start, num_bytes);
2673  out:
2674  	btrfs_put_block_group(block_group);
2675  	return ret;
2676  }
2677  
btrfs_exclude_logged_extents(struct extent_buffer * eb)2678  int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2679  {
2680  	struct btrfs_fs_info *fs_info = eb->fs_info;
2681  	struct btrfs_file_extent_item *item;
2682  	struct btrfs_key key;
2683  	int found_type;
2684  	int i;
2685  	int ret = 0;
2686  
2687  	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2688  		return 0;
2689  
2690  	for (i = 0; i < btrfs_header_nritems(eb); i++) {
2691  		btrfs_item_key_to_cpu(eb, &key, i);
2692  		if (key.type != BTRFS_EXTENT_DATA_KEY)
2693  			continue;
2694  		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2695  		found_type = btrfs_file_extent_type(eb, item);
2696  		if (found_type == BTRFS_FILE_EXTENT_INLINE)
2697  			continue;
2698  		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2699  			continue;
2700  		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2701  		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2702  		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2703  		if (ret)
2704  			break;
2705  	}
2706  
2707  	return ret;
2708  }
2709  
2710  static void
btrfs_inc_block_group_reservations(struct btrfs_block_group * bg)2711  btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2712  {
2713  	atomic_inc(&bg->reservations);
2714  }
2715  
2716  /*
2717   * Returns the free cluster for the given space info and sets empty_cluster to
2718   * what it should be based on the mount options.
2719   */
2720  static struct btrfs_free_cluster *
fetch_cluster_info(struct btrfs_fs_info * fs_info,struct btrfs_space_info * space_info,u64 * empty_cluster)2721  fetch_cluster_info(struct btrfs_fs_info *fs_info,
2722  		   struct btrfs_space_info *space_info, u64 *empty_cluster)
2723  {
2724  	struct btrfs_free_cluster *ret = NULL;
2725  
2726  	*empty_cluster = 0;
2727  	if (btrfs_mixed_space_info(space_info))
2728  		return ret;
2729  
2730  	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2731  		ret = &fs_info->meta_alloc_cluster;
2732  		if (btrfs_test_opt(fs_info, SSD))
2733  			*empty_cluster = SZ_2M;
2734  		else
2735  			*empty_cluster = SZ_64K;
2736  	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2737  		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
2738  		*empty_cluster = SZ_2M;
2739  		ret = &fs_info->data_alloc_cluster;
2740  	}
2741  
2742  	return ret;
2743  }
2744  
unpin_extent_range(struct btrfs_fs_info * fs_info,u64 start,u64 end,const bool return_free_space)2745  static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2746  			      u64 start, u64 end,
2747  			      const bool return_free_space)
2748  {
2749  	struct btrfs_block_group *cache = NULL;
2750  	struct btrfs_space_info *space_info;
2751  	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2752  	struct btrfs_free_cluster *cluster = NULL;
2753  	u64 len;
2754  	u64 total_unpinned = 0;
2755  	u64 empty_cluster = 0;
2756  	bool readonly;
2757  	int ret = 0;
2758  
2759  	while (start <= end) {
2760  		readonly = false;
2761  		if (!cache ||
2762  		    start >= cache->start + cache->length) {
2763  			if (cache)
2764  				btrfs_put_block_group(cache);
2765  			total_unpinned = 0;
2766  			cache = btrfs_lookup_block_group(fs_info, start);
2767  			if (cache == NULL) {
2768  				/* Logic error, something removed the block group. */
2769  				ret = -EUCLEAN;
2770  				goto out;
2771  			}
2772  
2773  			cluster = fetch_cluster_info(fs_info,
2774  						     cache->space_info,
2775  						     &empty_cluster);
2776  			empty_cluster <<= 1;
2777  		}
2778  
2779  		len = cache->start + cache->length - start;
2780  		len = min(len, end + 1 - start);
2781  
2782  		if (return_free_space)
2783  			btrfs_add_free_space(cache, start, len);
2784  
2785  		start += len;
2786  		total_unpinned += len;
2787  		space_info = cache->space_info;
2788  
2789  		/*
2790  		 * If this space cluster has been marked as fragmented and we've
2791  		 * unpinned enough in this block group to potentially allow a
2792  		 * cluster to be created inside of it go ahead and clear the
2793  		 * fragmented check.
2794  		 */
2795  		if (cluster && cluster->fragmented &&
2796  		    total_unpinned > empty_cluster) {
2797  			spin_lock(&cluster->lock);
2798  			cluster->fragmented = 0;
2799  			spin_unlock(&cluster->lock);
2800  		}
2801  
2802  		spin_lock(&space_info->lock);
2803  		spin_lock(&cache->lock);
2804  		cache->pinned -= len;
2805  		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2806  		space_info->max_extent_size = 0;
2807  		if (cache->ro) {
2808  			space_info->bytes_readonly += len;
2809  			readonly = true;
2810  		} else if (btrfs_is_zoned(fs_info)) {
2811  			/* Need reset before reusing in a zoned block group */
2812  			btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info,
2813  								    len);
2814  			readonly = true;
2815  		}
2816  		spin_unlock(&cache->lock);
2817  		if (!readonly && return_free_space &&
2818  		    global_rsv->space_info == space_info) {
2819  			spin_lock(&global_rsv->lock);
2820  			if (!global_rsv->full) {
2821  				u64 to_add = min(len, global_rsv->size -
2822  						      global_rsv->reserved);
2823  
2824  				global_rsv->reserved += to_add;
2825  				btrfs_space_info_update_bytes_may_use(fs_info,
2826  						space_info, to_add);
2827  				if (global_rsv->reserved >= global_rsv->size)
2828  					global_rsv->full = 1;
2829  				len -= to_add;
2830  			}
2831  			spin_unlock(&global_rsv->lock);
2832  		}
2833  		/* Add to any tickets we may have */
2834  		if (!readonly && return_free_space && len)
2835  			btrfs_try_granting_tickets(fs_info, space_info);
2836  		spin_unlock(&space_info->lock);
2837  	}
2838  
2839  	if (cache)
2840  		btrfs_put_block_group(cache);
2841  out:
2842  	return ret;
2843  }
2844  
btrfs_finish_extent_commit(struct btrfs_trans_handle * trans)2845  int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2846  {
2847  	struct btrfs_fs_info *fs_info = trans->fs_info;
2848  	struct btrfs_block_group *block_group, *tmp;
2849  	struct list_head *deleted_bgs;
2850  	struct extent_io_tree *unpin;
2851  	u64 start;
2852  	u64 end;
2853  	int ret;
2854  
2855  	unpin = &trans->transaction->pinned_extents;
2856  
2857  	while (!TRANS_ABORTED(trans)) {
2858  		struct extent_state *cached_state = NULL;
2859  
2860  		mutex_lock(&fs_info->unused_bg_unpin_mutex);
2861  		if (!find_first_extent_bit(unpin, 0, &start, &end,
2862  					   EXTENT_DIRTY, &cached_state)) {
2863  			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2864  			break;
2865  		}
2866  
2867  		if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2868  			ret = btrfs_discard_extent(fs_info, start,
2869  						   end + 1 - start, NULL);
2870  
2871  		clear_extent_dirty(unpin, start, end, &cached_state);
2872  		ret = unpin_extent_range(fs_info, start, end, true);
2873  		BUG_ON(ret);
2874  		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2875  		free_extent_state(cached_state);
2876  		cond_resched();
2877  	}
2878  
2879  	if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2880  		btrfs_discard_calc_delay(&fs_info->discard_ctl);
2881  		btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2882  	}
2883  
2884  	/*
2885  	 * Transaction is finished.  We don't need the lock anymore.  We
2886  	 * do need to clean up the block groups in case of a transaction
2887  	 * abort.
2888  	 */
2889  	deleted_bgs = &trans->transaction->deleted_bgs;
2890  	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2891  		u64 trimmed = 0;
2892  
2893  		ret = -EROFS;
2894  		if (!TRANS_ABORTED(trans))
2895  			ret = btrfs_discard_extent(fs_info,
2896  						   block_group->start,
2897  						   block_group->length,
2898  						   &trimmed);
2899  
2900  		list_del_init(&block_group->bg_list);
2901  		btrfs_unfreeze_block_group(block_group);
2902  		btrfs_put_block_group(block_group);
2903  
2904  		if (ret) {
2905  			const char *errstr = btrfs_decode_error(ret);
2906  			btrfs_warn(fs_info,
2907  			   "discard failed while removing blockgroup: errno=%d %s",
2908  				   ret, errstr);
2909  		}
2910  	}
2911  
2912  	return 0;
2913  }
2914  
2915  /*
2916   * Parse an extent item's inline extents looking for a simple quotas owner ref.
2917   *
2918   * @fs_info:	the btrfs_fs_info for this mount
2919   * @leaf:	a leaf in the extent tree containing the extent item
2920   * @slot:	the slot in the leaf where the extent item is found
2921   *
2922   * Returns the objectid of the root that originally allocated the extent item
2923   * if the inline owner ref is expected and present, otherwise 0.
2924   *
2925   * If an extent item has an owner ref item, it will be the first inline ref
2926   * item. Therefore the logic is to check whether there are any inline ref
2927   * items, then check the type of the first one.
2928   */
btrfs_get_extent_owner_root(struct btrfs_fs_info * fs_info,struct extent_buffer * leaf,int slot)2929  u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
2930  				struct extent_buffer *leaf, int slot)
2931  {
2932  	struct btrfs_extent_item *ei;
2933  	struct btrfs_extent_inline_ref *iref;
2934  	struct btrfs_extent_owner_ref *oref;
2935  	unsigned long ptr;
2936  	unsigned long end;
2937  	int type;
2938  
2939  	if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA))
2940  		return 0;
2941  
2942  	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
2943  	ptr = (unsigned long)(ei + 1);
2944  	end = (unsigned long)ei + btrfs_item_size(leaf, slot);
2945  
2946  	/* No inline ref items of any kind, can't check type. */
2947  	if (ptr == end)
2948  		return 0;
2949  
2950  	iref = (struct btrfs_extent_inline_ref *)ptr;
2951  	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
2952  
2953  	/* We found an owner ref, get the root out of it. */
2954  	if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
2955  		oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
2956  		return btrfs_extent_owner_ref_root_id(leaf, oref);
2957  	}
2958  
2959  	/* We have inline refs, but not an owner ref. */
2960  	return 0;
2961  }
2962  
do_free_extent_accounting(struct btrfs_trans_handle * trans,u64 bytenr,struct btrfs_squota_delta * delta)2963  static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
2964  				     u64 bytenr, struct btrfs_squota_delta *delta)
2965  {
2966  	int ret;
2967  	u64 num_bytes = delta->num_bytes;
2968  
2969  	if (delta->is_data) {
2970  		struct btrfs_root *csum_root;
2971  
2972  		csum_root = btrfs_csum_root(trans->fs_info, bytenr);
2973  		ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
2974  		if (ret) {
2975  			btrfs_abort_transaction(trans, ret);
2976  			return ret;
2977  		}
2978  
2979  		ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
2980  		if (ret) {
2981  			btrfs_abort_transaction(trans, ret);
2982  			return ret;
2983  		}
2984  	}
2985  
2986  	ret = btrfs_record_squota_delta(trans->fs_info, delta);
2987  	if (ret) {
2988  		btrfs_abort_transaction(trans, ret);
2989  		return ret;
2990  	}
2991  
2992  	ret = add_to_free_space_tree(trans, bytenr, num_bytes);
2993  	if (ret) {
2994  		btrfs_abort_transaction(trans, ret);
2995  		return ret;
2996  	}
2997  
2998  	ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
2999  	if (ret)
3000  		btrfs_abort_transaction(trans, ret);
3001  
3002  	return ret;
3003  }
3004  
3005  #define abort_and_dump(trans, path, fmt, args...)	\
3006  ({							\
3007  	btrfs_abort_transaction(trans, -EUCLEAN);	\
3008  	btrfs_print_leaf(path->nodes[0]);		\
3009  	btrfs_crit(trans->fs_info, fmt, ##args);	\
3010  })
3011  
3012  /*
3013   * Drop one or more refs of @node.
3014   *
3015   * 1. Locate the extent refs.
3016   *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
3017   *    Locate it, then reduce the refs number or remove the ref line completely.
3018   *
3019   * 2. Update the refs count in EXTENT/METADATA_ITEM
3020   *
3021   * Inline backref case:
3022   *
3023   * in extent tree we have:
3024   *
3025   * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3026   *		refs 2 gen 6 flags DATA
3027   *		extent data backref root FS_TREE objectid 258 offset 0 count 1
3028   *		extent data backref root FS_TREE objectid 257 offset 0 count 1
3029   *
3030   * This function gets called with:
3031   *
3032   *    node->bytenr = 13631488
3033   *    node->num_bytes = 1048576
3034   *    root_objectid = FS_TREE
3035   *    owner_objectid = 257
3036   *    owner_offset = 0
3037   *    refs_to_drop = 1
3038   *
3039   * Then we should get some like:
3040   *
3041   * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3042   *		refs 1 gen 6 flags DATA
3043   *		extent data backref root FS_TREE objectid 258 offset 0 count 1
3044   *
3045   * Keyed backref case:
3046   *
3047   * in extent tree we have:
3048   *
3049   *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3050   *		refs 754 gen 6 flags DATA
3051   *	[...]
3052   *	item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
3053   *		extent data backref root FS_TREE objectid 866 offset 0 count 1
3054   *
3055   * This function get called with:
3056   *
3057   *    node->bytenr = 13631488
3058   *    node->num_bytes = 1048576
3059   *    root_objectid = FS_TREE
3060   *    owner_objectid = 866
3061   *    owner_offset = 0
3062   *    refs_to_drop = 1
3063   *
3064   * Then we should get some like:
3065   *
3066   *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3067   *		refs 753 gen 6 flags DATA
3068   *
3069   * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
3070   */
__btrfs_free_extent(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * href,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op)3071  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3072  			       struct btrfs_delayed_ref_head *href,
3073  			       struct btrfs_delayed_ref_node *node,
3074  			       struct btrfs_delayed_extent_op *extent_op)
3075  {
3076  	struct btrfs_fs_info *info = trans->fs_info;
3077  	struct btrfs_key key;
3078  	struct btrfs_path *path;
3079  	struct btrfs_root *extent_root;
3080  	struct extent_buffer *leaf;
3081  	struct btrfs_extent_item *ei;
3082  	struct btrfs_extent_inline_ref *iref;
3083  	int ret;
3084  	int is_data;
3085  	int extent_slot = 0;
3086  	int found_extent = 0;
3087  	int num_to_del = 1;
3088  	int refs_to_drop = node->ref_mod;
3089  	u32 item_size;
3090  	u64 refs;
3091  	u64 bytenr = node->bytenr;
3092  	u64 num_bytes = node->num_bytes;
3093  	u64 owner_objectid = btrfs_delayed_ref_owner(node);
3094  	u64 owner_offset = btrfs_delayed_ref_offset(node);
3095  	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
3096  	u64 delayed_ref_root = href->owning_root;
3097  
3098  	extent_root = btrfs_extent_root(info, bytenr);
3099  	ASSERT(extent_root);
3100  
3101  	path = btrfs_alloc_path();
3102  	if (!path)
3103  		return -ENOMEM;
3104  
3105  	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3106  
3107  	if (!is_data && refs_to_drop != 1) {
3108  		btrfs_crit(info,
3109  "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
3110  			   node->bytenr, refs_to_drop);
3111  		ret = -EINVAL;
3112  		btrfs_abort_transaction(trans, ret);
3113  		goto out;
3114  	}
3115  
3116  	if (is_data)
3117  		skinny_metadata = false;
3118  
3119  	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
3120  				    node->parent, node->ref_root, owner_objectid,
3121  				    owner_offset);
3122  	if (ret == 0) {
3123  		/*
3124  		 * Either the inline backref or the SHARED_DATA_REF/
3125  		 * SHARED_BLOCK_REF is found
3126  		 *
3127  		 * Here is a quick path to locate EXTENT/METADATA_ITEM.
3128  		 * It's possible the EXTENT/METADATA_ITEM is near current slot.
3129  		 */
3130  		extent_slot = path->slots[0];
3131  		while (extent_slot >= 0) {
3132  			btrfs_item_key_to_cpu(path->nodes[0], &key,
3133  					      extent_slot);
3134  			if (key.objectid != bytenr)
3135  				break;
3136  			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3137  			    key.offset == num_bytes) {
3138  				found_extent = 1;
3139  				break;
3140  			}
3141  			if (key.type == BTRFS_METADATA_ITEM_KEY &&
3142  			    key.offset == owner_objectid) {
3143  				found_extent = 1;
3144  				break;
3145  			}
3146  
3147  			/* Quick path didn't find the EXTEMT/METADATA_ITEM */
3148  			if (path->slots[0] - extent_slot > 5)
3149  				break;
3150  			extent_slot--;
3151  		}
3152  
3153  		if (!found_extent) {
3154  			if (iref) {
3155  				abort_and_dump(trans, path,
3156  "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
3157  					   path->slots[0]);
3158  				ret = -EUCLEAN;
3159  				goto out;
3160  			}
3161  			/* Must be SHARED_* item, remove the backref first */
3162  			ret = remove_extent_backref(trans, extent_root, path,
3163  						    NULL, refs_to_drop, is_data);
3164  			if (ret) {
3165  				btrfs_abort_transaction(trans, ret);
3166  				goto out;
3167  			}
3168  			btrfs_release_path(path);
3169  
3170  			/* Slow path to locate EXTENT/METADATA_ITEM */
3171  			key.objectid = bytenr;
3172  			key.type = BTRFS_EXTENT_ITEM_KEY;
3173  			key.offset = num_bytes;
3174  
3175  			if (!is_data && skinny_metadata) {
3176  				key.type = BTRFS_METADATA_ITEM_KEY;
3177  				key.offset = owner_objectid;
3178  			}
3179  
3180  			ret = btrfs_search_slot(trans, extent_root,
3181  						&key, path, -1, 1);
3182  			if (ret > 0 && skinny_metadata && path->slots[0]) {
3183  				/*
3184  				 * Couldn't find our skinny metadata item,
3185  				 * see if we have ye olde extent item.
3186  				 */
3187  				path->slots[0]--;
3188  				btrfs_item_key_to_cpu(path->nodes[0], &key,
3189  						      path->slots[0]);
3190  				if (key.objectid == bytenr &&
3191  				    key.type == BTRFS_EXTENT_ITEM_KEY &&
3192  				    key.offset == num_bytes)
3193  					ret = 0;
3194  			}
3195  
3196  			if (ret > 0 && skinny_metadata) {
3197  				skinny_metadata = false;
3198  				key.objectid = bytenr;
3199  				key.type = BTRFS_EXTENT_ITEM_KEY;
3200  				key.offset = num_bytes;
3201  				btrfs_release_path(path);
3202  				ret = btrfs_search_slot(trans, extent_root,
3203  							&key, path, -1, 1);
3204  			}
3205  
3206  			if (ret) {
3207  				if (ret > 0)
3208  					btrfs_print_leaf(path->nodes[0]);
3209  				btrfs_err(info,
3210  			"umm, got %d back from search, was looking for %llu, slot %d",
3211  					  ret, bytenr, path->slots[0]);
3212  			}
3213  			if (ret < 0) {
3214  				btrfs_abort_transaction(trans, ret);
3215  				goto out;
3216  			}
3217  			extent_slot = path->slots[0];
3218  		}
3219  	} else if (WARN_ON(ret == -ENOENT)) {
3220  		abort_and_dump(trans, path,
3221  "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
3222  			       bytenr, node->parent, node->ref_root, owner_objectid,
3223  			       owner_offset, path->slots[0]);
3224  		goto out;
3225  	} else {
3226  		btrfs_abort_transaction(trans, ret);
3227  		goto out;
3228  	}
3229  
3230  	leaf = path->nodes[0];
3231  	item_size = btrfs_item_size(leaf, extent_slot);
3232  	if (unlikely(item_size < sizeof(*ei))) {
3233  		ret = -EUCLEAN;
3234  		btrfs_err(trans->fs_info,
3235  			  "unexpected extent item size, has %u expect >= %zu",
3236  			  item_size, sizeof(*ei));
3237  		btrfs_abort_transaction(trans, ret);
3238  		goto out;
3239  	}
3240  	ei = btrfs_item_ptr(leaf, extent_slot,
3241  			    struct btrfs_extent_item);
3242  	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3243  	    key.type == BTRFS_EXTENT_ITEM_KEY) {
3244  		struct btrfs_tree_block_info *bi;
3245  
3246  		if (item_size < sizeof(*ei) + sizeof(*bi)) {
3247  			abort_and_dump(trans, path,
3248  "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
3249  				       key.objectid, key.type, key.offset,
3250  				       path->slots[0], owner_objectid, item_size,
3251  				       sizeof(*ei) + sizeof(*bi));
3252  			ret = -EUCLEAN;
3253  			goto out;
3254  		}
3255  		bi = (struct btrfs_tree_block_info *)(ei + 1);
3256  		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3257  	}
3258  
3259  	refs = btrfs_extent_refs(leaf, ei);
3260  	if (refs < refs_to_drop) {
3261  		abort_and_dump(trans, path,
3262  		"trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
3263  			       refs_to_drop, refs, bytenr, path->slots[0]);
3264  		ret = -EUCLEAN;
3265  		goto out;
3266  	}
3267  	refs -= refs_to_drop;
3268  
3269  	if (refs > 0) {
3270  		if (extent_op)
3271  			__run_delayed_extent_op(extent_op, leaf, ei);
3272  		/*
3273  		 * In the case of inline back ref, reference count will
3274  		 * be updated by remove_extent_backref
3275  		 */
3276  		if (iref) {
3277  			if (!found_extent) {
3278  				abort_and_dump(trans, path,
3279  "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
3280  					       path->slots[0]);
3281  				ret = -EUCLEAN;
3282  				goto out;
3283  			}
3284  		} else {
3285  			btrfs_set_extent_refs(leaf, ei, refs);
3286  			btrfs_mark_buffer_dirty(trans, leaf);
3287  		}
3288  		if (found_extent) {
3289  			ret = remove_extent_backref(trans, extent_root, path,
3290  						    iref, refs_to_drop, is_data);
3291  			if (ret) {
3292  				btrfs_abort_transaction(trans, ret);
3293  				goto out;
3294  			}
3295  		}
3296  	} else {
3297  		struct btrfs_squota_delta delta = {
3298  			.root = delayed_ref_root,
3299  			.num_bytes = num_bytes,
3300  			.is_data = is_data,
3301  			.is_inc = false,
3302  			.generation = btrfs_extent_generation(leaf, ei),
3303  		};
3304  
3305  		/* In this branch refs == 1 */
3306  		if (found_extent) {
3307  			if (is_data && refs_to_drop !=
3308  			    extent_data_ref_count(path, iref)) {
3309  				abort_and_dump(trans, path,
3310  		"invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
3311  					       extent_data_ref_count(path, iref),
3312  					       refs_to_drop, path->slots[0]);
3313  				ret = -EUCLEAN;
3314  				goto out;
3315  			}
3316  			if (iref) {
3317  				if (path->slots[0] != extent_slot) {
3318  					abort_and_dump(trans, path,
3319  "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
3320  						       key.objectid, key.type,
3321  						       key.offset, path->slots[0]);
3322  					ret = -EUCLEAN;
3323  					goto out;
3324  				}
3325  			} else {
3326  				/*
3327  				 * No inline ref, we must be at SHARED_* item,
3328  				 * And it's single ref, it must be:
3329  				 * |	extent_slot	  ||extent_slot + 1|
3330  				 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3331  				 */
3332  				if (path->slots[0] != extent_slot + 1) {
3333  					abort_and_dump(trans, path,
3334  	"invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
3335  						       path->slots[0]);
3336  					ret = -EUCLEAN;
3337  					goto out;
3338  				}
3339  				path->slots[0] = extent_slot;
3340  				num_to_del = 2;
3341  			}
3342  		}
3343  		/*
3344  		 * We can't infer the data owner from the delayed ref, so we need
3345  		 * to try to get it from the owning ref item.
3346  		 *
3347  		 * If it is not present, then that extent was not written under
3348  		 * simple quotas mode, so we don't need to account for its deletion.
3349  		 */
3350  		if (is_data)
3351  			delta.root = btrfs_get_extent_owner_root(trans->fs_info,
3352  								 leaf, extent_slot);
3353  
3354  		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3355  				      num_to_del);
3356  		if (ret) {
3357  			btrfs_abort_transaction(trans, ret);
3358  			goto out;
3359  		}
3360  		btrfs_release_path(path);
3361  
3362  		ret = do_free_extent_accounting(trans, bytenr, &delta);
3363  	}
3364  	btrfs_release_path(path);
3365  
3366  out:
3367  	btrfs_free_path(path);
3368  	return ret;
3369  }
3370  
3371  /*
3372   * when we free an block, it is possible (and likely) that we free the last
3373   * delayed ref for that extent as well.  This searches the delayed ref tree for
3374   * a given extent, and if there are no other delayed refs to be processed, it
3375   * removes it from the tree.
3376   */
check_ref_cleanup(struct btrfs_trans_handle * trans,u64 bytenr)3377  static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3378  				      u64 bytenr)
3379  {
3380  	struct btrfs_delayed_ref_head *head;
3381  	struct btrfs_delayed_ref_root *delayed_refs;
3382  	int ret = 0;
3383  
3384  	delayed_refs = &trans->transaction->delayed_refs;
3385  	spin_lock(&delayed_refs->lock);
3386  	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3387  	if (!head)
3388  		goto out_delayed_unlock;
3389  
3390  	spin_lock(&head->lock);
3391  	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3392  		goto out;
3393  
3394  	if (cleanup_extent_op(head) != NULL)
3395  		goto out;
3396  
3397  	/*
3398  	 * waiting for the lock here would deadlock.  If someone else has it
3399  	 * locked they are already in the process of dropping it anyway
3400  	 */
3401  	if (!mutex_trylock(&head->mutex))
3402  		goto out;
3403  
3404  	btrfs_delete_ref_head(delayed_refs, head);
3405  	head->processing = false;
3406  
3407  	spin_unlock(&head->lock);
3408  	spin_unlock(&delayed_refs->lock);
3409  
3410  	BUG_ON(head->extent_op);
3411  	if (head->must_insert_reserved)
3412  		ret = 1;
3413  
3414  	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3415  	mutex_unlock(&head->mutex);
3416  	btrfs_put_delayed_ref_head(head);
3417  	return ret;
3418  out:
3419  	spin_unlock(&head->lock);
3420  
3421  out_delayed_unlock:
3422  	spin_unlock(&delayed_refs->lock);
3423  	return 0;
3424  }
3425  
btrfs_free_tree_block(struct btrfs_trans_handle * trans,u64 root_id,struct extent_buffer * buf,u64 parent,int last_ref)3426  int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3427  			  u64 root_id,
3428  			  struct extent_buffer *buf,
3429  			  u64 parent, int last_ref)
3430  {
3431  	struct btrfs_fs_info *fs_info = trans->fs_info;
3432  	struct btrfs_block_group *bg;
3433  	int ret;
3434  
3435  	if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3436  		struct btrfs_ref generic_ref = {
3437  			.action = BTRFS_DROP_DELAYED_REF,
3438  			.bytenr = buf->start,
3439  			.num_bytes = buf->len,
3440  			.parent = parent,
3441  			.owning_root = btrfs_header_owner(buf),
3442  			.ref_root = root_id,
3443  		};
3444  
3445  		/*
3446  		 * Assert that the extent buffer is not cleared due to
3447  		 * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer
3448  		 * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for
3449  		 * detail.
3450  		 */
3451  		ASSERT(btrfs_header_bytenr(buf) != 0);
3452  
3453  		btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
3454  		btrfs_ref_tree_mod(fs_info, &generic_ref);
3455  		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3456  		if (ret < 0)
3457  			return ret;
3458  	}
3459  
3460  	if (!last_ref)
3461  		return 0;
3462  
3463  	if (btrfs_header_generation(buf) != trans->transid)
3464  		goto out;
3465  
3466  	if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3467  		ret = check_ref_cleanup(trans, buf->start);
3468  		if (!ret)
3469  			goto out;
3470  	}
3471  
3472  	bg = btrfs_lookup_block_group(fs_info, buf->start);
3473  
3474  	if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3475  		pin_down_extent(trans, bg, buf->start, buf->len, 1);
3476  		btrfs_put_block_group(bg);
3477  		goto out;
3478  	}
3479  
3480  	/*
3481  	 * If there are tree mod log users we may have recorded mod log
3482  	 * operations for this node.  If we re-allocate this node we
3483  	 * could replay operations on this node that happened when it
3484  	 * existed in a completely different root.  For example if it
3485  	 * was part of root A, then was reallocated to root B, and we
3486  	 * are doing a btrfs_old_search_slot(root b), we could replay
3487  	 * operations that happened when the block was part of root A,
3488  	 * giving us an inconsistent view of the btree.
3489  	 *
3490  	 * We are safe from races here because at this point no other
3491  	 * node or root points to this extent buffer, so if after this
3492  	 * check a new tree mod log user joins we will not have an
3493  	 * existing log of operations on this node that we have to
3494  	 * contend with.
3495  	 */
3496  
3497  	if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
3498  		     || btrfs_is_zoned(fs_info)) {
3499  		pin_down_extent(trans, bg, buf->start, buf->len, 1);
3500  		btrfs_put_block_group(bg);
3501  		goto out;
3502  	}
3503  
3504  	WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3505  
3506  	btrfs_add_free_space(bg, buf->start, buf->len);
3507  	btrfs_free_reserved_bytes(bg, buf->len, 0);
3508  	btrfs_put_block_group(bg);
3509  	trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3510  
3511  out:
3512  
3513  	/*
3514  	 * Deleting the buffer, clear the corrupt flag since it doesn't
3515  	 * matter anymore.
3516  	 */
3517  	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3518  	return 0;
3519  }
3520  
3521  /* Can return -ENOMEM */
btrfs_free_extent(struct btrfs_trans_handle * trans,struct btrfs_ref * ref)3522  int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3523  {
3524  	struct btrfs_fs_info *fs_info = trans->fs_info;
3525  	int ret;
3526  
3527  	if (btrfs_is_testing(fs_info))
3528  		return 0;
3529  
3530  	/*
3531  	 * tree log blocks never actually go into the extent allocation
3532  	 * tree, just update pinning info and exit early.
3533  	 */
3534  	if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) {
3535  		btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1);
3536  		ret = 0;
3537  	} else if (ref->type == BTRFS_REF_METADATA) {
3538  		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3539  	} else {
3540  		ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3541  	}
3542  
3543  	if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID)
3544  		btrfs_ref_tree_mod(fs_info, ref);
3545  
3546  	return ret;
3547  }
3548  
3549  enum btrfs_loop_type {
3550  	/*
3551  	 * Start caching block groups but do not wait for progress or for them
3552  	 * to be done.
3553  	 */
3554  	LOOP_CACHING_NOWAIT,
3555  
3556  	/*
3557  	 * Wait for the block group free_space >= the space we're waiting for if
3558  	 * the block group isn't cached.
3559  	 */
3560  	LOOP_CACHING_WAIT,
3561  
3562  	/*
3563  	 * Allow allocations to happen from block groups that do not yet have a
3564  	 * size classification.
3565  	 */
3566  	LOOP_UNSET_SIZE_CLASS,
3567  
3568  	/*
3569  	 * Allocate a chunk and then retry the allocation.
3570  	 */
3571  	LOOP_ALLOC_CHUNK,
3572  
3573  	/*
3574  	 * Ignore the size class restrictions for this allocation.
3575  	 */
3576  	LOOP_WRONG_SIZE_CLASS,
3577  
3578  	/*
3579  	 * Ignore the empty size, only try to allocate the number of bytes
3580  	 * needed for this allocation.
3581  	 */
3582  	LOOP_NO_EMPTY_SIZE,
3583  };
3584  
3585  static inline void
btrfs_lock_block_group(struct btrfs_block_group * cache,int delalloc)3586  btrfs_lock_block_group(struct btrfs_block_group *cache,
3587  		       int delalloc)
3588  {
3589  	if (delalloc)
3590  		down_read(&cache->data_rwsem);
3591  }
3592  
btrfs_grab_block_group(struct btrfs_block_group * cache,int delalloc)3593  static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3594  		       int delalloc)
3595  {
3596  	btrfs_get_block_group(cache);
3597  	if (delalloc)
3598  		down_read(&cache->data_rwsem);
3599  }
3600  
btrfs_lock_cluster(struct btrfs_block_group * block_group,struct btrfs_free_cluster * cluster,int delalloc)3601  static struct btrfs_block_group *btrfs_lock_cluster(
3602  		   struct btrfs_block_group *block_group,
3603  		   struct btrfs_free_cluster *cluster,
3604  		   int delalloc)
3605  	__acquires(&cluster->refill_lock)
3606  {
3607  	struct btrfs_block_group *used_bg = NULL;
3608  
3609  	spin_lock(&cluster->refill_lock);
3610  	while (1) {
3611  		used_bg = cluster->block_group;
3612  		if (!used_bg)
3613  			return NULL;
3614  
3615  		if (used_bg == block_group)
3616  			return used_bg;
3617  
3618  		btrfs_get_block_group(used_bg);
3619  
3620  		if (!delalloc)
3621  			return used_bg;
3622  
3623  		if (down_read_trylock(&used_bg->data_rwsem))
3624  			return used_bg;
3625  
3626  		spin_unlock(&cluster->refill_lock);
3627  
3628  		/* We should only have one-level nested. */
3629  		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3630  
3631  		spin_lock(&cluster->refill_lock);
3632  		if (used_bg == cluster->block_group)
3633  			return used_bg;
3634  
3635  		up_read(&used_bg->data_rwsem);
3636  		btrfs_put_block_group(used_bg);
3637  	}
3638  }
3639  
3640  static inline void
btrfs_release_block_group(struct btrfs_block_group * cache,int delalloc)3641  btrfs_release_block_group(struct btrfs_block_group *cache,
3642  			 int delalloc)
3643  {
3644  	if (delalloc)
3645  		up_read(&cache->data_rwsem);
3646  	btrfs_put_block_group(cache);
3647  }
3648  
3649  /*
3650   * Helper function for find_free_extent().
3651   *
3652   * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3653   * Return >0 to inform caller that we find nothing
3654   * Return 0 means we have found a location and set ffe_ctl->found_offset.
3655   */
find_free_extent_clustered(struct btrfs_block_group * bg,struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group ** cluster_bg_ret)3656  static int find_free_extent_clustered(struct btrfs_block_group *bg,
3657  				      struct find_free_extent_ctl *ffe_ctl,
3658  				      struct btrfs_block_group **cluster_bg_ret)
3659  {
3660  	struct btrfs_block_group *cluster_bg;
3661  	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3662  	u64 aligned_cluster;
3663  	u64 offset;
3664  	int ret;
3665  
3666  	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3667  	if (!cluster_bg)
3668  		goto refill_cluster;
3669  	if (cluster_bg != bg && (cluster_bg->ro ||
3670  	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
3671  		goto release_cluster;
3672  
3673  	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3674  			ffe_ctl->num_bytes, cluster_bg->start,
3675  			&ffe_ctl->max_extent_size);
3676  	if (offset) {
3677  		/* We have a block, we're done */
3678  		spin_unlock(&last_ptr->refill_lock);
3679  		trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
3680  		*cluster_bg_ret = cluster_bg;
3681  		ffe_ctl->found_offset = offset;
3682  		return 0;
3683  	}
3684  	WARN_ON(last_ptr->block_group != cluster_bg);
3685  
3686  release_cluster:
3687  	/*
3688  	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3689  	 * lets just skip it and let the allocator find whatever block it can
3690  	 * find. If we reach this point, we will have tried the cluster
3691  	 * allocator plenty of times and not have found anything, so we are
3692  	 * likely way too fragmented for the clustering stuff to find anything.
3693  	 *
3694  	 * However, if the cluster is taken from the current block group,
3695  	 * release the cluster first, so that we stand a better chance of
3696  	 * succeeding in the unclustered allocation.
3697  	 */
3698  	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3699  		spin_unlock(&last_ptr->refill_lock);
3700  		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3701  		return -ENOENT;
3702  	}
3703  
3704  	/* This cluster didn't work out, free it and start over */
3705  	btrfs_return_cluster_to_free_space(NULL, last_ptr);
3706  
3707  	if (cluster_bg != bg)
3708  		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3709  
3710  refill_cluster:
3711  	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3712  		spin_unlock(&last_ptr->refill_lock);
3713  		return -ENOENT;
3714  	}
3715  
3716  	aligned_cluster = max_t(u64,
3717  			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3718  			bg->full_stripe_len);
3719  	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3720  			ffe_ctl->num_bytes, aligned_cluster);
3721  	if (ret == 0) {
3722  		/* Now pull our allocation out of this cluster */
3723  		offset = btrfs_alloc_from_cluster(bg, last_ptr,
3724  				ffe_ctl->num_bytes, ffe_ctl->search_start,
3725  				&ffe_ctl->max_extent_size);
3726  		if (offset) {
3727  			/* We found one, proceed */
3728  			spin_unlock(&last_ptr->refill_lock);
3729  			ffe_ctl->found_offset = offset;
3730  			trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
3731  			return 0;
3732  		}
3733  	}
3734  	/*
3735  	 * At this point we either didn't find a cluster or we weren't able to
3736  	 * allocate a block from our cluster.  Free the cluster we've been
3737  	 * trying to use, and go to the next block group.
3738  	 */
3739  	btrfs_return_cluster_to_free_space(NULL, last_ptr);
3740  	spin_unlock(&last_ptr->refill_lock);
3741  	return 1;
3742  }
3743  
3744  /*
3745   * Return >0 to inform caller that we find nothing
3746   * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3747   */
find_free_extent_unclustered(struct btrfs_block_group * bg,struct find_free_extent_ctl * ffe_ctl)3748  static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3749  					struct find_free_extent_ctl *ffe_ctl)
3750  {
3751  	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3752  	u64 offset;
3753  
3754  	/*
3755  	 * We are doing an unclustered allocation, set the fragmented flag so
3756  	 * we don't bother trying to setup a cluster again until we get more
3757  	 * space.
3758  	 */
3759  	if (unlikely(last_ptr)) {
3760  		spin_lock(&last_ptr->lock);
3761  		last_ptr->fragmented = 1;
3762  		spin_unlock(&last_ptr->lock);
3763  	}
3764  	if (ffe_ctl->cached) {
3765  		struct btrfs_free_space_ctl *free_space_ctl;
3766  
3767  		free_space_ctl = bg->free_space_ctl;
3768  		spin_lock(&free_space_ctl->tree_lock);
3769  		if (free_space_ctl->free_space <
3770  		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3771  		    ffe_ctl->empty_size) {
3772  			ffe_ctl->total_free_space = max_t(u64,
3773  					ffe_ctl->total_free_space,
3774  					free_space_ctl->free_space);
3775  			spin_unlock(&free_space_ctl->tree_lock);
3776  			return 1;
3777  		}
3778  		spin_unlock(&free_space_ctl->tree_lock);
3779  	}
3780  
3781  	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3782  			ffe_ctl->num_bytes, ffe_ctl->empty_size,
3783  			&ffe_ctl->max_extent_size);
3784  	if (!offset)
3785  		return 1;
3786  	ffe_ctl->found_offset = offset;
3787  	return 0;
3788  }
3789  
do_allocation_clustered(struct btrfs_block_group * block_group,struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group ** bg_ret)3790  static int do_allocation_clustered(struct btrfs_block_group *block_group,
3791  				   struct find_free_extent_ctl *ffe_ctl,
3792  				   struct btrfs_block_group **bg_ret)
3793  {
3794  	int ret;
3795  
3796  	/* We want to try and use the cluster allocator, so lets look there */
3797  	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3798  		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3799  		if (ret >= 0)
3800  			return ret;
3801  		/* ret == -ENOENT case falls through */
3802  	}
3803  
3804  	return find_free_extent_unclustered(block_group, ffe_ctl);
3805  }
3806  
3807  /*
3808   * Tree-log block group locking
3809   * ============================
3810   *
3811   * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3812   * indicates the starting address of a block group, which is reserved only
3813   * for tree-log metadata.
3814   *
3815   * Lock nesting
3816   * ============
3817   *
3818   * space_info::lock
3819   *   block_group::lock
3820   *     fs_info::treelog_bg_lock
3821   */
3822  
3823  /*
3824   * Simple allocator for sequential-only block group. It only allows sequential
3825   * allocation. No need to play with trees. This function also reserves the
3826   * bytes as in btrfs_add_reserved_bytes.
3827   */
do_allocation_zoned(struct btrfs_block_group * block_group,struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group ** bg_ret)3828  static int do_allocation_zoned(struct btrfs_block_group *block_group,
3829  			       struct find_free_extent_ctl *ffe_ctl,
3830  			       struct btrfs_block_group **bg_ret)
3831  {
3832  	struct btrfs_fs_info *fs_info = block_group->fs_info;
3833  	struct btrfs_space_info *space_info = block_group->space_info;
3834  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3835  	u64 start = block_group->start;
3836  	u64 num_bytes = ffe_ctl->num_bytes;
3837  	u64 avail;
3838  	u64 bytenr = block_group->start;
3839  	u64 log_bytenr;
3840  	u64 data_reloc_bytenr;
3841  	int ret = 0;
3842  	bool skip = false;
3843  
3844  	ASSERT(btrfs_is_zoned(block_group->fs_info));
3845  
3846  	/*
3847  	 * Do not allow non-tree-log blocks in the dedicated tree-log block
3848  	 * group, and vice versa.
3849  	 */
3850  	spin_lock(&fs_info->treelog_bg_lock);
3851  	log_bytenr = fs_info->treelog_bg;
3852  	if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3853  			   (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3854  		skip = true;
3855  	spin_unlock(&fs_info->treelog_bg_lock);
3856  	if (skip)
3857  		return 1;
3858  
3859  	/*
3860  	 * Do not allow non-relocation blocks in the dedicated relocation block
3861  	 * group, and vice versa.
3862  	 */
3863  	spin_lock(&fs_info->relocation_bg_lock);
3864  	data_reloc_bytenr = fs_info->data_reloc_bg;
3865  	if (data_reloc_bytenr &&
3866  	    ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3867  	     (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3868  		skip = true;
3869  	spin_unlock(&fs_info->relocation_bg_lock);
3870  	if (skip)
3871  		return 1;
3872  
3873  	/* Check RO and no space case before trying to activate it */
3874  	spin_lock(&block_group->lock);
3875  	if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
3876  		ret = 1;
3877  		/*
3878  		 * May need to clear fs_info->{treelog,data_reloc}_bg.
3879  		 * Return the error after taking the locks.
3880  		 */
3881  	}
3882  	spin_unlock(&block_group->lock);
3883  
3884  	/* Metadata block group is activated at write time. */
3885  	if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
3886  	    !btrfs_zone_activate(block_group)) {
3887  		ret = 1;
3888  		/*
3889  		 * May need to clear fs_info->{treelog,data_reloc}_bg.
3890  		 * Return the error after taking the locks.
3891  		 */
3892  	}
3893  
3894  	spin_lock(&space_info->lock);
3895  	spin_lock(&block_group->lock);
3896  	spin_lock(&fs_info->treelog_bg_lock);
3897  	spin_lock(&fs_info->relocation_bg_lock);
3898  
3899  	if (ret)
3900  		goto out;
3901  
3902  	ASSERT(!ffe_ctl->for_treelog ||
3903  	       block_group->start == fs_info->treelog_bg ||
3904  	       fs_info->treelog_bg == 0);
3905  	ASSERT(!ffe_ctl->for_data_reloc ||
3906  	       block_group->start == fs_info->data_reloc_bg ||
3907  	       fs_info->data_reloc_bg == 0);
3908  
3909  	if (block_group->ro ||
3910  	    (!ffe_ctl->for_data_reloc &&
3911  	     test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
3912  		ret = 1;
3913  		goto out;
3914  	}
3915  
3916  	/*
3917  	 * Do not allow currently using block group to be tree-log dedicated
3918  	 * block group.
3919  	 */
3920  	if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3921  	    (block_group->used || block_group->reserved)) {
3922  		ret = 1;
3923  		goto out;
3924  	}
3925  
3926  	/*
3927  	 * Do not allow currently used block group to be the data relocation
3928  	 * dedicated block group.
3929  	 */
3930  	if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3931  	    (block_group->used || block_group->reserved)) {
3932  		ret = 1;
3933  		goto out;
3934  	}
3935  
3936  	WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3937  	avail = block_group->zone_capacity - block_group->alloc_offset;
3938  	if (avail < num_bytes) {
3939  		if (ffe_ctl->max_extent_size < avail) {
3940  			/*
3941  			 * With sequential allocator, free space is always
3942  			 * contiguous
3943  			 */
3944  			ffe_ctl->max_extent_size = avail;
3945  			ffe_ctl->total_free_space = avail;
3946  		}
3947  		ret = 1;
3948  		goto out;
3949  	}
3950  
3951  	if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3952  		fs_info->treelog_bg = block_group->start;
3953  
3954  	if (ffe_ctl->for_data_reloc) {
3955  		if (!fs_info->data_reloc_bg)
3956  			fs_info->data_reloc_bg = block_group->start;
3957  		/*
3958  		 * Do not allow allocations from this block group, unless it is
3959  		 * for data relocation. Compared to increasing the ->ro, setting
3960  		 * the ->zoned_data_reloc_ongoing flag still allows nocow
3961  		 * writers to come in. See btrfs_inc_nocow_writers().
3962  		 *
3963  		 * We need to disable an allocation to avoid an allocation of
3964  		 * regular (non-relocation data) extent. With mix of relocation
3965  		 * extents and regular extents, we can dispatch WRITE commands
3966  		 * (for relocation extents) and ZONE APPEND commands (for
3967  		 * regular extents) at the same time to the same zone, which
3968  		 * easily break the write pointer.
3969  		 *
3970  		 * Also, this flag avoids this block group to be zone finished.
3971  		 */
3972  		set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
3973  	}
3974  
3975  	ffe_ctl->found_offset = start + block_group->alloc_offset;
3976  	block_group->alloc_offset += num_bytes;
3977  	spin_lock(&ctl->tree_lock);
3978  	ctl->free_space -= num_bytes;
3979  	spin_unlock(&ctl->tree_lock);
3980  
3981  	/*
3982  	 * We do not check if found_offset is aligned to stripesize. The
3983  	 * address is anyway rewritten when using zone append writing.
3984  	 */
3985  
3986  	ffe_ctl->search_start = ffe_ctl->found_offset;
3987  
3988  out:
3989  	if (ret && ffe_ctl->for_treelog)
3990  		fs_info->treelog_bg = 0;
3991  	if (ret && ffe_ctl->for_data_reloc)
3992  		fs_info->data_reloc_bg = 0;
3993  	spin_unlock(&fs_info->relocation_bg_lock);
3994  	spin_unlock(&fs_info->treelog_bg_lock);
3995  	spin_unlock(&block_group->lock);
3996  	spin_unlock(&space_info->lock);
3997  	return ret;
3998  }
3999  
do_allocation(struct btrfs_block_group * block_group,struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group ** bg_ret)4000  static int do_allocation(struct btrfs_block_group *block_group,
4001  			 struct find_free_extent_ctl *ffe_ctl,
4002  			 struct btrfs_block_group **bg_ret)
4003  {
4004  	switch (ffe_ctl->policy) {
4005  	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4006  		return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
4007  	case BTRFS_EXTENT_ALLOC_ZONED:
4008  		return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
4009  	default:
4010  		BUG();
4011  	}
4012  }
4013  
release_block_group(struct btrfs_block_group * block_group,struct find_free_extent_ctl * ffe_ctl,int delalloc)4014  static void release_block_group(struct btrfs_block_group *block_group,
4015  				struct find_free_extent_ctl *ffe_ctl,
4016  				int delalloc)
4017  {
4018  	switch (ffe_ctl->policy) {
4019  	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4020  		ffe_ctl->retry_uncached = false;
4021  		break;
4022  	case BTRFS_EXTENT_ALLOC_ZONED:
4023  		/* Nothing to do */
4024  		break;
4025  	default:
4026  		BUG();
4027  	}
4028  
4029  	BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
4030  	       ffe_ctl->index);
4031  	btrfs_release_block_group(block_group, delalloc);
4032  }
4033  
found_extent_clustered(struct find_free_extent_ctl * ffe_ctl,struct btrfs_key * ins)4034  static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
4035  				   struct btrfs_key *ins)
4036  {
4037  	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4038  
4039  	if (!ffe_ctl->use_cluster && last_ptr) {
4040  		spin_lock(&last_ptr->lock);
4041  		last_ptr->window_start = ins->objectid;
4042  		spin_unlock(&last_ptr->lock);
4043  	}
4044  }
4045  
found_extent(struct find_free_extent_ctl * ffe_ctl,struct btrfs_key * ins)4046  static void found_extent(struct find_free_extent_ctl *ffe_ctl,
4047  			 struct btrfs_key *ins)
4048  {
4049  	switch (ffe_ctl->policy) {
4050  	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4051  		found_extent_clustered(ffe_ctl, ins);
4052  		break;
4053  	case BTRFS_EXTENT_ALLOC_ZONED:
4054  		/* Nothing to do */
4055  		break;
4056  	default:
4057  		BUG();
4058  	}
4059  }
4060  
can_allocate_chunk_zoned(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl)4061  static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
4062  				    struct find_free_extent_ctl *ffe_ctl)
4063  {
4064  	/* Block group's activeness is not a requirement for METADATA block groups. */
4065  	if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA))
4066  		return 0;
4067  
4068  	/* If we can activate new zone, just allocate a chunk and use it */
4069  	if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
4070  		return 0;
4071  
4072  	/*
4073  	 * We already reached the max active zones. Try to finish one block
4074  	 * group to make a room for a new block group. This is only possible
4075  	 * for a data block group because btrfs_zone_finish() may need to wait
4076  	 * for a running transaction which can cause a deadlock for metadata
4077  	 * allocation.
4078  	 */
4079  	if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4080  		int ret = btrfs_zone_finish_one_bg(fs_info);
4081  
4082  		if (ret == 1)
4083  			return 0;
4084  		else if (ret < 0)
4085  			return ret;
4086  	}
4087  
4088  	/*
4089  	 * If we have enough free space left in an already active block group
4090  	 * and we can't activate any other zone now, do not allow allocating a
4091  	 * new chunk and let find_free_extent() retry with a smaller size.
4092  	 */
4093  	if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
4094  		return -ENOSPC;
4095  
4096  	/*
4097  	 * Even min_alloc_size is not left in any block groups. Since we cannot
4098  	 * activate a new block group, allocating it may not help. Let's tell a
4099  	 * caller to try again and hope it progress something by writing some
4100  	 * parts of the region. That is only possible for data block groups,
4101  	 * where a part of the region can be written.
4102  	 */
4103  	if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
4104  		return -EAGAIN;
4105  
4106  	/*
4107  	 * We cannot activate a new block group and no enough space left in any
4108  	 * block groups. So, allocating a new block group may not help. But,
4109  	 * there is nothing to do anyway, so let's go with it.
4110  	 */
4111  	return 0;
4112  }
4113  
can_allocate_chunk(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl)4114  static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
4115  			      struct find_free_extent_ctl *ffe_ctl)
4116  {
4117  	switch (ffe_ctl->policy) {
4118  	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4119  		return 0;
4120  	case BTRFS_EXTENT_ALLOC_ZONED:
4121  		return can_allocate_chunk_zoned(fs_info, ffe_ctl);
4122  	default:
4123  		BUG();
4124  	}
4125  }
4126  
4127  /*
4128   * Return >0 means caller needs to re-search for free extent
4129   * Return 0 means we have the needed free extent.
4130   * Return <0 means we failed to locate any free extent.
4131   */
find_free_extent_update_loop(struct btrfs_fs_info * fs_info,struct btrfs_key * ins,struct find_free_extent_ctl * ffe_ctl,bool full_search)4132  static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
4133  					struct btrfs_key *ins,
4134  					struct find_free_extent_ctl *ffe_ctl,
4135  					bool full_search)
4136  {
4137  	struct btrfs_root *root = fs_info->chunk_root;
4138  	int ret;
4139  
4140  	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
4141  	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
4142  		ffe_ctl->orig_have_caching_bg = true;
4143  
4144  	if (ins->objectid) {
4145  		found_extent(ffe_ctl, ins);
4146  		return 0;
4147  	}
4148  
4149  	if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
4150  		return 1;
4151  
4152  	ffe_ctl->index++;
4153  	if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
4154  		return 1;
4155  
4156  	/* See the comments for btrfs_loop_type for an explanation of the phases. */
4157  	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
4158  		ffe_ctl->index = 0;
4159  		/*
4160  		 * We want to skip the LOOP_CACHING_WAIT step if we don't have
4161  		 * any uncached bgs and we've already done a full search
4162  		 * through.
4163  		 */
4164  		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
4165  		    (!ffe_ctl->orig_have_caching_bg && full_search))
4166  			ffe_ctl->loop++;
4167  		ffe_ctl->loop++;
4168  
4169  		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
4170  			struct btrfs_trans_handle *trans;
4171  			int exist = 0;
4172  
4173  			/* Check if allocation policy allows to create a new chunk */
4174  			ret = can_allocate_chunk(fs_info, ffe_ctl);
4175  			if (ret)
4176  				return ret;
4177  
4178  			trans = current->journal_info;
4179  			if (trans)
4180  				exist = 1;
4181  			else
4182  				trans = btrfs_join_transaction(root);
4183  
4184  			if (IS_ERR(trans)) {
4185  				ret = PTR_ERR(trans);
4186  				return ret;
4187  			}
4188  
4189  			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
4190  						CHUNK_ALLOC_FORCE_FOR_EXTENT);
4191  
4192  			/* Do not bail out on ENOSPC since we can do more. */
4193  			if (ret == -ENOSPC) {
4194  				ret = 0;
4195  				ffe_ctl->loop++;
4196  			}
4197  			else if (ret < 0)
4198  				btrfs_abort_transaction(trans, ret);
4199  			else
4200  				ret = 0;
4201  			if (!exist)
4202  				btrfs_end_transaction(trans);
4203  			if (ret)
4204  				return ret;
4205  		}
4206  
4207  		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4208  			if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4209  				return -ENOSPC;
4210  
4211  			/*
4212  			 * Don't loop again if we already have no empty_size and
4213  			 * no empty_cluster.
4214  			 */
4215  			if (ffe_ctl->empty_size == 0 &&
4216  			    ffe_ctl->empty_cluster == 0)
4217  				return -ENOSPC;
4218  			ffe_ctl->empty_size = 0;
4219  			ffe_ctl->empty_cluster = 0;
4220  		}
4221  		return 1;
4222  	}
4223  	return -ENOSPC;
4224  }
4225  
find_free_extent_check_size_class(struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group * bg)4226  static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
4227  					      struct btrfs_block_group *bg)
4228  {
4229  	if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
4230  		return true;
4231  	if (!btrfs_block_group_should_use_size_class(bg))
4232  		return true;
4233  	if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
4234  		return true;
4235  	if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
4236  	    bg->size_class == BTRFS_BG_SZ_NONE)
4237  		return true;
4238  	return ffe_ctl->size_class == bg->size_class;
4239  }
4240  
prepare_allocation_clustered(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl,struct btrfs_space_info * space_info,struct btrfs_key * ins)4241  static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4242  					struct find_free_extent_ctl *ffe_ctl,
4243  					struct btrfs_space_info *space_info,
4244  					struct btrfs_key *ins)
4245  {
4246  	/*
4247  	 * If our free space is heavily fragmented we may not be able to make
4248  	 * big contiguous allocations, so instead of doing the expensive search
4249  	 * for free space, simply return ENOSPC with our max_extent_size so we
4250  	 * can go ahead and search for a more manageable chunk.
4251  	 *
4252  	 * If our max_extent_size is large enough for our allocation simply
4253  	 * disable clustering since we will likely not be able to find enough
4254  	 * space to create a cluster and induce latency trying.
4255  	 */
4256  	if (space_info->max_extent_size) {
4257  		spin_lock(&space_info->lock);
4258  		if (space_info->max_extent_size &&
4259  		    ffe_ctl->num_bytes > space_info->max_extent_size) {
4260  			ins->offset = space_info->max_extent_size;
4261  			spin_unlock(&space_info->lock);
4262  			return -ENOSPC;
4263  		} else if (space_info->max_extent_size) {
4264  			ffe_ctl->use_cluster = false;
4265  		}
4266  		spin_unlock(&space_info->lock);
4267  	}
4268  
4269  	ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4270  					       &ffe_ctl->empty_cluster);
4271  	if (ffe_ctl->last_ptr) {
4272  		struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4273  
4274  		spin_lock(&last_ptr->lock);
4275  		if (last_ptr->block_group)
4276  			ffe_ctl->hint_byte = last_ptr->window_start;
4277  		if (last_ptr->fragmented) {
4278  			/*
4279  			 * We still set window_start so we can keep track of the
4280  			 * last place we found an allocation to try and save
4281  			 * some time.
4282  			 */
4283  			ffe_ctl->hint_byte = last_ptr->window_start;
4284  			ffe_ctl->use_cluster = false;
4285  		}
4286  		spin_unlock(&last_ptr->lock);
4287  	}
4288  
4289  	return 0;
4290  }
4291  
prepare_allocation_zoned(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl)4292  static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
4293  				    struct find_free_extent_ctl *ffe_ctl)
4294  {
4295  	if (ffe_ctl->for_treelog) {
4296  		spin_lock(&fs_info->treelog_bg_lock);
4297  		if (fs_info->treelog_bg)
4298  			ffe_ctl->hint_byte = fs_info->treelog_bg;
4299  		spin_unlock(&fs_info->treelog_bg_lock);
4300  	} else if (ffe_ctl->for_data_reloc) {
4301  		spin_lock(&fs_info->relocation_bg_lock);
4302  		if (fs_info->data_reloc_bg)
4303  			ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4304  		spin_unlock(&fs_info->relocation_bg_lock);
4305  	} else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4306  		struct btrfs_block_group *block_group;
4307  
4308  		spin_lock(&fs_info->zone_active_bgs_lock);
4309  		list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
4310  			/*
4311  			 * No lock is OK here because avail is monotinically
4312  			 * decreasing, and this is just a hint.
4313  			 */
4314  			u64 avail = block_group->zone_capacity - block_group->alloc_offset;
4315  
4316  			if (block_group_bits(block_group, ffe_ctl->flags) &&
4317  			    avail >= ffe_ctl->num_bytes) {
4318  				ffe_ctl->hint_byte = block_group->start;
4319  				break;
4320  			}
4321  		}
4322  		spin_unlock(&fs_info->zone_active_bgs_lock);
4323  	}
4324  
4325  	return 0;
4326  }
4327  
prepare_allocation(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl,struct btrfs_space_info * space_info,struct btrfs_key * ins)4328  static int prepare_allocation(struct btrfs_fs_info *fs_info,
4329  			      struct find_free_extent_ctl *ffe_ctl,
4330  			      struct btrfs_space_info *space_info,
4331  			      struct btrfs_key *ins)
4332  {
4333  	switch (ffe_ctl->policy) {
4334  	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4335  		return prepare_allocation_clustered(fs_info, ffe_ctl,
4336  						    space_info, ins);
4337  	case BTRFS_EXTENT_ALLOC_ZONED:
4338  		return prepare_allocation_zoned(fs_info, ffe_ctl);
4339  	default:
4340  		BUG();
4341  	}
4342  }
4343  
4344  /*
4345   * walks the btree of allocated extents and find a hole of a given size.
4346   * The key ins is changed to record the hole:
4347   * ins->objectid == start position
4348   * ins->flags = BTRFS_EXTENT_ITEM_KEY
4349   * ins->offset == the size of the hole.
4350   * Any available blocks before search_start are skipped.
4351   *
4352   * If there is no suitable free space, we will record the max size of
4353   * the free space extent currently.
4354   *
4355   * The overall logic and call chain:
4356   *
4357   * find_free_extent()
4358   * |- Iterate through all block groups
4359   * |  |- Get a valid block group
4360   * |  |- Try to do clustered allocation in that block group
4361   * |  |- Try to do unclustered allocation in that block group
4362   * |  |- Check if the result is valid
4363   * |  |  |- If valid, then exit
4364   * |  |- Jump to next block group
4365   * |
4366   * |- Push harder to find free extents
4367   *    |- If not found, re-iterate all block groups
4368   */
find_free_extent(struct btrfs_root * root,struct btrfs_key * ins,struct find_free_extent_ctl * ffe_ctl)4369  static noinline int find_free_extent(struct btrfs_root *root,
4370  				     struct btrfs_key *ins,
4371  				     struct find_free_extent_ctl *ffe_ctl)
4372  {
4373  	struct btrfs_fs_info *fs_info = root->fs_info;
4374  	int ret = 0;
4375  	int cache_block_group_error = 0;
4376  	struct btrfs_block_group *block_group = NULL;
4377  	struct btrfs_space_info *space_info;
4378  	bool full_search = false;
4379  
4380  	WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
4381  
4382  	ffe_ctl->search_start = 0;
4383  	/* For clustered allocation */
4384  	ffe_ctl->empty_cluster = 0;
4385  	ffe_ctl->last_ptr = NULL;
4386  	ffe_ctl->use_cluster = true;
4387  	ffe_ctl->have_caching_bg = false;
4388  	ffe_ctl->orig_have_caching_bg = false;
4389  	ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4390  	ffe_ctl->loop = 0;
4391  	ffe_ctl->retry_uncached = false;
4392  	ffe_ctl->cached = 0;
4393  	ffe_ctl->max_extent_size = 0;
4394  	ffe_ctl->total_free_space = 0;
4395  	ffe_ctl->found_offset = 0;
4396  	ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4397  	ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
4398  
4399  	if (btrfs_is_zoned(fs_info))
4400  		ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
4401  
4402  	ins->type = BTRFS_EXTENT_ITEM_KEY;
4403  	ins->objectid = 0;
4404  	ins->offset = 0;
4405  
4406  	trace_find_free_extent(root, ffe_ctl);
4407  
4408  	space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
4409  	if (!space_info) {
4410  		btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
4411  		return -ENOSPC;
4412  	}
4413  
4414  	ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
4415  	if (ret < 0)
4416  		return ret;
4417  
4418  	ffe_ctl->search_start = max(ffe_ctl->search_start,
4419  				    first_logical_byte(fs_info));
4420  	ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4421  	if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
4422  		block_group = btrfs_lookup_block_group(fs_info,
4423  						       ffe_ctl->search_start);
4424  		/*
4425  		 * we don't want to use the block group if it doesn't match our
4426  		 * allocation bits, or if its not cached.
4427  		 *
4428  		 * However if we are re-searching with an ideal block group
4429  		 * picked out then we don't care that the block group is cached.
4430  		 */
4431  		if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
4432  		    block_group->cached != BTRFS_CACHE_NO) {
4433  			down_read(&space_info->groups_sem);
4434  			if (list_empty(&block_group->list) ||
4435  			    block_group->ro) {
4436  				/*
4437  				 * someone is removing this block group,
4438  				 * we can't jump into the have_block_group
4439  				 * target because our list pointers are not
4440  				 * valid
4441  				 */
4442  				btrfs_put_block_group(block_group);
4443  				up_read(&space_info->groups_sem);
4444  			} else {
4445  				ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4446  							block_group->flags);
4447  				btrfs_lock_block_group(block_group,
4448  						       ffe_ctl->delalloc);
4449  				ffe_ctl->hinted = true;
4450  				goto have_block_group;
4451  			}
4452  		} else if (block_group) {
4453  			btrfs_put_block_group(block_group);
4454  		}
4455  	}
4456  search:
4457  	trace_find_free_extent_search_loop(root, ffe_ctl);
4458  	ffe_ctl->have_caching_bg = false;
4459  	if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4460  	    ffe_ctl->index == 0)
4461  		full_search = true;
4462  	down_read(&space_info->groups_sem);
4463  	list_for_each_entry(block_group,
4464  			    &space_info->block_groups[ffe_ctl->index], list) {
4465  		struct btrfs_block_group *bg_ret;
4466  
4467  		ffe_ctl->hinted = false;
4468  		/* If the block group is read-only, we can skip it entirely. */
4469  		if (unlikely(block_group->ro)) {
4470  			if (ffe_ctl->for_treelog)
4471  				btrfs_clear_treelog_bg(block_group);
4472  			if (ffe_ctl->for_data_reloc)
4473  				btrfs_clear_data_reloc_bg(block_group);
4474  			continue;
4475  		}
4476  
4477  		btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4478  		ffe_ctl->search_start = block_group->start;
4479  
4480  		/*
4481  		 * this can happen if we end up cycling through all the
4482  		 * raid types, but we want to make sure we only allocate
4483  		 * for the proper type.
4484  		 */
4485  		if (!block_group_bits(block_group, ffe_ctl->flags)) {
4486  			u64 extra = BTRFS_BLOCK_GROUP_DUP |
4487  				BTRFS_BLOCK_GROUP_RAID1_MASK |
4488  				BTRFS_BLOCK_GROUP_RAID56_MASK |
4489  				BTRFS_BLOCK_GROUP_RAID10;
4490  
4491  			/*
4492  			 * if they asked for extra copies and this block group
4493  			 * doesn't provide them, bail.  This does allow us to
4494  			 * fill raid0 from raid1.
4495  			 */
4496  			if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
4497  				goto loop;
4498  
4499  			/*
4500  			 * This block group has different flags than we want.
4501  			 * It's possible that we have MIXED_GROUP flag but no
4502  			 * block group is mixed.  Just skip such block group.
4503  			 */
4504  			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4505  			continue;
4506  		}
4507  
4508  have_block_group:
4509  		trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
4510  		ffe_ctl->cached = btrfs_block_group_done(block_group);
4511  		if (unlikely(!ffe_ctl->cached)) {
4512  			ffe_ctl->have_caching_bg = true;
4513  			ret = btrfs_cache_block_group(block_group, false);
4514  
4515  			/*
4516  			 * If we get ENOMEM here or something else we want to
4517  			 * try other block groups, because it may not be fatal.
4518  			 * However if we can't find anything else we need to
4519  			 * save our return here so that we return the actual
4520  			 * error that caused problems, not ENOSPC.
4521  			 */
4522  			if (ret < 0) {
4523  				if (!cache_block_group_error)
4524  					cache_block_group_error = ret;
4525  				ret = 0;
4526  				goto loop;
4527  			}
4528  			ret = 0;
4529  		}
4530  
4531  		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
4532  			if (!cache_block_group_error)
4533  				cache_block_group_error = -EIO;
4534  			goto loop;
4535  		}
4536  
4537  		if (!find_free_extent_check_size_class(ffe_ctl, block_group))
4538  			goto loop;
4539  
4540  		bg_ret = NULL;
4541  		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
4542  		if (ret > 0)
4543  			goto loop;
4544  
4545  		if (bg_ret && bg_ret != block_group) {
4546  			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4547  			block_group = bg_ret;
4548  		}
4549  
4550  		/* Checks */
4551  		ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4552  						 fs_info->stripesize);
4553  
4554  		/* move on to the next group */
4555  		if (ffe_ctl->search_start + ffe_ctl->num_bytes >
4556  		    block_group->start + block_group->length) {
4557  			btrfs_add_free_space_unused(block_group,
4558  					    ffe_ctl->found_offset,
4559  					    ffe_ctl->num_bytes);
4560  			goto loop;
4561  		}
4562  
4563  		if (ffe_ctl->found_offset < ffe_ctl->search_start)
4564  			btrfs_add_free_space_unused(block_group,
4565  					ffe_ctl->found_offset,
4566  					ffe_ctl->search_start - ffe_ctl->found_offset);
4567  
4568  		ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4569  					       ffe_ctl->num_bytes,
4570  					       ffe_ctl->delalloc,
4571  					       ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
4572  		if (ret == -EAGAIN) {
4573  			btrfs_add_free_space_unused(block_group,
4574  					ffe_ctl->found_offset,
4575  					ffe_ctl->num_bytes);
4576  			goto loop;
4577  		}
4578  		btrfs_inc_block_group_reservations(block_group);
4579  
4580  		/* we are all good, lets return */
4581  		ins->objectid = ffe_ctl->search_start;
4582  		ins->offset = ffe_ctl->num_bytes;
4583  
4584  		trace_btrfs_reserve_extent(block_group, ffe_ctl);
4585  		btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4586  		break;
4587  loop:
4588  		if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
4589  		    !ffe_ctl->retry_uncached) {
4590  			ffe_ctl->retry_uncached = true;
4591  			btrfs_wait_block_group_cache_progress(block_group,
4592  						ffe_ctl->num_bytes +
4593  						ffe_ctl->empty_cluster +
4594  						ffe_ctl->empty_size);
4595  			goto have_block_group;
4596  		}
4597  		release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
4598  		cond_resched();
4599  	}
4600  	up_read(&space_info->groups_sem);
4601  
4602  	ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
4603  	if (ret > 0)
4604  		goto search;
4605  
4606  	if (ret == -ENOSPC && !cache_block_group_error) {
4607  		/*
4608  		 * Use ffe_ctl->total_free_space as fallback if we can't find
4609  		 * any contiguous hole.
4610  		 */
4611  		if (!ffe_ctl->max_extent_size)
4612  			ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4613  		spin_lock(&space_info->lock);
4614  		space_info->max_extent_size = ffe_ctl->max_extent_size;
4615  		spin_unlock(&space_info->lock);
4616  		ins->offset = ffe_ctl->max_extent_size;
4617  	} else if (ret == -ENOSPC) {
4618  		ret = cache_block_group_error;
4619  	}
4620  	return ret;
4621  }
4622  
4623  /*
4624   * Entry point to the extent allocator. Tries to find a hole that is at least
4625   * as big as @num_bytes.
4626   *
4627   * @root           -	The root that will contain this extent
4628   *
4629   * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
4630   *			is used for accounting purposes. This value differs
4631   *			from @num_bytes only in the case of compressed extents.
4632   *
4633   * @num_bytes      -	Number of bytes to allocate on-disk.
4634   *
4635   * @min_alloc_size -	Indicates the minimum amount of space that the
4636   *			allocator should try to satisfy. In some cases
4637   *			@num_bytes may be larger than what is required and if
4638   *			the filesystem is fragmented then allocation fails.
4639   *			However, the presence of @min_alloc_size gives a
4640   *			chance to try and satisfy the smaller allocation.
4641   *
4642   * @empty_size     -	A hint that you plan on doing more COW. This is the
4643   *			size in bytes the allocator should try to find free
4644   *			next to the block it returns.  This is just a hint and
4645   *			may be ignored by the allocator.
4646   *
4647   * @hint_byte      -	Hint to the allocator to start searching above the byte
4648   *			address passed. It might be ignored.
4649   *
4650   * @ins            -	This key is modified to record the found hole. It will
4651   *			have the following values:
4652   *			ins->objectid == start position
4653   *			ins->flags = BTRFS_EXTENT_ITEM_KEY
4654   *			ins->offset == the size of the hole.
4655   *
4656   * @is_data        -	Boolean flag indicating whether an extent is
4657   *			allocated for data (true) or metadata (false)
4658   *
4659   * @delalloc       -	Boolean flag indicating whether this allocation is for
4660   *			delalloc or not. If 'true' data_rwsem of block groups
4661   *			is going to be acquired.
4662   *
4663   *
4664   * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4665   * case -ENOSPC is returned then @ins->offset will contain the size of the
4666   * largest available hole the allocator managed to find.
4667   */
btrfs_reserve_extent(struct btrfs_root * root,u64 ram_bytes,u64 num_bytes,u64 min_alloc_size,u64 empty_size,u64 hint_byte,struct btrfs_key * ins,int is_data,int delalloc)4668  int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4669  			 u64 num_bytes, u64 min_alloc_size,
4670  			 u64 empty_size, u64 hint_byte,
4671  			 struct btrfs_key *ins, int is_data, int delalloc)
4672  {
4673  	struct btrfs_fs_info *fs_info = root->fs_info;
4674  	struct find_free_extent_ctl ffe_ctl = {};
4675  	bool final_tried = num_bytes == min_alloc_size;
4676  	u64 flags;
4677  	int ret;
4678  	bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);
4679  	bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
4680  
4681  	flags = get_alloc_profile_by_root(root, is_data);
4682  again:
4683  	WARN_ON(num_bytes < fs_info->sectorsize);
4684  
4685  	ffe_ctl.ram_bytes = ram_bytes;
4686  	ffe_ctl.num_bytes = num_bytes;
4687  	ffe_ctl.min_alloc_size = min_alloc_size;
4688  	ffe_ctl.empty_size = empty_size;
4689  	ffe_ctl.flags = flags;
4690  	ffe_ctl.delalloc = delalloc;
4691  	ffe_ctl.hint_byte = hint_byte;
4692  	ffe_ctl.for_treelog = for_treelog;
4693  	ffe_ctl.for_data_reloc = for_data_reloc;
4694  
4695  	ret = find_free_extent(root, ins, &ffe_ctl);
4696  	if (!ret && !is_data) {
4697  		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4698  	} else if (ret == -ENOSPC) {
4699  		if (!final_tried && ins->offset) {
4700  			num_bytes = min(num_bytes >> 1, ins->offset);
4701  			num_bytes = round_down(num_bytes,
4702  					       fs_info->sectorsize);
4703  			num_bytes = max(num_bytes, min_alloc_size);
4704  			ram_bytes = num_bytes;
4705  			if (num_bytes == min_alloc_size)
4706  				final_tried = true;
4707  			goto again;
4708  		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4709  			struct btrfs_space_info *sinfo;
4710  
4711  			sinfo = btrfs_find_space_info(fs_info, flags);
4712  			btrfs_err(fs_info,
4713  	"allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4714  				  flags, num_bytes, for_treelog, for_data_reloc);
4715  			if (sinfo)
4716  				btrfs_dump_space_info(fs_info, sinfo,
4717  						      num_bytes, 1);
4718  		}
4719  	}
4720  
4721  	return ret;
4722  }
4723  
btrfs_free_reserved_extent(struct btrfs_fs_info * fs_info,u64 start,u64 len,int delalloc)4724  int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4725  			       u64 start, u64 len, int delalloc)
4726  {
4727  	struct btrfs_block_group *cache;
4728  
4729  	cache = btrfs_lookup_block_group(fs_info, start);
4730  	if (!cache) {
4731  		btrfs_err(fs_info, "Unable to find block group for %llu",
4732  			  start);
4733  		return -ENOSPC;
4734  	}
4735  
4736  	btrfs_add_free_space(cache, start, len);
4737  	btrfs_free_reserved_bytes(cache, len, delalloc);
4738  	trace_btrfs_reserved_extent_free(fs_info, start, len);
4739  
4740  	btrfs_put_block_group(cache);
4741  	return 0;
4742  }
4743  
btrfs_pin_reserved_extent(struct btrfs_trans_handle * trans,const struct extent_buffer * eb)4744  int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
4745  			      const struct extent_buffer *eb)
4746  {
4747  	struct btrfs_block_group *cache;
4748  	int ret = 0;
4749  
4750  	cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
4751  	if (!cache) {
4752  		btrfs_err(trans->fs_info, "unable to find block group for %llu",
4753  			  eb->start);
4754  		return -ENOSPC;
4755  	}
4756  
4757  	ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
4758  	btrfs_put_block_group(cache);
4759  	return ret;
4760  }
4761  
alloc_reserved_extent(struct btrfs_trans_handle * trans,u64 bytenr,u64 num_bytes)4762  static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
4763  				 u64 num_bytes)
4764  {
4765  	struct btrfs_fs_info *fs_info = trans->fs_info;
4766  	int ret;
4767  
4768  	ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
4769  	if (ret)
4770  		return ret;
4771  
4772  	ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
4773  	if (ret) {
4774  		ASSERT(!ret);
4775  		btrfs_err(fs_info, "update block group failed for %llu %llu",
4776  			  bytenr, num_bytes);
4777  		return ret;
4778  	}
4779  
4780  	trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
4781  	return 0;
4782  }
4783  
alloc_reserved_file_extent(struct btrfs_trans_handle * trans,u64 parent,u64 root_objectid,u64 flags,u64 owner,u64 offset,struct btrfs_key * ins,int ref_mod,u64 oref_root)4784  static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4785  				      u64 parent, u64 root_objectid,
4786  				      u64 flags, u64 owner, u64 offset,
4787  				      struct btrfs_key *ins, int ref_mod, u64 oref_root)
4788  {
4789  	struct btrfs_fs_info *fs_info = trans->fs_info;
4790  	struct btrfs_root *extent_root;
4791  	int ret;
4792  	struct btrfs_extent_item *extent_item;
4793  	struct btrfs_extent_owner_ref *oref;
4794  	struct btrfs_extent_inline_ref *iref;
4795  	struct btrfs_path *path;
4796  	struct extent_buffer *leaf;
4797  	int type;
4798  	u32 size;
4799  	const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
4800  
4801  	if (parent > 0)
4802  		type = BTRFS_SHARED_DATA_REF_KEY;
4803  	else
4804  		type = BTRFS_EXTENT_DATA_REF_KEY;
4805  
4806  	size = sizeof(*extent_item);
4807  	if (simple_quota)
4808  		size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
4809  	size += btrfs_extent_inline_ref_size(type);
4810  
4811  	path = btrfs_alloc_path();
4812  	if (!path)
4813  		return -ENOMEM;
4814  
4815  	extent_root = btrfs_extent_root(fs_info, ins->objectid);
4816  	ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
4817  	if (ret) {
4818  		btrfs_free_path(path);
4819  		return ret;
4820  	}
4821  
4822  	leaf = path->nodes[0];
4823  	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4824  				     struct btrfs_extent_item);
4825  	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4826  	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4827  	btrfs_set_extent_flags(leaf, extent_item,
4828  			       flags | BTRFS_EXTENT_FLAG_DATA);
4829  
4830  	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4831  	if (simple_quota) {
4832  		btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY);
4833  		oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
4834  		btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root);
4835  		iref = (struct btrfs_extent_inline_ref *)(oref + 1);
4836  	}
4837  	btrfs_set_extent_inline_ref_type(leaf, iref, type);
4838  
4839  	if (parent > 0) {
4840  		struct btrfs_shared_data_ref *ref;
4841  		ref = (struct btrfs_shared_data_ref *)(iref + 1);
4842  		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4843  		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4844  	} else {
4845  		struct btrfs_extent_data_ref *ref;
4846  		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4847  		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4848  		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4849  		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4850  		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4851  	}
4852  
4853  	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
4854  	btrfs_free_path(path);
4855  
4856  	return alloc_reserved_extent(trans, ins->objectid, ins->offset);
4857  }
4858  
alloc_reserved_tree_block(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op)4859  static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4860  				     struct btrfs_delayed_ref_node *node,
4861  				     struct btrfs_delayed_extent_op *extent_op)
4862  {
4863  	struct btrfs_fs_info *fs_info = trans->fs_info;
4864  	struct btrfs_root *extent_root;
4865  	int ret;
4866  	struct btrfs_extent_item *extent_item;
4867  	struct btrfs_key extent_key;
4868  	struct btrfs_tree_block_info *block_info;
4869  	struct btrfs_extent_inline_ref *iref;
4870  	struct btrfs_path *path;
4871  	struct extent_buffer *leaf;
4872  	u32 size = sizeof(*extent_item) + sizeof(*iref);
4873  	const u64 flags = (extent_op ? extent_op->flags_to_set : 0);
4874  	/* The owner of a tree block is the level. */
4875  	int level = btrfs_delayed_ref_owner(node);
4876  	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4877  
4878  	extent_key.objectid = node->bytenr;
4879  	if (skinny_metadata) {
4880  		/* The owner of a tree block is the level. */
4881  		extent_key.offset = level;
4882  		extent_key.type = BTRFS_METADATA_ITEM_KEY;
4883  	} else {
4884  		extent_key.offset = node->num_bytes;
4885  		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4886  		size += sizeof(*block_info);
4887  	}
4888  
4889  	path = btrfs_alloc_path();
4890  	if (!path)
4891  		return -ENOMEM;
4892  
4893  	extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
4894  	ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
4895  				      size);
4896  	if (ret) {
4897  		btrfs_free_path(path);
4898  		return ret;
4899  	}
4900  
4901  	leaf = path->nodes[0];
4902  	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4903  				     struct btrfs_extent_item);
4904  	btrfs_set_extent_refs(leaf, extent_item, 1);
4905  	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4906  	btrfs_set_extent_flags(leaf, extent_item,
4907  			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4908  
4909  	if (skinny_metadata) {
4910  		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4911  	} else {
4912  		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4913  		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4914  		btrfs_set_tree_block_level(leaf, block_info, level);
4915  		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4916  	}
4917  
4918  	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4919  		btrfs_set_extent_inline_ref_type(leaf, iref,
4920  						 BTRFS_SHARED_BLOCK_REF_KEY);
4921  		btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent);
4922  	} else {
4923  		btrfs_set_extent_inline_ref_type(leaf, iref,
4924  						 BTRFS_TREE_BLOCK_REF_KEY);
4925  		btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
4926  	}
4927  
4928  	btrfs_mark_buffer_dirty(trans, leaf);
4929  	btrfs_free_path(path);
4930  
4931  	return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
4932  }
4933  
btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 owner,u64 offset,u64 ram_bytes,struct btrfs_key * ins)4934  int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4935  				     struct btrfs_root *root, u64 owner,
4936  				     u64 offset, u64 ram_bytes,
4937  				     struct btrfs_key *ins)
4938  {
4939  	struct btrfs_ref generic_ref = {
4940  		.action = BTRFS_ADD_DELAYED_EXTENT,
4941  		.bytenr = ins->objectid,
4942  		.num_bytes = ins->offset,
4943  		.owning_root = btrfs_root_id(root),
4944  		.ref_root = btrfs_root_id(root),
4945  	};
4946  
4947  	ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
4948  
4949  	if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
4950  		generic_ref.owning_root = root->relocation_src_root;
4951  
4952  	btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
4953  	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4954  
4955  	return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4956  }
4957  
4958  /*
4959   * this is used by the tree logging recovery code.  It records that
4960   * an extent has been allocated and makes sure to clear the free
4961   * space cache bits as well
4962   */
btrfs_alloc_logged_file_extent(struct btrfs_trans_handle * trans,u64 root_objectid,u64 owner,u64 offset,struct btrfs_key * ins)4963  int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4964  				   u64 root_objectid, u64 owner, u64 offset,
4965  				   struct btrfs_key *ins)
4966  {
4967  	struct btrfs_fs_info *fs_info = trans->fs_info;
4968  	int ret;
4969  	struct btrfs_block_group *block_group;
4970  	struct btrfs_space_info *space_info;
4971  	struct btrfs_squota_delta delta = {
4972  		.root = root_objectid,
4973  		.num_bytes = ins->offset,
4974  		.generation = trans->transid,
4975  		.is_data = true,
4976  		.is_inc = true,
4977  	};
4978  
4979  	/*
4980  	 * Mixed block groups will exclude before processing the log so we only
4981  	 * need to do the exclude dance if this fs isn't mixed.
4982  	 */
4983  	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4984  		ret = __exclude_logged_extent(fs_info, ins->objectid,
4985  					      ins->offset);
4986  		if (ret)
4987  			return ret;
4988  	}
4989  
4990  	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4991  	if (!block_group)
4992  		return -EINVAL;
4993  
4994  	space_info = block_group->space_info;
4995  	spin_lock(&space_info->lock);
4996  	spin_lock(&block_group->lock);
4997  	space_info->bytes_reserved += ins->offset;
4998  	block_group->reserved += ins->offset;
4999  	spin_unlock(&block_group->lock);
5000  	spin_unlock(&space_info->lock);
5001  
5002  	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
5003  					 offset, ins, 1, root_objectid);
5004  	if (ret)
5005  		btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
5006  	ret = btrfs_record_squota_delta(fs_info, &delta);
5007  	btrfs_put_block_group(block_group);
5008  	return ret;
5009  }
5010  
5011  #ifdef CONFIG_BTRFS_DEBUG
5012  /*
5013   * Extra safety check in case the extent tree is corrupted and extent allocator
5014   * chooses to use a tree block which is already used and locked.
5015   */
check_eb_lock_owner(const struct extent_buffer * eb)5016  static bool check_eb_lock_owner(const struct extent_buffer *eb)
5017  {
5018  	if (eb->lock_owner == current->pid) {
5019  		btrfs_err_rl(eb->fs_info,
5020  "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
5021  			     eb->start, btrfs_header_owner(eb), current->pid);
5022  		return true;
5023  	}
5024  	return false;
5025  }
5026  #else
check_eb_lock_owner(struct extent_buffer * eb)5027  static bool check_eb_lock_owner(struct extent_buffer *eb)
5028  {
5029  	return false;
5030  }
5031  #endif
5032  
5033  static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 bytenr,int level,u64 owner,enum btrfs_lock_nesting nest)5034  btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5035  		      u64 bytenr, int level, u64 owner,
5036  		      enum btrfs_lock_nesting nest)
5037  {
5038  	struct btrfs_fs_info *fs_info = root->fs_info;
5039  	struct extent_buffer *buf;
5040  	u64 lockdep_owner = owner;
5041  
5042  	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
5043  	if (IS_ERR(buf))
5044  		return buf;
5045  
5046  	if (check_eb_lock_owner(buf)) {
5047  		free_extent_buffer(buf);
5048  		return ERR_PTR(-EUCLEAN);
5049  	}
5050  
5051  	/*
5052  	 * The reloc trees are just snapshots, so we need them to appear to be
5053  	 * just like any other fs tree WRT lockdep.
5054  	 *
5055  	 * The exception however is in replace_path() in relocation, where we
5056  	 * hold the lock on the original fs root and then search for the reloc
5057  	 * root.  At that point we need to make sure any reloc root buffers are
5058  	 * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
5059  	 * lockdep happy.
5060  	 */
5061  	if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
5062  	    !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
5063  		lockdep_owner = BTRFS_FS_TREE_OBJECTID;
5064  
5065  	/* btrfs_clear_buffer_dirty() accesses generation field. */
5066  	btrfs_set_header_generation(buf, trans->transid);
5067  
5068  	/*
5069  	 * This needs to stay, because we could allocate a freed block from an
5070  	 * old tree into a new tree, so we need to make sure this new block is
5071  	 * set to the appropriate level and owner.
5072  	 */
5073  	btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
5074  
5075  	btrfs_tree_lock_nested(buf, nest);
5076  	btrfs_clear_buffer_dirty(trans, buf);
5077  	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
5078  	clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
5079  
5080  	set_extent_buffer_uptodate(buf);
5081  
5082  	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
5083  	btrfs_set_header_level(buf, level);
5084  	btrfs_set_header_bytenr(buf, buf->start);
5085  	btrfs_set_header_generation(buf, trans->transid);
5086  	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
5087  	btrfs_set_header_owner(buf, owner);
5088  	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
5089  	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
5090  	if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
5091  		buf->log_index = root->log_transid % 2;
5092  		/*
5093  		 * we allow two log transactions at a time, use different
5094  		 * EXTENT bit to differentiate dirty pages.
5095  		 */
5096  		if (buf->log_index == 0)
5097  			set_extent_bit(&root->dirty_log_pages, buf->start,
5098  				       buf->start + buf->len - 1,
5099  				       EXTENT_DIRTY, NULL);
5100  		else
5101  			set_extent_bit(&root->dirty_log_pages, buf->start,
5102  				       buf->start + buf->len - 1,
5103  				       EXTENT_NEW, NULL);
5104  	} else {
5105  		buf->log_index = -1;
5106  		set_extent_bit(&trans->transaction->dirty_pages, buf->start,
5107  			       buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
5108  	}
5109  	/* this returns a buffer locked for blocking */
5110  	return buf;
5111  }
5112  
5113  /*
5114   * finds a free extent and does all the dirty work required for allocation
5115   * returns the tree buffer or an ERR_PTR on error.
5116   */
btrfs_alloc_tree_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 parent,u64 root_objectid,const struct btrfs_disk_key * key,int level,u64 hint,u64 empty_size,u64 reloc_src_root,enum btrfs_lock_nesting nest)5117  struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
5118  					     struct btrfs_root *root,
5119  					     u64 parent, u64 root_objectid,
5120  					     const struct btrfs_disk_key *key,
5121  					     int level, u64 hint,
5122  					     u64 empty_size,
5123  					     u64 reloc_src_root,
5124  					     enum btrfs_lock_nesting nest)
5125  {
5126  	struct btrfs_fs_info *fs_info = root->fs_info;
5127  	struct btrfs_key ins;
5128  	struct btrfs_block_rsv *block_rsv;
5129  	struct extent_buffer *buf;
5130  	u64 flags = 0;
5131  	int ret;
5132  	u32 blocksize = fs_info->nodesize;
5133  	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
5134  	u64 owning_root;
5135  
5136  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5137  	if (btrfs_is_testing(fs_info)) {
5138  		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
5139  					    level, root_objectid, nest);
5140  		if (!IS_ERR(buf))
5141  			root->alloc_bytenr += blocksize;
5142  		return buf;
5143  	}
5144  #endif
5145  
5146  	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
5147  	if (IS_ERR(block_rsv))
5148  		return ERR_CAST(block_rsv);
5149  
5150  	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
5151  				   empty_size, hint, &ins, 0, 0);
5152  	if (ret)
5153  		goto out_unuse;
5154  
5155  	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
5156  				    root_objectid, nest);
5157  	if (IS_ERR(buf)) {
5158  		ret = PTR_ERR(buf);
5159  		goto out_free_reserved;
5160  	}
5161  	owning_root = btrfs_header_owner(buf);
5162  
5163  	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5164  		if (parent == 0)
5165  			parent = ins.objectid;
5166  		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5167  		owning_root = reloc_src_root;
5168  	} else
5169  		BUG_ON(parent > 0);
5170  
5171  	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5172  		struct btrfs_delayed_extent_op *extent_op;
5173  		struct btrfs_ref generic_ref = {
5174  			.action = BTRFS_ADD_DELAYED_EXTENT,
5175  			.bytenr = ins.objectid,
5176  			.num_bytes = ins.offset,
5177  			.parent = parent,
5178  			.owning_root = owning_root,
5179  			.ref_root = root_objectid,
5180  		};
5181  
5182  		if (!skinny_metadata || flags != 0) {
5183  			extent_op = btrfs_alloc_delayed_extent_op();
5184  			if (!extent_op) {
5185  				ret = -ENOMEM;
5186  				goto out_free_buf;
5187  			}
5188  			if (key)
5189  				memcpy(&extent_op->key, key, sizeof(extent_op->key));
5190  			else
5191  				memset(&extent_op->key, 0, sizeof(extent_op->key));
5192  			extent_op->flags_to_set = flags;
5193  			extent_op->update_key = (skinny_metadata ? false : true);
5194  			extent_op->update_flags = (flags != 0);
5195  		} else {
5196  			extent_op = NULL;
5197  		}
5198  
5199  		btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false);
5200  		btrfs_ref_tree_mod(fs_info, &generic_ref);
5201  		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
5202  		if (ret) {
5203  			btrfs_free_delayed_extent_op(extent_op);
5204  			goto out_free_buf;
5205  		}
5206  	}
5207  	return buf;
5208  
5209  out_free_buf:
5210  	btrfs_tree_unlock(buf);
5211  	free_extent_buffer(buf);
5212  out_free_reserved:
5213  	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
5214  out_unuse:
5215  	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
5216  	return ERR_PTR(ret);
5217  }
5218  
5219  struct walk_control {
5220  	u64 refs[BTRFS_MAX_LEVEL];
5221  	u64 flags[BTRFS_MAX_LEVEL];
5222  	struct btrfs_key update_progress;
5223  	struct btrfs_key drop_progress;
5224  	int drop_level;
5225  	int stage;
5226  	int level;
5227  	int shared_level;
5228  	int update_ref;
5229  	int keep_locks;
5230  	int reada_slot;
5231  	int reada_count;
5232  	int restarted;
5233  	/* Indicate that extent info needs to be looked up when walking the tree. */
5234  	int lookup_info;
5235  };
5236  
5237  /*
5238   * This is our normal stage.  We are traversing blocks the current snapshot owns
5239   * and we are dropping any of our references to any children we are able to, and
5240   * then freeing the block once we've processed all of the children.
5241   */
5242  #define DROP_REFERENCE	1
5243  
5244  /*
5245   * We enter this stage when we have to walk into a child block (meaning we can't
5246   * simply drop our reference to it from our current parent node) and there are
5247   * more than one reference on it.  If we are the owner of any of the children
5248   * blocks from the current parent node then we have to do the FULL_BACKREF dance
5249   * on them in order to drop our normal ref and add the shared ref.
5250   */
5251  #define UPDATE_BACKREF	2
5252  
5253  /*
5254   * Decide if we need to walk down into this node to adjust the references.
5255   *
5256   * @root:	the root we are currently deleting
5257   * @wc:		the walk control for this deletion
5258   * @eb:		the parent eb that we're currently visiting
5259   * @refs:	the number of refs for wc->level - 1
5260   * @flags:	the flags for wc->level - 1
5261   * @slot:	the slot in the eb that we're currently checking
5262   *
5263   * This is meant to be called when we're evaluating if a node we point to at
5264   * wc->level should be read and walked into, or if we can simply delete our
5265   * reference to it.  We return true if we should walk into the node, false if we
5266   * can skip it.
5267   *
5268   * We have assertions in here to make sure this is called correctly.  We assume
5269   * that sanity checking on the blocks read to this point has been done, so any
5270   * corrupted file systems must have been caught before calling this function.
5271   */
visit_node_for_delete(struct btrfs_root * root,struct walk_control * wc,struct extent_buffer * eb,u64 refs,u64 flags,int slot)5272  static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
5273  				  struct extent_buffer *eb, u64 refs, u64 flags, int slot)
5274  {
5275  	struct btrfs_key key;
5276  	u64 generation;
5277  	int level = wc->level;
5278  
5279  	ASSERT(level > 0);
5280  	ASSERT(wc->refs[level - 1] > 0);
5281  
5282  	/*
5283  	 * The update backref stage we only want to skip if we already have
5284  	 * FULL_BACKREF set, otherwise we need to read.
5285  	 */
5286  	if (wc->stage == UPDATE_BACKREF) {
5287  		if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5288  			return false;
5289  		return true;
5290  	}
5291  
5292  	/*
5293  	 * We're the last ref on this block, we must walk into it and process
5294  	 * any refs it's pointing at.
5295  	 */
5296  	if (wc->refs[level - 1] == 1)
5297  		return true;
5298  
5299  	/*
5300  	 * If we're already FULL_BACKREF then we know we can just drop our
5301  	 * current reference.
5302  	 */
5303  	if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5304  		return false;
5305  
5306  	/*
5307  	 * This block is older than our creation generation, we can drop our
5308  	 * reference to it.
5309  	 */
5310  	generation = btrfs_node_ptr_generation(eb, slot);
5311  	if (!wc->update_ref || generation <= root->root_key.offset)
5312  		return false;
5313  
5314  	/*
5315  	 * This block was processed from a previous snapshot deletion run, we
5316  	 * can skip it.
5317  	 */
5318  	btrfs_node_key_to_cpu(eb, &key, slot);
5319  	if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0)
5320  		return false;
5321  
5322  	/* All other cases we need to wander into the node. */
5323  	return true;
5324  }
5325  
reada_walk_down(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct walk_control * wc,struct btrfs_path * path)5326  static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5327  				     struct btrfs_root *root,
5328  				     struct walk_control *wc,
5329  				     struct btrfs_path *path)
5330  {
5331  	struct btrfs_fs_info *fs_info = root->fs_info;
5332  	u64 bytenr;
5333  	u64 generation;
5334  	u64 refs;
5335  	u64 flags;
5336  	u32 nritems;
5337  	struct extent_buffer *eb;
5338  	int ret;
5339  	int slot;
5340  	int nread = 0;
5341  
5342  	if (path->slots[wc->level] < wc->reada_slot) {
5343  		wc->reada_count = wc->reada_count * 2 / 3;
5344  		wc->reada_count = max(wc->reada_count, 2);
5345  	} else {
5346  		wc->reada_count = wc->reada_count * 3 / 2;
5347  		wc->reada_count = min_t(int, wc->reada_count,
5348  					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
5349  	}
5350  
5351  	eb = path->nodes[wc->level];
5352  	nritems = btrfs_header_nritems(eb);
5353  
5354  	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5355  		if (nread >= wc->reada_count)
5356  			break;
5357  
5358  		cond_resched();
5359  		bytenr = btrfs_node_blockptr(eb, slot);
5360  		generation = btrfs_node_ptr_generation(eb, slot);
5361  
5362  		if (slot == path->slots[wc->level])
5363  			goto reada;
5364  
5365  		if (wc->stage == UPDATE_BACKREF &&
5366  		    generation <= root->root_key.offset)
5367  			continue;
5368  
5369  		/* We don't lock the tree block, it's OK to be racy here */
5370  		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
5371  					       wc->level - 1, 1, &refs,
5372  					       &flags, NULL);
5373  		/* We don't care about errors in readahead. */
5374  		if (ret < 0)
5375  			continue;
5376  
5377  		/*
5378  		 * This could be racey, it's conceivable that we raced and end
5379  		 * up with a bogus refs count, if that's the case just skip, if
5380  		 * we are actually corrupt we will notice when we look up
5381  		 * everything again with our locks.
5382  		 */
5383  		if (refs == 0)
5384  			continue;
5385  
5386  		/* If we don't need to visit this node don't reada. */
5387  		if (!visit_node_for_delete(root, wc, eb, refs, flags, slot))
5388  			continue;
5389  reada:
5390  		btrfs_readahead_node_child(eb, slot);
5391  		nread++;
5392  	}
5393  	wc->reada_slot = slot;
5394  }
5395  
5396  /*
5397   * helper to process tree block while walking down the tree.
5398   *
5399   * when wc->stage == UPDATE_BACKREF, this function updates
5400   * back refs for pointers in the block.
5401   *
5402   * NOTE: return value 1 means we should stop walking down.
5403   */
walk_down_proc(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc)5404  static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5405  				   struct btrfs_root *root,
5406  				   struct btrfs_path *path,
5407  				   struct walk_control *wc)
5408  {
5409  	struct btrfs_fs_info *fs_info = root->fs_info;
5410  	int level = wc->level;
5411  	struct extent_buffer *eb = path->nodes[level];
5412  	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5413  	int ret;
5414  
5415  	if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root))
5416  		return 1;
5417  
5418  	/*
5419  	 * when reference count of tree block is 1, it won't increase
5420  	 * again. once full backref flag is set, we never clear it.
5421  	 */
5422  	if (wc->lookup_info &&
5423  	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5424  	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5425  		ASSERT(path->locks[level]);
5426  		ret = btrfs_lookup_extent_info(trans, fs_info,
5427  					       eb->start, level, 1,
5428  					       &wc->refs[level],
5429  					       &wc->flags[level],
5430  					       NULL);
5431  		if (ret)
5432  			return ret;
5433  		if (unlikely(wc->refs[level] == 0)) {
5434  			btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5435  				  eb->start);
5436  			return -EUCLEAN;
5437  		}
5438  	}
5439  
5440  	if (wc->stage == DROP_REFERENCE) {
5441  		if (wc->refs[level] > 1)
5442  			return 1;
5443  
5444  		if (path->locks[level] && !wc->keep_locks) {
5445  			btrfs_tree_unlock_rw(eb, path->locks[level]);
5446  			path->locks[level] = 0;
5447  		}
5448  		return 0;
5449  	}
5450  
5451  	/* wc->stage == UPDATE_BACKREF */
5452  	if (!(wc->flags[level] & flag)) {
5453  		ASSERT(path->locks[level]);
5454  		ret = btrfs_inc_ref(trans, root, eb, 1);
5455  		if (ret) {
5456  			btrfs_abort_transaction(trans, ret);
5457  			return ret;
5458  		}
5459  		ret = btrfs_dec_ref(trans, root, eb, 0);
5460  		if (ret) {
5461  			btrfs_abort_transaction(trans, ret);
5462  			return ret;
5463  		}
5464  		ret = btrfs_set_disk_extent_flags(trans, eb, flag);
5465  		if (ret) {
5466  			btrfs_abort_transaction(trans, ret);
5467  			return ret;
5468  		}
5469  		wc->flags[level] |= flag;
5470  	}
5471  
5472  	/*
5473  	 * the block is shared by multiple trees, so it's not good to
5474  	 * keep the tree lock
5475  	 */
5476  	if (path->locks[level] && level > 0) {
5477  		btrfs_tree_unlock_rw(eb, path->locks[level]);
5478  		path->locks[level] = 0;
5479  	}
5480  	return 0;
5481  }
5482  
5483  /*
5484   * This is used to verify a ref exists for this root to deal with a bug where we
5485   * would have a drop_progress key that hadn't been updated properly.
5486   */
check_ref_exists(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 bytenr,u64 parent,int level)5487  static int check_ref_exists(struct btrfs_trans_handle *trans,
5488  			    struct btrfs_root *root, u64 bytenr, u64 parent,
5489  			    int level)
5490  {
5491  	struct btrfs_delayed_ref_root *delayed_refs;
5492  	struct btrfs_delayed_ref_head *head;
5493  	struct btrfs_path *path;
5494  	struct btrfs_extent_inline_ref *iref;
5495  	int ret;
5496  	bool exists = false;
5497  
5498  	path = btrfs_alloc_path();
5499  	if (!path)
5500  		return -ENOMEM;
5501  again:
5502  	ret = lookup_extent_backref(trans, path, &iref, bytenr,
5503  				    root->fs_info->nodesize, parent,
5504  				    btrfs_root_id(root), level, 0);
5505  	if (ret != -ENOENT) {
5506  		/*
5507  		 * If we get 0 then we found our reference, return 1, else
5508  		 * return the error if it's not -ENOENT;
5509  		 */
5510  		btrfs_free_path(path);
5511  		return (ret < 0 ) ? ret : 1;
5512  	}
5513  
5514  	/*
5515  	 * We could have a delayed ref with this reference, so look it up while
5516  	 * we're holding the path open to make sure we don't race with the
5517  	 * delayed ref running.
5518  	 */
5519  	delayed_refs = &trans->transaction->delayed_refs;
5520  	spin_lock(&delayed_refs->lock);
5521  	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
5522  	if (!head)
5523  		goto out;
5524  	if (!mutex_trylock(&head->mutex)) {
5525  		/*
5526  		 * We're contended, means that the delayed ref is running, get a
5527  		 * reference and wait for the ref head to be complete and then
5528  		 * try again.
5529  		 */
5530  		refcount_inc(&head->refs);
5531  		spin_unlock(&delayed_refs->lock);
5532  
5533  		btrfs_release_path(path);
5534  
5535  		mutex_lock(&head->mutex);
5536  		mutex_unlock(&head->mutex);
5537  		btrfs_put_delayed_ref_head(head);
5538  		goto again;
5539  	}
5540  
5541  	exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent);
5542  	mutex_unlock(&head->mutex);
5543  out:
5544  	spin_unlock(&delayed_refs->lock);
5545  	btrfs_free_path(path);
5546  	return exists ? 1 : 0;
5547  }
5548  
5549  /*
5550   * We may not have an uptodate block, so if we are going to walk down into this
5551   * block we need to drop the lock, read it off of the disk, re-lock it and
5552   * return to continue dropping the snapshot.
5553   */
check_next_block_uptodate(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc,struct extent_buffer * next)5554  static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
5555  				     struct btrfs_root *root,
5556  				     struct btrfs_path *path,
5557  				     struct walk_control *wc,
5558  				     struct extent_buffer *next)
5559  {
5560  	struct btrfs_tree_parent_check check = { 0 };
5561  	u64 generation;
5562  	int level = wc->level;
5563  	int ret;
5564  
5565  	btrfs_assert_tree_write_locked(next);
5566  
5567  	generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
5568  
5569  	if (btrfs_buffer_uptodate(next, generation, 0))
5570  		return 0;
5571  
5572  	check.level = level - 1;
5573  	check.transid = generation;
5574  	check.owner_root = btrfs_root_id(root);
5575  	check.has_first_key = true;
5576  	btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]);
5577  
5578  	btrfs_tree_unlock(next);
5579  	if (level == 1)
5580  		reada_walk_down(trans, root, wc, path);
5581  	ret = btrfs_read_extent_buffer(next, &check);
5582  	if (ret) {
5583  		free_extent_buffer(next);
5584  		return ret;
5585  	}
5586  	btrfs_tree_lock(next);
5587  	wc->lookup_info = 1;
5588  	return 0;
5589  }
5590  
5591  /*
5592   * If we determine that we don't have to visit wc->level - 1 then we need to
5593   * determine if we can drop our reference.
5594   *
5595   * If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
5596   *
5597   * If we are DROP_REFERENCE this will figure out if we need to drop our current
5598   * reference, skipping it if we dropped it from a previous incompleted drop, or
5599   * dropping it if we still have a reference to it.
5600   */
maybe_drop_reference(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc,struct extent_buffer * next,u64 owner_root)5601  static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5602  				struct btrfs_path *path, struct walk_control *wc,
5603  				struct extent_buffer *next, u64 owner_root)
5604  {
5605  	struct btrfs_ref ref = {
5606  		.action = BTRFS_DROP_DELAYED_REF,
5607  		.bytenr = next->start,
5608  		.num_bytes = root->fs_info->nodesize,
5609  		.owning_root = owner_root,
5610  		.ref_root = btrfs_root_id(root),
5611  	};
5612  	int level = wc->level;
5613  	int ret;
5614  
5615  	/* We are UPDATE_BACKREF, we're not dropping anything. */
5616  	if (wc->stage == UPDATE_BACKREF)
5617  		return 0;
5618  
5619  	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5620  		ref.parent = path->nodes[level]->start;
5621  	} else {
5622  		ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level]));
5623  		if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) {
5624  			btrfs_err(root->fs_info, "mismatched block owner");
5625  			return -EIO;
5626  		}
5627  	}
5628  
5629  	/*
5630  	 * If we had a drop_progress we need to verify the refs are set as
5631  	 * expected.  If we find our ref then we know that from here on out
5632  	 * everything should be correct, and we can clear the
5633  	 * ->restarted flag.
5634  	 */
5635  	if (wc->restarted) {
5636  		ret = check_ref_exists(trans, root, next->start, ref.parent,
5637  				       level - 1);
5638  		if (ret <= 0)
5639  			return ret;
5640  		ret = 0;
5641  		wc->restarted = 0;
5642  	}
5643  
5644  	/*
5645  	 * Reloc tree doesn't contribute to qgroup numbers, and we have already
5646  	 * accounted them at merge time (replace_path), thus we could skip
5647  	 * expensive subtree trace here.
5648  	 */
5649  	if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
5650  	    wc->refs[level - 1] > 1) {
5651  		u64 generation = btrfs_node_ptr_generation(path->nodes[level],
5652  							   path->slots[level]);
5653  
5654  		ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1);
5655  		if (ret) {
5656  			btrfs_err_rl(root->fs_info,
5657  "error %d accounting shared subtree, quota is out of sync, rescan required",
5658  				     ret);
5659  		}
5660  	}
5661  
5662  	/*
5663  	 * We need to update the next key in our walk control so we can update
5664  	 * the drop_progress key accordingly.  We don't care if find_next_key
5665  	 * doesn't find a key because that means we're at the end and are going
5666  	 * to clean up now.
5667  	 */
5668  	wc->drop_level = level;
5669  	find_next_key(path, level, &wc->drop_progress);
5670  
5671  	btrfs_init_tree_ref(&ref, level - 1, 0, false);
5672  	return btrfs_free_extent(trans, &ref);
5673  }
5674  
5675  /*
5676   * helper to process tree block pointer.
5677   *
5678   * when wc->stage == DROP_REFERENCE, this function checks
5679   * reference count of the block pointed to. if the block
5680   * is shared and we need update back refs for the subtree
5681   * rooted at the block, this function changes wc->stage to
5682   * UPDATE_BACKREF. if the block is shared and there is no
5683   * need to update back, this function drops the reference
5684   * to the block.
5685   *
5686   * NOTE: return value 1 means we should stop walking down.
5687   */
do_walk_down(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc)5688  static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5689  				 struct btrfs_root *root,
5690  				 struct btrfs_path *path,
5691  				 struct walk_control *wc)
5692  {
5693  	struct btrfs_fs_info *fs_info = root->fs_info;
5694  	u64 bytenr;
5695  	u64 generation;
5696  	u64 owner_root = 0;
5697  	struct extent_buffer *next;
5698  	int level = wc->level;
5699  	int ret = 0;
5700  
5701  	generation = btrfs_node_ptr_generation(path->nodes[level],
5702  					       path->slots[level]);
5703  	/*
5704  	 * if the lower level block was created before the snapshot
5705  	 * was created, we know there is no need to update back refs
5706  	 * for the subtree
5707  	 */
5708  	if (wc->stage == UPDATE_BACKREF &&
5709  	    generation <= root->root_key.offset) {
5710  		wc->lookup_info = 1;
5711  		return 1;
5712  	}
5713  
5714  	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5715  
5716  	next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root),
5717  					    level - 1);
5718  	if (IS_ERR(next))
5719  		return PTR_ERR(next);
5720  
5721  	btrfs_tree_lock(next);
5722  
5723  	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5724  				       &wc->refs[level - 1],
5725  				       &wc->flags[level - 1],
5726  				       &owner_root);
5727  	if (ret < 0)
5728  		goto out_unlock;
5729  
5730  	if (unlikely(wc->refs[level - 1] == 0)) {
5731  		btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5732  			  bytenr);
5733  		ret = -EUCLEAN;
5734  		goto out_unlock;
5735  	}
5736  	wc->lookup_info = 0;
5737  
5738  	/* If we don't have to walk into this node skip it. */
5739  	if (!visit_node_for_delete(root, wc, path->nodes[level],
5740  				   wc->refs[level - 1], wc->flags[level - 1],
5741  				   path->slots[level]))
5742  		goto skip;
5743  
5744  	/*
5745  	 * We have to walk down into this node, and if we're currently at the
5746  	 * DROP_REFERNCE stage and this block is shared then we need to switch
5747  	 * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
5748  	 */
5749  	if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) {
5750  		wc->stage = UPDATE_BACKREF;
5751  		wc->shared_level = level - 1;
5752  	}
5753  
5754  	ret = check_next_block_uptodate(trans, root, path, wc, next);
5755  	if (ret)
5756  		return ret;
5757  
5758  	level--;
5759  	ASSERT(level == btrfs_header_level(next));
5760  	if (level != btrfs_header_level(next)) {
5761  		btrfs_err(root->fs_info, "mismatched level");
5762  		ret = -EIO;
5763  		goto out_unlock;
5764  	}
5765  	path->nodes[level] = next;
5766  	path->slots[level] = 0;
5767  	path->locks[level] = BTRFS_WRITE_LOCK;
5768  	wc->level = level;
5769  	if (wc->level == 1)
5770  		wc->reada_slot = 0;
5771  	return 0;
5772  skip:
5773  	ret = maybe_drop_reference(trans, root, path, wc, next, owner_root);
5774  	if (ret)
5775  		goto out_unlock;
5776  	wc->refs[level - 1] = 0;
5777  	wc->flags[level - 1] = 0;
5778  	wc->lookup_info = 1;
5779  	ret = 1;
5780  
5781  out_unlock:
5782  	btrfs_tree_unlock(next);
5783  	free_extent_buffer(next);
5784  
5785  	return ret;
5786  }
5787  
5788  /*
5789   * helper to process tree block while walking up the tree.
5790   *
5791   * when wc->stage == DROP_REFERENCE, this function drops
5792   * reference count on the block.
5793   *
5794   * when wc->stage == UPDATE_BACKREF, this function changes
5795   * wc->stage back to DROP_REFERENCE if we changed wc->stage
5796   * to UPDATE_BACKREF previously while processing the block.
5797   *
5798   * NOTE: return value 1 means we should stop walking up.
5799   */
walk_up_proc(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc)5800  static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5801  				 struct btrfs_root *root,
5802  				 struct btrfs_path *path,
5803  				 struct walk_control *wc)
5804  {
5805  	struct btrfs_fs_info *fs_info = root->fs_info;
5806  	int ret = 0;
5807  	int level = wc->level;
5808  	struct extent_buffer *eb = path->nodes[level];
5809  	u64 parent = 0;
5810  
5811  	if (wc->stage == UPDATE_BACKREF) {
5812  		ASSERT(wc->shared_level >= level);
5813  		if (level < wc->shared_level)
5814  			goto out;
5815  
5816  		ret = find_next_key(path, level + 1, &wc->update_progress);
5817  		if (ret > 0)
5818  			wc->update_ref = 0;
5819  
5820  		wc->stage = DROP_REFERENCE;
5821  		wc->shared_level = -1;
5822  		path->slots[level] = 0;
5823  
5824  		/*
5825  		 * check reference count again if the block isn't locked.
5826  		 * we should start walking down the tree again if reference
5827  		 * count is one.
5828  		 */
5829  		if (!path->locks[level]) {
5830  			ASSERT(level > 0);
5831  			btrfs_tree_lock(eb);
5832  			path->locks[level] = BTRFS_WRITE_LOCK;
5833  
5834  			ret = btrfs_lookup_extent_info(trans, fs_info,
5835  						       eb->start, level, 1,
5836  						       &wc->refs[level],
5837  						       &wc->flags[level],
5838  						       NULL);
5839  			if (ret < 0) {
5840  				btrfs_tree_unlock_rw(eb, path->locks[level]);
5841  				path->locks[level] = 0;
5842  				return ret;
5843  			}
5844  			if (unlikely(wc->refs[level] == 0)) {
5845  				btrfs_tree_unlock_rw(eb, path->locks[level]);
5846  				btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5847  					  eb->start);
5848  				return -EUCLEAN;
5849  			}
5850  			if (wc->refs[level] == 1) {
5851  				btrfs_tree_unlock_rw(eb, path->locks[level]);
5852  				path->locks[level] = 0;
5853  				return 1;
5854  			}
5855  		}
5856  	}
5857  
5858  	/* wc->stage == DROP_REFERENCE */
5859  	ASSERT(path->locks[level] || wc->refs[level] == 1);
5860  
5861  	if (wc->refs[level] == 1) {
5862  		if (level == 0) {
5863  			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5864  				ret = btrfs_dec_ref(trans, root, eb, 1);
5865  			else
5866  				ret = btrfs_dec_ref(trans, root, eb, 0);
5867  			if (ret) {
5868  				btrfs_abort_transaction(trans, ret);
5869  				return ret;
5870  			}
5871  			if (is_fstree(btrfs_root_id(root))) {
5872  				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5873  				if (ret) {
5874  					btrfs_err_rl(fs_info,
5875  	"error %d accounting leaf items, quota is out of sync, rescan required",
5876  					     ret);
5877  				}
5878  			}
5879  		}
5880  		/* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
5881  		if (!path->locks[level]) {
5882  			btrfs_tree_lock(eb);
5883  			path->locks[level] = BTRFS_WRITE_LOCK;
5884  		}
5885  		btrfs_clear_buffer_dirty(trans, eb);
5886  	}
5887  
5888  	if (eb == root->node) {
5889  		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5890  			parent = eb->start;
5891  		else if (btrfs_root_id(root) != btrfs_header_owner(eb))
5892  			goto owner_mismatch;
5893  	} else {
5894  		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5895  			parent = path->nodes[level + 1]->start;
5896  		else if (btrfs_root_id(root) !=
5897  			 btrfs_header_owner(path->nodes[level + 1]))
5898  			goto owner_mismatch;
5899  	}
5900  
5901  	ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5902  				    wc->refs[level] == 1);
5903  	if (ret < 0)
5904  		btrfs_abort_transaction(trans, ret);
5905  out:
5906  	wc->refs[level] = 0;
5907  	wc->flags[level] = 0;
5908  	return ret;
5909  
5910  owner_mismatch:
5911  	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5912  		     btrfs_header_owner(eb), btrfs_root_id(root));
5913  	return -EUCLEAN;
5914  }
5915  
5916  /*
5917   * walk_down_tree consists of two steps.
5918   *
5919   * walk_down_proc().  Look up the reference count and reference of our current
5920   * wc->level.  At this point path->nodes[wc->level] should be populated and
5921   * uptodate, and in most cases should already be locked.  If we are in
5922   * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and
5923   * we can walk back up the tree.  If we are UPDATE_BACKREF we have to set
5924   * FULL_BACKREF on this node if it's not already set, and then do the
5925   * FULL_BACKREF conversion dance, which is to drop the root reference and add
5926   * the shared reference to all of this nodes children.
5927   *
5928   * do_walk_down().  This is where we actually start iterating on the children of
5929   * our current path->nodes[wc->level].  For DROP_REFERENCE that means dropping
5930   * our reference to the children that return false from visit_node_for_delete(),
5931   * which has various conditions where we know we can just drop our reference
5932   * without visiting the node.  For UPDATE_BACKREF we will skip any children that
5933   * visit_node_for_delete() returns false for, only walking down when necessary.
5934   * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of
5935   * snapshot deletion.
5936   */
walk_down_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc)5937  static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5938  				   struct btrfs_root *root,
5939  				   struct btrfs_path *path,
5940  				   struct walk_control *wc)
5941  {
5942  	int level = wc->level;
5943  	int ret = 0;
5944  
5945  	wc->lookup_info = 1;
5946  	while (level >= 0) {
5947  		ret = walk_down_proc(trans, root, path, wc);
5948  		if (ret)
5949  			break;
5950  
5951  		if (level == 0)
5952  			break;
5953  
5954  		if (path->slots[level] >=
5955  		    btrfs_header_nritems(path->nodes[level]))
5956  			break;
5957  
5958  		ret = do_walk_down(trans, root, path, wc);
5959  		if (ret > 0) {
5960  			path->slots[level]++;
5961  			continue;
5962  		} else if (ret < 0)
5963  			break;
5964  		level = wc->level;
5965  	}
5966  	return (ret == 1) ? 0 : ret;
5967  }
5968  
5969  /*
5970   * walk_up_tree() is responsible for making sure we visit every slot on our
5971   * current node, and if we're at the end of that node then we call
5972   * walk_up_proc() on our current node which will do one of a few things based on
5973   * our stage.
5974   *
5975   * UPDATE_BACKREF.  If we wc->level is currently less than our wc->shared_level
5976   * then we need to walk back up the tree, and then going back down into the
5977   * other slots via walk_down_tree to update any other children from our original
5978   * wc->shared_level.  Once we're at or above our wc->shared_level we can switch
5979   * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on.
5980   *
5981   * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block.
5982   * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents
5983   * in our current leaf.  After that we call btrfs_free_tree_block() on the
5984   * current node and walk up to the next node to walk down the next slot.
5985   */
walk_up_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc,int max_level)5986  static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5987  				 struct btrfs_root *root,
5988  				 struct btrfs_path *path,
5989  				 struct walk_control *wc, int max_level)
5990  {
5991  	int level = wc->level;
5992  	int ret;
5993  
5994  	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5995  	while (level < max_level && path->nodes[level]) {
5996  		wc->level = level;
5997  		if (path->slots[level] + 1 <
5998  		    btrfs_header_nritems(path->nodes[level])) {
5999  			path->slots[level]++;
6000  			return 0;
6001  		} else {
6002  			ret = walk_up_proc(trans, root, path, wc);
6003  			if (ret > 0)
6004  				return 0;
6005  			if (ret < 0)
6006  				return ret;
6007  
6008  			if (path->locks[level]) {
6009  				btrfs_tree_unlock_rw(path->nodes[level],
6010  						     path->locks[level]);
6011  				path->locks[level] = 0;
6012  			}
6013  			free_extent_buffer(path->nodes[level]);
6014  			path->nodes[level] = NULL;
6015  			level++;
6016  		}
6017  	}
6018  	return 1;
6019  }
6020  
6021  /*
6022   * drop a subvolume tree.
6023   *
6024   * this function traverses the tree freeing any blocks that only
6025   * referenced by the tree.
6026   *
6027   * when a shared tree block is found. this function decreases its
6028   * reference count by one. if update_ref is true, this function
6029   * also make sure backrefs for the shared block and all lower level
6030   * blocks are properly updated.
6031   *
6032   * If called with for_reloc == 0, may exit early with -EAGAIN
6033   */
btrfs_drop_snapshot(struct btrfs_root * root,int update_ref,int for_reloc)6034  int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
6035  {
6036  	const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
6037  	struct btrfs_fs_info *fs_info = root->fs_info;
6038  	struct btrfs_path *path;
6039  	struct btrfs_trans_handle *trans;
6040  	struct btrfs_root *tree_root = fs_info->tree_root;
6041  	struct btrfs_root_item *root_item = &root->root_item;
6042  	struct walk_control *wc;
6043  	struct btrfs_key key;
6044  	const u64 rootid = btrfs_root_id(root);
6045  	int ret = 0;
6046  	int level;
6047  	bool root_dropped = false;
6048  	bool unfinished_drop = false;
6049  
6050  	btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root));
6051  
6052  	path = btrfs_alloc_path();
6053  	if (!path) {
6054  		ret = -ENOMEM;
6055  		goto out;
6056  	}
6057  
6058  	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6059  	if (!wc) {
6060  		btrfs_free_path(path);
6061  		ret = -ENOMEM;
6062  		goto out;
6063  	}
6064  
6065  	/*
6066  	 * Use join to avoid potential EINTR from transaction start. See
6067  	 * wait_reserve_ticket and the whole reservation callchain.
6068  	 */
6069  	if (for_reloc)
6070  		trans = btrfs_join_transaction(tree_root);
6071  	else
6072  		trans = btrfs_start_transaction(tree_root, 0);
6073  	if (IS_ERR(trans)) {
6074  		ret = PTR_ERR(trans);
6075  		goto out_free;
6076  	}
6077  
6078  	ret = btrfs_run_delayed_items(trans);
6079  	if (ret)
6080  		goto out_end_trans;
6081  
6082  	/*
6083  	 * This will help us catch people modifying the fs tree while we're
6084  	 * dropping it.  It is unsafe to mess with the fs tree while it's being
6085  	 * dropped as we unlock the root node and parent nodes as we walk down
6086  	 * the tree, assuming nothing will change.  If something does change
6087  	 * then we'll have stale information and drop references to blocks we've
6088  	 * already dropped.
6089  	 */
6090  	set_bit(BTRFS_ROOT_DELETING, &root->state);
6091  	unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
6092  
6093  	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6094  		level = btrfs_header_level(root->node);
6095  		path->nodes[level] = btrfs_lock_root_node(root);
6096  		path->slots[level] = 0;
6097  		path->locks[level] = BTRFS_WRITE_LOCK;
6098  		memset(&wc->update_progress, 0,
6099  		       sizeof(wc->update_progress));
6100  	} else {
6101  		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6102  		memcpy(&wc->update_progress, &key,
6103  		       sizeof(wc->update_progress));
6104  
6105  		level = btrfs_root_drop_level(root_item);
6106  		BUG_ON(level == 0);
6107  		path->lowest_level = level;
6108  		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6109  		path->lowest_level = 0;
6110  		if (ret < 0)
6111  			goto out_end_trans;
6112  
6113  		WARN_ON(ret > 0);
6114  		ret = 0;
6115  
6116  		/*
6117  		 * unlock our path, this is safe because only this
6118  		 * function is allowed to delete this snapshot
6119  		 */
6120  		btrfs_unlock_up_safe(path, 0);
6121  
6122  		level = btrfs_header_level(root->node);
6123  		while (1) {
6124  			btrfs_tree_lock(path->nodes[level]);
6125  			path->locks[level] = BTRFS_WRITE_LOCK;
6126  
6127  			/*
6128  			 * btrfs_lookup_extent_info() returns 0 for success,
6129  			 * or < 0 for error.
6130  			 */
6131  			ret = btrfs_lookup_extent_info(trans, fs_info,
6132  						path->nodes[level]->start,
6133  						level, 1, &wc->refs[level],
6134  						&wc->flags[level], NULL);
6135  			if (ret < 0)
6136  				goto out_end_trans;
6137  
6138  			BUG_ON(wc->refs[level] == 0);
6139  
6140  			if (level == btrfs_root_drop_level(root_item))
6141  				break;
6142  
6143  			btrfs_tree_unlock(path->nodes[level]);
6144  			path->locks[level] = 0;
6145  			WARN_ON(wc->refs[level] != 1);
6146  			level--;
6147  		}
6148  	}
6149  
6150  	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
6151  	wc->level = level;
6152  	wc->shared_level = -1;
6153  	wc->stage = DROP_REFERENCE;
6154  	wc->update_ref = update_ref;
6155  	wc->keep_locks = 0;
6156  	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
6157  
6158  	while (1) {
6159  
6160  		ret = walk_down_tree(trans, root, path, wc);
6161  		if (ret < 0) {
6162  			btrfs_abort_transaction(trans, ret);
6163  			break;
6164  		}
6165  
6166  		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6167  		if (ret < 0) {
6168  			btrfs_abort_transaction(trans, ret);
6169  			break;
6170  		}
6171  
6172  		if (ret > 0) {
6173  			BUG_ON(wc->stage != DROP_REFERENCE);
6174  			ret = 0;
6175  			break;
6176  		}
6177  
6178  		if (wc->stage == DROP_REFERENCE) {
6179  			wc->drop_level = wc->level;
6180  			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
6181  					      &wc->drop_progress,
6182  					      path->slots[wc->drop_level]);
6183  		}
6184  		btrfs_cpu_key_to_disk(&root_item->drop_progress,
6185  				      &wc->drop_progress);
6186  		btrfs_set_root_drop_level(root_item, wc->drop_level);
6187  
6188  		BUG_ON(wc->level == 0);
6189  		if (btrfs_should_end_transaction(trans) ||
6190  		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
6191  			ret = btrfs_update_root(trans, tree_root,
6192  						&root->root_key,
6193  						root_item);
6194  			if (ret) {
6195  				btrfs_abort_transaction(trans, ret);
6196  				goto out_end_trans;
6197  			}
6198  
6199  			if (!is_reloc_root)
6200  				btrfs_set_last_root_drop_gen(fs_info, trans->transid);
6201  
6202  			btrfs_end_transaction_throttle(trans);
6203  			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
6204  				btrfs_debug(fs_info,
6205  					    "drop snapshot early exit");
6206  				ret = -EAGAIN;
6207  				goto out_free;
6208  			}
6209  
6210  		       /*
6211  			* Use join to avoid potential EINTR from transaction
6212  			* start. See wait_reserve_ticket and the whole
6213  			* reservation callchain.
6214  			*/
6215  			if (for_reloc)
6216  				trans = btrfs_join_transaction(tree_root);
6217  			else
6218  				trans = btrfs_start_transaction(tree_root, 0);
6219  			if (IS_ERR(trans)) {
6220  				ret = PTR_ERR(trans);
6221  				goto out_free;
6222  			}
6223  		}
6224  	}
6225  	btrfs_release_path(path);
6226  	if (ret)
6227  		goto out_end_trans;
6228  
6229  	ret = btrfs_del_root(trans, &root->root_key);
6230  	if (ret) {
6231  		btrfs_abort_transaction(trans, ret);
6232  		goto out_end_trans;
6233  	}
6234  
6235  	if (!is_reloc_root) {
6236  		ret = btrfs_find_root(tree_root, &root->root_key, path,
6237  				      NULL, NULL);
6238  		if (ret < 0) {
6239  			btrfs_abort_transaction(trans, ret);
6240  			goto out_end_trans;
6241  		} else if (ret > 0) {
6242  			ret = 0;
6243  			/*
6244  			 * If we fail to delete the orphan item this time
6245  			 * around, it'll get picked up the next time.
6246  			 *
6247  			 * The most common failure here is just -ENOENT.
6248  			 */
6249  			btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root));
6250  		}
6251  	}
6252  
6253  	/*
6254  	 * This subvolume is going to be completely dropped, and won't be
6255  	 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
6256  	 * commit transaction time.  So free it here manually.
6257  	 */
6258  	btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
6259  	btrfs_qgroup_free_meta_all_pertrans(root);
6260  
6261  	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
6262  		btrfs_add_dropped_root(trans, root);
6263  	else
6264  		btrfs_put_root(root);
6265  	root_dropped = true;
6266  out_end_trans:
6267  	if (!is_reloc_root)
6268  		btrfs_set_last_root_drop_gen(fs_info, trans->transid);
6269  
6270  	btrfs_end_transaction_throttle(trans);
6271  out_free:
6272  	kfree(wc);
6273  	btrfs_free_path(path);
6274  out:
6275  	if (!ret && root_dropped) {
6276  		ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid);
6277  		if (ret < 0)
6278  			btrfs_warn_rl(fs_info,
6279  				      "failed to cleanup qgroup 0/%llu: %d",
6280  				      rootid, ret);
6281  		ret = 0;
6282  	}
6283  	/*
6284  	 * We were an unfinished drop root, check to see if there are any
6285  	 * pending, and if not clear and wake up any waiters.
6286  	 */
6287  	if (!ret && unfinished_drop)
6288  		btrfs_maybe_wake_unfinished_drop(fs_info);
6289  
6290  	/*
6291  	 * So if we need to stop dropping the snapshot for whatever reason we
6292  	 * need to make sure to add it back to the dead root list so that we
6293  	 * keep trying to do the work later.  This also cleans up roots if we
6294  	 * don't have it in the radix (like when we recover after a power fail
6295  	 * or unmount) so we don't leak memory.
6296  	 */
6297  	if (!for_reloc && !root_dropped)
6298  		btrfs_add_dead_root(root);
6299  	return ret;
6300  }
6301  
6302  /*
6303   * drop subtree rooted at tree block 'node'.
6304   *
6305   * NOTE: this function will unlock and release tree block 'node'
6306   * only used by relocation code
6307   */
btrfs_drop_subtree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * node,struct extent_buffer * parent)6308  int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6309  			struct btrfs_root *root,
6310  			struct extent_buffer *node,
6311  			struct extent_buffer *parent)
6312  {
6313  	struct btrfs_fs_info *fs_info = root->fs_info;
6314  	struct btrfs_path *path;
6315  	struct walk_control *wc;
6316  	int level;
6317  	int parent_level;
6318  	int ret = 0;
6319  
6320  	BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
6321  
6322  	path = btrfs_alloc_path();
6323  	if (!path)
6324  		return -ENOMEM;
6325  
6326  	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6327  	if (!wc) {
6328  		btrfs_free_path(path);
6329  		return -ENOMEM;
6330  	}
6331  
6332  	btrfs_assert_tree_write_locked(parent);
6333  	parent_level = btrfs_header_level(parent);
6334  	atomic_inc(&parent->refs);
6335  	path->nodes[parent_level] = parent;
6336  	path->slots[parent_level] = btrfs_header_nritems(parent);
6337  
6338  	btrfs_assert_tree_write_locked(node);
6339  	level = btrfs_header_level(node);
6340  	path->nodes[level] = node;
6341  	path->slots[level] = 0;
6342  	path->locks[level] = BTRFS_WRITE_LOCK;
6343  
6344  	wc->refs[parent_level] = 1;
6345  	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6346  	wc->level = level;
6347  	wc->shared_level = -1;
6348  	wc->stage = DROP_REFERENCE;
6349  	wc->update_ref = 0;
6350  	wc->keep_locks = 1;
6351  	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
6352  
6353  	while (1) {
6354  		ret = walk_down_tree(trans, root, path, wc);
6355  		if (ret < 0)
6356  			break;
6357  
6358  		ret = walk_up_tree(trans, root, path, wc, parent_level);
6359  		if (ret) {
6360  			if (ret > 0)
6361  				ret = 0;
6362  			break;
6363  		}
6364  	}
6365  
6366  	kfree(wc);
6367  	btrfs_free_path(path);
6368  	return ret;
6369  }
6370  
6371  /*
6372   * Unpin the extent range in an error context and don't add the space back.
6373   * Errors are not propagated further.
6374   */
btrfs_error_unpin_extent_range(struct btrfs_fs_info * fs_info,u64 start,u64 end)6375  void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
6376  {
6377  	unpin_extent_range(fs_info, start, end, false);
6378  }
6379  
6380  /*
6381   * It used to be that old block groups would be left around forever.
6382   * Iterating over them would be enough to trim unused space.  Since we
6383   * now automatically remove them, we also need to iterate over unallocated
6384   * space.
6385   *
6386   * We don't want a transaction for this since the discard may take a
6387   * substantial amount of time.  We don't require that a transaction be
6388   * running, but we do need to take a running transaction into account
6389   * to ensure that we're not discarding chunks that were released or
6390   * allocated in the current transaction.
6391   *
6392   * Holding the chunks lock will prevent other threads from allocating
6393   * or releasing chunks, but it won't prevent a running transaction
6394   * from committing and releasing the memory that the pending chunks
6395   * list head uses.  For that, we need to take a reference to the
6396   * transaction and hold the commit root sem.  We only need to hold
6397   * it while performing the free space search since we have already
6398   * held back allocations.
6399   */
btrfs_trim_free_extents(struct btrfs_device * device,u64 * trimmed)6400  static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
6401  {
6402  	u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
6403  	int ret;
6404  
6405  	*trimmed = 0;
6406  
6407  	/* Discard not supported = nothing to do. */
6408  	if (!bdev_max_discard_sectors(device->bdev))
6409  		return 0;
6410  
6411  	/* Not writable = nothing to do. */
6412  	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
6413  		return 0;
6414  
6415  	/* No free space = nothing to do. */
6416  	if (device->total_bytes <= device->bytes_used)
6417  		return 0;
6418  
6419  	ret = 0;
6420  
6421  	while (1) {
6422  		struct btrfs_fs_info *fs_info = device->fs_info;
6423  		u64 bytes;
6424  
6425  		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
6426  		if (ret)
6427  			break;
6428  
6429  		find_first_clear_extent_bit(&device->alloc_state, start,
6430  					    &start, &end,
6431  					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
6432  
6433  		/* Check if there are any CHUNK_* bits left */
6434  		if (start > device->total_bytes) {
6435  			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6436  			btrfs_warn_in_rcu(fs_info,
6437  "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
6438  					  start, end - start + 1,
6439  					  btrfs_dev_name(device),
6440  					  device->total_bytes);
6441  			mutex_unlock(&fs_info->chunk_mutex);
6442  			ret = 0;
6443  			break;
6444  		}
6445  
6446  		/* Ensure we skip the reserved space on each device. */
6447  		start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
6448  
6449  		/*
6450  		 * If find_first_clear_extent_bit find a range that spans the
6451  		 * end of the device it will set end to -1, in this case it's up
6452  		 * to the caller to trim the value to the size of the device.
6453  		 */
6454  		end = min(end, device->total_bytes - 1);
6455  
6456  		len = end - start + 1;
6457  
6458  		/* We didn't find any extents */
6459  		if (!len) {
6460  			mutex_unlock(&fs_info->chunk_mutex);
6461  			ret = 0;
6462  			break;
6463  		}
6464  
6465  		ret = btrfs_issue_discard(device->bdev, start, len,
6466  					  &bytes);
6467  		if (!ret)
6468  			set_extent_bit(&device->alloc_state, start,
6469  				       start + bytes - 1, CHUNK_TRIMMED, NULL);
6470  		mutex_unlock(&fs_info->chunk_mutex);
6471  
6472  		if (ret)
6473  			break;
6474  
6475  		start += len;
6476  		*trimmed += bytes;
6477  
6478  		if (btrfs_trim_interrupted()) {
6479  			ret = -ERESTARTSYS;
6480  			break;
6481  		}
6482  
6483  		cond_resched();
6484  	}
6485  
6486  	return ret;
6487  }
6488  
6489  /*
6490   * Trim the whole filesystem by:
6491   * 1) trimming the free space in each block group
6492   * 2) trimming the unallocated space on each device
6493   *
6494   * This will also continue trimming even if a block group or device encounters
6495   * an error.  The return value will be the last error, or 0 if nothing bad
6496   * happens.
6497   */
btrfs_trim_fs(struct btrfs_fs_info * fs_info,struct fstrim_range * range)6498  int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
6499  {
6500  	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6501  	struct btrfs_block_group *cache = NULL;
6502  	struct btrfs_device *device;
6503  	u64 group_trimmed;
6504  	u64 range_end = U64_MAX;
6505  	u64 start;
6506  	u64 end;
6507  	u64 trimmed = 0;
6508  	u64 bg_failed = 0;
6509  	u64 dev_failed = 0;
6510  	int bg_ret = 0;
6511  	int dev_ret = 0;
6512  	int ret = 0;
6513  
6514  	if (range->start == U64_MAX)
6515  		return -EINVAL;
6516  
6517  	/*
6518  	 * Check range overflow if range->len is set.
6519  	 * The default range->len is U64_MAX.
6520  	 */
6521  	if (range->len != U64_MAX &&
6522  	    check_add_overflow(range->start, range->len, &range_end))
6523  		return -EINVAL;
6524  
6525  	cache = btrfs_lookup_first_block_group(fs_info, range->start);
6526  	for (; cache; cache = btrfs_next_block_group(cache)) {
6527  		if (cache->start >= range_end) {
6528  			btrfs_put_block_group(cache);
6529  			break;
6530  		}
6531  
6532  		start = max(range->start, cache->start);
6533  		end = min(range_end, cache->start + cache->length);
6534  
6535  		if (end - start >= range->minlen) {
6536  			if (!btrfs_block_group_done(cache)) {
6537  				ret = btrfs_cache_block_group(cache, true);
6538  				if (ret) {
6539  					bg_failed++;
6540  					bg_ret = ret;
6541  					continue;
6542  				}
6543  			}
6544  			ret = btrfs_trim_block_group(cache,
6545  						     &group_trimmed,
6546  						     start,
6547  						     end,
6548  						     range->minlen);
6549  
6550  			trimmed += group_trimmed;
6551  			if (ret) {
6552  				bg_failed++;
6553  				bg_ret = ret;
6554  				continue;
6555  			}
6556  		}
6557  	}
6558  
6559  	if (bg_failed)
6560  		btrfs_warn(fs_info,
6561  			"failed to trim %llu block group(s), last error %d",
6562  			bg_failed, bg_ret);
6563  
6564  	mutex_lock(&fs_devices->device_list_mutex);
6565  	list_for_each_entry(device, &fs_devices->devices, dev_list) {
6566  		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6567  			continue;
6568  
6569  		ret = btrfs_trim_free_extents(device, &group_trimmed);
6570  
6571  		trimmed += group_trimmed;
6572  		if (ret) {
6573  			dev_failed++;
6574  			dev_ret = ret;
6575  			break;
6576  		}
6577  	}
6578  	mutex_unlock(&fs_devices->device_list_mutex);
6579  
6580  	if (dev_failed)
6581  		btrfs_warn(fs_info,
6582  			"failed to trim %llu device(s), last error %d",
6583  			dev_failed, dev_ret);
6584  	range->len = trimmed;
6585  	if (bg_ret)
6586  		return bg_ret;
6587  	return dev_ret;
6588  }
6589