1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2009 Oracle.  All rights reserved.
4   */
5  
6  #include <linux/sched.h>
7  #include <linux/pagemap.h>
8  #include <linux/writeback.h>
9  #include <linux/blkdev.h>
10  #include <linux/rbtree.h>
11  #include <linux/slab.h>
12  #include <linux/error-injection.h>
13  #include "ctree.h"
14  #include "disk-io.h"
15  #include "transaction.h"
16  #include "volumes.h"
17  #include "locking.h"
18  #include "btrfs_inode.h"
19  #include "async-thread.h"
20  #include "free-space-cache.h"
21  #include "qgroup.h"
22  #include "print-tree.h"
23  #include "delalloc-space.h"
24  #include "block-group.h"
25  #include "backref.h"
26  #include "misc.h"
27  #include "subpage.h"
28  #include "zoned.h"
29  #include "inode-item.h"
30  #include "space-info.h"
31  #include "fs.h"
32  #include "accessors.h"
33  #include "extent-tree.h"
34  #include "root-tree.h"
35  #include "file-item.h"
36  #include "relocation.h"
37  #include "super.h"
38  #include "tree-checker.h"
39  #include "raid-stripe-tree.h"
40  
41  /*
42   * Relocation overview
43   *
44   * [What does relocation do]
45   *
46   * The objective of relocation is to relocate all extents of the target block
47   * group to other block groups.
48   * This is utilized by resize (shrink only), profile converting, compacting
49   * space, or balance routine to spread chunks over devices.
50   *
51   * 		Before		|		After
52   * ------------------------------------------------------------------
53   *  BG A: 10 data extents	| BG A: deleted
54   *  BG B:  2 data extents	| BG B: 10 data extents (2 old + 8 relocated)
55   *  BG C:  1 extents		| BG C:  3 data extents (1 old + 2 relocated)
56   *
57   * [How does relocation work]
58   *
59   * 1.   Mark the target block group read-only
60   *      New extents won't be allocated from the target block group.
61   *
62   * 2.1  Record each extent in the target block group
63   *      To build a proper map of extents to be relocated.
64   *
65   * 2.2  Build data reloc tree and reloc trees
66   *      Data reloc tree will contain an inode, recording all newly relocated
67   *      data extents.
68   *      There will be only one data reloc tree for one data block group.
69   *
70   *      Reloc tree will be a special snapshot of its source tree, containing
71   *      relocated tree blocks.
72   *      Each tree referring to a tree block in target block group will get its
73   *      reloc tree built.
74   *
75   * 2.3  Swap source tree with its corresponding reloc tree
76   *      Each involved tree only refers to new extents after swap.
77   *
78   * 3.   Cleanup reloc trees and data reloc tree.
79   *      As old extents in the target block group are still referenced by reloc
80   *      trees, we need to clean them up before really freeing the target block
81   *      group.
82   *
83   * The main complexity is in steps 2.2 and 2.3.
84   *
85   * The entry point of relocation is relocate_block_group() function.
86   */
87  
88  #define RELOCATION_RESERVED_NODES	256
89  /*
90   * map address of tree root to tree
91   */
92  struct mapping_node {
93  	struct {
94  		struct rb_node rb_node;
95  		u64 bytenr;
96  	}; /* Use rb_simle_node for search/insert */
97  	void *data;
98  };
99  
100  struct mapping_tree {
101  	struct rb_root rb_root;
102  	spinlock_t lock;
103  };
104  
105  /*
106   * present a tree block to process
107   */
108  struct tree_block {
109  	struct {
110  		struct rb_node rb_node;
111  		u64 bytenr;
112  	}; /* Use rb_simple_node for search/insert */
113  	u64 owner;
114  	struct btrfs_key key;
115  	u8 level;
116  	bool key_ready;
117  };
118  
119  #define MAX_EXTENTS 128
120  
121  struct file_extent_cluster {
122  	u64 start;
123  	u64 end;
124  	u64 boundary[MAX_EXTENTS];
125  	unsigned int nr;
126  	u64 owning_root;
127  };
128  
129  /* Stages of data relocation. */
130  enum reloc_stage {
131  	MOVE_DATA_EXTENTS,
132  	UPDATE_DATA_PTRS
133  };
134  
135  struct reloc_control {
136  	/* block group to relocate */
137  	struct btrfs_block_group *block_group;
138  	/* extent tree */
139  	struct btrfs_root *extent_root;
140  	/* inode for moving data */
141  	struct inode *data_inode;
142  
143  	struct btrfs_block_rsv *block_rsv;
144  
145  	struct btrfs_backref_cache backref_cache;
146  
147  	struct file_extent_cluster cluster;
148  	/* tree blocks have been processed */
149  	struct extent_io_tree processed_blocks;
150  	/* map start of tree root to corresponding reloc tree */
151  	struct mapping_tree reloc_root_tree;
152  	/* list of reloc trees */
153  	struct list_head reloc_roots;
154  	/* list of subvolume trees that get relocated */
155  	struct list_head dirty_subvol_roots;
156  	/* size of metadata reservation for merging reloc trees */
157  	u64 merging_rsv_size;
158  	/* size of relocated tree nodes */
159  	u64 nodes_relocated;
160  	/* reserved size for block group relocation*/
161  	u64 reserved_bytes;
162  
163  	u64 search_start;
164  	u64 extents_found;
165  
166  	enum reloc_stage stage;
167  	bool create_reloc_tree;
168  	bool merge_reloc_tree;
169  	bool found_file_extent;
170  };
171  
mark_block_processed(struct reloc_control * rc,struct btrfs_backref_node * node)172  static void mark_block_processed(struct reloc_control *rc,
173  				 struct btrfs_backref_node *node)
174  {
175  	u32 blocksize;
176  
177  	if (node->level == 0 ||
178  	    in_range(node->bytenr, rc->block_group->start,
179  		     rc->block_group->length)) {
180  		blocksize = rc->extent_root->fs_info->nodesize;
181  		set_extent_bit(&rc->processed_blocks, node->bytenr,
182  			       node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL);
183  	}
184  	node->processed = 1;
185  }
186  
187  /*
188   * walk up backref nodes until reach node presents tree root
189   */
walk_up_backref(struct btrfs_backref_node * node,struct btrfs_backref_edge * edges[],int * index)190  static struct btrfs_backref_node *walk_up_backref(
191  		struct btrfs_backref_node *node,
192  		struct btrfs_backref_edge *edges[], int *index)
193  {
194  	struct btrfs_backref_edge *edge;
195  	int idx = *index;
196  
197  	while (!list_empty(&node->upper)) {
198  		edge = list_entry(node->upper.next,
199  				  struct btrfs_backref_edge, list[LOWER]);
200  		edges[idx++] = edge;
201  		node = edge->node[UPPER];
202  	}
203  	BUG_ON(node->detached);
204  	*index = idx;
205  	return node;
206  }
207  
208  /*
209   * walk down backref nodes to find start of next reference path
210   */
walk_down_backref(struct btrfs_backref_edge * edges[],int * index)211  static struct btrfs_backref_node *walk_down_backref(
212  		struct btrfs_backref_edge *edges[], int *index)
213  {
214  	struct btrfs_backref_edge *edge;
215  	struct btrfs_backref_node *lower;
216  	int idx = *index;
217  
218  	while (idx > 0) {
219  		edge = edges[idx - 1];
220  		lower = edge->node[LOWER];
221  		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
222  			idx--;
223  			continue;
224  		}
225  		edge = list_entry(edge->list[LOWER].next,
226  				  struct btrfs_backref_edge, list[LOWER]);
227  		edges[idx - 1] = edge;
228  		*index = idx;
229  		return edge->node[UPPER];
230  	}
231  	*index = 0;
232  	return NULL;
233  }
234  
reloc_root_is_dead(const struct btrfs_root * root)235  static bool reloc_root_is_dead(const struct btrfs_root *root)
236  {
237  	/*
238  	 * Pair with set_bit/clear_bit in clean_dirty_subvols and
239  	 * btrfs_update_reloc_root. We need to see the updated bit before
240  	 * trying to access reloc_root
241  	 */
242  	smp_rmb();
243  	if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
244  		return true;
245  	return false;
246  }
247  
248  /*
249   * Check if this subvolume tree has valid reloc tree.
250   *
251   * Reloc tree after swap is considered dead, thus not considered as valid.
252   * This is enough for most callers, as they don't distinguish dead reloc root
253   * from no reloc root.  But btrfs_should_ignore_reloc_root() below is a
254   * special case.
255   */
have_reloc_root(const struct btrfs_root * root)256  static bool have_reloc_root(const struct btrfs_root *root)
257  {
258  	if (reloc_root_is_dead(root))
259  		return false;
260  	if (!root->reloc_root)
261  		return false;
262  	return true;
263  }
264  
btrfs_should_ignore_reloc_root(const struct btrfs_root * root)265  bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root)
266  {
267  	struct btrfs_root *reloc_root;
268  
269  	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
270  		return false;
271  
272  	/* This root has been merged with its reloc tree, we can ignore it */
273  	if (reloc_root_is_dead(root))
274  		return true;
275  
276  	reloc_root = root->reloc_root;
277  	if (!reloc_root)
278  		return false;
279  
280  	if (btrfs_header_generation(reloc_root->commit_root) ==
281  	    root->fs_info->running_transaction->transid)
282  		return false;
283  	/*
284  	 * If there is reloc tree and it was created in previous transaction
285  	 * backref lookup can find the reloc tree, so backref node for the fs
286  	 * tree root is useless for relocation.
287  	 */
288  	return true;
289  }
290  
291  /*
292   * find reloc tree by address of tree root
293   */
find_reloc_root(struct btrfs_fs_info * fs_info,u64 bytenr)294  struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
295  {
296  	struct reloc_control *rc = fs_info->reloc_ctl;
297  	struct rb_node *rb_node;
298  	struct mapping_node *node;
299  	struct btrfs_root *root = NULL;
300  
301  	ASSERT(rc);
302  	spin_lock(&rc->reloc_root_tree.lock);
303  	rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
304  	if (rb_node) {
305  		node = rb_entry(rb_node, struct mapping_node, rb_node);
306  		root = node->data;
307  	}
308  	spin_unlock(&rc->reloc_root_tree.lock);
309  	return btrfs_grab_root(root);
310  }
311  
312  /*
313   * For useless nodes, do two major clean ups:
314   *
315   * - Cleanup the children edges and nodes
316   *   If child node is also orphan (no parent) during cleanup, then the child
317   *   node will also be cleaned up.
318   *
319   * - Freeing up leaves (level 0), keeps nodes detached
320   *   For nodes, the node is still cached as "detached"
321   *
322   * Return false if @node is not in the @useless_nodes list.
323   * Return true if @node is in the @useless_nodes list.
324   */
handle_useless_nodes(struct reloc_control * rc,struct btrfs_backref_node * node)325  static bool handle_useless_nodes(struct reloc_control *rc,
326  				 struct btrfs_backref_node *node)
327  {
328  	struct btrfs_backref_cache *cache = &rc->backref_cache;
329  	struct list_head *useless_node = &cache->useless_node;
330  	bool ret = false;
331  
332  	while (!list_empty(useless_node)) {
333  		struct btrfs_backref_node *cur;
334  
335  		cur = list_first_entry(useless_node, struct btrfs_backref_node,
336  				 list);
337  		list_del_init(&cur->list);
338  
339  		/* Only tree root nodes can be added to @useless_nodes */
340  		ASSERT(list_empty(&cur->upper));
341  
342  		if (cur == node)
343  			ret = true;
344  
345  		/* The node is the lowest node */
346  		if (cur->lowest) {
347  			list_del_init(&cur->lower);
348  			cur->lowest = 0;
349  		}
350  
351  		/* Cleanup the lower edges */
352  		while (!list_empty(&cur->lower)) {
353  			struct btrfs_backref_edge *edge;
354  			struct btrfs_backref_node *lower;
355  
356  			edge = list_entry(cur->lower.next,
357  					struct btrfs_backref_edge, list[UPPER]);
358  			list_del(&edge->list[UPPER]);
359  			list_del(&edge->list[LOWER]);
360  			lower = edge->node[LOWER];
361  			btrfs_backref_free_edge(cache, edge);
362  
363  			/* Child node is also orphan, queue for cleanup */
364  			if (list_empty(&lower->upper))
365  				list_add(&lower->list, useless_node);
366  		}
367  		/* Mark this block processed for relocation */
368  		mark_block_processed(rc, cur);
369  
370  		/*
371  		 * Backref nodes for tree leaves are deleted from the cache.
372  		 * Backref nodes for upper level tree blocks are left in the
373  		 * cache to avoid unnecessary backref lookup.
374  		 */
375  		if (cur->level > 0) {
376  			list_add(&cur->list, &cache->detached);
377  			cur->detached = 1;
378  		} else {
379  			rb_erase(&cur->rb_node, &cache->rb_root);
380  			btrfs_backref_free_node(cache, cur);
381  		}
382  	}
383  	return ret;
384  }
385  
386  /*
387   * Build backref tree for a given tree block. Root of the backref tree
388   * corresponds the tree block, leaves of the backref tree correspond roots of
389   * b-trees that reference the tree block.
390   *
391   * The basic idea of this function is check backrefs of a given block to find
392   * upper level blocks that reference the block, and then check backrefs of
393   * these upper level blocks recursively. The recursion stops when tree root is
394   * reached or backrefs for the block is cached.
395   *
396   * NOTE: if we find that backrefs for a block are cached, we know backrefs for
397   * all upper level blocks that directly/indirectly reference the block are also
398   * cached.
399   */
build_backref_tree(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_key * node_key,int level,u64 bytenr)400  static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
401  			struct btrfs_trans_handle *trans,
402  			struct reloc_control *rc, struct btrfs_key *node_key,
403  			int level, u64 bytenr)
404  {
405  	struct btrfs_backref_iter *iter;
406  	struct btrfs_backref_cache *cache = &rc->backref_cache;
407  	/* For searching parent of TREE_BLOCK_REF */
408  	struct btrfs_path *path;
409  	struct btrfs_backref_node *cur;
410  	struct btrfs_backref_node *node = NULL;
411  	struct btrfs_backref_edge *edge;
412  	int ret;
413  
414  	iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
415  	if (!iter)
416  		return ERR_PTR(-ENOMEM);
417  	path = btrfs_alloc_path();
418  	if (!path) {
419  		ret = -ENOMEM;
420  		goto out;
421  	}
422  
423  	node = btrfs_backref_alloc_node(cache, bytenr, level);
424  	if (!node) {
425  		ret = -ENOMEM;
426  		goto out;
427  	}
428  
429  	node->lowest = 1;
430  	cur = node;
431  
432  	/* Breadth-first search to build backref cache */
433  	do {
434  		ret = btrfs_backref_add_tree_node(trans, cache, path, iter,
435  						  node_key, cur);
436  		if (ret < 0)
437  			goto out;
438  
439  		edge = list_first_entry_or_null(&cache->pending_edge,
440  				struct btrfs_backref_edge, list[UPPER]);
441  		/*
442  		 * The pending list isn't empty, take the first block to
443  		 * process
444  		 */
445  		if (edge) {
446  			list_del_init(&edge->list[UPPER]);
447  			cur = edge->node[UPPER];
448  		}
449  	} while (edge);
450  
451  	/* Finish the upper linkage of newly added edges/nodes */
452  	ret = btrfs_backref_finish_upper_links(cache, node);
453  	if (ret < 0)
454  		goto out;
455  
456  	if (handle_useless_nodes(rc, node))
457  		node = NULL;
458  out:
459  	btrfs_free_path(iter->path);
460  	kfree(iter);
461  	btrfs_free_path(path);
462  	if (ret) {
463  		btrfs_backref_error_cleanup(cache, node);
464  		return ERR_PTR(ret);
465  	}
466  	ASSERT(!node || !node->detached);
467  	ASSERT(list_empty(&cache->useless_node) &&
468  	       list_empty(&cache->pending_edge));
469  	return node;
470  }
471  
472  /*
473   * helper to add backref node for the newly created snapshot.
474   * the backref node is created by cloning backref node that
475   * corresponds to root of source tree
476   */
clone_backref_node(struct btrfs_trans_handle * trans,struct reloc_control * rc,const struct btrfs_root * src,struct btrfs_root * dest)477  static int clone_backref_node(struct btrfs_trans_handle *trans,
478  			      struct reloc_control *rc,
479  			      const struct btrfs_root *src,
480  			      struct btrfs_root *dest)
481  {
482  	struct btrfs_root *reloc_root = src->reloc_root;
483  	struct btrfs_backref_cache *cache = &rc->backref_cache;
484  	struct btrfs_backref_node *node = NULL;
485  	struct btrfs_backref_node *new_node;
486  	struct btrfs_backref_edge *edge;
487  	struct btrfs_backref_edge *new_edge;
488  	struct rb_node *rb_node;
489  
490  	rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
491  	if (rb_node) {
492  		node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
493  		if (node->detached)
494  			node = NULL;
495  		else
496  			BUG_ON(node->new_bytenr != reloc_root->node->start);
497  	}
498  
499  	if (!node) {
500  		rb_node = rb_simple_search(&cache->rb_root,
501  					   reloc_root->commit_root->start);
502  		if (rb_node) {
503  			node = rb_entry(rb_node, struct btrfs_backref_node,
504  					rb_node);
505  			BUG_ON(node->detached);
506  		}
507  	}
508  
509  	if (!node)
510  		return 0;
511  
512  	new_node = btrfs_backref_alloc_node(cache, dest->node->start,
513  					    node->level);
514  	if (!new_node)
515  		return -ENOMEM;
516  
517  	new_node->lowest = node->lowest;
518  	new_node->checked = 1;
519  	new_node->root = btrfs_grab_root(dest);
520  	ASSERT(new_node->root);
521  
522  	if (!node->lowest) {
523  		list_for_each_entry(edge, &node->lower, list[UPPER]) {
524  			new_edge = btrfs_backref_alloc_edge(cache);
525  			if (!new_edge)
526  				goto fail;
527  
528  			btrfs_backref_link_edge(new_edge, edge->node[LOWER],
529  						new_node, LINK_UPPER);
530  		}
531  	} else {
532  		list_add_tail(&new_node->lower, &cache->leaves);
533  	}
534  
535  	rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
536  				   &new_node->rb_node);
537  	if (rb_node)
538  		btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
539  
540  	if (!new_node->lowest) {
541  		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
542  			list_add_tail(&new_edge->list[LOWER],
543  				      &new_edge->node[LOWER]->upper);
544  		}
545  	}
546  	return 0;
547  fail:
548  	while (!list_empty(&new_node->lower)) {
549  		new_edge = list_entry(new_node->lower.next,
550  				      struct btrfs_backref_edge, list[UPPER]);
551  		list_del(&new_edge->list[UPPER]);
552  		btrfs_backref_free_edge(cache, new_edge);
553  	}
554  	btrfs_backref_free_node(cache, new_node);
555  	return -ENOMEM;
556  }
557  
558  /*
559   * helper to add 'address of tree root -> reloc tree' mapping
560   */
__add_reloc_root(struct btrfs_root * root)561  static int __add_reloc_root(struct btrfs_root *root)
562  {
563  	struct btrfs_fs_info *fs_info = root->fs_info;
564  	struct rb_node *rb_node;
565  	struct mapping_node *node;
566  	struct reloc_control *rc = fs_info->reloc_ctl;
567  
568  	node = kmalloc(sizeof(*node), GFP_NOFS);
569  	if (!node)
570  		return -ENOMEM;
571  
572  	node->bytenr = root->commit_root->start;
573  	node->data = root;
574  
575  	spin_lock(&rc->reloc_root_tree.lock);
576  	rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
577  				   node->bytenr, &node->rb_node);
578  	spin_unlock(&rc->reloc_root_tree.lock);
579  	if (rb_node) {
580  		btrfs_err(fs_info,
581  			    "Duplicate root found for start=%llu while inserting into relocation tree",
582  			    node->bytenr);
583  		return -EEXIST;
584  	}
585  
586  	list_add_tail(&root->root_list, &rc->reloc_roots);
587  	return 0;
588  }
589  
590  /*
591   * helper to delete the 'address of tree root -> reloc tree'
592   * mapping
593   */
__del_reloc_root(struct btrfs_root * root)594  static void __del_reloc_root(struct btrfs_root *root)
595  {
596  	struct btrfs_fs_info *fs_info = root->fs_info;
597  	struct rb_node *rb_node;
598  	struct mapping_node *node = NULL;
599  	struct reloc_control *rc = fs_info->reloc_ctl;
600  	bool put_ref = false;
601  
602  	if (rc && root->node) {
603  		spin_lock(&rc->reloc_root_tree.lock);
604  		rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
605  					   root->commit_root->start);
606  		if (rb_node) {
607  			node = rb_entry(rb_node, struct mapping_node, rb_node);
608  			rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
609  			RB_CLEAR_NODE(&node->rb_node);
610  		}
611  		spin_unlock(&rc->reloc_root_tree.lock);
612  		ASSERT(!node || (struct btrfs_root *)node->data == root);
613  	}
614  
615  	/*
616  	 * We only put the reloc root here if it's on the list.  There's a lot
617  	 * of places where the pattern is to splice the rc->reloc_roots, process
618  	 * the reloc roots, and then add the reloc root back onto
619  	 * rc->reloc_roots.  If we call __del_reloc_root while it's off of the
620  	 * list we don't want the reference being dropped, because the guy
621  	 * messing with the list is in charge of the reference.
622  	 */
623  	spin_lock(&fs_info->trans_lock);
624  	if (!list_empty(&root->root_list)) {
625  		put_ref = true;
626  		list_del_init(&root->root_list);
627  	}
628  	spin_unlock(&fs_info->trans_lock);
629  	if (put_ref)
630  		btrfs_put_root(root);
631  	kfree(node);
632  }
633  
634  /*
635   * helper to update the 'address of tree root -> reloc tree'
636   * mapping
637   */
__update_reloc_root(struct btrfs_root * root)638  static int __update_reloc_root(struct btrfs_root *root)
639  {
640  	struct btrfs_fs_info *fs_info = root->fs_info;
641  	struct rb_node *rb_node;
642  	struct mapping_node *node = NULL;
643  	struct reloc_control *rc = fs_info->reloc_ctl;
644  
645  	spin_lock(&rc->reloc_root_tree.lock);
646  	rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
647  				   root->commit_root->start);
648  	if (rb_node) {
649  		node = rb_entry(rb_node, struct mapping_node, rb_node);
650  		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
651  	}
652  	spin_unlock(&rc->reloc_root_tree.lock);
653  
654  	if (!node)
655  		return 0;
656  	BUG_ON((struct btrfs_root *)node->data != root);
657  
658  	spin_lock(&rc->reloc_root_tree.lock);
659  	node->bytenr = root->node->start;
660  	rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
661  				   node->bytenr, &node->rb_node);
662  	spin_unlock(&rc->reloc_root_tree.lock);
663  	if (rb_node)
664  		btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
665  	return 0;
666  }
667  
create_reloc_root(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 objectid)668  static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
669  					struct btrfs_root *root, u64 objectid)
670  {
671  	struct btrfs_fs_info *fs_info = root->fs_info;
672  	struct btrfs_root *reloc_root;
673  	struct extent_buffer *eb;
674  	struct btrfs_root_item *root_item;
675  	struct btrfs_key root_key;
676  	int ret = 0;
677  	bool must_abort = false;
678  
679  	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
680  	if (!root_item)
681  		return ERR_PTR(-ENOMEM);
682  
683  	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
684  	root_key.type = BTRFS_ROOT_ITEM_KEY;
685  	root_key.offset = objectid;
686  
687  	if (btrfs_root_id(root) == objectid) {
688  		u64 commit_root_gen;
689  
690  		/* called by btrfs_init_reloc_root */
691  		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
692  				      BTRFS_TREE_RELOC_OBJECTID);
693  		if (ret)
694  			goto fail;
695  
696  		/*
697  		 * Set the last_snapshot field to the generation of the commit
698  		 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
699  		 * correctly (returns true) when the relocation root is created
700  		 * either inside the critical section of a transaction commit
701  		 * (through transaction.c:qgroup_account_snapshot()) and when
702  		 * it's created before the transaction commit is started.
703  		 */
704  		commit_root_gen = btrfs_header_generation(root->commit_root);
705  		btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
706  	} else {
707  		/*
708  		 * called by btrfs_reloc_post_snapshot_hook.
709  		 * the source tree is a reloc tree, all tree blocks
710  		 * modified after it was created have RELOC flag
711  		 * set in their headers. so it's OK to not update
712  		 * the 'last_snapshot'.
713  		 */
714  		ret = btrfs_copy_root(trans, root, root->node, &eb,
715  				      BTRFS_TREE_RELOC_OBJECTID);
716  		if (ret)
717  			goto fail;
718  	}
719  
720  	/*
721  	 * We have changed references at this point, we must abort the
722  	 * transaction if anything fails.
723  	 */
724  	must_abort = true;
725  
726  	memcpy(root_item, &root->root_item, sizeof(*root_item));
727  	btrfs_set_root_bytenr(root_item, eb->start);
728  	btrfs_set_root_level(root_item, btrfs_header_level(eb));
729  	btrfs_set_root_generation(root_item, trans->transid);
730  
731  	if (btrfs_root_id(root) == objectid) {
732  		btrfs_set_root_refs(root_item, 0);
733  		memset(&root_item->drop_progress, 0,
734  		       sizeof(struct btrfs_disk_key));
735  		btrfs_set_root_drop_level(root_item, 0);
736  	}
737  
738  	btrfs_tree_unlock(eb);
739  	free_extent_buffer(eb);
740  
741  	ret = btrfs_insert_root(trans, fs_info->tree_root,
742  				&root_key, root_item);
743  	if (ret)
744  		goto fail;
745  
746  	kfree(root_item);
747  
748  	reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
749  	if (IS_ERR(reloc_root)) {
750  		ret = PTR_ERR(reloc_root);
751  		goto abort;
752  	}
753  	set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
754  	btrfs_set_root_last_trans(reloc_root, trans->transid);
755  	return reloc_root;
756  fail:
757  	kfree(root_item);
758  abort:
759  	if (must_abort)
760  		btrfs_abort_transaction(trans, ret);
761  	return ERR_PTR(ret);
762  }
763  
764  /*
765   * create reloc tree for a given fs tree. reloc tree is just a
766   * snapshot of the fs tree with special root objectid.
767   *
768   * The reloc_root comes out of here with two references, one for
769   * root->reloc_root, and another for being on the rc->reloc_roots list.
770   */
btrfs_init_reloc_root(struct btrfs_trans_handle * trans,struct btrfs_root * root)771  int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
772  			  struct btrfs_root *root)
773  {
774  	struct btrfs_fs_info *fs_info = root->fs_info;
775  	struct btrfs_root *reloc_root;
776  	struct reloc_control *rc = fs_info->reloc_ctl;
777  	struct btrfs_block_rsv *rsv;
778  	int clear_rsv = 0;
779  	int ret;
780  
781  	if (!rc)
782  		return 0;
783  
784  	/*
785  	 * The subvolume has reloc tree but the swap is finished, no need to
786  	 * create/update the dead reloc tree
787  	 */
788  	if (reloc_root_is_dead(root))
789  		return 0;
790  
791  	/*
792  	 * This is subtle but important.  We do not do
793  	 * record_root_in_transaction for reloc roots, instead we record their
794  	 * corresponding fs root, and then here we update the last trans for the
795  	 * reloc root.  This means that we have to do this for the entire life
796  	 * of the reloc root, regardless of which stage of the relocation we are
797  	 * in.
798  	 */
799  	if (root->reloc_root) {
800  		reloc_root = root->reloc_root;
801  		btrfs_set_root_last_trans(reloc_root, trans->transid);
802  		return 0;
803  	}
804  
805  	/*
806  	 * We are merging reloc roots, we do not need new reloc trees.  Also
807  	 * reloc trees never need their own reloc tree.
808  	 */
809  	if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
810  		return 0;
811  
812  	if (!trans->reloc_reserved) {
813  		rsv = trans->block_rsv;
814  		trans->block_rsv = rc->block_rsv;
815  		clear_rsv = 1;
816  	}
817  	reloc_root = create_reloc_root(trans, root, btrfs_root_id(root));
818  	if (clear_rsv)
819  		trans->block_rsv = rsv;
820  	if (IS_ERR(reloc_root))
821  		return PTR_ERR(reloc_root);
822  
823  	ret = __add_reloc_root(reloc_root);
824  	ASSERT(ret != -EEXIST);
825  	if (ret) {
826  		/* Pairs with create_reloc_root */
827  		btrfs_put_root(reloc_root);
828  		return ret;
829  	}
830  	root->reloc_root = btrfs_grab_root(reloc_root);
831  	return 0;
832  }
833  
834  /*
835   * update root item of reloc tree
836   */
btrfs_update_reloc_root(struct btrfs_trans_handle * trans,struct btrfs_root * root)837  int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
838  			    struct btrfs_root *root)
839  {
840  	struct btrfs_fs_info *fs_info = root->fs_info;
841  	struct btrfs_root *reloc_root;
842  	struct btrfs_root_item *root_item;
843  	int ret;
844  
845  	if (!have_reloc_root(root))
846  		return 0;
847  
848  	reloc_root = root->reloc_root;
849  	root_item = &reloc_root->root_item;
850  
851  	/*
852  	 * We are probably ok here, but __del_reloc_root() will drop its ref of
853  	 * the root.  We have the ref for root->reloc_root, but just in case
854  	 * hold it while we update the reloc root.
855  	 */
856  	btrfs_grab_root(reloc_root);
857  
858  	/* root->reloc_root will stay until current relocation finished */
859  	if (fs_info->reloc_ctl && fs_info->reloc_ctl->merge_reloc_tree &&
860  	    btrfs_root_refs(root_item) == 0) {
861  		set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
862  		/*
863  		 * Mark the tree as dead before we change reloc_root so
864  		 * have_reloc_root will not touch it from now on.
865  		 */
866  		smp_wmb();
867  		__del_reloc_root(reloc_root);
868  	}
869  
870  	if (reloc_root->commit_root != reloc_root->node) {
871  		__update_reloc_root(reloc_root);
872  		btrfs_set_root_node(root_item, reloc_root->node);
873  		free_extent_buffer(reloc_root->commit_root);
874  		reloc_root->commit_root = btrfs_root_node(reloc_root);
875  	}
876  
877  	ret = btrfs_update_root(trans, fs_info->tree_root,
878  				&reloc_root->root_key, root_item);
879  	btrfs_put_root(reloc_root);
880  	return ret;
881  }
882  
883  /*
884   * get new location of data
885   */
get_new_location(struct inode * reloc_inode,u64 * new_bytenr,u64 bytenr,u64 num_bytes)886  static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
887  			    u64 bytenr, u64 num_bytes)
888  {
889  	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
890  	struct btrfs_path *path;
891  	struct btrfs_file_extent_item *fi;
892  	struct extent_buffer *leaf;
893  	int ret;
894  
895  	path = btrfs_alloc_path();
896  	if (!path)
897  		return -ENOMEM;
898  
899  	bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start;
900  	ret = btrfs_lookup_file_extent(NULL, root, path,
901  			btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
902  	if (ret < 0)
903  		goto out;
904  	if (ret > 0) {
905  		ret = -ENOENT;
906  		goto out;
907  	}
908  
909  	leaf = path->nodes[0];
910  	fi = btrfs_item_ptr(leaf, path->slots[0],
911  			    struct btrfs_file_extent_item);
912  
913  	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
914  	       btrfs_file_extent_compression(leaf, fi) ||
915  	       btrfs_file_extent_encryption(leaf, fi) ||
916  	       btrfs_file_extent_other_encoding(leaf, fi));
917  
918  	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
919  		ret = -EINVAL;
920  		goto out;
921  	}
922  
923  	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
924  	ret = 0;
925  out:
926  	btrfs_free_path(path);
927  	return ret;
928  }
929  
930  /*
931   * update file extent items in the tree leaf to point to
932   * the new locations.
933   */
934  static noinline_for_stack
replace_file_extents(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_root * root,struct extent_buffer * leaf)935  int replace_file_extents(struct btrfs_trans_handle *trans,
936  			 struct reloc_control *rc,
937  			 struct btrfs_root *root,
938  			 struct extent_buffer *leaf)
939  {
940  	struct btrfs_fs_info *fs_info = root->fs_info;
941  	struct btrfs_key key;
942  	struct btrfs_file_extent_item *fi;
943  	struct btrfs_inode *inode = NULL;
944  	u64 parent;
945  	u64 bytenr;
946  	u64 new_bytenr = 0;
947  	u64 num_bytes;
948  	u64 end;
949  	u32 nritems;
950  	u32 i;
951  	int ret = 0;
952  	int first = 1;
953  	int dirty = 0;
954  
955  	if (rc->stage != UPDATE_DATA_PTRS)
956  		return 0;
957  
958  	/* reloc trees always use full backref */
959  	if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
960  		parent = leaf->start;
961  	else
962  		parent = 0;
963  
964  	nritems = btrfs_header_nritems(leaf);
965  	for (i = 0; i < nritems; i++) {
966  		struct btrfs_ref ref = { 0 };
967  
968  		cond_resched();
969  		btrfs_item_key_to_cpu(leaf, &key, i);
970  		if (key.type != BTRFS_EXTENT_DATA_KEY)
971  			continue;
972  		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
973  		if (btrfs_file_extent_type(leaf, fi) ==
974  		    BTRFS_FILE_EXTENT_INLINE)
975  			continue;
976  		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
977  		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
978  		if (bytenr == 0)
979  			continue;
980  		if (!in_range(bytenr, rc->block_group->start,
981  			      rc->block_group->length))
982  			continue;
983  
984  		/*
985  		 * if we are modifying block in fs tree, wait for read_folio
986  		 * to complete and drop the extent cache
987  		 */
988  		if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) {
989  			if (first) {
990  				inode = btrfs_find_first_inode(root, key.objectid);
991  				first = 0;
992  			} else if (inode && btrfs_ino(inode) < key.objectid) {
993  				btrfs_add_delayed_iput(inode);
994  				inode = btrfs_find_first_inode(root, key.objectid);
995  			}
996  			if (inode && btrfs_ino(inode) == key.objectid) {
997  				struct extent_state *cached_state = NULL;
998  
999  				end = key.offset +
1000  				      btrfs_file_extent_num_bytes(leaf, fi);
1001  				WARN_ON(!IS_ALIGNED(key.offset,
1002  						    fs_info->sectorsize));
1003  				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1004  				end--;
1005  				/* Take mmap lock to serialize with reflinks. */
1006  				if (!down_read_trylock(&inode->i_mmap_lock))
1007  					continue;
1008  				ret = try_lock_extent(&inode->io_tree, key.offset,
1009  						      end, &cached_state);
1010  				if (!ret) {
1011  					up_read(&inode->i_mmap_lock);
1012  					continue;
1013  				}
1014  
1015  				btrfs_drop_extent_map_range(inode, key.offset, end, true);
1016  				unlock_extent(&inode->io_tree, key.offset, end,
1017  					      &cached_state);
1018  				up_read(&inode->i_mmap_lock);
1019  			}
1020  		}
1021  
1022  		ret = get_new_location(rc->data_inode, &new_bytenr,
1023  				       bytenr, num_bytes);
1024  		if (ret) {
1025  			/*
1026  			 * Don't have to abort since we've not changed anything
1027  			 * in the file extent yet.
1028  			 */
1029  			break;
1030  		}
1031  
1032  		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1033  		dirty = 1;
1034  
1035  		key.offset -= btrfs_file_extent_offset(leaf, fi);
1036  		ref.action = BTRFS_ADD_DELAYED_REF;
1037  		ref.bytenr = new_bytenr;
1038  		ref.num_bytes = num_bytes;
1039  		ref.parent = parent;
1040  		ref.owning_root = btrfs_root_id(root);
1041  		ref.ref_root = btrfs_header_owner(leaf);
1042  		btrfs_init_data_ref(&ref, key.objectid, key.offset,
1043  				    btrfs_root_id(root), false);
1044  		ret = btrfs_inc_extent_ref(trans, &ref);
1045  		if (ret) {
1046  			btrfs_abort_transaction(trans, ret);
1047  			break;
1048  		}
1049  
1050  		ref.action = BTRFS_DROP_DELAYED_REF;
1051  		ref.bytenr = bytenr;
1052  		ref.num_bytes = num_bytes;
1053  		ref.parent = parent;
1054  		ref.owning_root = btrfs_root_id(root);
1055  		ref.ref_root = btrfs_header_owner(leaf);
1056  		btrfs_init_data_ref(&ref, key.objectid, key.offset,
1057  				    btrfs_root_id(root), false);
1058  		ret = btrfs_free_extent(trans, &ref);
1059  		if (ret) {
1060  			btrfs_abort_transaction(trans, ret);
1061  			break;
1062  		}
1063  	}
1064  	if (dirty)
1065  		btrfs_mark_buffer_dirty(trans, leaf);
1066  	if (inode)
1067  		btrfs_add_delayed_iput(inode);
1068  	return ret;
1069  }
1070  
memcmp_node_keys(const struct extent_buffer * eb,int slot,const struct btrfs_path * path,int level)1071  static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb,
1072  					       int slot, const struct btrfs_path *path,
1073  					       int level)
1074  {
1075  	struct btrfs_disk_key key1;
1076  	struct btrfs_disk_key key2;
1077  	btrfs_node_key(eb, &key1, slot);
1078  	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1079  	return memcmp(&key1, &key2, sizeof(key1));
1080  }
1081  
1082  /*
1083   * try to replace tree blocks in fs tree with the new blocks
1084   * in reloc tree. tree blocks haven't been modified since the
1085   * reloc tree was create can be replaced.
1086   *
1087   * if a block was replaced, level of the block + 1 is returned.
1088   * if no block got replaced, 0 is returned. if there are other
1089   * errors, a negative error number is returned.
1090   */
1091  static noinline_for_stack
replace_path(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_root * dest,struct btrfs_root * src,struct btrfs_path * path,struct btrfs_key * next_key,int lowest_level,int max_level)1092  int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
1093  		 struct btrfs_root *dest, struct btrfs_root *src,
1094  		 struct btrfs_path *path, struct btrfs_key *next_key,
1095  		 int lowest_level, int max_level)
1096  {
1097  	struct btrfs_fs_info *fs_info = dest->fs_info;
1098  	struct extent_buffer *eb;
1099  	struct extent_buffer *parent;
1100  	struct btrfs_ref ref = { 0 };
1101  	struct btrfs_key key;
1102  	u64 old_bytenr;
1103  	u64 new_bytenr;
1104  	u64 old_ptr_gen;
1105  	u64 new_ptr_gen;
1106  	u64 last_snapshot;
1107  	u32 blocksize;
1108  	int cow = 0;
1109  	int level;
1110  	int ret;
1111  	int slot;
1112  
1113  	ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID);
1114  	ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID);
1115  
1116  	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1117  again:
1118  	slot = path->slots[lowest_level];
1119  	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1120  
1121  	eb = btrfs_lock_root_node(dest);
1122  	level = btrfs_header_level(eb);
1123  
1124  	if (level < lowest_level) {
1125  		btrfs_tree_unlock(eb);
1126  		free_extent_buffer(eb);
1127  		return 0;
1128  	}
1129  
1130  	if (cow) {
1131  		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
1132  				      BTRFS_NESTING_COW);
1133  		if (ret) {
1134  			btrfs_tree_unlock(eb);
1135  			free_extent_buffer(eb);
1136  			return ret;
1137  		}
1138  	}
1139  
1140  	if (next_key) {
1141  		next_key->objectid = (u64)-1;
1142  		next_key->type = (u8)-1;
1143  		next_key->offset = (u64)-1;
1144  	}
1145  
1146  	parent = eb;
1147  	while (1) {
1148  		level = btrfs_header_level(parent);
1149  		ASSERT(level >= lowest_level);
1150  
1151  		ret = btrfs_bin_search(parent, 0, &key, &slot);
1152  		if (ret < 0)
1153  			break;
1154  		if (ret && slot > 0)
1155  			slot--;
1156  
1157  		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1158  			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1159  
1160  		old_bytenr = btrfs_node_blockptr(parent, slot);
1161  		blocksize = fs_info->nodesize;
1162  		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1163  
1164  		if (level <= max_level) {
1165  			eb = path->nodes[level];
1166  			new_bytenr = btrfs_node_blockptr(eb,
1167  							path->slots[level]);
1168  			new_ptr_gen = btrfs_node_ptr_generation(eb,
1169  							path->slots[level]);
1170  		} else {
1171  			new_bytenr = 0;
1172  			new_ptr_gen = 0;
1173  		}
1174  
1175  		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1176  			ret = level;
1177  			break;
1178  		}
1179  
1180  		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1181  		    memcmp_node_keys(parent, slot, path, level)) {
1182  			if (level <= lowest_level) {
1183  				ret = 0;
1184  				break;
1185  			}
1186  
1187  			eb = btrfs_read_node_slot(parent, slot);
1188  			if (IS_ERR(eb)) {
1189  				ret = PTR_ERR(eb);
1190  				break;
1191  			}
1192  			btrfs_tree_lock(eb);
1193  			if (cow) {
1194  				ret = btrfs_cow_block(trans, dest, eb, parent,
1195  						      slot, &eb,
1196  						      BTRFS_NESTING_COW);
1197  				if (ret) {
1198  					btrfs_tree_unlock(eb);
1199  					free_extent_buffer(eb);
1200  					break;
1201  				}
1202  			}
1203  
1204  			btrfs_tree_unlock(parent);
1205  			free_extent_buffer(parent);
1206  
1207  			parent = eb;
1208  			continue;
1209  		}
1210  
1211  		if (!cow) {
1212  			btrfs_tree_unlock(parent);
1213  			free_extent_buffer(parent);
1214  			cow = 1;
1215  			goto again;
1216  		}
1217  
1218  		btrfs_node_key_to_cpu(path->nodes[level], &key,
1219  				      path->slots[level]);
1220  		btrfs_release_path(path);
1221  
1222  		path->lowest_level = level;
1223  		set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
1224  		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1225  		clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
1226  		path->lowest_level = 0;
1227  		if (ret) {
1228  			if (ret > 0)
1229  				ret = -ENOENT;
1230  			break;
1231  		}
1232  
1233  		/*
1234  		 * Info qgroup to trace both subtrees.
1235  		 *
1236  		 * We must trace both trees.
1237  		 * 1) Tree reloc subtree
1238  		 *    If not traced, we will leak data numbers
1239  		 * 2) Fs subtree
1240  		 *    If not traced, we will double count old data
1241  		 *
1242  		 * We don't scan the subtree right now, but only record
1243  		 * the swapped tree blocks.
1244  		 * The real subtree rescan is delayed until we have new
1245  		 * CoW on the subtree root node before transaction commit.
1246  		 */
1247  		ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
1248  				rc->block_group, parent, slot,
1249  				path->nodes[level], path->slots[level],
1250  				last_snapshot);
1251  		if (ret < 0)
1252  			break;
1253  		/*
1254  		 * swap blocks in fs tree and reloc tree.
1255  		 */
1256  		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1257  		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1258  		btrfs_mark_buffer_dirty(trans, parent);
1259  
1260  		btrfs_set_node_blockptr(path->nodes[level],
1261  					path->slots[level], old_bytenr);
1262  		btrfs_set_node_ptr_generation(path->nodes[level],
1263  					      path->slots[level], old_ptr_gen);
1264  		btrfs_mark_buffer_dirty(trans, path->nodes[level]);
1265  
1266  		ref.action = BTRFS_ADD_DELAYED_REF;
1267  		ref.bytenr = old_bytenr;
1268  		ref.num_bytes = blocksize;
1269  		ref.parent = path->nodes[level]->start;
1270  		ref.owning_root = btrfs_root_id(src);
1271  		ref.ref_root = btrfs_root_id(src);
1272  		btrfs_init_tree_ref(&ref, level - 1, 0, true);
1273  		ret = btrfs_inc_extent_ref(trans, &ref);
1274  		if (ret) {
1275  			btrfs_abort_transaction(trans, ret);
1276  			break;
1277  		}
1278  
1279  		ref.action = BTRFS_ADD_DELAYED_REF;
1280  		ref.bytenr = new_bytenr;
1281  		ref.num_bytes = blocksize;
1282  		ref.parent = 0;
1283  		ref.owning_root = btrfs_root_id(dest);
1284  		ref.ref_root = btrfs_root_id(dest);
1285  		btrfs_init_tree_ref(&ref, level - 1, 0, true);
1286  		ret = btrfs_inc_extent_ref(trans, &ref);
1287  		if (ret) {
1288  			btrfs_abort_transaction(trans, ret);
1289  			break;
1290  		}
1291  
1292  		/* We don't know the real owning_root, use 0. */
1293  		ref.action = BTRFS_DROP_DELAYED_REF;
1294  		ref.bytenr = new_bytenr;
1295  		ref.num_bytes = blocksize;
1296  		ref.parent = path->nodes[level]->start;
1297  		ref.owning_root = 0;
1298  		ref.ref_root = btrfs_root_id(src);
1299  		btrfs_init_tree_ref(&ref, level - 1, 0, true);
1300  		ret = btrfs_free_extent(trans, &ref);
1301  		if (ret) {
1302  			btrfs_abort_transaction(trans, ret);
1303  			break;
1304  		}
1305  
1306  		/* We don't know the real owning_root, use 0. */
1307  		ref.action = BTRFS_DROP_DELAYED_REF;
1308  		ref.bytenr = old_bytenr;
1309  		ref.num_bytes = blocksize;
1310  		ref.parent = 0;
1311  		ref.owning_root = 0;
1312  		ref.ref_root = btrfs_root_id(dest);
1313  		btrfs_init_tree_ref(&ref, level - 1, 0, true);
1314  		ret = btrfs_free_extent(trans, &ref);
1315  		if (ret) {
1316  			btrfs_abort_transaction(trans, ret);
1317  			break;
1318  		}
1319  
1320  		btrfs_unlock_up_safe(path, 0);
1321  
1322  		ret = level;
1323  		break;
1324  	}
1325  	btrfs_tree_unlock(parent);
1326  	free_extent_buffer(parent);
1327  	return ret;
1328  }
1329  
1330  /*
1331   * helper to find next relocated block in reloc tree
1332   */
1333  static noinline_for_stack
walk_up_reloc_tree(struct btrfs_root * root,struct btrfs_path * path,int * level)1334  int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1335  		       int *level)
1336  {
1337  	struct extent_buffer *eb;
1338  	int i;
1339  	u64 last_snapshot;
1340  	u32 nritems;
1341  
1342  	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1343  
1344  	for (i = 0; i < *level; i++) {
1345  		free_extent_buffer(path->nodes[i]);
1346  		path->nodes[i] = NULL;
1347  	}
1348  
1349  	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1350  		eb = path->nodes[i];
1351  		nritems = btrfs_header_nritems(eb);
1352  		while (path->slots[i] + 1 < nritems) {
1353  			path->slots[i]++;
1354  			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1355  			    last_snapshot)
1356  				continue;
1357  
1358  			*level = i;
1359  			return 0;
1360  		}
1361  		free_extent_buffer(path->nodes[i]);
1362  		path->nodes[i] = NULL;
1363  	}
1364  	return 1;
1365  }
1366  
1367  /*
1368   * walk down reloc tree to find relocated block of lowest level
1369   */
1370  static noinline_for_stack
walk_down_reloc_tree(struct btrfs_root * root,struct btrfs_path * path,int * level)1371  int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1372  			 int *level)
1373  {
1374  	struct extent_buffer *eb = NULL;
1375  	int i;
1376  	u64 ptr_gen = 0;
1377  	u64 last_snapshot;
1378  	u32 nritems;
1379  
1380  	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1381  
1382  	for (i = *level; i > 0; i--) {
1383  		eb = path->nodes[i];
1384  		nritems = btrfs_header_nritems(eb);
1385  		while (path->slots[i] < nritems) {
1386  			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1387  			if (ptr_gen > last_snapshot)
1388  				break;
1389  			path->slots[i]++;
1390  		}
1391  		if (path->slots[i] >= nritems) {
1392  			if (i == *level)
1393  				break;
1394  			*level = i + 1;
1395  			return 0;
1396  		}
1397  		if (i == 1) {
1398  			*level = i;
1399  			return 0;
1400  		}
1401  
1402  		eb = btrfs_read_node_slot(eb, path->slots[i]);
1403  		if (IS_ERR(eb))
1404  			return PTR_ERR(eb);
1405  		BUG_ON(btrfs_header_level(eb) != i - 1);
1406  		path->nodes[i - 1] = eb;
1407  		path->slots[i - 1] = 0;
1408  	}
1409  	return 1;
1410  }
1411  
1412  /*
1413   * invalidate extent cache for file extents whose key in range of
1414   * [min_key, max_key)
1415   */
invalidate_extent_cache(struct btrfs_root * root,const struct btrfs_key * min_key,const struct btrfs_key * max_key)1416  static int invalidate_extent_cache(struct btrfs_root *root,
1417  				   const struct btrfs_key *min_key,
1418  				   const struct btrfs_key *max_key)
1419  {
1420  	struct btrfs_fs_info *fs_info = root->fs_info;
1421  	struct btrfs_inode *inode = NULL;
1422  	u64 objectid;
1423  	u64 start, end;
1424  	u64 ino;
1425  
1426  	objectid = min_key->objectid;
1427  	while (1) {
1428  		struct extent_state *cached_state = NULL;
1429  
1430  		cond_resched();
1431  		if (inode)
1432  			iput(&inode->vfs_inode);
1433  
1434  		if (objectid > max_key->objectid)
1435  			break;
1436  
1437  		inode = btrfs_find_first_inode(root, objectid);
1438  		if (!inode)
1439  			break;
1440  		ino = btrfs_ino(inode);
1441  
1442  		if (ino > max_key->objectid) {
1443  			iput(&inode->vfs_inode);
1444  			break;
1445  		}
1446  
1447  		objectid = ino + 1;
1448  		if (!S_ISREG(inode->vfs_inode.i_mode))
1449  			continue;
1450  
1451  		if (unlikely(min_key->objectid == ino)) {
1452  			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1453  				continue;
1454  			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1455  				start = 0;
1456  			else {
1457  				start = min_key->offset;
1458  				WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
1459  			}
1460  		} else {
1461  			start = 0;
1462  		}
1463  
1464  		if (unlikely(max_key->objectid == ino)) {
1465  			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1466  				continue;
1467  			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1468  				end = (u64)-1;
1469  			} else {
1470  				if (max_key->offset == 0)
1471  					continue;
1472  				end = max_key->offset;
1473  				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1474  				end--;
1475  			}
1476  		} else {
1477  			end = (u64)-1;
1478  		}
1479  
1480  		/* the lock_extent waits for read_folio to complete */
1481  		lock_extent(&inode->io_tree, start, end, &cached_state);
1482  		btrfs_drop_extent_map_range(inode, start, end, true);
1483  		unlock_extent(&inode->io_tree, start, end, &cached_state);
1484  	}
1485  	return 0;
1486  }
1487  
find_next_key(struct btrfs_path * path,int level,struct btrfs_key * key)1488  static int find_next_key(struct btrfs_path *path, int level,
1489  			 struct btrfs_key *key)
1490  
1491  {
1492  	while (level < BTRFS_MAX_LEVEL) {
1493  		if (!path->nodes[level])
1494  			break;
1495  		if (path->slots[level] + 1 <
1496  		    btrfs_header_nritems(path->nodes[level])) {
1497  			btrfs_node_key_to_cpu(path->nodes[level], key,
1498  					      path->slots[level] + 1);
1499  			return 0;
1500  		}
1501  		level++;
1502  	}
1503  	return 1;
1504  }
1505  
1506  /*
1507   * Insert current subvolume into reloc_control::dirty_subvol_roots
1508   */
insert_dirty_subvol(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_root * root)1509  static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
1510  			       struct reloc_control *rc,
1511  			       struct btrfs_root *root)
1512  {
1513  	struct btrfs_root *reloc_root = root->reloc_root;
1514  	struct btrfs_root_item *reloc_root_item;
1515  	int ret;
1516  
1517  	/* @root must be a subvolume tree root with a valid reloc tree */
1518  	ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
1519  	ASSERT(reloc_root);
1520  
1521  	reloc_root_item = &reloc_root->root_item;
1522  	memset(&reloc_root_item->drop_progress, 0,
1523  		sizeof(reloc_root_item->drop_progress));
1524  	btrfs_set_root_drop_level(reloc_root_item, 0);
1525  	btrfs_set_root_refs(reloc_root_item, 0);
1526  	ret = btrfs_update_reloc_root(trans, root);
1527  	if (ret)
1528  		return ret;
1529  
1530  	if (list_empty(&root->reloc_dirty_list)) {
1531  		btrfs_grab_root(root);
1532  		list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
1533  	}
1534  
1535  	return 0;
1536  }
1537  
clean_dirty_subvols(struct reloc_control * rc)1538  static int clean_dirty_subvols(struct reloc_control *rc)
1539  {
1540  	struct btrfs_root *root;
1541  	struct btrfs_root *next;
1542  	int ret = 0;
1543  	int ret2;
1544  
1545  	list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
1546  				 reloc_dirty_list) {
1547  		if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) {
1548  			/* Merged subvolume, cleanup its reloc root */
1549  			struct btrfs_root *reloc_root = root->reloc_root;
1550  
1551  			list_del_init(&root->reloc_dirty_list);
1552  			root->reloc_root = NULL;
1553  			/*
1554  			 * Need barrier to ensure clear_bit() only happens after
1555  			 * root->reloc_root = NULL. Pairs with have_reloc_root.
1556  			 */
1557  			smp_wmb();
1558  			clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
1559  			if (reloc_root) {
1560  				/*
1561  				 * btrfs_drop_snapshot drops our ref we hold for
1562  				 * ->reloc_root.  If it fails however we must
1563  				 * drop the ref ourselves.
1564  				 */
1565  				ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
1566  				if (ret2 < 0) {
1567  					btrfs_put_root(reloc_root);
1568  					if (!ret)
1569  						ret = ret2;
1570  				}
1571  			}
1572  			btrfs_put_root(root);
1573  		} else {
1574  			/* Orphan reloc tree, just clean it up */
1575  			ret2 = btrfs_drop_snapshot(root, 0, 1);
1576  			if (ret2 < 0) {
1577  				btrfs_put_root(root);
1578  				if (!ret)
1579  					ret = ret2;
1580  			}
1581  		}
1582  	}
1583  	return ret;
1584  }
1585  
1586  /*
1587   * merge the relocated tree blocks in reloc tree with corresponding
1588   * fs tree.
1589   */
merge_reloc_root(struct reloc_control * rc,struct btrfs_root * root)1590  static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1591  					       struct btrfs_root *root)
1592  {
1593  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1594  	struct btrfs_key key;
1595  	struct btrfs_key next_key;
1596  	struct btrfs_trans_handle *trans = NULL;
1597  	struct btrfs_root *reloc_root;
1598  	struct btrfs_root_item *root_item;
1599  	struct btrfs_path *path;
1600  	struct extent_buffer *leaf;
1601  	int reserve_level;
1602  	int level;
1603  	int max_level;
1604  	int replaced = 0;
1605  	int ret = 0;
1606  	u32 min_reserved;
1607  
1608  	path = btrfs_alloc_path();
1609  	if (!path)
1610  		return -ENOMEM;
1611  	path->reada = READA_FORWARD;
1612  
1613  	reloc_root = root->reloc_root;
1614  	root_item = &reloc_root->root_item;
1615  
1616  	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1617  		level = btrfs_root_level(root_item);
1618  		atomic_inc(&reloc_root->node->refs);
1619  		path->nodes[level] = reloc_root->node;
1620  		path->slots[level] = 0;
1621  	} else {
1622  		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1623  
1624  		level = btrfs_root_drop_level(root_item);
1625  		BUG_ON(level == 0);
1626  		path->lowest_level = level;
1627  		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1628  		path->lowest_level = 0;
1629  		if (ret < 0) {
1630  			btrfs_free_path(path);
1631  			return ret;
1632  		}
1633  
1634  		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1635  				      path->slots[level]);
1636  		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1637  
1638  		btrfs_unlock_up_safe(path, 0);
1639  	}
1640  
1641  	/*
1642  	 * In merge_reloc_root(), we modify the upper level pointer to swap the
1643  	 * tree blocks between reloc tree and subvolume tree.  Thus for tree
1644  	 * block COW, we COW at most from level 1 to root level for each tree.
1645  	 *
1646  	 * Thus the needed metadata size is at most root_level * nodesize,
1647  	 * and * 2 since we have two trees to COW.
1648  	 */
1649  	reserve_level = max_t(int, 1, btrfs_root_level(root_item));
1650  	min_reserved = fs_info->nodesize * reserve_level * 2;
1651  	memset(&next_key, 0, sizeof(next_key));
1652  
1653  	while (1) {
1654  		ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
1655  					     min_reserved,
1656  					     BTRFS_RESERVE_FLUSH_LIMIT);
1657  		if (ret)
1658  			goto out;
1659  		trans = btrfs_start_transaction(root, 0);
1660  		if (IS_ERR(trans)) {
1661  			ret = PTR_ERR(trans);
1662  			trans = NULL;
1663  			goto out;
1664  		}
1665  
1666  		/*
1667  		 * At this point we no longer have a reloc_control, so we can't
1668  		 * depend on btrfs_init_reloc_root to update our last_trans.
1669  		 *
1670  		 * But that's ok, we started the trans handle on our
1671  		 * corresponding fs_root, which means it's been added to the
1672  		 * dirty list.  At commit time we'll still call
1673  		 * btrfs_update_reloc_root() and update our root item
1674  		 * appropriately.
1675  		 */
1676  		btrfs_set_root_last_trans(reloc_root, trans->transid);
1677  		trans->block_rsv = rc->block_rsv;
1678  
1679  		replaced = 0;
1680  		max_level = level;
1681  
1682  		ret = walk_down_reloc_tree(reloc_root, path, &level);
1683  		if (ret < 0)
1684  			goto out;
1685  		if (ret > 0)
1686  			break;
1687  
1688  		if (!find_next_key(path, level, &key) &&
1689  		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1690  			ret = 0;
1691  		} else {
1692  			ret = replace_path(trans, rc, root, reloc_root, path,
1693  					   &next_key, level, max_level);
1694  		}
1695  		if (ret < 0)
1696  			goto out;
1697  		if (ret > 0) {
1698  			level = ret;
1699  			btrfs_node_key_to_cpu(path->nodes[level], &key,
1700  					      path->slots[level]);
1701  			replaced = 1;
1702  		}
1703  
1704  		ret = walk_up_reloc_tree(reloc_root, path, &level);
1705  		if (ret > 0)
1706  			break;
1707  
1708  		BUG_ON(level == 0);
1709  		/*
1710  		 * save the merging progress in the drop_progress.
1711  		 * this is OK since root refs == 1 in this case.
1712  		 */
1713  		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1714  			       path->slots[level]);
1715  		btrfs_set_root_drop_level(root_item, level);
1716  
1717  		btrfs_end_transaction_throttle(trans);
1718  		trans = NULL;
1719  
1720  		btrfs_btree_balance_dirty(fs_info);
1721  
1722  		if (replaced && rc->stage == UPDATE_DATA_PTRS)
1723  			invalidate_extent_cache(root, &key, &next_key);
1724  	}
1725  
1726  	/*
1727  	 * handle the case only one block in the fs tree need to be
1728  	 * relocated and the block is tree root.
1729  	 */
1730  	leaf = btrfs_lock_root_node(root);
1731  	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
1732  			      BTRFS_NESTING_COW);
1733  	btrfs_tree_unlock(leaf);
1734  	free_extent_buffer(leaf);
1735  out:
1736  	btrfs_free_path(path);
1737  
1738  	if (ret == 0) {
1739  		ret = insert_dirty_subvol(trans, rc, root);
1740  		if (ret)
1741  			btrfs_abort_transaction(trans, ret);
1742  	}
1743  
1744  	if (trans)
1745  		btrfs_end_transaction_throttle(trans);
1746  
1747  	btrfs_btree_balance_dirty(fs_info);
1748  
1749  	if (replaced && rc->stage == UPDATE_DATA_PTRS)
1750  		invalidate_extent_cache(root, &key, &next_key);
1751  
1752  	return ret;
1753  }
1754  
1755  static noinline_for_stack
prepare_to_merge(struct reloc_control * rc,int err)1756  int prepare_to_merge(struct reloc_control *rc, int err)
1757  {
1758  	struct btrfs_root *root = rc->extent_root;
1759  	struct btrfs_fs_info *fs_info = root->fs_info;
1760  	struct btrfs_root *reloc_root;
1761  	struct btrfs_trans_handle *trans;
1762  	LIST_HEAD(reloc_roots);
1763  	u64 num_bytes = 0;
1764  	int ret;
1765  
1766  	mutex_lock(&fs_info->reloc_mutex);
1767  	rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1768  	rc->merging_rsv_size += rc->nodes_relocated * 2;
1769  	mutex_unlock(&fs_info->reloc_mutex);
1770  
1771  again:
1772  	if (!err) {
1773  		num_bytes = rc->merging_rsv_size;
1774  		ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes,
1775  					  BTRFS_RESERVE_FLUSH_ALL);
1776  		if (ret)
1777  			err = ret;
1778  	}
1779  
1780  	trans = btrfs_join_transaction(rc->extent_root);
1781  	if (IS_ERR(trans)) {
1782  		if (!err)
1783  			btrfs_block_rsv_release(fs_info, rc->block_rsv,
1784  						num_bytes, NULL);
1785  		return PTR_ERR(trans);
1786  	}
1787  
1788  	if (!err) {
1789  		if (num_bytes != rc->merging_rsv_size) {
1790  			btrfs_end_transaction(trans);
1791  			btrfs_block_rsv_release(fs_info, rc->block_rsv,
1792  						num_bytes, NULL);
1793  			goto again;
1794  		}
1795  	}
1796  
1797  	rc->merge_reloc_tree = true;
1798  
1799  	while (!list_empty(&rc->reloc_roots)) {
1800  		reloc_root = list_entry(rc->reloc_roots.next,
1801  					struct btrfs_root, root_list);
1802  		list_del_init(&reloc_root->root_list);
1803  
1804  		root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1805  				false);
1806  		if (IS_ERR(root)) {
1807  			/*
1808  			 * Even if we have an error we need this reloc root
1809  			 * back on our list so we can clean up properly.
1810  			 */
1811  			list_add(&reloc_root->root_list, &reloc_roots);
1812  			btrfs_abort_transaction(trans, (int)PTR_ERR(root));
1813  			if (!err)
1814  				err = PTR_ERR(root);
1815  			break;
1816  		}
1817  
1818  		if (unlikely(root->reloc_root != reloc_root)) {
1819  			if (root->reloc_root) {
1820  				btrfs_err(fs_info,
1821  "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu",
1822  					  btrfs_root_id(root),
1823  					  btrfs_root_id(root->reloc_root),
1824  					  root->reloc_root->root_key.type,
1825  					  root->reloc_root->root_key.offset,
1826  					  btrfs_root_generation(
1827  						  &root->reloc_root->root_item),
1828  					  btrfs_root_id(reloc_root),
1829  					  reloc_root->root_key.type,
1830  					  reloc_root->root_key.offset,
1831  					  btrfs_root_generation(
1832  						  &reloc_root->root_item));
1833  			} else {
1834  				btrfs_err(fs_info,
1835  "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu",
1836  					  btrfs_root_id(root),
1837  					  btrfs_root_id(reloc_root),
1838  					  reloc_root->root_key.type,
1839  					  reloc_root->root_key.offset,
1840  					  btrfs_root_generation(
1841  						  &reloc_root->root_item));
1842  			}
1843  			list_add(&reloc_root->root_list, &reloc_roots);
1844  			btrfs_put_root(root);
1845  			btrfs_abort_transaction(trans, -EUCLEAN);
1846  			if (!err)
1847  				err = -EUCLEAN;
1848  			break;
1849  		}
1850  
1851  		/*
1852  		 * set reference count to 1, so btrfs_recover_relocation
1853  		 * knows it should resumes merging
1854  		 */
1855  		if (!err)
1856  			btrfs_set_root_refs(&reloc_root->root_item, 1);
1857  		ret = btrfs_update_reloc_root(trans, root);
1858  
1859  		/*
1860  		 * Even if we have an error we need this reloc root back on our
1861  		 * list so we can clean up properly.
1862  		 */
1863  		list_add(&reloc_root->root_list, &reloc_roots);
1864  		btrfs_put_root(root);
1865  
1866  		if (ret) {
1867  			btrfs_abort_transaction(trans, ret);
1868  			if (!err)
1869  				err = ret;
1870  			break;
1871  		}
1872  	}
1873  
1874  	list_splice(&reloc_roots, &rc->reloc_roots);
1875  
1876  	if (!err)
1877  		err = btrfs_commit_transaction(trans);
1878  	else
1879  		btrfs_end_transaction(trans);
1880  	return err;
1881  }
1882  
1883  static noinline_for_stack
free_reloc_roots(struct list_head * list)1884  void free_reloc_roots(struct list_head *list)
1885  {
1886  	struct btrfs_root *reloc_root, *tmp;
1887  
1888  	list_for_each_entry_safe(reloc_root, tmp, list, root_list)
1889  		__del_reloc_root(reloc_root);
1890  }
1891  
1892  static noinline_for_stack
merge_reloc_roots(struct reloc_control * rc)1893  void merge_reloc_roots(struct reloc_control *rc)
1894  {
1895  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1896  	struct btrfs_root *root;
1897  	struct btrfs_root *reloc_root;
1898  	LIST_HEAD(reloc_roots);
1899  	int found = 0;
1900  	int ret = 0;
1901  again:
1902  	root = rc->extent_root;
1903  
1904  	/*
1905  	 * this serializes us with btrfs_record_root_in_transaction,
1906  	 * we have to make sure nobody is in the middle of
1907  	 * adding their roots to the list while we are
1908  	 * doing this splice
1909  	 */
1910  	mutex_lock(&fs_info->reloc_mutex);
1911  	list_splice_init(&rc->reloc_roots, &reloc_roots);
1912  	mutex_unlock(&fs_info->reloc_mutex);
1913  
1914  	while (!list_empty(&reloc_roots)) {
1915  		found = 1;
1916  		reloc_root = list_entry(reloc_roots.next,
1917  					struct btrfs_root, root_list);
1918  
1919  		root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1920  					 false);
1921  		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1922  			if (WARN_ON(IS_ERR(root))) {
1923  				/*
1924  				 * For recovery we read the fs roots on mount,
1925  				 * and if we didn't find the root then we marked
1926  				 * the reloc root as a garbage root.  For normal
1927  				 * relocation obviously the root should exist in
1928  				 * memory.  However there's no reason we can't
1929  				 * handle the error properly here just in case.
1930  				 */
1931  				ret = PTR_ERR(root);
1932  				goto out;
1933  			}
1934  			if (WARN_ON(root->reloc_root != reloc_root)) {
1935  				/*
1936  				 * This can happen if on-disk metadata has some
1937  				 * corruption, e.g. bad reloc tree key offset.
1938  				 */
1939  				ret = -EINVAL;
1940  				goto out;
1941  			}
1942  			ret = merge_reloc_root(rc, root);
1943  			btrfs_put_root(root);
1944  			if (ret) {
1945  				if (list_empty(&reloc_root->root_list))
1946  					list_add_tail(&reloc_root->root_list,
1947  						      &reloc_roots);
1948  				goto out;
1949  			}
1950  		} else {
1951  			if (!IS_ERR(root)) {
1952  				if (root->reloc_root == reloc_root) {
1953  					root->reloc_root = NULL;
1954  					btrfs_put_root(reloc_root);
1955  				}
1956  				clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
1957  					  &root->state);
1958  				btrfs_put_root(root);
1959  			}
1960  
1961  			list_del_init(&reloc_root->root_list);
1962  			/* Don't forget to queue this reloc root for cleanup */
1963  			list_add_tail(&reloc_root->reloc_dirty_list,
1964  				      &rc->dirty_subvol_roots);
1965  		}
1966  	}
1967  
1968  	if (found) {
1969  		found = 0;
1970  		goto again;
1971  	}
1972  out:
1973  	if (ret) {
1974  		btrfs_handle_fs_error(fs_info, ret, NULL);
1975  		free_reloc_roots(&reloc_roots);
1976  
1977  		/* new reloc root may be added */
1978  		mutex_lock(&fs_info->reloc_mutex);
1979  		list_splice_init(&rc->reloc_roots, &reloc_roots);
1980  		mutex_unlock(&fs_info->reloc_mutex);
1981  		free_reloc_roots(&reloc_roots);
1982  	}
1983  
1984  	/*
1985  	 * We used to have
1986  	 *
1987  	 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1988  	 *
1989  	 * here, but it's wrong.  If we fail to start the transaction in
1990  	 * prepare_to_merge() we will have only 0 ref reloc roots, none of which
1991  	 * have actually been removed from the reloc_root_tree rb tree.  This is
1992  	 * fine because we're bailing here, and we hold a reference on the root
1993  	 * for the list that holds it, so these roots will be cleaned up when we
1994  	 * do the reloc_dirty_list afterwards.  Meanwhile the root->reloc_root
1995  	 * will be cleaned up on unmount.
1996  	 *
1997  	 * The remaining nodes will be cleaned up by free_reloc_control.
1998  	 */
1999  }
2000  
free_block_list(struct rb_root * blocks)2001  static void free_block_list(struct rb_root *blocks)
2002  {
2003  	struct tree_block *block;
2004  	struct rb_node *rb_node;
2005  	while ((rb_node = rb_first(blocks))) {
2006  		block = rb_entry(rb_node, struct tree_block, rb_node);
2007  		rb_erase(rb_node, blocks);
2008  		kfree(block);
2009  	}
2010  }
2011  
record_reloc_root_in_trans(struct btrfs_trans_handle * trans,struct btrfs_root * reloc_root)2012  static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2013  				      struct btrfs_root *reloc_root)
2014  {
2015  	struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2016  	struct btrfs_root *root;
2017  	int ret;
2018  
2019  	if (btrfs_get_root_last_trans(reloc_root) == trans->transid)
2020  		return 0;
2021  
2022  	root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
2023  
2024  	/*
2025  	 * This should succeed, since we can't have a reloc root without having
2026  	 * already looked up the actual root and created the reloc root for this
2027  	 * root.
2028  	 *
2029  	 * However if there's some sort of corruption where we have a ref to a
2030  	 * reloc root without a corresponding root this could return ENOENT.
2031  	 */
2032  	if (IS_ERR(root)) {
2033  		ASSERT(0);
2034  		return PTR_ERR(root);
2035  	}
2036  	if (root->reloc_root != reloc_root) {
2037  		ASSERT(0);
2038  		btrfs_err(fs_info,
2039  			  "root %llu has two reloc roots associated with it",
2040  			  reloc_root->root_key.offset);
2041  		btrfs_put_root(root);
2042  		return -EUCLEAN;
2043  	}
2044  	ret = btrfs_record_root_in_trans(trans, root);
2045  	btrfs_put_root(root);
2046  
2047  	return ret;
2048  }
2049  
2050  static noinline_for_stack
select_reloc_root(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node,struct btrfs_backref_edge * edges[])2051  struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2052  				     struct reloc_control *rc,
2053  				     struct btrfs_backref_node *node,
2054  				     struct btrfs_backref_edge *edges[])
2055  {
2056  	struct btrfs_backref_node *next;
2057  	struct btrfs_root *root;
2058  	int index = 0;
2059  	int ret;
2060  
2061  	next = node;
2062  	while (1) {
2063  		cond_resched();
2064  		next = walk_up_backref(next, edges, &index);
2065  		root = next->root;
2066  
2067  		/*
2068  		 * If there is no root, then our references for this block are
2069  		 * incomplete, as we should be able to walk all the way up to a
2070  		 * block that is owned by a root.
2071  		 *
2072  		 * This path is only for SHAREABLE roots, so if we come upon a
2073  		 * non-SHAREABLE root then we have backrefs that resolve
2074  		 * improperly.
2075  		 *
2076  		 * Both of these cases indicate file system corruption, or a bug
2077  		 * in the backref walking code.
2078  		 */
2079  		if (!root) {
2080  			ASSERT(0);
2081  			btrfs_err(trans->fs_info,
2082  		"bytenr %llu doesn't have a backref path ending in a root",
2083  				  node->bytenr);
2084  			return ERR_PTR(-EUCLEAN);
2085  		}
2086  		if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
2087  			ASSERT(0);
2088  			btrfs_err(trans->fs_info,
2089  	"bytenr %llu has multiple refs with one ending in a non-shareable root",
2090  				  node->bytenr);
2091  			return ERR_PTR(-EUCLEAN);
2092  		}
2093  
2094  		if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
2095  			ret = record_reloc_root_in_trans(trans, root);
2096  			if (ret)
2097  				return ERR_PTR(ret);
2098  			break;
2099  		}
2100  
2101  		ret = btrfs_record_root_in_trans(trans, root);
2102  		if (ret)
2103  			return ERR_PTR(ret);
2104  		root = root->reloc_root;
2105  
2106  		/*
2107  		 * We could have raced with another thread which failed, so
2108  		 * root->reloc_root may not be set, return ENOENT in this case.
2109  		 */
2110  		if (!root)
2111  			return ERR_PTR(-ENOENT);
2112  
2113  		if (next->new_bytenr != root->node->start) {
2114  			/*
2115  			 * We just created the reloc root, so we shouldn't have
2116  			 * ->new_bytenr set and this shouldn't be in the changed
2117  			 *  list.  If it is then we have multiple roots pointing
2118  			 *  at the same bytenr which indicates corruption, or
2119  			 *  we've made a mistake in the backref walking code.
2120  			 */
2121  			ASSERT(next->new_bytenr == 0);
2122  			ASSERT(list_empty(&next->list));
2123  			if (next->new_bytenr || !list_empty(&next->list)) {
2124  				btrfs_err(trans->fs_info,
2125  	"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
2126  					  node->bytenr, next->bytenr);
2127  				return ERR_PTR(-EUCLEAN);
2128  			}
2129  
2130  			next->new_bytenr = root->node->start;
2131  			btrfs_put_root(next->root);
2132  			next->root = btrfs_grab_root(root);
2133  			ASSERT(next->root);
2134  			list_add_tail(&next->list,
2135  				      &rc->backref_cache.changed);
2136  			mark_block_processed(rc, next);
2137  			break;
2138  		}
2139  
2140  		WARN_ON(1);
2141  		root = NULL;
2142  		next = walk_down_backref(edges, &index);
2143  		if (!next || next->level <= node->level)
2144  			break;
2145  	}
2146  	if (!root) {
2147  		/*
2148  		 * This can happen if there's fs corruption or if there's a bug
2149  		 * in the backref lookup code.
2150  		 */
2151  		ASSERT(0);
2152  		return ERR_PTR(-ENOENT);
2153  	}
2154  
2155  	next = node;
2156  	/* setup backref node path for btrfs_reloc_cow_block */
2157  	while (1) {
2158  		rc->backref_cache.path[next->level] = next;
2159  		if (--index < 0)
2160  			break;
2161  		next = edges[index]->node[UPPER];
2162  	}
2163  	return root;
2164  }
2165  
2166  /*
2167   * Select a tree root for relocation.
2168   *
2169   * Return NULL if the block is not shareable. We should use do_relocation() in
2170   * this case.
2171   *
2172   * Return a tree root pointer if the block is shareable.
2173   * Return -ENOENT if the block is root of reloc tree.
2174   */
2175  static noinline_for_stack
select_one_root(struct btrfs_backref_node * node)2176  struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
2177  {
2178  	struct btrfs_backref_node *next;
2179  	struct btrfs_root *root;
2180  	struct btrfs_root *fs_root = NULL;
2181  	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2182  	int index = 0;
2183  
2184  	next = node;
2185  	while (1) {
2186  		cond_resched();
2187  		next = walk_up_backref(next, edges, &index);
2188  		root = next->root;
2189  
2190  		/*
2191  		 * This can occur if we have incomplete extent refs leading all
2192  		 * the way up a particular path, in this case return -EUCLEAN.
2193  		 */
2194  		if (!root)
2195  			return ERR_PTR(-EUCLEAN);
2196  
2197  		/* No other choice for non-shareable tree */
2198  		if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
2199  			return root;
2200  
2201  		if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID)
2202  			fs_root = root;
2203  
2204  		if (next != node)
2205  			return NULL;
2206  
2207  		next = walk_down_backref(edges, &index);
2208  		if (!next || next->level <= node->level)
2209  			break;
2210  	}
2211  
2212  	if (!fs_root)
2213  		return ERR_PTR(-ENOENT);
2214  	return fs_root;
2215  }
2216  
calcu_metadata_size(struct reloc_control * rc,struct btrfs_backref_node * node)2217  static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
2218  						  struct btrfs_backref_node *node)
2219  {
2220  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2221  	struct btrfs_backref_node *next = node;
2222  	struct btrfs_backref_edge *edge;
2223  	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2224  	u64 num_bytes = 0;
2225  	int index = 0;
2226  
2227  	BUG_ON(node->processed);
2228  
2229  	while (next) {
2230  		cond_resched();
2231  		while (1) {
2232  			if (next->processed)
2233  				break;
2234  
2235  			num_bytes += fs_info->nodesize;
2236  
2237  			if (list_empty(&next->upper))
2238  				break;
2239  
2240  			edge = list_entry(next->upper.next,
2241  					struct btrfs_backref_edge, list[LOWER]);
2242  			edges[index++] = edge;
2243  			next = edge->node[UPPER];
2244  		}
2245  		next = walk_down_backref(edges, &index);
2246  	}
2247  	return num_bytes;
2248  }
2249  
reserve_metadata_space(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node)2250  static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2251  				  struct reloc_control *rc,
2252  				  struct btrfs_backref_node *node)
2253  {
2254  	struct btrfs_root *root = rc->extent_root;
2255  	struct btrfs_fs_info *fs_info = root->fs_info;
2256  	u64 num_bytes;
2257  	int ret;
2258  	u64 tmp;
2259  
2260  	num_bytes = calcu_metadata_size(rc, node) * 2;
2261  
2262  	trans->block_rsv = rc->block_rsv;
2263  	rc->reserved_bytes += num_bytes;
2264  
2265  	/*
2266  	 * We are under a transaction here so we can only do limited flushing.
2267  	 * If we get an enospc just kick back -EAGAIN so we know to drop the
2268  	 * transaction and try to refill when we can flush all the things.
2269  	 */
2270  	ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
2271  				     BTRFS_RESERVE_FLUSH_LIMIT);
2272  	if (ret) {
2273  		tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2274  		while (tmp <= rc->reserved_bytes)
2275  			tmp <<= 1;
2276  		/*
2277  		 * only one thread can access block_rsv at this point,
2278  		 * so we don't need hold lock to protect block_rsv.
2279  		 * we expand more reservation size here to allow enough
2280  		 * space for relocation and we will return earlier in
2281  		 * enospc case.
2282  		 */
2283  		rc->block_rsv->size = tmp + fs_info->nodesize *
2284  				      RELOCATION_RESERVED_NODES;
2285  		return -EAGAIN;
2286  	}
2287  
2288  	return 0;
2289  }
2290  
2291  /*
2292   * relocate a block tree, and then update pointers in upper level
2293   * blocks that reference the block to point to the new location.
2294   *
2295   * if called by link_to_upper, the block has already been relocated.
2296   * in that case this function just updates pointers.
2297   */
do_relocation(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node,struct btrfs_key * key,struct btrfs_path * path,int lowest)2298  static int do_relocation(struct btrfs_trans_handle *trans,
2299  			 struct reloc_control *rc,
2300  			 struct btrfs_backref_node *node,
2301  			 struct btrfs_key *key,
2302  			 struct btrfs_path *path, int lowest)
2303  {
2304  	struct btrfs_backref_node *upper;
2305  	struct btrfs_backref_edge *edge;
2306  	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2307  	struct btrfs_root *root;
2308  	struct extent_buffer *eb;
2309  	u32 blocksize;
2310  	u64 bytenr;
2311  	int slot;
2312  	int ret = 0;
2313  
2314  	/*
2315  	 * If we are lowest then this is the first time we're processing this
2316  	 * block, and thus shouldn't have an eb associated with it yet.
2317  	 */
2318  	ASSERT(!lowest || !node->eb);
2319  
2320  	path->lowest_level = node->level + 1;
2321  	rc->backref_cache.path[node->level] = node;
2322  	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2323  		cond_resched();
2324  
2325  		upper = edge->node[UPPER];
2326  		root = select_reloc_root(trans, rc, upper, edges);
2327  		if (IS_ERR(root)) {
2328  			ret = PTR_ERR(root);
2329  			goto next;
2330  		}
2331  
2332  		if (upper->eb && !upper->locked) {
2333  			if (!lowest) {
2334  				ret = btrfs_bin_search(upper->eb, 0, key, &slot);
2335  				if (ret < 0)
2336  					goto next;
2337  				BUG_ON(ret);
2338  				bytenr = btrfs_node_blockptr(upper->eb, slot);
2339  				if (node->eb->start == bytenr)
2340  					goto next;
2341  			}
2342  			btrfs_backref_drop_node_buffer(upper);
2343  		}
2344  
2345  		if (!upper->eb) {
2346  			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2347  			if (ret) {
2348  				if (ret > 0)
2349  					ret = -ENOENT;
2350  
2351  				btrfs_release_path(path);
2352  				break;
2353  			}
2354  
2355  			if (!upper->eb) {
2356  				upper->eb = path->nodes[upper->level];
2357  				path->nodes[upper->level] = NULL;
2358  			} else {
2359  				BUG_ON(upper->eb != path->nodes[upper->level]);
2360  			}
2361  
2362  			upper->locked = 1;
2363  			path->locks[upper->level] = 0;
2364  
2365  			slot = path->slots[upper->level];
2366  			btrfs_release_path(path);
2367  		} else {
2368  			ret = btrfs_bin_search(upper->eb, 0, key, &slot);
2369  			if (ret < 0)
2370  				goto next;
2371  			BUG_ON(ret);
2372  		}
2373  
2374  		bytenr = btrfs_node_blockptr(upper->eb, slot);
2375  		if (lowest) {
2376  			if (bytenr != node->bytenr) {
2377  				btrfs_err(root->fs_info,
2378  		"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2379  					  bytenr, node->bytenr, slot,
2380  					  upper->eb->start);
2381  				ret = -EIO;
2382  				goto next;
2383  			}
2384  		} else {
2385  			if (node->eb->start == bytenr)
2386  				goto next;
2387  		}
2388  
2389  		blocksize = root->fs_info->nodesize;
2390  		eb = btrfs_read_node_slot(upper->eb, slot);
2391  		if (IS_ERR(eb)) {
2392  			ret = PTR_ERR(eb);
2393  			goto next;
2394  		}
2395  		btrfs_tree_lock(eb);
2396  
2397  		if (!node->eb) {
2398  			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2399  					      slot, &eb, BTRFS_NESTING_COW);
2400  			btrfs_tree_unlock(eb);
2401  			free_extent_buffer(eb);
2402  			if (ret < 0)
2403  				goto next;
2404  			/*
2405  			 * We've just COWed this block, it should have updated
2406  			 * the correct backref node entry.
2407  			 */
2408  			ASSERT(node->eb == eb);
2409  		} else {
2410  			struct btrfs_ref ref = {
2411  				.action = BTRFS_ADD_DELAYED_REF,
2412  				.bytenr = node->eb->start,
2413  				.num_bytes = blocksize,
2414  				.parent = upper->eb->start,
2415  				.owning_root = btrfs_header_owner(upper->eb),
2416  				.ref_root = btrfs_header_owner(upper->eb),
2417  			};
2418  
2419  			btrfs_set_node_blockptr(upper->eb, slot,
2420  						node->eb->start);
2421  			btrfs_set_node_ptr_generation(upper->eb, slot,
2422  						      trans->transid);
2423  			btrfs_mark_buffer_dirty(trans, upper->eb);
2424  
2425  			btrfs_init_tree_ref(&ref, node->level,
2426  					    btrfs_root_id(root), false);
2427  			ret = btrfs_inc_extent_ref(trans, &ref);
2428  			if (!ret)
2429  				ret = btrfs_drop_subtree(trans, root, eb,
2430  							 upper->eb);
2431  			if (ret)
2432  				btrfs_abort_transaction(trans, ret);
2433  		}
2434  next:
2435  		if (!upper->pending)
2436  			btrfs_backref_drop_node_buffer(upper);
2437  		else
2438  			btrfs_backref_unlock_node_buffer(upper);
2439  		if (ret)
2440  			break;
2441  	}
2442  
2443  	if (!ret && node->pending) {
2444  		btrfs_backref_drop_node_buffer(node);
2445  		list_move_tail(&node->list, &rc->backref_cache.changed);
2446  		node->pending = 0;
2447  	}
2448  
2449  	path->lowest_level = 0;
2450  
2451  	/*
2452  	 * We should have allocated all of our space in the block rsv and thus
2453  	 * shouldn't ENOSPC.
2454  	 */
2455  	ASSERT(ret != -ENOSPC);
2456  	return ret;
2457  }
2458  
link_to_upper(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node,struct btrfs_path * path)2459  static int link_to_upper(struct btrfs_trans_handle *trans,
2460  			 struct reloc_control *rc,
2461  			 struct btrfs_backref_node *node,
2462  			 struct btrfs_path *path)
2463  {
2464  	struct btrfs_key key;
2465  
2466  	btrfs_node_key_to_cpu(node->eb, &key, 0);
2467  	return do_relocation(trans, rc, node, &key, path, 0);
2468  }
2469  
finish_pending_nodes(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_path * path,int err)2470  static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2471  				struct reloc_control *rc,
2472  				struct btrfs_path *path, int err)
2473  {
2474  	LIST_HEAD(list);
2475  	struct btrfs_backref_cache *cache = &rc->backref_cache;
2476  	struct btrfs_backref_node *node;
2477  	int level;
2478  	int ret;
2479  
2480  	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2481  		while (!list_empty(&cache->pending[level])) {
2482  			node = list_entry(cache->pending[level].next,
2483  					  struct btrfs_backref_node, list);
2484  			list_move_tail(&node->list, &list);
2485  			BUG_ON(!node->pending);
2486  
2487  			if (!err) {
2488  				ret = link_to_upper(trans, rc, node, path);
2489  				if (ret < 0)
2490  					err = ret;
2491  			}
2492  		}
2493  		list_splice_init(&list, &cache->pending[level]);
2494  	}
2495  	return err;
2496  }
2497  
2498  /*
2499   * mark a block and all blocks directly/indirectly reference the block
2500   * as processed.
2501   */
update_processed_blocks(struct reloc_control * rc,struct btrfs_backref_node * node)2502  static void update_processed_blocks(struct reloc_control *rc,
2503  				    struct btrfs_backref_node *node)
2504  {
2505  	struct btrfs_backref_node *next = node;
2506  	struct btrfs_backref_edge *edge;
2507  	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2508  	int index = 0;
2509  
2510  	while (next) {
2511  		cond_resched();
2512  		while (1) {
2513  			if (next->processed)
2514  				break;
2515  
2516  			mark_block_processed(rc, next);
2517  
2518  			if (list_empty(&next->upper))
2519  				break;
2520  
2521  			edge = list_entry(next->upper.next,
2522  					struct btrfs_backref_edge, list[LOWER]);
2523  			edges[index++] = edge;
2524  			next = edge->node[UPPER];
2525  		}
2526  		next = walk_down_backref(edges, &index);
2527  	}
2528  }
2529  
tree_block_processed(u64 bytenr,struct reloc_control * rc)2530  static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2531  {
2532  	u32 blocksize = rc->extent_root->fs_info->nodesize;
2533  
2534  	if (test_range_bit(&rc->processed_blocks, bytenr,
2535  			   bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
2536  		return 1;
2537  	return 0;
2538  }
2539  
get_tree_block_key(struct btrfs_fs_info * fs_info,struct tree_block * block)2540  static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2541  			      struct tree_block *block)
2542  {
2543  	struct btrfs_tree_parent_check check = {
2544  		.level = block->level,
2545  		.owner_root = block->owner,
2546  		.transid = block->key.offset
2547  	};
2548  	struct extent_buffer *eb;
2549  
2550  	eb = read_tree_block(fs_info, block->bytenr, &check);
2551  	if (IS_ERR(eb))
2552  		return PTR_ERR(eb);
2553  	if (!extent_buffer_uptodate(eb)) {
2554  		free_extent_buffer(eb);
2555  		return -EIO;
2556  	}
2557  	if (block->level == 0)
2558  		btrfs_item_key_to_cpu(eb, &block->key, 0);
2559  	else
2560  		btrfs_node_key_to_cpu(eb, &block->key, 0);
2561  	free_extent_buffer(eb);
2562  	block->key_ready = true;
2563  	return 0;
2564  }
2565  
2566  /*
2567   * helper function to relocate a tree block
2568   */
relocate_tree_block(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node,struct btrfs_key * key,struct btrfs_path * path)2569  static int relocate_tree_block(struct btrfs_trans_handle *trans,
2570  				struct reloc_control *rc,
2571  				struct btrfs_backref_node *node,
2572  				struct btrfs_key *key,
2573  				struct btrfs_path *path)
2574  {
2575  	struct btrfs_root *root;
2576  	int ret = 0;
2577  
2578  	if (!node)
2579  		return 0;
2580  
2581  	/*
2582  	 * If we fail here we want to drop our backref_node because we are going
2583  	 * to start over and regenerate the tree for it.
2584  	 */
2585  	ret = reserve_metadata_space(trans, rc, node);
2586  	if (ret)
2587  		goto out;
2588  
2589  	BUG_ON(node->processed);
2590  	root = select_one_root(node);
2591  	if (IS_ERR(root)) {
2592  		ret = PTR_ERR(root);
2593  
2594  		/* See explanation in select_one_root for the -EUCLEAN case. */
2595  		ASSERT(ret == -ENOENT);
2596  		if (ret == -ENOENT) {
2597  			ret = 0;
2598  			update_processed_blocks(rc, node);
2599  		}
2600  		goto out;
2601  	}
2602  
2603  	if (root) {
2604  		if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
2605  			/*
2606  			 * This block was the root block of a root, and this is
2607  			 * the first time we're processing the block and thus it
2608  			 * should not have had the ->new_bytenr modified and
2609  			 * should have not been included on the changed list.
2610  			 *
2611  			 * However in the case of corruption we could have
2612  			 * multiple refs pointing to the same block improperly,
2613  			 * and thus we would trip over these checks.  ASSERT()
2614  			 * for the developer case, because it could indicate a
2615  			 * bug in the backref code, however error out for a
2616  			 * normal user in the case of corruption.
2617  			 */
2618  			ASSERT(node->new_bytenr == 0);
2619  			ASSERT(list_empty(&node->list));
2620  			if (node->new_bytenr || !list_empty(&node->list)) {
2621  				btrfs_err(root->fs_info,
2622  				  "bytenr %llu has improper references to it",
2623  					  node->bytenr);
2624  				ret = -EUCLEAN;
2625  				goto out;
2626  			}
2627  			ret = btrfs_record_root_in_trans(trans, root);
2628  			if (ret)
2629  				goto out;
2630  			/*
2631  			 * Another thread could have failed, need to check if we
2632  			 * have reloc_root actually set.
2633  			 */
2634  			if (!root->reloc_root) {
2635  				ret = -ENOENT;
2636  				goto out;
2637  			}
2638  			root = root->reloc_root;
2639  			node->new_bytenr = root->node->start;
2640  			btrfs_put_root(node->root);
2641  			node->root = btrfs_grab_root(root);
2642  			ASSERT(node->root);
2643  			list_add_tail(&node->list, &rc->backref_cache.changed);
2644  		} else {
2645  			path->lowest_level = node->level;
2646  			if (root == root->fs_info->chunk_root)
2647  				btrfs_reserve_chunk_metadata(trans, false);
2648  			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2649  			btrfs_release_path(path);
2650  			if (root == root->fs_info->chunk_root)
2651  				btrfs_trans_release_chunk_metadata(trans);
2652  			if (ret > 0)
2653  				ret = 0;
2654  		}
2655  		if (!ret)
2656  			update_processed_blocks(rc, node);
2657  	} else {
2658  		ret = do_relocation(trans, rc, node, key, path, 1);
2659  	}
2660  out:
2661  	if (ret || node->level == 0 || node->cowonly)
2662  		btrfs_backref_cleanup_node(&rc->backref_cache, node);
2663  	return ret;
2664  }
2665  
2666  /*
2667   * relocate a list of blocks
2668   */
2669  static noinline_for_stack
relocate_tree_blocks(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct rb_root * blocks)2670  int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2671  			 struct reloc_control *rc, struct rb_root *blocks)
2672  {
2673  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2674  	struct btrfs_backref_node *node;
2675  	struct btrfs_path *path;
2676  	struct tree_block *block;
2677  	struct tree_block *next;
2678  	int ret = 0;
2679  
2680  	path = btrfs_alloc_path();
2681  	if (!path) {
2682  		ret = -ENOMEM;
2683  		goto out_free_blocks;
2684  	}
2685  
2686  	/* Kick in readahead for tree blocks with missing keys */
2687  	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2688  		if (!block->key_ready)
2689  			btrfs_readahead_tree_block(fs_info, block->bytenr,
2690  						   block->owner, 0,
2691  						   block->level);
2692  	}
2693  
2694  	/* Get first keys */
2695  	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2696  		if (!block->key_ready) {
2697  			ret = get_tree_block_key(fs_info, block);
2698  			if (ret)
2699  				goto out_free_path;
2700  		}
2701  	}
2702  
2703  	/* Do tree relocation */
2704  	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2705  		node = build_backref_tree(trans, rc, &block->key,
2706  					  block->level, block->bytenr);
2707  		if (IS_ERR(node)) {
2708  			ret = PTR_ERR(node);
2709  			goto out;
2710  		}
2711  
2712  		ret = relocate_tree_block(trans, rc, node, &block->key,
2713  					  path);
2714  		if (ret < 0)
2715  			break;
2716  	}
2717  out:
2718  	ret = finish_pending_nodes(trans, rc, path, ret);
2719  
2720  out_free_path:
2721  	btrfs_free_path(path);
2722  out_free_blocks:
2723  	free_block_list(blocks);
2724  	return ret;
2725  }
2726  
prealloc_file_extent_cluster(struct reloc_control * rc)2727  static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc)
2728  {
2729  	const struct file_extent_cluster *cluster = &rc->cluster;
2730  	struct btrfs_inode *inode = BTRFS_I(rc->data_inode);
2731  	u64 alloc_hint = 0;
2732  	u64 start;
2733  	u64 end;
2734  	u64 offset = inode->reloc_block_group_start;
2735  	u64 num_bytes;
2736  	int nr;
2737  	int ret = 0;
2738  	u64 i_size = i_size_read(&inode->vfs_inode);
2739  	u64 prealloc_start = cluster->start - offset;
2740  	u64 prealloc_end = cluster->end - offset;
2741  	u64 cur_offset = prealloc_start;
2742  
2743  	/*
2744  	 * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
2745  	 * This means the range [i_size, PAGE_END + 1) is filled with zeros by
2746  	 * btrfs_do_readpage() call of previously relocated file cluster.
2747  	 *
2748  	 * If the current cluster starts in the above range, btrfs_do_readpage()
2749  	 * will skip the read, and relocate_one_folio() will later writeback
2750  	 * the padding zeros as new data, causing data corruption.
2751  	 *
2752  	 * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
2753  	 */
2754  	if (!PAGE_ALIGNED(i_size)) {
2755  		struct address_space *mapping = inode->vfs_inode.i_mapping;
2756  		struct btrfs_fs_info *fs_info = inode->root->fs_info;
2757  		const u32 sectorsize = fs_info->sectorsize;
2758  		struct folio *folio;
2759  
2760  		ASSERT(sectorsize < PAGE_SIZE);
2761  		ASSERT(IS_ALIGNED(i_size, sectorsize));
2762  
2763  		/*
2764  		 * Subpage can't handle page with DIRTY but without UPTODATE
2765  		 * bit as it can lead to the following deadlock:
2766  		 *
2767  		 * btrfs_read_folio()
2768  		 * | Page already *locked*
2769  		 * |- btrfs_lock_and_flush_ordered_range()
2770  		 *    |- btrfs_start_ordered_extent()
2771  		 *       |- extent_write_cache_pages()
2772  		 *          |- lock_page()
2773  		 *             We try to lock the page we already hold.
2774  		 *
2775  		 * Here we just writeback the whole data reloc inode, so that
2776  		 * we will be ensured to have no dirty range in the page, and
2777  		 * are safe to clear the uptodate bits.
2778  		 *
2779  		 * This shouldn't cause too much overhead, as we need to write
2780  		 * the data back anyway.
2781  		 */
2782  		ret = filemap_write_and_wait(mapping);
2783  		if (ret < 0)
2784  			return ret;
2785  
2786  		clear_extent_bits(&inode->io_tree, i_size,
2787  				  round_up(i_size, PAGE_SIZE) - 1,
2788  				  EXTENT_UPTODATE);
2789  		folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT);
2790  		/*
2791  		 * If page is freed we don't need to do anything then, as we
2792  		 * will re-read the whole page anyway.
2793  		 */
2794  		if (!IS_ERR(folio)) {
2795  			btrfs_subpage_clear_uptodate(fs_info, folio, i_size,
2796  					round_up(i_size, PAGE_SIZE) - i_size);
2797  			folio_unlock(folio);
2798  			folio_put(folio);
2799  		}
2800  	}
2801  
2802  	BUG_ON(cluster->start != cluster->boundary[0]);
2803  	ret = btrfs_alloc_data_chunk_ondemand(inode,
2804  					      prealloc_end + 1 - prealloc_start);
2805  	if (ret)
2806  		return ret;
2807  
2808  	btrfs_inode_lock(inode, 0);
2809  	for (nr = 0; nr < cluster->nr; nr++) {
2810  		struct extent_state *cached_state = NULL;
2811  
2812  		start = cluster->boundary[nr] - offset;
2813  		if (nr + 1 < cluster->nr)
2814  			end = cluster->boundary[nr + 1] - 1 - offset;
2815  		else
2816  			end = cluster->end - offset;
2817  
2818  		lock_extent(&inode->io_tree, start, end, &cached_state);
2819  		num_bytes = end + 1 - start;
2820  		ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
2821  						num_bytes, num_bytes,
2822  						end + 1, &alloc_hint);
2823  		cur_offset = end + 1;
2824  		unlock_extent(&inode->io_tree, start, end, &cached_state);
2825  		if (ret)
2826  			break;
2827  	}
2828  	btrfs_inode_unlock(inode, 0);
2829  
2830  	if (cur_offset < prealloc_end)
2831  		btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
2832  					       prealloc_end + 1 - cur_offset);
2833  	return ret;
2834  }
2835  
setup_relocation_extent_mapping(struct reloc_control * rc)2836  static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc)
2837  {
2838  	struct btrfs_inode *inode = BTRFS_I(rc->data_inode);
2839  	struct extent_map *em;
2840  	struct extent_state *cached_state = NULL;
2841  	u64 offset = inode->reloc_block_group_start;
2842  	u64 start = rc->cluster.start - offset;
2843  	u64 end = rc->cluster.end - offset;
2844  	int ret = 0;
2845  
2846  	em = alloc_extent_map();
2847  	if (!em)
2848  		return -ENOMEM;
2849  
2850  	em->start = start;
2851  	em->len = end + 1 - start;
2852  	em->disk_bytenr = rc->cluster.start;
2853  	em->disk_num_bytes = em->len;
2854  	em->ram_bytes = em->len;
2855  	em->flags |= EXTENT_FLAG_PINNED;
2856  
2857  	lock_extent(&inode->io_tree, start, end, &cached_state);
2858  	ret = btrfs_replace_extent_map_range(inode, em, false);
2859  	unlock_extent(&inode->io_tree, start, end, &cached_state);
2860  	free_extent_map(em);
2861  
2862  	return ret;
2863  }
2864  
2865  /*
2866   * Allow error injection to test balance/relocation cancellation
2867   */
btrfs_should_cancel_balance(const struct btrfs_fs_info * fs_info)2868  noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info)
2869  {
2870  	return atomic_read(&fs_info->balance_cancel_req) ||
2871  		atomic_read(&fs_info->reloc_cancel_req) ||
2872  		fatal_signal_pending(current);
2873  }
2874  ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
2875  
get_cluster_boundary_end(const struct file_extent_cluster * cluster,int cluster_nr)2876  static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
2877  				    int cluster_nr)
2878  {
2879  	/* Last extent, use cluster end directly */
2880  	if (cluster_nr >= cluster->nr - 1)
2881  		return cluster->end;
2882  
2883  	/* Use next boundary start*/
2884  	return cluster->boundary[cluster_nr + 1] - 1;
2885  }
2886  
relocate_one_folio(struct reloc_control * rc,struct file_ra_state * ra,int * cluster_nr,unsigned long index)2887  static int relocate_one_folio(struct reloc_control *rc,
2888  			      struct file_ra_state *ra,
2889  			      int *cluster_nr, unsigned long index)
2890  {
2891  	const struct file_extent_cluster *cluster = &rc->cluster;
2892  	struct inode *inode = rc->data_inode;
2893  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2894  	u64 offset = BTRFS_I(inode)->reloc_block_group_start;
2895  	const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
2896  	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2897  	struct folio *folio;
2898  	u64 folio_start;
2899  	u64 folio_end;
2900  	u64 cur;
2901  	int ret;
2902  	const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags);
2903  
2904  	ASSERT(index <= last_index);
2905  	folio = filemap_lock_folio(inode->i_mapping, index);
2906  	if (IS_ERR(folio)) {
2907  
2908  		/*
2909  		 * On relocation we're doing readahead on the relocation inode,
2910  		 * but if the filesystem is backed by a RAID stripe tree we can
2911  		 * get ENOENT (e.g. due to preallocated extents not being
2912  		 * mapped in the RST) from the lookup.
2913  		 *
2914  		 * But readahead doesn't handle the error and submits invalid
2915  		 * reads to the device, causing a assertion failures.
2916  		 */
2917  		if (!use_rst)
2918  			page_cache_sync_readahead(inode->i_mapping, ra, NULL,
2919  						  index, last_index + 1 - index);
2920  		folio = __filemap_get_folio(inode->i_mapping, index,
2921  					    FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
2922  					    mask);
2923  		if (IS_ERR(folio))
2924  			return PTR_ERR(folio);
2925  	}
2926  
2927  	WARN_ON(folio_order(folio));
2928  
2929  	if (folio_test_readahead(folio) && !use_rst)
2930  		page_cache_async_readahead(inode->i_mapping, ra, NULL,
2931  					   folio, last_index + 1 - index);
2932  
2933  	if (!folio_test_uptodate(folio)) {
2934  		btrfs_read_folio(NULL, folio);
2935  		folio_lock(folio);
2936  		if (!folio_test_uptodate(folio)) {
2937  			ret = -EIO;
2938  			goto release_folio;
2939  		}
2940  	}
2941  
2942  	/*
2943  	 * We could have lost folio private when we dropped the lock to read the
2944  	 * folio above, make sure we set_page_extent_mapped here so we have any
2945  	 * of the subpage blocksize stuff we need in place.
2946  	 */
2947  	ret = set_folio_extent_mapped(folio);
2948  	if (ret < 0)
2949  		goto release_folio;
2950  
2951  	folio_start = folio_pos(folio);
2952  	folio_end = folio_start + PAGE_SIZE - 1;
2953  
2954  	/*
2955  	 * Start from the cluster, as for subpage case, the cluster can start
2956  	 * inside the folio.
2957  	 */
2958  	cur = max(folio_start, cluster->boundary[*cluster_nr] - offset);
2959  	while (cur <= folio_end) {
2960  		struct extent_state *cached_state = NULL;
2961  		u64 extent_start = cluster->boundary[*cluster_nr] - offset;
2962  		u64 extent_end = get_cluster_boundary_end(cluster,
2963  						*cluster_nr) - offset;
2964  		u64 clamped_start = max(folio_start, extent_start);
2965  		u64 clamped_end = min(folio_end, extent_end);
2966  		u32 clamped_len = clamped_end + 1 - clamped_start;
2967  
2968  		/* Reserve metadata for this range */
2969  		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
2970  						      clamped_len, clamped_len,
2971  						      false);
2972  		if (ret)
2973  			goto release_folio;
2974  
2975  		/* Mark the range delalloc and dirty for later writeback */
2976  		lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
2977  			    &cached_state);
2978  		ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
2979  						clamped_end, 0, &cached_state);
2980  		if (ret) {
2981  			clear_extent_bit(&BTRFS_I(inode)->io_tree,
2982  					 clamped_start, clamped_end,
2983  					 EXTENT_LOCKED | EXTENT_BOUNDARY,
2984  					 &cached_state);
2985  			btrfs_delalloc_release_metadata(BTRFS_I(inode),
2986  							clamped_len, true);
2987  			btrfs_delalloc_release_extents(BTRFS_I(inode),
2988  						       clamped_len);
2989  			goto release_folio;
2990  		}
2991  		btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len);
2992  
2993  		/*
2994  		 * Set the boundary if it's inside the folio.
2995  		 * Data relocation requires the destination extents to have the
2996  		 * same size as the source.
2997  		 * EXTENT_BOUNDARY bit prevents current extent from being merged
2998  		 * with previous extent.
2999  		 */
3000  		if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) {
3001  			u64 boundary_start = cluster->boundary[*cluster_nr] -
3002  						offset;
3003  			u64 boundary_end = boundary_start +
3004  					   fs_info->sectorsize - 1;
3005  
3006  			set_extent_bit(&BTRFS_I(inode)->io_tree,
3007  				       boundary_start, boundary_end,
3008  				       EXTENT_BOUNDARY, NULL);
3009  		}
3010  		unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
3011  			      &cached_state);
3012  		btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
3013  		cur += clamped_len;
3014  
3015  		/* Crossed extent end, go to next extent */
3016  		if (cur >= extent_end) {
3017  			(*cluster_nr)++;
3018  			/* Just finished the last extent of the cluster, exit. */
3019  			if (*cluster_nr >= cluster->nr)
3020  				break;
3021  		}
3022  	}
3023  	folio_unlock(folio);
3024  	folio_put(folio);
3025  
3026  	balance_dirty_pages_ratelimited(inode->i_mapping);
3027  	btrfs_throttle(fs_info);
3028  	if (btrfs_should_cancel_balance(fs_info))
3029  		ret = -ECANCELED;
3030  	return ret;
3031  
3032  release_folio:
3033  	folio_unlock(folio);
3034  	folio_put(folio);
3035  	return ret;
3036  }
3037  
relocate_file_extent_cluster(struct reloc_control * rc)3038  static int relocate_file_extent_cluster(struct reloc_control *rc)
3039  {
3040  	struct inode *inode = rc->data_inode;
3041  	const struct file_extent_cluster *cluster = &rc->cluster;
3042  	u64 offset = BTRFS_I(inode)->reloc_block_group_start;
3043  	unsigned long index;
3044  	unsigned long last_index;
3045  	struct file_ra_state *ra;
3046  	int cluster_nr = 0;
3047  	int ret = 0;
3048  
3049  	if (!cluster->nr)
3050  		return 0;
3051  
3052  	ra = kzalloc(sizeof(*ra), GFP_NOFS);
3053  	if (!ra)
3054  		return -ENOMEM;
3055  
3056  	ret = prealloc_file_extent_cluster(rc);
3057  	if (ret)
3058  		goto out;
3059  
3060  	file_ra_state_init(ra, inode->i_mapping);
3061  
3062  	ret = setup_relocation_extent_mapping(rc);
3063  	if (ret)
3064  		goto out;
3065  
3066  	last_index = (cluster->end - offset) >> PAGE_SHIFT;
3067  	for (index = (cluster->start - offset) >> PAGE_SHIFT;
3068  	     index <= last_index && !ret; index++)
3069  		ret = relocate_one_folio(rc, ra, &cluster_nr, index);
3070  	if (ret == 0)
3071  		WARN_ON(cluster_nr != cluster->nr);
3072  out:
3073  	kfree(ra);
3074  	return ret;
3075  }
3076  
relocate_data_extent(struct reloc_control * rc,const struct btrfs_key * extent_key)3077  static noinline_for_stack int relocate_data_extent(struct reloc_control *rc,
3078  					   const struct btrfs_key *extent_key)
3079  {
3080  	struct inode *inode = rc->data_inode;
3081  	struct file_extent_cluster *cluster = &rc->cluster;
3082  	int ret;
3083  	struct btrfs_root *root = BTRFS_I(inode)->root;
3084  
3085  	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3086  		ret = relocate_file_extent_cluster(rc);
3087  		if (ret)
3088  			return ret;
3089  		cluster->nr = 0;
3090  	}
3091  
3092  	/*
3093  	 * Under simple quotas, we set root->relocation_src_root when we find
3094  	 * the extent. If adjacent extents have different owners, we can't merge
3095  	 * them while relocating. Handle this by storing the owning root that
3096  	 * started a cluster and if we see an extent from a different root break
3097  	 * cluster formation (just like the above case of non-adjacent extents).
3098  	 *
3099  	 * Without simple quotas, relocation_src_root is always 0, so we should
3100  	 * never see a mismatch, and it should have no effect on relocation
3101  	 * clusters.
3102  	 */
3103  	if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) {
3104  		u64 tmp = root->relocation_src_root;
3105  
3106  		/*
3107  		 * root->relocation_src_root is the state that actually affects
3108  		 * the preallocation we do here, so set it to the root owning
3109  		 * the cluster we need to relocate.
3110  		 */
3111  		root->relocation_src_root = cluster->owning_root;
3112  		ret = relocate_file_extent_cluster(rc);
3113  		if (ret)
3114  			return ret;
3115  		cluster->nr = 0;
3116  		/* And reset it back for the current extent's owning root. */
3117  		root->relocation_src_root = tmp;
3118  	}
3119  
3120  	if (!cluster->nr) {
3121  		cluster->start = extent_key->objectid;
3122  		cluster->owning_root = root->relocation_src_root;
3123  	}
3124  	else
3125  		BUG_ON(cluster->nr >= MAX_EXTENTS);
3126  	cluster->end = extent_key->objectid + extent_key->offset - 1;
3127  	cluster->boundary[cluster->nr] = extent_key->objectid;
3128  	cluster->nr++;
3129  
3130  	if (cluster->nr >= MAX_EXTENTS) {
3131  		ret = relocate_file_extent_cluster(rc);
3132  		if (ret)
3133  			return ret;
3134  		cluster->nr = 0;
3135  	}
3136  	return 0;
3137  }
3138  
3139  /*
3140   * helper to add a tree block to the list.
3141   * the major work is getting the generation and level of the block
3142   */
add_tree_block(struct reloc_control * rc,const struct btrfs_key * extent_key,struct btrfs_path * path,struct rb_root * blocks)3143  static int add_tree_block(struct reloc_control *rc,
3144  			  const struct btrfs_key *extent_key,
3145  			  struct btrfs_path *path,
3146  			  struct rb_root *blocks)
3147  {
3148  	struct extent_buffer *eb;
3149  	struct btrfs_extent_item *ei;
3150  	struct btrfs_tree_block_info *bi;
3151  	struct tree_block *block;
3152  	struct rb_node *rb_node;
3153  	u32 item_size;
3154  	int level = -1;
3155  	u64 generation;
3156  	u64 owner = 0;
3157  
3158  	eb =  path->nodes[0];
3159  	item_size = btrfs_item_size(eb, path->slots[0]);
3160  
3161  	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3162  	    item_size >= sizeof(*ei) + sizeof(*bi)) {
3163  		unsigned long ptr = 0, end;
3164  
3165  		ei = btrfs_item_ptr(eb, path->slots[0],
3166  				struct btrfs_extent_item);
3167  		end = (unsigned long)ei + item_size;
3168  		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3169  			bi = (struct btrfs_tree_block_info *)(ei + 1);
3170  			level = btrfs_tree_block_level(eb, bi);
3171  			ptr = (unsigned long)(bi + 1);
3172  		} else {
3173  			level = (int)extent_key->offset;
3174  			ptr = (unsigned long)(ei + 1);
3175  		}
3176  		generation = btrfs_extent_generation(eb, ei);
3177  
3178  		/*
3179  		 * We're reading random blocks without knowing their owner ahead
3180  		 * of time.  This is ok most of the time, as all reloc roots and
3181  		 * fs roots have the same lock type.  However normal trees do
3182  		 * not, and the only way to know ahead of time is to read the
3183  		 * inline ref offset.  We know it's an fs root if
3184  		 *
3185  		 * 1. There's more than one ref.
3186  		 * 2. There's a SHARED_DATA_REF_KEY set.
3187  		 * 3. FULL_BACKREF is set on the flags.
3188  		 *
3189  		 * Otherwise it's safe to assume that the ref offset == the
3190  		 * owner of this block, so we can use that when calling
3191  		 * read_tree_block.
3192  		 */
3193  		if (btrfs_extent_refs(eb, ei) == 1 &&
3194  		    !(btrfs_extent_flags(eb, ei) &
3195  		      BTRFS_BLOCK_FLAG_FULL_BACKREF) &&
3196  		    ptr < end) {
3197  			struct btrfs_extent_inline_ref *iref;
3198  			int type;
3199  
3200  			iref = (struct btrfs_extent_inline_ref *)ptr;
3201  			type = btrfs_get_extent_inline_ref_type(eb, iref,
3202  							BTRFS_REF_TYPE_BLOCK);
3203  			if (type == BTRFS_REF_TYPE_INVALID)
3204  				return -EINVAL;
3205  			if (type == BTRFS_TREE_BLOCK_REF_KEY)
3206  				owner = btrfs_extent_inline_ref_offset(eb, iref);
3207  		}
3208  	} else {
3209  		btrfs_print_leaf(eb);
3210  		btrfs_err(rc->block_group->fs_info,
3211  			  "unrecognized tree backref at tree block %llu slot %u",
3212  			  eb->start, path->slots[0]);
3213  		btrfs_release_path(path);
3214  		return -EUCLEAN;
3215  	}
3216  
3217  	btrfs_release_path(path);
3218  
3219  	BUG_ON(level == -1);
3220  
3221  	block = kmalloc(sizeof(*block), GFP_NOFS);
3222  	if (!block)
3223  		return -ENOMEM;
3224  
3225  	block->bytenr = extent_key->objectid;
3226  	block->key.objectid = rc->extent_root->fs_info->nodesize;
3227  	block->key.offset = generation;
3228  	block->level = level;
3229  	block->key_ready = false;
3230  	block->owner = owner;
3231  
3232  	rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
3233  	if (rb_node)
3234  		btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
3235  				    -EEXIST);
3236  
3237  	return 0;
3238  }
3239  
3240  /*
3241   * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3242   */
__add_tree_block(struct reloc_control * rc,u64 bytenr,u32 blocksize,struct rb_root * blocks)3243  static int __add_tree_block(struct reloc_control *rc,
3244  			    u64 bytenr, u32 blocksize,
3245  			    struct rb_root *blocks)
3246  {
3247  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3248  	struct btrfs_path *path;
3249  	struct btrfs_key key;
3250  	int ret;
3251  	bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3252  
3253  	if (tree_block_processed(bytenr, rc))
3254  		return 0;
3255  
3256  	if (rb_simple_search(blocks, bytenr))
3257  		return 0;
3258  
3259  	path = btrfs_alloc_path();
3260  	if (!path)
3261  		return -ENOMEM;
3262  again:
3263  	key.objectid = bytenr;
3264  	if (skinny) {
3265  		key.type = BTRFS_METADATA_ITEM_KEY;
3266  		key.offset = (u64)-1;
3267  	} else {
3268  		key.type = BTRFS_EXTENT_ITEM_KEY;
3269  		key.offset = blocksize;
3270  	}
3271  
3272  	path->search_commit_root = 1;
3273  	path->skip_locking = 1;
3274  	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3275  	if (ret < 0)
3276  		goto out;
3277  
3278  	if (ret > 0 && skinny) {
3279  		if (path->slots[0]) {
3280  			path->slots[0]--;
3281  			btrfs_item_key_to_cpu(path->nodes[0], &key,
3282  					      path->slots[0]);
3283  			if (key.objectid == bytenr &&
3284  			    (key.type == BTRFS_METADATA_ITEM_KEY ||
3285  			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
3286  			      key.offset == blocksize)))
3287  				ret = 0;
3288  		}
3289  
3290  		if (ret) {
3291  			skinny = false;
3292  			btrfs_release_path(path);
3293  			goto again;
3294  		}
3295  	}
3296  	if (ret) {
3297  		ASSERT(ret == 1);
3298  		btrfs_print_leaf(path->nodes[0]);
3299  		btrfs_err(fs_info,
3300  	     "tree block extent item (%llu) is not found in extent tree",
3301  		     bytenr);
3302  		WARN_ON(1);
3303  		ret = -EINVAL;
3304  		goto out;
3305  	}
3306  
3307  	ret = add_tree_block(rc, &key, path, blocks);
3308  out:
3309  	btrfs_free_path(path);
3310  	return ret;
3311  }
3312  
delete_block_group_cache(struct btrfs_fs_info * fs_info,struct btrfs_block_group * block_group,struct inode * inode,u64 ino)3313  static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3314  				    struct btrfs_block_group *block_group,
3315  				    struct inode *inode,
3316  				    u64 ino)
3317  {
3318  	struct btrfs_root *root = fs_info->tree_root;
3319  	struct btrfs_trans_handle *trans;
3320  	int ret = 0;
3321  
3322  	if (inode)
3323  		goto truncate;
3324  
3325  	inode = btrfs_iget(ino, root);
3326  	if (IS_ERR(inode))
3327  		return -ENOENT;
3328  
3329  truncate:
3330  	ret = btrfs_check_trunc_cache_free_space(fs_info,
3331  						 &fs_info->global_block_rsv);
3332  	if (ret)
3333  		goto out;
3334  
3335  	trans = btrfs_join_transaction(root);
3336  	if (IS_ERR(trans)) {
3337  		ret = PTR_ERR(trans);
3338  		goto out;
3339  	}
3340  
3341  	ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3342  
3343  	btrfs_end_transaction(trans);
3344  	btrfs_btree_balance_dirty(fs_info);
3345  out:
3346  	iput(inode);
3347  	return ret;
3348  }
3349  
3350  /*
3351   * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
3352   * cache inode, to avoid free space cache data extent blocking data relocation.
3353   */
delete_v1_space_cache(struct extent_buffer * leaf,struct btrfs_block_group * block_group,u64 data_bytenr)3354  static int delete_v1_space_cache(struct extent_buffer *leaf,
3355  				 struct btrfs_block_group *block_group,
3356  				 u64 data_bytenr)
3357  {
3358  	u64 space_cache_ino;
3359  	struct btrfs_file_extent_item *ei;
3360  	struct btrfs_key key;
3361  	bool found = false;
3362  	int i;
3363  	int ret;
3364  
3365  	if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
3366  		return 0;
3367  
3368  	for (i = 0; i < btrfs_header_nritems(leaf); i++) {
3369  		u8 type;
3370  
3371  		btrfs_item_key_to_cpu(leaf, &key, i);
3372  		if (key.type != BTRFS_EXTENT_DATA_KEY)
3373  			continue;
3374  		ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3375  		type = btrfs_file_extent_type(leaf, ei);
3376  
3377  		if ((type == BTRFS_FILE_EXTENT_REG ||
3378  		     type == BTRFS_FILE_EXTENT_PREALLOC) &&
3379  		    btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
3380  			found = true;
3381  			space_cache_ino = key.objectid;
3382  			break;
3383  		}
3384  	}
3385  	if (!found)
3386  		return -ENOENT;
3387  	ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
3388  					space_cache_ino);
3389  	return ret;
3390  }
3391  
3392  /*
3393   * helper to find all tree blocks that reference a given data extent
3394   */
add_data_references(struct reloc_control * rc,const struct btrfs_key * extent_key,struct btrfs_path * path,struct rb_root * blocks)3395  static noinline_for_stack int add_data_references(struct reloc_control *rc,
3396  						  const struct btrfs_key *extent_key,
3397  						  struct btrfs_path *path,
3398  						  struct rb_root *blocks)
3399  {
3400  	struct btrfs_backref_walk_ctx ctx = { 0 };
3401  	struct ulist_iterator leaf_uiter;
3402  	struct ulist_node *ref_node = NULL;
3403  	const u32 blocksize = rc->extent_root->fs_info->nodesize;
3404  	int ret = 0;
3405  
3406  	btrfs_release_path(path);
3407  
3408  	ctx.bytenr = extent_key->objectid;
3409  	ctx.skip_inode_ref_list = true;
3410  	ctx.fs_info = rc->extent_root->fs_info;
3411  
3412  	ret = btrfs_find_all_leafs(&ctx);
3413  	if (ret < 0)
3414  		return ret;
3415  
3416  	ULIST_ITER_INIT(&leaf_uiter);
3417  	while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) {
3418  		struct btrfs_tree_parent_check check = { 0 };
3419  		struct extent_buffer *eb;
3420  
3421  		eb = read_tree_block(ctx.fs_info, ref_node->val, &check);
3422  		if (IS_ERR(eb)) {
3423  			ret = PTR_ERR(eb);
3424  			break;
3425  		}
3426  		ret = delete_v1_space_cache(eb, rc->block_group,
3427  					    extent_key->objectid);
3428  		free_extent_buffer(eb);
3429  		if (ret < 0)
3430  			break;
3431  		ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
3432  		if (ret < 0)
3433  			break;
3434  	}
3435  	if (ret < 0)
3436  		free_block_list(blocks);
3437  	ulist_free(ctx.refs);
3438  	return ret;
3439  }
3440  
3441  /*
3442   * helper to find next unprocessed extent
3443   */
3444  static noinline_for_stack
find_next_extent(struct reloc_control * rc,struct btrfs_path * path,struct btrfs_key * extent_key)3445  int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3446  		     struct btrfs_key *extent_key)
3447  {
3448  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3449  	struct btrfs_key key;
3450  	struct extent_buffer *leaf;
3451  	u64 start, end, last;
3452  	int ret;
3453  
3454  	last = rc->block_group->start + rc->block_group->length;
3455  	while (1) {
3456  		bool block_found;
3457  
3458  		cond_resched();
3459  		if (rc->search_start >= last) {
3460  			ret = 1;
3461  			break;
3462  		}
3463  
3464  		key.objectid = rc->search_start;
3465  		key.type = BTRFS_EXTENT_ITEM_KEY;
3466  		key.offset = 0;
3467  
3468  		path->search_commit_root = 1;
3469  		path->skip_locking = 1;
3470  		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3471  					0, 0);
3472  		if (ret < 0)
3473  			break;
3474  next:
3475  		leaf = path->nodes[0];
3476  		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3477  			ret = btrfs_next_leaf(rc->extent_root, path);
3478  			if (ret != 0)
3479  				break;
3480  			leaf = path->nodes[0];
3481  		}
3482  
3483  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3484  		if (key.objectid >= last) {
3485  			ret = 1;
3486  			break;
3487  		}
3488  
3489  		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3490  		    key.type != BTRFS_METADATA_ITEM_KEY) {
3491  			path->slots[0]++;
3492  			goto next;
3493  		}
3494  
3495  		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3496  		    key.objectid + key.offset <= rc->search_start) {
3497  			path->slots[0]++;
3498  			goto next;
3499  		}
3500  
3501  		if (key.type == BTRFS_METADATA_ITEM_KEY &&
3502  		    key.objectid + fs_info->nodesize <=
3503  		    rc->search_start) {
3504  			path->slots[0]++;
3505  			goto next;
3506  		}
3507  
3508  		block_found = find_first_extent_bit(&rc->processed_blocks,
3509  						    key.objectid, &start, &end,
3510  						    EXTENT_DIRTY, NULL);
3511  
3512  		if (block_found && start <= key.objectid) {
3513  			btrfs_release_path(path);
3514  			rc->search_start = end + 1;
3515  		} else {
3516  			if (key.type == BTRFS_EXTENT_ITEM_KEY)
3517  				rc->search_start = key.objectid + key.offset;
3518  			else
3519  				rc->search_start = key.objectid +
3520  					fs_info->nodesize;
3521  			memcpy(extent_key, &key, sizeof(key));
3522  			return 0;
3523  		}
3524  	}
3525  	btrfs_release_path(path);
3526  	return ret;
3527  }
3528  
set_reloc_control(struct reloc_control * rc)3529  static void set_reloc_control(struct reloc_control *rc)
3530  {
3531  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3532  
3533  	mutex_lock(&fs_info->reloc_mutex);
3534  	fs_info->reloc_ctl = rc;
3535  	mutex_unlock(&fs_info->reloc_mutex);
3536  }
3537  
unset_reloc_control(struct reloc_control * rc)3538  static void unset_reloc_control(struct reloc_control *rc)
3539  {
3540  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3541  
3542  	mutex_lock(&fs_info->reloc_mutex);
3543  	fs_info->reloc_ctl = NULL;
3544  	mutex_unlock(&fs_info->reloc_mutex);
3545  }
3546  
3547  static noinline_for_stack
prepare_to_relocate(struct reloc_control * rc)3548  int prepare_to_relocate(struct reloc_control *rc)
3549  {
3550  	struct btrfs_trans_handle *trans;
3551  	int ret;
3552  
3553  	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3554  					      BTRFS_BLOCK_RSV_TEMP);
3555  	if (!rc->block_rsv)
3556  		return -ENOMEM;
3557  
3558  	memset(&rc->cluster, 0, sizeof(rc->cluster));
3559  	rc->search_start = rc->block_group->start;
3560  	rc->extents_found = 0;
3561  	rc->nodes_relocated = 0;
3562  	rc->merging_rsv_size = 0;
3563  	rc->reserved_bytes = 0;
3564  	rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3565  			      RELOCATION_RESERVED_NODES;
3566  	ret = btrfs_block_rsv_refill(rc->extent_root->fs_info,
3567  				     rc->block_rsv, rc->block_rsv->size,
3568  				     BTRFS_RESERVE_FLUSH_ALL);
3569  	if (ret)
3570  		return ret;
3571  
3572  	rc->create_reloc_tree = true;
3573  	set_reloc_control(rc);
3574  
3575  	trans = btrfs_join_transaction(rc->extent_root);
3576  	if (IS_ERR(trans)) {
3577  		unset_reloc_control(rc);
3578  		/*
3579  		 * extent tree is not a ref_cow tree and has no reloc_root to
3580  		 * cleanup.  And callers are responsible to free the above
3581  		 * block rsv.
3582  		 */
3583  		return PTR_ERR(trans);
3584  	}
3585  
3586  	ret = btrfs_commit_transaction(trans);
3587  	if (ret)
3588  		unset_reloc_control(rc);
3589  
3590  	return ret;
3591  }
3592  
relocate_block_group(struct reloc_control * rc)3593  static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3594  {
3595  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3596  	struct rb_root blocks = RB_ROOT;
3597  	struct btrfs_key key;
3598  	struct btrfs_trans_handle *trans = NULL;
3599  	struct btrfs_path *path;
3600  	struct btrfs_extent_item *ei;
3601  	u64 flags;
3602  	int ret;
3603  	int err = 0;
3604  	int progress = 0;
3605  
3606  	path = btrfs_alloc_path();
3607  	if (!path)
3608  		return -ENOMEM;
3609  	path->reada = READA_FORWARD;
3610  
3611  	ret = prepare_to_relocate(rc);
3612  	if (ret) {
3613  		err = ret;
3614  		goto out_free;
3615  	}
3616  
3617  	while (1) {
3618  		rc->reserved_bytes = 0;
3619  		ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
3620  					     rc->block_rsv->size,
3621  					     BTRFS_RESERVE_FLUSH_ALL);
3622  		if (ret) {
3623  			err = ret;
3624  			break;
3625  		}
3626  		progress++;
3627  		trans = btrfs_start_transaction(rc->extent_root, 0);
3628  		if (IS_ERR(trans)) {
3629  			err = PTR_ERR(trans);
3630  			trans = NULL;
3631  			break;
3632  		}
3633  restart:
3634  		if (rc->backref_cache.last_trans != trans->transid)
3635  			btrfs_backref_release_cache(&rc->backref_cache);
3636  		rc->backref_cache.last_trans = trans->transid;
3637  
3638  		ret = find_next_extent(rc, path, &key);
3639  		if (ret < 0)
3640  			err = ret;
3641  		if (ret != 0)
3642  			break;
3643  
3644  		rc->extents_found++;
3645  
3646  		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3647  				    struct btrfs_extent_item);
3648  		flags = btrfs_extent_flags(path->nodes[0], ei);
3649  
3650  		/*
3651  		 * If we are relocating a simple quota owned extent item, we
3652  		 * need to note the owner on the reloc data root so that when
3653  		 * we allocate the replacement item, we can attribute it to the
3654  		 * correct eventual owner (rather than the reloc data root).
3655  		 */
3656  		if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
3657  			struct btrfs_root *root = BTRFS_I(rc->data_inode)->root;
3658  			u64 owning_root_id = btrfs_get_extent_owner_root(fs_info,
3659  								 path->nodes[0],
3660  								 path->slots[0]);
3661  
3662  			root->relocation_src_root = owning_root_id;
3663  		}
3664  
3665  		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3666  			ret = add_tree_block(rc, &key, path, &blocks);
3667  		} else if (rc->stage == UPDATE_DATA_PTRS &&
3668  			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
3669  			ret = add_data_references(rc, &key, path, &blocks);
3670  		} else {
3671  			btrfs_release_path(path);
3672  			ret = 0;
3673  		}
3674  		if (ret < 0) {
3675  			err = ret;
3676  			break;
3677  		}
3678  
3679  		if (!RB_EMPTY_ROOT(&blocks)) {
3680  			ret = relocate_tree_blocks(trans, rc, &blocks);
3681  			if (ret < 0) {
3682  				if (ret != -EAGAIN) {
3683  					err = ret;
3684  					break;
3685  				}
3686  				rc->extents_found--;
3687  				rc->search_start = key.objectid;
3688  			}
3689  		}
3690  
3691  		btrfs_end_transaction_throttle(trans);
3692  		btrfs_btree_balance_dirty(fs_info);
3693  		trans = NULL;
3694  
3695  		if (rc->stage == MOVE_DATA_EXTENTS &&
3696  		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
3697  			rc->found_file_extent = true;
3698  			ret = relocate_data_extent(rc, &key);
3699  			if (ret < 0) {
3700  				err = ret;
3701  				break;
3702  			}
3703  		}
3704  		if (btrfs_should_cancel_balance(fs_info)) {
3705  			err = -ECANCELED;
3706  			break;
3707  		}
3708  	}
3709  	if (trans && progress && err == -ENOSPC) {
3710  		ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
3711  		if (ret == 1) {
3712  			err = 0;
3713  			progress = 0;
3714  			goto restart;
3715  		}
3716  	}
3717  
3718  	btrfs_release_path(path);
3719  	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
3720  
3721  	if (trans) {
3722  		btrfs_end_transaction_throttle(trans);
3723  		btrfs_btree_balance_dirty(fs_info);
3724  	}
3725  
3726  	if (!err) {
3727  		ret = relocate_file_extent_cluster(rc);
3728  		if (ret < 0)
3729  			err = ret;
3730  	}
3731  
3732  	rc->create_reloc_tree = false;
3733  	set_reloc_control(rc);
3734  
3735  	btrfs_backref_release_cache(&rc->backref_cache);
3736  	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3737  
3738  	/*
3739  	 * Even in the case when the relocation is cancelled, we should all go
3740  	 * through prepare_to_merge() and merge_reloc_roots().
3741  	 *
3742  	 * For error (including cancelled balance), prepare_to_merge() will
3743  	 * mark all reloc trees orphan, then queue them for cleanup in
3744  	 * merge_reloc_roots()
3745  	 */
3746  	err = prepare_to_merge(rc, err);
3747  
3748  	merge_reloc_roots(rc);
3749  
3750  	rc->merge_reloc_tree = false;
3751  	unset_reloc_control(rc);
3752  	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3753  
3754  	/* get rid of pinned extents */
3755  	trans = btrfs_join_transaction(rc->extent_root);
3756  	if (IS_ERR(trans)) {
3757  		err = PTR_ERR(trans);
3758  		goto out_free;
3759  	}
3760  	ret = btrfs_commit_transaction(trans);
3761  	if (ret && !err)
3762  		err = ret;
3763  out_free:
3764  	ret = clean_dirty_subvols(rc);
3765  	if (ret < 0 && !err)
3766  		err = ret;
3767  	btrfs_free_block_rsv(fs_info, rc->block_rsv);
3768  	btrfs_free_path(path);
3769  	return err;
3770  }
3771  
__insert_orphan_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 objectid)3772  static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3773  				 struct btrfs_root *root, u64 objectid)
3774  {
3775  	struct btrfs_path *path;
3776  	struct btrfs_inode_item *item;
3777  	struct extent_buffer *leaf;
3778  	int ret;
3779  
3780  	path = btrfs_alloc_path();
3781  	if (!path)
3782  		return -ENOMEM;
3783  
3784  	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3785  	if (ret)
3786  		goto out;
3787  
3788  	leaf = path->nodes[0];
3789  	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3790  	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3791  	btrfs_set_inode_generation(leaf, item, 1);
3792  	btrfs_set_inode_size(leaf, item, 0);
3793  	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3794  	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3795  					  BTRFS_INODE_PREALLOC);
3796  	btrfs_mark_buffer_dirty(trans, leaf);
3797  out:
3798  	btrfs_free_path(path);
3799  	return ret;
3800  }
3801  
delete_orphan_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 objectid)3802  static void delete_orphan_inode(struct btrfs_trans_handle *trans,
3803  				struct btrfs_root *root, u64 objectid)
3804  {
3805  	struct btrfs_path *path;
3806  	struct btrfs_key key;
3807  	int ret = 0;
3808  
3809  	path = btrfs_alloc_path();
3810  	if (!path) {
3811  		ret = -ENOMEM;
3812  		goto out;
3813  	}
3814  
3815  	key.objectid = objectid;
3816  	key.type = BTRFS_INODE_ITEM_KEY;
3817  	key.offset = 0;
3818  	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3819  	if (ret) {
3820  		if (ret > 0)
3821  			ret = -ENOENT;
3822  		goto out;
3823  	}
3824  	ret = btrfs_del_item(trans, root, path);
3825  out:
3826  	if (ret)
3827  		btrfs_abort_transaction(trans, ret);
3828  	btrfs_free_path(path);
3829  }
3830  
3831  /*
3832   * helper to create inode for data relocation.
3833   * the inode is in data relocation tree and its link count is 0
3834   */
create_reloc_inode(struct btrfs_fs_info * fs_info,const struct btrfs_block_group * group)3835  static noinline_for_stack struct inode *create_reloc_inode(
3836  					struct btrfs_fs_info *fs_info,
3837  					const struct btrfs_block_group *group)
3838  {
3839  	struct inode *inode = NULL;
3840  	struct btrfs_trans_handle *trans;
3841  	struct btrfs_root *root;
3842  	u64 objectid;
3843  	int ret = 0;
3844  
3845  	root = btrfs_grab_root(fs_info->data_reloc_root);
3846  	trans = btrfs_start_transaction(root, 6);
3847  	if (IS_ERR(trans)) {
3848  		btrfs_put_root(root);
3849  		return ERR_CAST(trans);
3850  	}
3851  
3852  	ret = btrfs_get_free_objectid(root, &objectid);
3853  	if (ret)
3854  		goto out;
3855  
3856  	ret = __insert_orphan_inode(trans, root, objectid);
3857  	if (ret)
3858  		goto out;
3859  
3860  	inode = btrfs_iget(objectid, root);
3861  	if (IS_ERR(inode)) {
3862  		delete_orphan_inode(trans, root, objectid);
3863  		ret = PTR_ERR(inode);
3864  		inode = NULL;
3865  		goto out;
3866  	}
3867  	BTRFS_I(inode)->reloc_block_group_start = group->start;
3868  
3869  	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
3870  out:
3871  	btrfs_put_root(root);
3872  	btrfs_end_transaction(trans);
3873  	btrfs_btree_balance_dirty(fs_info);
3874  	if (ret) {
3875  		iput(inode);
3876  		inode = ERR_PTR(ret);
3877  	}
3878  	return inode;
3879  }
3880  
3881  /*
3882   * Mark start of chunk relocation that is cancellable. Check if the cancellation
3883   * has been requested meanwhile and don't start in that case.
3884   *
3885   * Return:
3886   *   0             success
3887   *   -EINPROGRESS  operation is already in progress, that's probably a bug
3888   *   -ECANCELED    cancellation request was set before the operation started
3889   */
reloc_chunk_start(struct btrfs_fs_info * fs_info)3890  static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
3891  {
3892  	if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
3893  		/* This should not happen */
3894  		btrfs_err(fs_info, "reloc already running, cannot start");
3895  		return -EINPROGRESS;
3896  	}
3897  
3898  	if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
3899  		btrfs_info(fs_info, "chunk relocation canceled on start");
3900  		/*
3901  		 * On cancel, clear all requests but let the caller mark
3902  		 * the end after cleanup operations.
3903  		 */
3904  		atomic_set(&fs_info->reloc_cancel_req, 0);
3905  		return -ECANCELED;
3906  	}
3907  	return 0;
3908  }
3909  
3910  /*
3911   * Mark end of chunk relocation that is cancellable and wake any waiters.
3912   */
reloc_chunk_end(struct btrfs_fs_info * fs_info)3913  static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
3914  {
3915  	/* Requested after start, clear bit first so any waiters can continue */
3916  	if (atomic_read(&fs_info->reloc_cancel_req) > 0)
3917  		btrfs_info(fs_info, "chunk relocation canceled during operation");
3918  	clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
3919  	atomic_set(&fs_info->reloc_cancel_req, 0);
3920  }
3921  
alloc_reloc_control(struct btrfs_fs_info * fs_info)3922  static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
3923  {
3924  	struct reloc_control *rc;
3925  
3926  	rc = kzalloc(sizeof(*rc), GFP_NOFS);
3927  	if (!rc)
3928  		return NULL;
3929  
3930  	INIT_LIST_HEAD(&rc->reloc_roots);
3931  	INIT_LIST_HEAD(&rc->dirty_subvol_roots);
3932  	btrfs_backref_init_cache(fs_info, &rc->backref_cache, true);
3933  	rc->reloc_root_tree.rb_root = RB_ROOT;
3934  	spin_lock_init(&rc->reloc_root_tree.lock);
3935  	extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
3936  	return rc;
3937  }
3938  
free_reloc_control(struct reloc_control * rc)3939  static void free_reloc_control(struct reloc_control *rc)
3940  {
3941  	struct mapping_node *node, *tmp;
3942  
3943  	free_reloc_roots(&rc->reloc_roots);
3944  	rbtree_postorder_for_each_entry_safe(node, tmp,
3945  			&rc->reloc_root_tree.rb_root, rb_node)
3946  		kfree(node);
3947  
3948  	kfree(rc);
3949  }
3950  
3951  /*
3952   * Print the block group being relocated
3953   */
describe_relocation(struct btrfs_block_group * block_group)3954  static void describe_relocation(struct btrfs_block_group *block_group)
3955  {
3956  	char buf[128] = {'\0'};
3957  
3958  	btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
3959  
3960  	btrfs_info(block_group->fs_info, "relocating block group %llu flags %s",
3961  		   block_group->start, buf);
3962  }
3963  
stage_to_string(enum reloc_stage stage)3964  static const char *stage_to_string(enum reloc_stage stage)
3965  {
3966  	if (stage == MOVE_DATA_EXTENTS)
3967  		return "move data extents";
3968  	if (stage == UPDATE_DATA_PTRS)
3969  		return "update data pointers";
3970  	return "unknown";
3971  }
3972  
3973  /*
3974   * function to relocate all extents in a block group.
3975   */
btrfs_relocate_block_group(struct btrfs_fs_info * fs_info,u64 group_start)3976  int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
3977  {
3978  	struct btrfs_block_group *bg;
3979  	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
3980  	struct reloc_control *rc;
3981  	struct inode *inode;
3982  	struct btrfs_path *path;
3983  	int ret;
3984  	int rw = 0;
3985  	int err = 0;
3986  
3987  	/*
3988  	 * This only gets set if we had a half-deleted snapshot on mount.  We
3989  	 * cannot allow relocation to start while we're still trying to clean up
3990  	 * these pending deletions.
3991  	 */
3992  	ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
3993  	if (ret)
3994  		return ret;
3995  
3996  	/* We may have been woken up by close_ctree, so bail if we're closing. */
3997  	if (btrfs_fs_closing(fs_info))
3998  		return -EINTR;
3999  
4000  	bg = btrfs_lookup_block_group(fs_info, group_start);
4001  	if (!bg)
4002  		return -ENOENT;
4003  
4004  	/*
4005  	 * Relocation of a data block group creates ordered extents.  Without
4006  	 * sb_start_write(), we can freeze the filesystem while unfinished
4007  	 * ordered extents are left. Such ordered extents can cause a deadlock
4008  	 * e.g. when syncfs() is waiting for their completion but they can't
4009  	 * finish because they block when joining a transaction, due to the
4010  	 * fact that the freeze locks are being held in write mode.
4011  	 */
4012  	if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
4013  		ASSERT(sb_write_started(fs_info->sb));
4014  
4015  	if (btrfs_pinned_by_swapfile(fs_info, bg)) {
4016  		btrfs_put_block_group(bg);
4017  		return -ETXTBSY;
4018  	}
4019  
4020  	rc = alloc_reloc_control(fs_info);
4021  	if (!rc) {
4022  		btrfs_put_block_group(bg);
4023  		return -ENOMEM;
4024  	}
4025  
4026  	ret = reloc_chunk_start(fs_info);
4027  	if (ret < 0) {
4028  		err = ret;
4029  		goto out_put_bg;
4030  	}
4031  
4032  	rc->extent_root = extent_root;
4033  	rc->block_group = bg;
4034  
4035  	ret = btrfs_inc_block_group_ro(rc->block_group, true);
4036  	if (ret) {
4037  		err = ret;
4038  		goto out;
4039  	}
4040  	rw = 1;
4041  
4042  	path = btrfs_alloc_path();
4043  	if (!path) {
4044  		err = -ENOMEM;
4045  		goto out;
4046  	}
4047  
4048  	inode = lookup_free_space_inode(rc->block_group, path);
4049  	btrfs_free_path(path);
4050  
4051  	if (!IS_ERR(inode))
4052  		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4053  	else
4054  		ret = PTR_ERR(inode);
4055  
4056  	if (ret && ret != -ENOENT) {
4057  		err = ret;
4058  		goto out;
4059  	}
4060  
4061  	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4062  	if (IS_ERR(rc->data_inode)) {
4063  		err = PTR_ERR(rc->data_inode);
4064  		rc->data_inode = NULL;
4065  		goto out;
4066  	}
4067  
4068  	describe_relocation(rc->block_group);
4069  
4070  	btrfs_wait_block_group_reservations(rc->block_group);
4071  	btrfs_wait_nocow_writers(rc->block_group);
4072  	btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group);
4073  
4074  	ret = btrfs_zone_finish(rc->block_group);
4075  	WARN_ON(ret && ret != -EAGAIN);
4076  
4077  	while (1) {
4078  		enum reloc_stage finishes_stage;
4079  
4080  		mutex_lock(&fs_info->cleaner_mutex);
4081  		ret = relocate_block_group(rc);
4082  		mutex_unlock(&fs_info->cleaner_mutex);
4083  		if (ret < 0)
4084  			err = ret;
4085  
4086  		finishes_stage = rc->stage;
4087  		/*
4088  		 * We may have gotten ENOSPC after we already dirtied some
4089  		 * extents.  If writeout happens while we're relocating a
4090  		 * different block group we could end up hitting the
4091  		 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
4092  		 * btrfs_reloc_cow_block.  Make sure we write everything out
4093  		 * properly so we don't trip over this problem, and then break
4094  		 * out of the loop if we hit an error.
4095  		 */
4096  		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4097  			ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0,
4098  						       (u64)-1);
4099  			if (ret)
4100  				err = ret;
4101  			invalidate_mapping_pages(rc->data_inode->i_mapping,
4102  						 0, -1);
4103  			rc->stage = UPDATE_DATA_PTRS;
4104  		}
4105  
4106  		if (err < 0)
4107  			goto out;
4108  
4109  		if (rc->extents_found == 0)
4110  			break;
4111  
4112  		btrfs_info(fs_info, "found %llu extents, stage: %s",
4113  			   rc->extents_found, stage_to_string(finishes_stage));
4114  	}
4115  
4116  	WARN_ON(rc->block_group->pinned > 0);
4117  	WARN_ON(rc->block_group->reserved > 0);
4118  	WARN_ON(rc->block_group->used > 0);
4119  out:
4120  	if (err && rw)
4121  		btrfs_dec_block_group_ro(rc->block_group);
4122  	iput(rc->data_inode);
4123  out_put_bg:
4124  	btrfs_put_block_group(bg);
4125  	reloc_chunk_end(fs_info);
4126  	free_reloc_control(rc);
4127  	return err;
4128  }
4129  
mark_garbage_root(struct btrfs_root * root)4130  static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4131  {
4132  	struct btrfs_fs_info *fs_info = root->fs_info;
4133  	struct btrfs_trans_handle *trans;
4134  	int ret, err;
4135  
4136  	trans = btrfs_start_transaction(fs_info->tree_root, 0);
4137  	if (IS_ERR(trans))
4138  		return PTR_ERR(trans);
4139  
4140  	memset(&root->root_item.drop_progress, 0,
4141  		sizeof(root->root_item.drop_progress));
4142  	btrfs_set_root_drop_level(&root->root_item, 0);
4143  	btrfs_set_root_refs(&root->root_item, 0);
4144  	ret = btrfs_update_root(trans, fs_info->tree_root,
4145  				&root->root_key, &root->root_item);
4146  
4147  	err = btrfs_end_transaction(trans);
4148  	if (err)
4149  		return err;
4150  	return ret;
4151  }
4152  
4153  /*
4154   * recover relocation interrupted by system crash.
4155   *
4156   * this function resumes merging reloc trees with corresponding fs trees.
4157   * this is important for keeping the sharing of tree blocks
4158   */
btrfs_recover_relocation(struct btrfs_fs_info * fs_info)4159  int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
4160  {
4161  	LIST_HEAD(reloc_roots);
4162  	struct btrfs_key key;
4163  	struct btrfs_root *fs_root;
4164  	struct btrfs_root *reloc_root;
4165  	struct btrfs_path *path;
4166  	struct extent_buffer *leaf;
4167  	struct reloc_control *rc = NULL;
4168  	struct btrfs_trans_handle *trans;
4169  	int ret2;
4170  	int ret = 0;
4171  
4172  	path = btrfs_alloc_path();
4173  	if (!path)
4174  		return -ENOMEM;
4175  	path->reada = READA_BACK;
4176  
4177  	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4178  	key.type = BTRFS_ROOT_ITEM_KEY;
4179  	key.offset = (u64)-1;
4180  
4181  	while (1) {
4182  		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
4183  					path, 0, 0);
4184  		if (ret < 0)
4185  			goto out;
4186  		if (ret > 0) {
4187  			if (path->slots[0] == 0)
4188  				break;
4189  			path->slots[0]--;
4190  		}
4191  		ret = 0;
4192  		leaf = path->nodes[0];
4193  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4194  		btrfs_release_path(path);
4195  
4196  		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4197  		    key.type != BTRFS_ROOT_ITEM_KEY)
4198  			break;
4199  
4200  		reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
4201  		if (IS_ERR(reloc_root)) {
4202  			ret = PTR_ERR(reloc_root);
4203  			goto out;
4204  		}
4205  
4206  		set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
4207  		list_add(&reloc_root->root_list, &reloc_roots);
4208  
4209  		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4210  			fs_root = btrfs_get_fs_root(fs_info,
4211  					reloc_root->root_key.offset, false);
4212  			if (IS_ERR(fs_root)) {
4213  				ret = PTR_ERR(fs_root);
4214  				if (ret != -ENOENT)
4215  					goto out;
4216  				ret = mark_garbage_root(reloc_root);
4217  				if (ret < 0)
4218  					goto out;
4219  				ret = 0;
4220  			} else {
4221  				btrfs_put_root(fs_root);
4222  			}
4223  		}
4224  
4225  		if (key.offset == 0)
4226  			break;
4227  
4228  		key.offset--;
4229  	}
4230  	btrfs_release_path(path);
4231  
4232  	if (list_empty(&reloc_roots))
4233  		goto out;
4234  
4235  	rc = alloc_reloc_control(fs_info);
4236  	if (!rc) {
4237  		ret = -ENOMEM;
4238  		goto out;
4239  	}
4240  
4241  	ret = reloc_chunk_start(fs_info);
4242  	if (ret < 0)
4243  		goto out_end;
4244  
4245  	rc->extent_root = btrfs_extent_root(fs_info, 0);
4246  
4247  	set_reloc_control(rc);
4248  
4249  	trans = btrfs_join_transaction(rc->extent_root);
4250  	if (IS_ERR(trans)) {
4251  		ret = PTR_ERR(trans);
4252  		goto out_unset;
4253  	}
4254  
4255  	rc->merge_reloc_tree = true;
4256  
4257  	while (!list_empty(&reloc_roots)) {
4258  		reloc_root = list_entry(reloc_roots.next,
4259  					struct btrfs_root, root_list);
4260  		list_del(&reloc_root->root_list);
4261  
4262  		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4263  			list_add_tail(&reloc_root->root_list,
4264  				      &rc->reloc_roots);
4265  			continue;
4266  		}
4267  
4268  		fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
4269  					    false);
4270  		if (IS_ERR(fs_root)) {
4271  			ret = PTR_ERR(fs_root);
4272  			list_add_tail(&reloc_root->root_list, &reloc_roots);
4273  			btrfs_end_transaction(trans);
4274  			goto out_unset;
4275  		}
4276  
4277  		ret = __add_reloc_root(reloc_root);
4278  		ASSERT(ret != -EEXIST);
4279  		if (ret) {
4280  			list_add_tail(&reloc_root->root_list, &reloc_roots);
4281  			btrfs_put_root(fs_root);
4282  			btrfs_end_transaction(trans);
4283  			goto out_unset;
4284  		}
4285  		fs_root->reloc_root = btrfs_grab_root(reloc_root);
4286  		btrfs_put_root(fs_root);
4287  	}
4288  
4289  	ret = btrfs_commit_transaction(trans);
4290  	if (ret)
4291  		goto out_unset;
4292  
4293  	merge_reloc_roots(rc);
4294  
4295  	unset_reloc_control(rc);
4296  
4297  	trans = btrfs_join_transaction(rc->extent_root);
4298  	if (IS_ERR(trans)) {
4299  		ret = PTR_ERR(trans);
4300  		goto out_clean;
4301  	}
4302  	ret = btrfs_commit_transaction(trans);
4303  out_clean:
4304  	ret2 = clean_dirty_subvols(rc);
4305  	if (ret2 < 0 && !ret)
4306  		ret = ret2;
4307  out_unset:
4308  	unset_reloc_control(rc);
4309  out_end:
4310  	reloc_chunk_end(fs_info);
4311  	free_reloc_control(rc);
4312  out:
4313  	free_reloc_roots(&reloc_roots);
4314  
4315  	btrfs_free_path(path);
4316  
4317  	if (ret == 0) {
4318  		/* cleanup orphan inode in data relocation tree */
4319  		fs_root = btrfs_grab_root(fs_info->data_reloc_root);
4320  		ASSERT(fs_root);
4321  		ret = btrfs_orphan_cleanup(fs_root);
4322  		btrfs_put_root(fs_root);
4323  	}
4324  	return ret;
4325  }
4326  
4327  /*
4328   * helper to add ordered checksum for data relocation.
4329   *
4330   * cloning checksum properly handles the nodatasum extents.
4331   * it also saves CPU time to re-calculate the checksum.
4332   */
btrfs_reloc_clone_csums(struct btrfs_ordered_extent * ordered)4333  int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
4334  {
4335  	struct btrfs_inode *inode = ordered->inode;
4336  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
4337  	u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start;
4338  	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr);
4339  	LIST_HEAD(list);
4340  	int ret;
4341  
4342  	ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
4343  				      disk_bytenr + ordered->num_bytes - 1,
4344  				      &list, false);
4345  	if (ret < 0) {
4346  		btrfs_mark_ordered_extent_error(ordered);
4347  		return ret;
4348  	}
4349  
4350  	while (!list_empty(&list)) {
4351  		struct btrfs_ordered_sum *sums =
4352  			list_entry(list.next, struct btrfs_ordered_sum, list);
4353  
4354  		list_del_init(&sums->list);
4355  
4356  		/*
4357  		 * We need to offset the new_bytenr based on where the csum is.
4358  		 * We need to do this because we will read in entire prealloc
4359  		 * extents but we may have written to say the middle of the
4360  		 * prealloc extent, so we need to make sure the csum goes with
4361  		 * the right disk offset.
4362  		 *
4363  		 * We can do this because the data reloc inode refers strictly
4364  		 * to the on disk bytes, so we don't have to worry about
4365  		 * disk_len vs real len like with real inodes since it's all
4366  		 * disk length.
4367  		 */
4368  		sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr;
4369  		btrfs_add_ordered_sum(ordered, sums);
4370  	}
4371  
4372  	return 0;
4373  }
4374  
btrfs_reloc_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct extent_buffer * buf,struct extent_buffer * cow)4375  int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4376  			  struct btrfs_root *root,
4377  			  const struct extent_buffer *buf,
4378  			  struct extent_buffer *cow)
4379  {
4380  	struct btrfs_fs_info *fs_info = root->fs_info;
4381  	struct reloc_control *rc;
4382  	struct btrfs_backref_node *node;
4383  	int first_cow = 0;
4384  	int level;
4385  	int ret = 0;
4386  
4387  	rc = fs_info->reloc_ctl;
4388  	if (!rc)
4389  		return 0;
4390  
4391  	BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
4392  
4393  	level = btrfs_header_level(buf);
4394  	if (btrfs_header_generation(buf) <=
4395  	    btrfs_root_last_snapshot(&root->root_item))
4396  		first_cow = 1;
4397  
4398  	if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) {
4399  		WARN_ON(!first_cow && level == 0);
4400  
4401  		node = rc->backref_cache.path[level];
4402  		BUG_ON(node->bytenr != buf->start &&
4403  		       node->new_bytenr != buf->start);
4404  
4405  		btrfs_backref_drop_node_buffer(node);
4406  		atomic_inc(&cow->refs);
4407  		node->eb = cow;
4408  		node->new_bytenr = cow->start;
4409  
4410  		if (!node->pending) {
4411  			list_move_tail(&node->list,
4412  				       &rc->backref_cache.pending[level]);
4413  			node->pending = 1;
4414  		}
4415  
4416  		if (first_cow)
4417  			mark_block_processed(rc, node);
4418  
4419  		if (first_cow && level > 0)
4420  			rc->nodes_relocated += buf->len;
4421  	}
4422  
4423  	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4424  		ret = replace_file_extents(trans, rc, root, cow);
4425  	return ret;
4426  }
4427  
4428  /*
4429   * called before creating snapshot. it calculates metadata reservation
4430   * required for relocating tree blocks in the snapshot
4431   */
btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot * pending,u64 * bytes_to_reserve)4432  void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4433  			      u64 *bytes_to_reserve)
4434  {
4435  	struct btrfs_root *root = pending->root;
4436  	struct reloc_control *rc = root->fs_info->reloc_ctl;
4437  
4438  	if (!rc || !have_reloc_root(root))
4439  		return;
4440  
4441  	if (!rc->merge_reloc_tree)
4442  		return;
4443  
4444  	root = root->reloc_root;
4445  	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4446  	/*
4447  	 * relocation is in the stage of merging trees. the space
4448  	 * used by merging a reloc tree is twice the size of
4449  	 * relocated tree nodes in the worst case. half for cowing
4450  	 * the reloc tree, half for cowing the fs tree. the space
4451  	 * used by cowing the reloc tree will be freed after the
4452  	 * tree is dropped. if we create snapshot, cowing the fs
4453  	 * tree may use more space than it frees. so we need
4454  	 * reserve extra space.
4455  	 */
4456  	*bytes_to_reserve += rc->nodes_relocated;
4457  }
4458  
4459  /*
4460   * called after snapshot is created. migrate block reservation
4461   * and create reloc root for the newly created snapshot
4462   *
4463   * This is similar to btrfs_init_reloc_root(), we come out of here with two
4464   * references held on the reloc_root, one for root->reloc_root and one for
4465   * rc->reloc_roots.
4466   */
btrfs_reloc_post_snapshot(struct btrfs_trans_handle * trans,struct btrfs_pending_snapshot * pending)4467  int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4468  			       struct btrfs_pending_snapshot *pending)
4469  {
4470  	struct btrfs_root *root = pending->root;
4471  	struct btrfs_root *reloc_root;
4472  	struct btrfs_root *new_root;
4473  	struct reloc_control *rc = root->fs_info->reloc_ctl;
4474  	int ret;
4475  
4476  	if (!rc || !have_reloc_root(root))
4477  		return 0;
4478  
4479  	rc = root->fs_info->reloc_ctl;
4480  	rc->merging_rsv_size += rc->nodes_relocated;
4481  
4482  	if (rc->merge_reloc_tree) {
4483  		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4484  					      rc->block_rsv,
4485  					      rc->nodes_relocated, true);
4486  		if (ret)
4487  			return ret;
4488  	}
4489  
4490  	new_root = pending->snap;
4491  	reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root));
4492  	if (IS_ERR(reloc_root))
4493  		return PTR_ERR(reloc_root);
4494  
4495  	ret = __add_reloc_root(reloc_root);
4496  	ASSERT(ret != -EEXIST);
4497  	if (ret) {
4498  		/* Pairs with create_reloc_root */
4499  		btrfs_put_root(reloc_root);
4500  		return ret;
4501  	}
4502  	new_root->reloc_root = btrfs_grab_root(reloc_root);
4503  
4504  	if (rc->create_reloc_tree)
4505  		ret = clone_backref_node(trans, rc, root, reloc_root);
4506  	return ret;
4507  }
4508  
4509  /*
4510   * Get the current bytenr for the block group which is being relocated.
4511   *
4512   * Return U64_MAX if no running relocation.
4513   */
btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info * fs_info)4514  u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
4515  {
4516  	u64 logical = U64_MAX;
4517  
4518  	lockdep_assert_held(&fs_info->reloc_mutex);
4519  
4520  	if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group)
4521  		logical = fs_info->reloc_ctl->block_group->start;
4522  	return logical;
4523  }
4524