1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2007 Oracle.  All rights reserved.
4   */
5  
6  #include <linux/fs.h>
7  #include <linux/slab.h>
8  #include <linux/sched.h>
9  #include <linux/sched/mm.h>
10  #include <linux/writeback.h>
11  #include <linux/pagemap.h>
12  #include <linux/blkdev.h>
13  #include <linux/uuid.h>
14  #include <linux/timekeeping.h>
15  #include "misc.h"
16  #include "ctree.h"
17  #include "disk-io.h"
18  #include "transaction.h"
19  #include "locking.h"
20  #include "tree-log.h"
21  #include "volumes.h"
22  #include "dev-replace.h"
23  #include "qgroup.h"
24  #include "block-group.h"
25  #include "space-info.h"
26  #include "fs.h"
27  #include "accessors.h"
28  #include "extent-tree.h"
29  #include "root-tree.h"
30  #include "dir-item.h"
31  #include "uuid-tree.h"
32  #include "ioctl.h"
33  #include "relocation.h"
34  #include "scrub.h"
35  
36  static struct kmem_cache *btrfs_trans_handle_cachep;
37  
38  /*
39   * Transaction states and transitions
40   *
41   * No running transaction (fs tree blocks are not modified)
42   * |
43   * | To next stage:
44   * |  Call start_transaction() variants. Except btrfs_join_transaction_nostart().
45   * V
46   * Transaction N [[TRANS_STATE_RUNNING]]
47   * |
48   * | New trans handles can be attached to transaction N by calling all
49   * | start_transaction() variants.
50   * |
51   * | To next stage:
52   * |  Call btrfs_commit_transaction() on any trans handle attached to
53   * |  transaction N
54   * V
55   * Transaction N [[TRANS_STATE_COMMIT_PREP]]
56   * |
57   * | If there are simultaneous calls to btrfs_commit_transaction() one will win
58   * | the race and the rest will wait for the winner to commit the transaction.
59   * |
60   * | The winner will wait for previous running transaction to completely finish
61   * | if there is one.
62   * |
63   * Transaction N [[TRANS_STATE_COMMIT_START]]
64   * |
65   * | Then one of the following happens:
66   * | - Wait for all other trans handle holders to release.
67   * |   The btrfs_commit_transaction() caller will do the commit work.
68   * | - Wait for current transaction to be committed by others.
69   * |   Other btrfs_commit_transaction() caller will do the commit work.
70   * |
71   * | At this stage, only btrfs_join_transaction*() variants can attach
72   * | to this running transaction.
73   * | All other variants will wait for current one to finish and attach to
74   * | transaction N+1.
75   * |
76   * | To next stage:
77   * |  Caller is chosen to commit transaction N, and all other trans handle
78   * |  haven been released.
79   * V
80   * Transaction N [[TRANS_STATE_COMMIT_DOING]]
81   * |
82   * | The heavy lifting transaction work is started.
83   * | From running delayed refs (modifying extent tree) to creating pending
84   * | snapshots, running qgroups.
85   * | In short, modify supporting trees to reflect modifications of subvolume
86   * | trees.
87   * |
88   * | At this stage, all start_transaction() calls will wait for this
89   * | transaction to finish and attach to transaction N+1.
90   * |
91   * | To next stage:
92   * |  Until all supporting trees are updated.
93   * V
94   * Transaction N [[TRANS_STATE_UNBLOCKED]]
95   * |						    Transaction N+1
96   * | All needed trees are modified, thus we only    [[TRANS_STATE_RUNNING]]
97   * | need to write them back to disk and update	    |
98   * | super blocks.				    |
99   * |						    |
100   * | At this stage, new transaction is allowed to   |
101   * | start.					    |
102   * | All new start_transaction() calls will be	    |
103   * | attached to transid N+1.			    |
104   * |						    |
105   * | To next stage:				    |
106   * |  Until all tree blocks are super blocks are    |
107   * |  written to block devices			    |
108   * V						    |
109   * Transaction N [[TRANS_STATE_COMPLETED]]	    V
110   *   All tree blocks and super blocks are written.  Transaction N+1
111   *   This transaction is finished and all its	    [[TRANS_STATE_COMMIT_START]]
112   *   data structures will be cleaned up.	    | Life goes on
113   */
114  static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
115  	[TRANS_STATE_RUNNING]		= 0U,
116  	[TRANS_STATE_COMMIT_PREP]	= 0U,
117  	[TRANS_STATE_COMMIT_START]	= (__TRANS_START | __TRANS_ATTACH),
118  	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_START |
119  					   __TRANS_ATTACH |
120  					   __TRANS_JOIN |
121  					   __TRANS_JOIN_NOSTART),
122  	[TRANS_STATE_UNBLOCKED]		= (__TRANS_START |
123  					   __TRANS_ATTACH |
124  					   __TRANS_JOIN |
125  					   __TRANS_JOIN_NOLOCK |
126  					   __TRANS_JOIN_NOSTART),
127  	[TRANS_STATE_SUPER_COMMITTED]	= (__TRANS_START |
128  					   __TRANS_ATTACH |
129  					   __TRANS_JOIN |
130  					   __TRANS_JOIN_NOLOCK |
131  					   __TRANS_JOIN_NOSTART),
132  	[TRANS_STATE_COMPLETED]		= (__TRANS_START |
133  					   __TRANS_ATTACH |
134  					   __TRANS_JOIN |
135  					   __TRANS_JOIN_NOLOCK |
136  					   __TRANS_JOIN_NOSTART),
137  };
138  
btrfs_put_transaction(struct btrfs_transaction * transaction)139  void btrfs_put_transaction(struct btrfs_transaction *transaction)
140  {
141  	WARN_ON(refcount_read(&transaction->use_count) == 0);
142  	if (refcount_dec_and_test(&transaction->use_count)) {
143  		BUG_ON(!list_empty(&transaction->list));
144  		WARN_ON(!RB_EMPTY_ROOT(
145  				&transaction->delayed_refs.href_root.rb_root));
146  		WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents));
147  		if (transaction->delayed_refs.pending_csums)
148  			btrfs_err(transaction->fs_info,
149  				  "pending csums is %llu",
150  				  transaction->delayed_refs.pending_csums);
151  		/*
152  		 * If any block groups are found in ->deleted_bgs then it's
153  		 * because the transaction was aborted and a commit did not
154  		 * happen (things failed before writing the new superblock
155  		 * and calling btrfs_finish_extent_commit()), so we can not
156  		 * discard the physical locations of the block groups.
157  		 */
158  		while (!list_empty(&transaction->deleted_bgs)) {
159  			struct btrfs_block_group *cache;
160  
161  			cache = list_first_entry(&transaction->deleted_bgs,
162  						 struct btrfs_block_group,
163  						 bg_list);
164  			list_del_init(&cache->bg_list);
165  			btrfs_unfreeze_block_group(cache);
166  			btrfs_put_block_group(cache);
167  		}
168  		WARN_ON(!list_empty(&transaction->dev_update_list));
169  		kfree(transaction);
170  	}
171  }
172  
switch_commit_roots(struct btrfs_trans_handle * trans)173  static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
174  {
175  	struct btrfs_transaction *cur_trans = trans->transaction;
176  	struct btrfs_fs_info *fs_info = trans->fs_info;
177  	struct btrfs_root *root, *tmp;
178  
179  	/*
180  	 * At this point no one can be using this transaction to modify any tree
181  	 * and no one can start another transaction to modify any tree either.
182  	 */
183  	ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING);
184  
185  	down_write(&fs_info->commit_root_sem);
186  
187  	if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
188  		fs_info->last_reloc_trans = trans->transid;
189  
190  	list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
191  				 dirty_list) {
192  		list_del_init(&root->dirty_list);
193  		free_extent_buffer(root->commit_root);
194  		root->commit_root = btrfs_root_node(root);
195  		extent_io_tree_release(&root->dirty_log_pages);
196  		btrfs_qgroup_clean_swapped_blocks(root);
197  	}
198  
199  	/* We can free old roots now. */
200  	spin_lock(&cur_trans->dropped_roots_lock);
201  	while (!list_empty(&cur_trans->dropped_roots)) {
202  		root = list_first_entry(&cur_trans->dropped_roots,
203  					struct btrfs_root, root_list);
204  		list_del_init(&root->root_list);
205  		spin_unlock(&cur_trans->dropped_roots_lock);
206  		btrfs_free_log(trans, root);
207  		btrfs_drop_and_free_fs_root(fs_info, root);
208  		spin_lock(&cur_trans->dropped_roots_lock);
209  	}
210  	spin_unlock(&cur_trans->dropped_roots_lock);
211  
212  	up_write(&fs_info->commit_root_sem);
213  }
214  
extwriter_counter_inc(struct btrfs_transaction * trans,unsigned int type)215  static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
216  					 unsigned int type)
217  {
218  	if (type & TRANS_EXTWRITERS)
219  		atomic_inc(&trans->num_extwriters);
220  }
221  
extwriter_counter_dec(struct btrfs_transaction * trans,unsigned int type)222  static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
223  					 unsigned int type)
224  {
225  	if (type & TRANS_EXTWRITERS)
226  		atomic_dec(&trans->num_extwriters);
227  }
228  
extwriter_counter_init(struct btrfs_transaction * trans,unsigned int type)229  static inline void extwriter_counter_init(struct btrfs_transaction *trans,
230  					  unsigned int type)
231  {
232  	atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
233  }
234  
extwriter_counter_read(struct btrfs_transaction * trans)235  static inline int extwriter_counter_read(struct btrfs_transaction *trans)
236  {
237  	return atomic_read(&trans->num_extwriters);
238  }
239  
240  /*
241   * To be called after doing the chunk btree updates right after allocating a new
242   * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
243   * chunk after all chunk btree updates and after finishing the second phase of
244   * chunk allocation (btrfs_create_pending_block_groups()) in case some block
245   * group had its chunk item insertion delayed to the second phase.
246   */
btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle * trans)247  void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
248  {
249  	struct btrfs_fs_info *fs_info = trans->fs_info;
250  
251  	if (!trans->chunk_bytes_reserved)
252  		return;
253  
254  	btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
255  				trans->chunk_bytes_reserved, NULL);
256  	trans->chunk_bytes_reserved = 0;
257  }
258  
259  /*
260   * either allocate a new transaction or hop into the existing one
261   */
join_transaction(struct btrfs_fs_info * fs_info,unsigned int type)262  static noinline int join_transaction(struct btrfs_fs_info *fs_info,
263  				     unsigned int type)
264  {
265  	struct btrfs_transaction *cur_trans;
266  
267  	spin_lock(&fs_info->trans_lock);
268  loop:
269  	/* The file system has been taken offline. No new transactions. */
270  	if (BTRFS_FS_ERROR(fs_info)) {
271  		spin_unlock(&fs_info->trans_lock);
272  		return -EROFS;
273  	}
274  
275  	cur_trans = fs_info->running_transaction;
276  	if (cur_trans) {
277  		if (TRANS_ABORTED(cur_trans)) {
278  			spin_unlock(&fs_info->trans_lock);
279  			return cur_trans->aborted;
280  		}
281  		if (btrfs_blocked_trans_types[cur_trans->state] & type) {
282  			spin_unlock(&fs_info->trans_lock);
283  			return -EBUSY;
284  		}
285  		refcount_inc(&cur_trans->use_count);
286  		atomic_inc(&cur_trans->num_writers);
287  		extwriter_counter_inc(cur_trans, type);
288  		spin_unlock(&fs_info->trans_lock);
289  		btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
290  		btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
291  		return 0;
292  	}
293  	spin_unlock(&fs_info->trans_lock);
294  
295  	/*
296  	 * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the
297  	 * current transaction, and commit it. If there is no transaction, just
298  	 * return ENOENT.
299  	 */
300  	if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART)
301  		return -ENOENT;
302  
303  	/*
304  	 * JOIN_NOLOCK only happens during the transaction commit, so
305  	 * it is impossible that ->running_transaction is NULL
306  	 */
307  	BUG_ON(type == TRANS_JOIN_NOLOCK);
308  
309  	cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
310  	if (!cur_trans)
311  		return -ENOMEM;
312  
313  	btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
314  	btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
315  
316  	spin_lock(&fs_info->trans_lock);
317  	if (fs_info->running_transaction) {
318  		/*
319  		 * someone started a transaction after we unlocked.  Make sure
320  		 * to redo the checks above
321  		 */
322  		btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
323  		btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
324  		kfree(cur_trans);
325  		goto loop;
326  	} else if (BTRFS_FS_ERROR(fs_info)) {
327  		spin_unlock(&fs_info->trans_lock);
328  		btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
329  		btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
330  		kfree(cur_trans);
331  		return -EROFS;
332  	}
333  
334  	cur_trans->fs_info = fs_info;
335  	atomic_set(&cur_trans->pending_ordered, 0);
336  	init_waitqueue_head(&cur_trans->pending_wait);
337  	atomic_set(&cur_trans->num_writers, 1);
338  	extwriter_counter_init(cur_trans, type);
339  	init_waitqueue_head(&cur_trans->writer_wait);
340  	init_waitqueue_head(&cur_trans->commit_wait);
341  	cur_trans->state = TRANS_STATE_RUNNING;
342  	/*
343  	 * One for this trans handle, one so it will live on until we
344  	 * commit the transaction.
345  	 */
346  	refcount_set(&cur_trans->use_count, 2);
347  	cur_trans->flags = 0;
348  	cur_trans->start_time = ktime_get_seconds();
349  
350  	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
351  
352  	cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
353  	xa_init(&cur_trans->delayed_refs.dirty_extents);
354  	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
355  
356  	/*
357  	 * although the tree mod log is per file system and not per transaction,
358  	 * the log must never go across transaction boundaries.
359  	 */
360  	smp_mb();
361  	if (!list_empty(&fs_info->tree_mod_seq_list))
362  		WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n");
363  	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
364  		WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n");
365  	atomic64_set(&fs_info->tree_mod_seq, 0);
366  
367  	spin_lock_init(&cur_trans->delayed_refs.lock);
368  
369  	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
370  	INIT_LIST_HEAD(&cur_trans->dev_update_list);
371  	INIT_LIST_HEAD(&cur_trans->switch_commits);
372  	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
373  	INIT_LIST_HEAD(&cur_trans->io_bgs);
374  	INIT_LIST_HEAD(&cur_trans->dropped_roots);
375  	mutex_init(&cur_trans->cache_write_mutex);
376  	spin_lock_init(&cur_trans->dirty_bgs_lock);
377  	INIT_LIST_HEAD(&cur_trans->deleted_bgs);
378  	spin_lock_init(&cur_trans->dropped_roots_lock);
379  	list_add_tail(&cur_trans->list, &fs_info->trans_list);
380  	extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
381  			IO_TREE_TRANS_DIRTY_PAGES);
382  	extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
383  			IO_TREE_FS_PINNED_EXTENTS);
384  	btrfs_set_fs_generation(fs_info, fs_info->generation + 1);
385  	cur_trans->transid = fs_info->generation;
386  	fs_info->running_transaction = cur_trans;
387  	cur_trans->aborted = 0;
388  	spin_unlock(&fs_info->trans_lock);
389  
390  	return 0;
391  }
392  
393  /*
394   * This does all the record keeping required to make sure that a shareable root
395   * is properly recorded in a given transaction.  This is required to make sure
396   * the old root from before we joined the transaction is deleted when the
397   * transaction commits.
398   */
record_root_in_trans(struct btrfs_trans_handle * trans,struct btrfs_root * root,int force)399  static int record_root_in_trans(struct btrfs_trans_handle *trans,
400  			       struct btrfs_root *root,
401  			       int force)
402  {
403  	struct btrfs_fs_info *fs_info = root->fs_info;
404  	int ret = 0;
405  
406  	if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
407  	    btrfs_get_root_last_trans(root) < trans->transid) || force) {
408  		WARN_ON(!force && root->commit_root != root->node);
409  
410  		/*
411  		 * see below for IN_TRANS_SETUP usage rules
412  		 * we have the reloc mutex held now, so there
413  		 * is only one writer in this function
414  		 */
415  		set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
416  
417  		/* make sure readers find IN_TRANS_SETUP before
418  		 * they find our root->last_trans update
419  		 */
420  		smp_wmb();
421  
422  		spin_lock(&fs_info->fs_roots_radix_lock);
423  		if (btrfs_get_root_last_trans(root) == trans->transid && !force) {
424  			spin_unlock(&fs_info->fs_roots_radix_lock);
425  			return 0;
426  		}
427  		radix_tree_tag_set(&fs_info->fs_roots_radix,
428  				   (unsigned long)btrfs_root_id(root),
429  				   BTRFS_ROOT_TRANS_TAG);
430  		spin_unlock(&fs_info->fs_roots_radix_lock);
431  		btrfs_set_root_last_trans(root, trans->transid);
432  
433  		/* this is pretty tricky.  We don't want to
434  		 * take the relocation lock in btrfs_record_root_in_trans
435  		 * unless we're really doing the first setup for this root in
436  		 * this transaction.
437  		 *
438  		 * Normally we'd use root->last_trans as a flag to decide
439  		 * if we want to take the expensive mutex.
440  		 *
441  		 * But, we have to set root->last_trans before we
442  		 * init the relocation root, otherwise, we trip over warnings
443  		 * in ctree.c.  The solution used here is to flag ourselves
444  		 * with root IN_TRANS_SETUP.  When this is 1, we're still
445  		 * fixing up the reloc trees and everyone must wait.
446  		 *
447  		 * When this is zero, they can trust root->last_trans and fly
448  		 * through btrfs_record_root_in_trans without having to take the
449  		 * lock.  smp_wmb() makes sure that all the writes above are
450  		 * done before we pop in the zero below
451  		 */
452  		ret = btrfs_init_reloc_root(trans, root);
453  		smp_mb__before_atomic();
454  		clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
455  	}
456  	return ret;
457  }
458  
459  
btrfs_add_dropped_root(struct btrfs_trans_handle * trans,struct btrfs_root * root)460  void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
461  			    struct btrfs_root *root)
462  {
463  	struct btrfs_fs_info *fs_info = root->fs_info;
464  	struct btrfs_transaction *cur_trans = trans->transaction;
465  
466  	/* Add ourselves to the transaction dropped list */
467  	spin_lock(&cur_trans->dropped_roots_lock);
468  	list_add_tail(&root->root_list, &cur_trans->dropped_roots);
469  	spin_unlock(&cur_trans->dropped_roots_lock);
470  
471  	/* Make sure we don't try to update the root at commit time */
472  	spin_lock(&fs_info->fs_roots_radix_lock);
473  	radix_tree_tag_clear(&fs_info->fs_roots_radix,
474  			     (unsigned long)btrfs_root_id(root),
475  			     BTRFS_ROOT_TRANS_TAG);
476  	spin_unlock(&fs_info->fs_roots_radix_lock);
477  }
478  
btrfs_record_root_in_trans(struct btrfs_trans_handle * trans,struct btrfs_root * root)479  int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
480  			       struct btrfs_root *root)
481  {
482  	struct btrfs_fs_info *fs_info = root->fs_info;
483  	int ret;
484  
485  	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
486  		return 0;
487  
488  	/*
489  	 * see record_root_in_trans for comments about IN_TRANS_SETUP usage
490  	 * and barriers
491  	 */
492  	smp_rmb();
493  	if (btrfs_get_root_last_trans(root) == trans->transid &&
494  	    !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
495  		return 0;
496  
497  	mutex_lock(&fs_info->reloc_mutex);
498  	ret = record_root_in_trans(trans, root, 0);
499  	mutex_unlock(&fs_info->reloc_mutex);
500  
501  	return ret;
502  }
503  
is_transaction_blocked(struct btrfs_transaction * trans)504  static inline int is_transaction_blocked(struct btrfs_transaction *trans)
505  {
506  	return (trans->state >= TRANS_STATE_COMMIT_START &&
507  		trans->state < TRANS_STATE_UNBLOCKED &&
508  		!TRANS_ABORTED(trans));
509  }
510  
511  /* wait for commit against the current transaction to become unblocked
512   * when this is done, it is safe to start a new transaction, but the current
513   * transaction might not be fully on disk.
514   */
wait_current_trans(struct btrfs_fs_info * fs_info)515  static void wait_current_trans(struct btrfs_fs_info *fs_info)
516  {
517  	struct btrfs_transaction *cur_trans;
518  
519  	spin_lock(&fs_info->trans_lock);
520  	cur_trans = fs_info->running_transaction;
521  	if (cur_trans && is_transaction_blocked(cur_trans)) {
522  		refcount_inc(&cur_trans->use_count);
523  		spin_unlock(&fs_info->trans_lock);
524  
525  		btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
526  		wait_event(fs_info->transaction_wait,
527  			   cur_trans->state >= TRANS_STATE_UNBLOCKED ||
528  			   TRANS_ABORTED(cur_trans));
529  		btrfs_put_transaction(cur_trans);
530  	} else {
531  		spin_unlock(&fs_info->trans_lock);
532  	}
533  }
534  
may_wait_transaction(struct btrfs_fs_info * fs_info,int type)535  static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
536  {
537  	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
538  		return 0;
539  
540  	if (type == TRANS_START)
541  		return 1;
542  
543  	return 0;
544  }
545  
need_reserve_reloc_root(struct btrfs_root * root)546  static inline bool need_reserve_reloc_root(struct btrfs_root *root)
547  {
548  	struct btrfs_fs_info *fs_info = root->fs_info;
549  
550  	if (!fs_info->reloc_ctl ||
551  	    !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
552  	    btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
553  	    root->reloc_root)
554  		return false;
555  
556  	return true;
557  }
558  
btrfs_reserve_trans_metadata(struct btrfs_fs_info * fs_info,enum btrfs_reserve_flush_enum flush,u64 num_bytes,u64 * delayed_refs_bytes)559  static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
560  					enum btrfs_reserve_flush_enum flush,
561  					u64 num_bytes,
562  					u64 *delayed_refs_bytes)
563  {
564  	struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
565  	u64 bytes = num_bytes + *delayed_refs_bytes;
566  	int ret;
567  
568  	/*
569  	 * We want to reserve all the bytes we may need all at once, so we only
570  	 * do 1 enospc flushing cycle per transaction start.
571  	 */
572  	ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
573  
574  	/*
575  	 * If we are an emergency flush, which can steal from the global block
576  	 * reserve, then attempt to not reserve space for the delayed refs, as
577  	 * we will consume space for them from the global block reserve.
578  	 */
579  	if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
580  		bytes -= *delayed_refs_bytes;
581  		*delayed_refs_bytes = 0;
582  		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
583  	}
584  
585  	return ret;
586  }
587  
588  static struct btrfs_trans_handle *
start_transaction(struct btrfs_root * root,unsigned int num_items,unsigned int type,enum btrfs_reserve_flush_enum flush,bool enforce_qgroups)589  start_transaction(struct btrfs_root *root, unsigned int num_items,
590  		  unsigned int type, enum btrfs_reserve_flush_enum flush,
591  		  bool enforce_qgroups)
592  {
593  	struct btrfs_fs_info *fs_info = root->fs_info;
594  	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
595  	struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
596  	struct btrfs_trans_handle *h;
597  	struct btrfs_transaction *cur_trans;
598  	u64 num_bytes = 0;
599  	u64 qgroup_reserved = 0;
600  	u64 delayed_refs_bytes = 0;
601  	bool reloc_reserved = false;
602  	bool do_chunk_alloc = false;
603  	int ret;
604  
605  	if (BTRFS_FS_ERROR(fs_info))
606  		return ERR_PTR(-EROFS);
607  
608  	if (current->journal_info) {
609  		WARN_ON(type & TRANS_EXTWRITERS);
610  		h = current->journal_info;
611  		refcount_inc(&h->use_count);
612  		WARN_ON(refcount_read(&h->use_count) > 2);
613  		h->orig_rsv = h->block_rsv;
614  		h->block_rsv = NULL;
615  		goto got_it;
616  	}
617  
618  	/*
619  	 * Do the reservation before we join the transaction so we can do all
620  	 * the appropriate flushing if need be.
621  	 */
622  	if (num_items && root != fs_info->chunk_root) {
623  		qgroup_reserved = num_items * fs_info->nodesize;
624  		/*
625  		 * Use prealloc for now, as there might be a currently running
626  		 * transaction that could free this reserved space prematurely
627  		 * by committing.
628  		 */
629  		ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserved,
630  							 enforce_qgroups, false);
631  		if (ret)
632  			return ERR_PTR(ret);
633  
634  		num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
635  		/*
636  		 * If we plan to insert/update/delete "num_items" from a btree,
637  		 * we will also generate delayed refs for extent buffers in the
638  		 * respective btree paths, so reserve space for the delayed refs
639  		 * that will be generated by the caller as it modifies btrees.
640  		 * Try to reserve them to avoid excessive use of the global
641  		 * block reserve.
642  		 */
643  		delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items);
644  
645  		/*
646  		 * Do the reservation for the relocation root creation
647  		 */
648  		if (need_reserve_reloc_root(root)) {
649  			num_bytes += fs_info->nodesize;
650  			reloc_reserved = true;
651  		}
652  
653  		ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes,
654  						   &delayed_refs_bytes);
655  		if (ret)
656  			goto reserve_fail;
657  
658  		btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true);
659  
660  		if (trans_rsv->space_info->force_alloc)
661  			do_chunk_alloc = true;
662  	} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
663  		   !btrfs_block_rsv_full(delayed_refs_rsv)) {
664  		/*
665  		 * Some people call with btrfs_start_transaction(root, 0)
666  		 * because they can be throttled, but have some other mechanism
667  		 * for reserving space.  We still want these guys to refill the
668  		 * delayed block_rsv so just add 1 items worth of reservation
669  		 * here.
670  		 */
671  		ret = btrfs_delayed_refs_rsv_refill(fs_info, flush);
672  		if (ret)
673  			goto reserve_fail;
674  	}
675  again:
676  	h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
677  	if (!h) {
678  		ret = -ENOMEM;
679  		goto alloc_fail;
680  	}
681  
682  	/*
683  	 * If we are JOIN_NOLOCK we're already committing a transaction and
684  	 * waiting on this guy, so we don't need to do the sb_start_intwrite
685  	 * because we're already holding a ref.  We need this because we could
686  	 * have raced in and did an fsync() on a file which can kick a commit
687  	 * and then we deadlock with somebody doing a freeze.
688  	 *
689  	 * If we are ATTACH, it means we just want to catch the current
690  	 * transaction and commit it, so we needn't do sb_start_intwrite().
691  	 */
692  	if (type & __TRANS_FREEZABLE)
693  		sb_start_intwrite(fs_info->sb);
694  
695  	if (may_wait_transaction(fs_info, type))
696  		wait_current_trans(fs_info);
697  
698  	do {
699  		ret = join_transaction(fs_info, type);
700  		if (ret == -EBUSY) {
701  			wait_current_trans(fs_info);
702  			if (unlikely(type == TRANS_ATTACH ||
703  				     type == TRANS_JOIN_NOSTART))
704  				ret = -ENOENT;
705  		}
706  	} while (ret == -EBUSY);
707  
708  	if (ret < 0)
709  		goto join_fail;
710  
711  	cur_trans = fs_info->running_transaction;
712  
713  	h->transid = cur_trans->transid;
714  	h->transaction = cur_trans;
715  	refcount_set(&h->use_count, 1);
716  	h->fs_info = root->fs_info;
717  
718  	h->type = type;
719  	INIT_LIST_HEAD(&h->new_bgs);
720  	btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS);
721  
722  	smp_mb();
723  	if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
724  	    may_wait_transaction(fs_info, type)) {
725  		current->journal_info = h;
726  		btrfs_commit_transaction(h);
727  		goto again;
728  	}
729  
730  	if (num_bytes) {
731  		trace_btrfs_space_reservation(fs_info, "transaction",
732  					      h->transid, num_bytes, 1);
733  		h->block_rsv = trans_rsv;
734  		h->bytes_reserved = num_bytes;
735  		if (delayed_refs_bytes > 0) {
736  			trace_btrfs_space_reservation(fs_info,
737  						      "local_delayed_refs_rsv",
738  						      h->transid,
739  						      delayed_refs_bytes, 1);
740  			h->delayed_refs_bytes_reserved = delayed_refs_bytes;
741  			btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true);
742  			delayed_refs_bytes = 0;
743  		}
744  		h->reloc_reserved = reloc_reserved;
745  	}
746  
747  got_it:
748  	if (!current->journal_info)
749  		current->journal_info = h;
750  
751  	/*
752  	 * If the space_info is marked ALLOC_FORCE then we'll get upgraded to
753  	 * ALLOC_FORCE the first run through, and then we won't allocate for
754  	 * anybody else who races in later.  We don't care about the return
755  	 * value here.
756  	 */
757  	if (do_chunk_alloc && num_bytes) {
758  		u64 flags = h->block_rsv->space_info->flags;
759  
760  		btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags),
761  				  CHUNK_ALLOC_NO_FORCE);
762  	}
763  
764  	/*
765  	 * btrfs_record_root_in_trans() needs to alloc new extents, and may
766  	 * call btrfs_join_transaction() while we're also starting a
767  	 * transaction.
768  	 *
769  	 * Thus it need to be called after current->journal_info initialized,
770  	 * or we can deadlock.
771  	 */
772  	ret = btrfs_record_root_in_trans(h, root);
773  	if (ret) {
774  		/*
775  		 * The transaction handle is fully initialized and linked with
776  		 * other structures so it needs to be ended in case of errors,
777  		 * not just freed.
778  		 */
779  		btrfs_end_transaction(h);
780  		goto reserve_fail;
781  	}
782  	/*
783  	 * Now that we have found a transaction to be a part of, convert the
784  	 * qgroup reservation from prealloc to pertrans. A different transaction
785  	 * can't race in and free our pertrans out from under us.
786  	 */
787  	if (qgroup_reserved)
788  		btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
789  
790  	return h;
791  
792  join_fail:
793  	if (type & __TRANS_FREEZABLE)
794  		sb_end_intwrite(fs_info->sb);
795  	kmem_cache_free(btrfs_trans_handle_cachep, h);
796  alloc_fail:
797  	if (num_bytes)
798  		btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);
799  	if (delayed_refs_bytes)
800  		btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info,
801  						    delayed_refs_bytes);
802  reserve_fail:
803  	btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
804  	return ERR_PTR(ret);
805  }
806  
btrfs_start_transaction(struct btrfs_root * root,unsigned int num_items)807  struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
808  						   unsigned int num_items)
809  {
810  	return start_transaction(root, num_items, TRANS_START,
811  				 BTRFS_RESERVE_FLUSH_ALL, true);
812  }
813  
btrfs_start_transaction_fallback_global_rsv(struct btrfs_root * root,unsigned int num_items)814  struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
815  					struct btrfs_root *root,
816  					unsigned int num_items)
817  {
818  	return start_transaction(root, num_items, TRANS_START,
819  				 BTRFS_RESERVE_FLUSH_ALL_STEAL, false);
820  }
821  
btrfs_join_transaction(struct btrfs_root * root)822  struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
823  {
824  	return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH,
825  				 true);
826  }
827  
btrfs_join_transaction_spacecache(struct btrfs_root * root)828  struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root)
829  {
830  	return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
831  				 BTRFS_RESERVE_NO_FLUSH, true);
832  }
833  
834  /*
835   * Similar to regular join but it never starts a transaction when none is
836   * running or when there's a running one at a state >= TRANS_STATE_UNBLOCKED.
837   * This is similar to btrfs_attach_transaction() but it allows the join to
838   * happen if the transaction commit already started but it's not yet in the
839   * "doing" phase (the state is < TRANS_STATE_COMMIT_DOING).
840   */
btrfs_join_transaction_nostart(struct btrfs_root * root)841  struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root)
842  {
843  	return start_transaction(root, 0, TRANS_JOIN_NOSTART,
844  				 BTRFS_RESERVE_NO_FLUSH, true);
845  }
846  
847  /*
848   * Catch the running transaction.
849   *
850   * It is used when we want to commit the current the transaction, but
851   * don't want to start a new one.
852   *
853   * Note: If this function return -ENOENT, it just means there is no
854   * running transaction. But it is possible that the inactive transaction
855   * is still in the memory, not fully on disk. If you hope there is no
856   * inactive transaction in the fs when -ENOENT is returned, you should
857   * invoke
858   *     btrfs_attach_transaction_barrier()
859   */
btrfs_attach_transaction(struct btrfs_root * root)860  struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
861  {
862  	return start_transaction(root, 0, TRANS_ATTACH,
863  				 BTRFS_RESERVE_NO_FLUSH, true);
864  }
865  
866  /*
867   * Catch the running transaction.
868   *
869   * It is similar to the above function, the difference is this one
870   * will wait for all the inactive transactions until they fully
871   * complete.
872   */
873  struct btrfs_trans_handle *
btrfs_attach_transaction_barrier(struct btrfs_root * root)874  btrfs_attach_transaction_barrier(struct btrfs_root *root)
875  {
876  	struct btrfs_trans_handle *trans;
877  
878  	trans = start_transaction(root, 0, TRANS_ATTACH,
879  				  BTRFS_RESERVE_NO_FLUSH, true);
880  	if (trans == ERR_PTR(-ENOENT)) {
881  		int ret;
882  
883  		ret = btrfs_wait_for_commit(root->fs_info, 0);
884  		if (ret)
885  			return ERR_PTR(ret);
886  	}
887  
888  	return trans;
889  }
890  
891  /* Wait for a transaction commit to reach at least the given state. */
wait_for_commit(struct btrfs_transaction * commit,const enum btrfs_trans_state min_state)892  static noinline void wait_for_commit(struct btrfs_transaction *commit,
893  				     const enum btrfs_trans_state min_state)
894  {
895  	struct btrfs_fs_info *fs_info = commit->fs_info;
896  	u64 transid = commit->transid;
897  	bool put = false;
898  
899  	/*
900  	 * At the moment this function is called with min_state either being
901  	 * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED.
902  	 */
903  	if (min_state == TRANS_STATE_COMPLETED)
904  		btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
905  	else
906  		btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
907  
908  	while (1) {
909  		wait_event(commit->commit_wait, commit->state >= min_state);
910  		if (put)
911  			btrfs_put_transaction(commit);
912  
913  		if (min_state < TRANS_STATE_COMPLETED)
914  			break;
915  
916  		/*
917  		 * A transaction isn't really completed until all of the
918  		 * previous transactions are completed, but with fsync we can
919  		 * end up with SUPER_COMMITTED transactions before a COMPLETED
920  		 * transaction. Wait for those.
921  		 */
922  
923  		spin_lock(&fs_info->trans_lock);
924  		commit = list_first_entry_or_null(&fs_info->trans_list,
925  						  struct btrfs_transaction,
926  						  list);
927  		if (!commit || commit->transid > transid) {
928  			spin_unlock(&fs_info->trans_lock);
929  			break;
930  		}
931  		refcount_inc(&commit->use_count);
932  		put = true;
933  		spin_unlock(&fs_info->trans_lock);
934  	}
935  }
936  
btrfs_wait_for_commit(struct btrfs_fs_info * fs_info,u64 transid)937  int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
938  {
939  	struct btrfs_transaction *cur_trans = NULL, *t;
940  	int ret = 0;
941  
942  	if (transid) {
943  		if (transid <= btrfs_get_last_trans_committed(fs_info))
944  			goto out;
945  
946  		/* find specified transaction */
947  		spin_lock(&fs_info->trans_lock);
948  		list_for_each_entry(t, &fs_info->trans_list, list) {
949  			if (t->transid == transid) {
950  				cur_trans = t;
951  				refcount_inc(&cur_trans->use_count);
952  				ret = 0;
953  				break;
954  			}
955  			if (t->transid > transid) {
956  				ret = 0;
957  				break;
958  			}
959  		}
960  		spin_unlock(&fs_info->trans_lock);
961  
962  		/*
963  		 * The specified transaction doesn't exist, or we
964  		 * raced with btrfs_commit_transaction
965  		 */
966  		if (!cur_trans) {
967  			if (transid > btrfs_get_last_trans_committed(fs_info))
968  				ret = -EINVAL;
969  			goto out;
970  		}
971  	} else {
972  		/* find newest transaction that is committing | committed */
973  		spin_lock(&fs_info->trans_lock);
974  		list_for_each_entry_reverse(t, &fs_info->trans_list,
975  					    list) {
976  			if (t->state >= TRANS_STATE_COMMIT_START) {
977  				if (t->state == TRANS_STATE_COMPLETED)
978  					break;
979  				cur_trans = t;
980  				refcount_inc(&cur_trans->use_count);
981  				break;
982  			}
983  		}
984  		spin_unlock(&fs_info->trans_lock);
985  		if (!cur_trans)
986  			goto out;  /* nothing committing|committed */
987  	}
988  
989  	wait_for_commit(cur_trans, TRANS_STATE_COMPLETED);
990  	ret = cur_trans->aborted;
991  	btrfs_put_transaction(cur_trans);
992  out:
993  	return ret;
994  }
995  
btrfs_throttle(struct btrfs_fs_info * fs_info)996  void btrfs_throttle(struct btrfs_fs_info *fs_info)
997  {
998  	wait_current_trans(fs_info);
999  }
1000  
btrfs_should_end_transaction(struct btrfs_trans_handle * trans)1001  bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
1002  {
1003  	struct btrfs_transaction *cur_trans = trans->transaction;
1004  
1005  	if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
1006  	    test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
1007  		return true;
1008  
1009  	if (btrfs_check_space_for_delayed_refs(trans->fs_info))
1010  		return true;
1011  
1012  	return !!btrfs_block_rsv_check(&trans->fs_info->global_block_rsv, 50);
1013  }
1014  
btrfs_trans_release_metadata(struct btrfs_trans_handle * trans)1015  static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
1016  
1017  {
1018  	struct btrfs_fs_info *fs_info = trans->fs_info;
1019  
1020  	if (!trans->block_rsv) {
1021  		ASSERT(!trans->bytes_reserved);
1022  		ASSERT(!trans->delayed_refs_bytes_reserved);
1023  		return;
1024  	}
1025  
1026  	if (!trans->bytes_reserved) {
1027  		ASSERT(!trans->delayed_refs_bytes_reserved);
1028  		return;
1029  	}
1030  
1031  	ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
1032  	trace_btrfs_space_reservation(fs_info, "transaction",
1033  				      trans->transid, trans->bytes_reserved, 0);
1034  	btrfs_block_rsv_release(fs_info, trans->block_rsv,
1035  				trans->bytes_reserved, NULL);
1036  	trans->bytes_reserved = 0;
1037  
1038  	if (!trans->delayed_refs_bytes_reserved)
1039  		return;
1040  
1041  	trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv",
1042  				      trans->transid,
1043  				      trans->delayed_refs_bytes_reserved, 0);
1044  	btrfs_block_rsv_release(fs_info, &trans->delayed_rsv,
1045  				trans->delayed_refs_bytes_reserved, NULL);
1046  	trans->delayed_refs_bytes_reserved = 0;
1047  }
1048  
__btrfs_end_transaction(struct btrfs_trans_handle * trans,int throttle)1049  static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
1050  				   int throttle)
1051  {
1052  	struct btrfs_fs_info *info = trans->fs_info;
1053  	struct btrfs_transaction *cur_trans = trans->transaction;
1054  	int ret = 0;
1055  
1056  	if (refcount_read(&trans->use_count) > 1) {
1057  		refcount_dec(&trans->use_count);
1058  		trans->block_rsv = trans->orig_rsv;
1059  		return 0;
1060  	}
1061  
1062  	btrfs_trans_release_metadata(trans);
1063  	trans->block_rsv = NULL;
1064  
1065  	btrfs_create_pending_block_groups(trans);
1066  
1067  	btrfs_trans_release_chunk_metadata(trans);
1068  
1069  	if (trans->type & __TRANS_FREEZABLE)
1070  		sb_end_intwrite(info->sb);
1071  
1072  	WARN_ON(cur_trans != info->running_transaction);
1073  	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
1074  	atomic_dec(&cur_trans->num_writers);
1075  	extwriter_counter_dec(cur_trans, trans->type);
1076  
1077  	cond_wake_up(&cur_trans->writer_wait);
1078  
1079  	btrfs_lockdep_release(info, btrfs_trans_num_extwriters);
1080  	btrfs_lockdep_release(info, btrfs_trans_num_writers);
1081  
1082  	btrfs_put_transaction(cur_trans);
1083  
1084  	if (current->journal_info == trans)
1085  		current->journal_info = NULL;
1086  
1087  	if (throttle)
1088  		btrfs_run_delayed_iputs(info);
1089  
1090  	if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
1091  		wake_up_process(info->transaction_kthread);
1092  		if (TRANS_ABORTED(trans))
1093  			ret = trans->aborted;
1094  		else
1095  			ret = -EROFS;
1096  	}
1097  
1098  	kmem_cache_free(btrfs_trans_handle_cachep, trans);
1099  	return ret;
1100  }
1101  
btrfs_end_transaction(struct btrfs_trans_handle * trans)1102  int btrfs_end_transaction(struct btrfs_trans_handle *trans)
1103  {
1104  	return __btrfs_end_transaction(trans, 0);
1105  }
1106  
btrfs_end_transaction_throttle(struct btrfs_trans_handle * trans)1107  int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
1108  {
1109  	return __btrfs_end_transaction(trans, 1);
1110  }
1111  
1112  /*
1113   * when btree blocks are allocated, they have some corresponding bits set for
1114   * them in one of two extent_io trees.  This is used to make sure all of
1115   * those extents are sent to disk but does not wait on them
1116   */
btrfs_write_marked_extents(struct btrfs_fs_info * fs_info,struct extent_io_tree * dirty_pages,int mark)1117  int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
1118  			       struct extent_io_tree *dirty_pages, int mark)
1119  {
1120  	int ret = 0;
1121  	struct address_space *mapping = fs_info->btree_inode->i_mapping;
1122  	struct extent_state *cached_state = NULL;
1123  	u64 start = 0;
1124  	u64 end;
1125  
1126  	while (find_first_extent_bit(dirty_pages, start, &start, &end,
1127  				     mark, &cached_state)) {
1128  		bool wait_writeback = false;
1129  
1130  		ret = convert_extent_bit(dirty_pages, start, end,
1131  					 EXTENT_NEED_WAIT,
1132  					 mark, &cached_state);
1133  		/*
1134  		 * convert_extent_bit can return -ENOMEM, which is most of the
1135  		 * time a temporary error. So when it happens, ignore the error
1136  		 * and wait for writeback of this range to finish - because we
1137  		 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
1138  		 * to __btrfs_wait_marked_extents() would not know that
1139  		 * writeback for this range started and therefore wouldn't
1140  		 * wait for it to finish - we don't want to commit a
1141  		 * superblock that points to btree nodes/leafs for which
1142  		 * writeback hasn't finished yet (and without errors).
1143  		 * We cleanup any entries left in the io tree when committing
1144  		 * the transaction (through extent_io_tree_release()).
1145  		 */
1146  		if (ret == -ENOMEM) {
1147  			ret = 0;
1148  			wait_writeback = true;
1149  		}
1150  		if (!ret)
1151  			ret = filemap_fdatawrite_range(mapping, start, end);
1152  		if (!ret && wait_writeback)
1153  			ret = filemap_fdatawait_range(mapping, start, end);
1154  		free_extent_state(cached_state);
1155  		if (ret)
1156  			break;
1157  		cached_state = NULL;
1158  		cond_resched();
1159  		start = end + 1;
1160  	}
1161  	return ret;
1162  }
1163  
1164  /*
1165   * when btree blocks are allocated, they have some corresponding bits set for
1166   * them in one of two extent_io trees.  This is used to make sure all of
1167   * those extents are on disk for transaction or log commit.  We wait
1168   * on all the pages and clear them from the dirty pages state tree
1169   */
__btrfs_wait_marked_extents(struct btrfs_fs_info * fs_info,struct extent_io_tree * dirty_pages)1170  static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
1171  				       struct extent_io_tree *dirty_pages)
1172  {
1173  	struct address_space *mapping = fs_info->btree_inode->i_mapping;
1174  	struct extent_state *cached_state = NULL;
1175  	u64 start = 0;
1176  	u64 end;
1177  	int ret = 0;
1178  
1179  	while (find_first_extent_bit(dirty_pages, start, &start, &end,
1180  				     EXTENT_NEED_WAIT, &cached_state)) {
1181  		/*
1182  		 * Ignore -ENOMEM errors returned by clear_extent_bit().
1183  		 * When committing the transaction, we'll remove any entries
1184  		 * left in the io tree. For a log commit, we don't remove them
1185  		 * after committing the log because the tree can be accessed
1186  		 * concurrently - we do it only at transaction commit time when
1187  		 * it's safe to do it (through extent_io_tree_release()).
1188  		 */
1189  		ret = clear_extent_bit(dirty_pages, start, end,
1190  				       EXTENT_NEED_WAIT, &cached_state);
1191  		if (ret == -ENOMEM)
1192  			ret = 0;
1193  		if (!ret)
1194  			ret = filemap_fdatawait_range(mapping, start, end);
1195  		free_extent_state(cached_state);
1196  		if (ret)
1197  			break;
1198  		cached_state = NULL;
1199  		cond_resched();
1200  		start = end + 1;
1201  	}
1202  	return ret;
1203  }
1204  
btrfs_wait_extents(struct btrfs_fs_info * fs_info,struct extent_io_tree * dirty_pages)1205  static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
1206  		       struct extent_io_tree *dirty_pages)
1207  {
1208  	bool errors = false;
1209  	int err;
1210  
1211  	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
1212  	if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
1213  		errors = true;
1214  
1215  	if (errors && !err)
1216  		err = -EIO;
1217  	return err;
1218  }
1219  
btrfs_wait_tree_log_extents(struct btrfs_root * log_root,int mark)1220  int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
1221  {
1222  	struct btrfs_fs_info *fs_info = log_root->fs_info;
1223  	struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
1224  	bool errors = false;
1225  	int err;
1226  
1227  	ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID);
1228  
1229  	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
1230  	if ((mark & EXTENT_DIRTY) &&
1231  	    test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
1232  		errors = true;
1233  
1234  	if ((mark & EXTENT_NEW) &&
1235  	    test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
1236  		errors = true;
1237  
1238  	if (errors && !err)
1239  		err = -EIO;
1240  	return err;
1241  }
1242  
1243  /*
1244   * When btree blocks are allocated the corresponding extents are marked dirty.
1245   * This function ensures such extents are persisted on disk for transaction or
1246   * log commit.
1247   *
1248   * @trans: transaction whose dirty pages we'd like to write
1249   */
btrfs_write_and_wait_transaction(struct btrfs_trans_handle * trans)1250  static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
1251  {
1252  	int ret;
1253  	int ret2;
1254  	struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages;
1255  	struct btrfs_fs_info *fs_info = trans->fs_info;
1256  	struct blk_plug plug;
1257  
1258  	blk_start_plug(&plug);
1259  	ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY);
1260  	blk_finish_plug(&plug);
1261  	ret2 = btrfs_wait_extents(fs_info, dirty_pages);
1262  
1263  	extent_io_tree_release(&trans->transaction->dirty_pages);
1264  
1265  	if (ret)
1266  		return ret;
1267  	else if (ret2)
1268  		return ret2;
1269  	else
1270  		return 0;
1271  }
1272  
1273  /*
1274   * this is used to update the root pointer in the tree of tree roots.
1275   *
1276   * But, in the case of the extent allocation tree, updating the root
1277   * pointer may allocate blocks which may change the root of the extent
1278   * allocation tree.
1279   *
1280   * So, this loops and repeats and makes sure the cowonly root didn't
1281   * change while the root pointer was being updated in the metadata.
1282   */
update_cowonly_root(struct btrfs_trans_handle * trans,struct btrfs_root * root)1283  static int update_cowonly_root(struct btrfs_trans_handle *trans,
1284  			       struct btrfs_root *root)
1285  {
1286  	int ret;
1287  	u64 old_root_bytenr;
1288  	u64 old_root_used;
1289  	struct btrfs_fs_info *fs_info = root->fs_info;
1290  	struct btrfs_root *tree_root = fs_info->tree_root;
1291  
1292  	old_root_used = btrfs_root_used(&root->root_item);
1293  
1294  	while (1) {
1295  		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
1296  		if (old_root_bytenr == root->node->start &&
1297  		    old_root_used == btrfs_root_used(&root->root_item))
1298  			break;
1299  
1300  		btrfs_set_root_node(&root->root_item, root->node);
1301  		ret = btrfs_update_root(trans, tree_root,
1302  					&root->root_key,
1303  					&root->root_item);
1304  		if (ret)
1305  			return ret;
1306  
1307  		old_root_used = btrfs_root_used(&root->root_item);
1308  	}
1309  
1310  	return 0;
1311  }
1312  
1313  /*
1314   * update all the cowonly tree roots on disk
1315   *
1316   * The error handling in this function may not be obvious. Any of the
1317   * failures will cause the file system to go offline. We still need
1318   * to clean up the delayed refs.
1319   */
commit_cowonly_roots(struct btrfs_trans_handle * trans)1320  static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
1321  {
1322  	struct btrfs_fs_info *fs_info = trans->fs_info;
1323  	struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
1324  	struct list_head *io_bgs = &trans->transaction->io_bgs;
1325  	struct list_head *next;
1326  	struct extent_buffer *eb;
1327  	int ret;
1328  
1329  	/*
1330  	 * At this point no one can be using this transaction to modify any tree
1331  	 * and no one can start another transaction to modify any tree either.
1332  	 */
1333  	ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
1334  
1335  	eb = btrfs_lock_root_node(fs_info->tree_root);
1336  	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
1337  			      0, &eb, BTRFS_NESTING_COW);
1338  	btrfs_tree_unlock(eb);
1339  	free_extent_buffer(eb);
1340  
1341  	if (ret)
1342  		return ret;
1343  
1344  	ret = btrfs_run_dev_stats(trans);
1345  	if (ret)
1346  		return ret;
1347  	ret = btrfs_run_dev_replace(trans);
1348  	if (ret)
1349  		return ret;
1350  	ret = btrfs_run_qgroups(trans);
1351  	if (ret)
1352  		return ret;
1353  
1354  	ret = btrfs_setup_space_cache(trans);
1355  	if (ret)
1356  		return ret;
1357  
1358  again:
1359  	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
1360  		struct btrfs_root *root;
1361  		next = fs_info->dirty_cowonly_roots.next;
1362  		list_del_init(next);
1363  		root = list_entry(next, struct btrfs_root, dirty_list);
1364  		clear_bit(BTRFS_ROOT_DIRTY, &root->state);
1365  
1366  		list_add_tail(&root->dirty_list,
1367  			      &trans->transaction->switch_commits);
1368  		ret = update_cowonly_root(trans, root);
1369  		if (ret)
1370  			return ret;
1371  	}
1372  
1373  	/* Now flush any delayed refs generated by updating all of the roots */
1374  	ret = btrfs_run_delayed_refs(trans, U64_MAX);
1375  	if (ret)
1376  		return ret;
1377  
1378  	while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
1379  		ret = btrfs_write_dirty_block_groups(trans);
1380  		if (ret)
1381  			return ret;
1382  
1383  		/*
1384  		 * We're writing the dirty block groups, which could generate
1385  		 * delayed refs, which could generate more dirty block groups,
1386  		 * so we want to keep this flushing in this loop to make sure
1387  		 * everything gets run.
1388  		 */
1389  		ret = btrfs_run_delayed_refs(trans, U64_MAX);
1390  		if (ret)
1391  			return ret;
1392  	}
1393  
1394  	if (!list_empty(&fs_info->dirty_cowonly_roots))
1395  		goto again;
1396  
1397  	/* Update dev-replace pointer once everything is committed */
1398  	fs_info->dev_replace.committed_cursor_left =
1399  		fs_info->dev_replace.cursor_left_last_write_of_item;
1400  
1401  	return 0;
1402  }
1403  
1404  /*
1405   * If we had a pending drop we need to see if there are any others left in our
1406   * dead roots list, and if not clear our bit and wake any waiters.
1407   */
btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info * fs_info)1408  void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
1409  {
1410  	/*
1411  	 * We put the drop in progress roots at the front of the list, so if the
1412  	 * first entry doesn't have UNFINISHED_DROP set we can wake everybody
1413  	 * up.
1414  	 */
1415  	spin_lock(&fs_info->trans_lock);
1416  	if (!list_empty(&fs_info->dead_roots)) {
1417  		struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
1418  							   struct btrfs_root,
1419  							   root_list);
1420  		if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
1421  			spin_unlock(&fs_info->trans_lock);
1422  			return;
1423  		}
1424  	}
1425  	spin_unlock(&fs_info->trans_lock);
1426  
1427  	btrfs_wake_unfinished_drop(fs_info);
1428  }
1429  
1430  /*
1431   * dead roots are old snapshots that need to be deleted.  This allocates
1432   * a dirty root struct and adds it into the list of dead roots that need to
1433   * be deleted
1434   */
btrfs_add_dead_root(struct btrfs_root * root)1435  void btrfs_add_dead_root(struct btrfs_root *root)
1436  {
1437  	struct btrfs_fs_info *fs_info = root->fs_info;
1438  
1439  	spin_lock(&fs_info->trans_lock);
1440  	if (list_empty(&root->root_list)) {
1441  		btrfs_grab_root(root);
1442  
1443  		/* We want to process the partially complete drops first. */
1444  		if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
1445  			list_add(&root->root_list, &fs_info->dead_roots);
1446  		else
1447  			list_add_tail(&root->root_list, &fs_info->dead_roots);
1448  	}
1449  	spin_unlock(&fs_info->trans_lock);
1450  }
1451  
1452  /*
1453   * Update each subvolume root and its relocation root, if it exists, in the tree
1454   * of tree roots. Also free log roots if they exist.
1455   */
commit_fs_roots(struct btrfs_trans_handle * trans)1456  static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
1457  {
1458  	struct btrfs_fs_info *fs_info = trans->fs_info;
1459  	struct btrfs_root *gang[8];
1460  	int i;
1461  	int ret;
1462  
1463  	/*
1464  	 * At this point no one can be using this transaction to modify any tree
1465  	 * and no one can start another transaction to modify any tree either.
1466  	 */
1467  	ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
1468  
1469  	spin_lock(&fs_info->fs_roots_radix_lock);
1470  	while (1) {
1471  		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
1472  						 (void **)gang, 0,
1473  						 ARRAY_SIZE(gang),
1474  						 BTRFS_ROOT_TRANS_TAG);
1475  		if (ret == 0)
1476  			break;
1477  		for (i = 0; i < ret; i++) {
1478  			struct btrfs_root *root = gang[i];
1479  			int ret2;
1480  
1481  			/*
1482  			 * At this point we can neither have tasks logging inodes
1483  			 * from a root nor trying to commit a log tree.
1484  			 */
1485  			ASSERT(atomic_read(&root->log_writers) == 0);
1486  			ASSERT(atomic_read(&root->log_commit[0]) == 0);
1487  			ASSERT(atomic_read(&root->log_commit[1]) == 0);
1488  
1489  			radix_tree_tag_clear(&fs_info->fs_roots_radix,
1490  					(unsigned long)btrfs_root_id(root),
1491  					BTRFS_ROOT_TRANS_TAG);
1492  			btrfs_qgroup_free_meta_all_pertrans(root);
1493  			spin_unlock(&fs_info->fs_roots_radix_lock);
1494  
1495  			btrfs_free_log(trans, root);
1496  			ret2 = btrfs_update_reloc_root(trans, root);
1497  			if (ret2)
1498  				return ret2;
1499  
1500  			/* see comments in should_cow_block() */
1501  			clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
1502  			smp_mb__after_atomic();
1503  
1504  			if (root->commit_root != root->node) {
1505  				list_add_tail(&root->dirty_list,
1506  					&trans->transaction->switch_commits);
1507  				btrfs_set_root_node(&root->root_item,
1508  						    root->node);
1509  			}
1510  
1511  			ret2 = btrfs_update_root(trans, fs_info->tree_root,
1512  						&root->root_key,
1513  						&root->root_item);
1514  			if (ret2)
1515  				return ret2;
1516  			spin_lock(&fs_info->fs_roots_radix_lock);
1517  		}
1518  	}
1519  	spin_unlock(&fs_info->fs_roots_radix_lock);
1520  	return 0;
1521  }
1522  
1523  /*
1524   * Do all special snapshot related qgroup dirty hack.
1525   *
1526   * Will do all needed qgroup inherit and dirty hack like switch commit
1527   * roots inside one transaction and write all btree into disk, to make
1528   * qgroup works.
1529   */
qgroup_account_snapshot(struct btrfs_trans_handle * trans,struct btrfs_root * src,struct btrfs_root * parent,struct btrfs_qgroup_inherit * inherit,u64 dst_objectid)1530  static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
1531  				   struct btrfs_root *src,
1532  				   struct btrfs_root *parent,
1533  				   struct btrfs_qgroup_inherit *inherit,
1534  				   u64 dst_objectid)
1535  {
1536  	struct btrfs_fs_info *fs_info = src->fs_info;
1537  	int ret;
1538  
1539  	/*
1540  	 * Save some performance in the case that qgroups are not enabled. If
1541  	 * this check races with the ioctl, rescan will kick in anyway.
1542  	 */
1543  	if (!btrfs_qgroup_full_accounting(fs_info))
1544  		return 0;
1545  
1546  	/*
1547  	 * Ensure dirty @src will be committed.  Or, after coming
1548  	 * commit_fs_roots() and switch_commit_roots(), any dirty but not
1549  	 * recorded root will never be updated again, causing an outdated root
1550  	 * item.
1551  	 */
1552  	ret = record_root_in_trans(trans, src, 1);
1553  	if (ret)
1554  		return ret;
1555  
1556  	/*
1557  	 * btrfs_qgroup_inherit relies on a consistent view of the usage for the
1558  	 * src root, so we must run the delayed refs here.
1559  	 *
1560  	 * However this isn't particularly fool proof, because there's no
1561  	 * synchronization keeping us from changing the tree after this point
1562  	 * before we do the qgroup_inherit, or even from making changes while
1563  	 * we're doing the qgroup_inherit.  But that's a problem for the future,
1564  	 * for now flush the delayed refs to narrow the race window where the
1565  	 * qgroup counters could end up wrong.
1566  	 */
1567  	ret = btrfs_run_delayed_refs(trans, U64_MAX);
1568  	if (ret) {
1569  		btrfs_abort_transaction(trans, ret);
1570  		return ret;
1571  	}
1572  
1573  	ret = commit_fs_roots(trans);
1574  	if (ret)
1575  		goto out;
1576  	ret = btrfs_qgroup_account_extents(trans);
1577  	if (ret < 0)
1578  		goto out;
1579  
1580  	/* Now qgroup are all updated, we can inherit it to new qgroups */
1581  	ret = btrfs_qgroup_inherit(trans, btrfs_root_id(src), dst_objectid,
1582  				   btrfs_root_id(parent), inherit);
1583  	if (ret < 0)
1584  		goto out;
1585  
1586  	/*
1587  	 * Now we do a simplified commit transaction, which will:
1588  	 * 1) commit all subvolume and extent tree
1589  	 *    To ensure all subvolume and extent tree have a valid
1590  	 *    commit_root to accounting later insert_dir_item()
1591  	 * 2) write all btree blocks onto disk
1592  	 *    This is to make sure later btree modification will be cowed
1593  	 *    Or commit_root can be populated and cause wrong qgroup numbers
1594  	 * In this simplified commit, we don't really care about other trees
1595  	 * like chunk and root tree, as they won't affect qgroup.
1596  	 * And we don't write super to avoid half committed status.
1597  	 */
1598  	ret = commit_cowonly_roots(trans);
1599  	if (ret)
1600  		goto out;
1601  	switch_commit_roots(trans);
1602  	ret = btrfs_write_and_wait_transaction(trans);
1603  	if (ret)
1604  		btrfs_handle_fs_error(fs_info, ret,
1605  			"Error while writing out transaction for qgroup");
1606  
1607  out:
1608  	/*
1609  	 * Force parent root to be updated, as we recorded it before so its
1610  	 * last_trans == cur_transid.
1611  	 * Or it won't be committed again onto disk after later
1612  	 * insert_dir_item()
1613  	 */
1614  	if (!ret)
1615  		ret = record_root_in_trans(trans, parent, 1);
1616  	return ret;
1617  }
1618  
1619  /*
1620   * new snapshots need to be created at a very specific time in the
1621   * transaction commit.  This does the actual creation.
1622   *
1623   * Note:
1624   * If the error which may affect the commitment of the current transaction
1625   * happens, we should return the error number. If the error which just affect
1626   * the creation of the pending snapshots, just return 0.
1627   */
create_pending_snapshot(struct btrfs_trans_handle * trans,struct btrfs_pending_snapshot * pending)1628  static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1629  				   struct btrfs_pending_snapshot *pending)
1630  {
1631  
1632  	struct btrfs_fs_info *fs_info = trans->fs_info;
1633  	struct btrfs_key key;
1634  	struct btrfs_root_item *new_root_item;
1635  	struct btrfs_root *tree_root = fs_info->tree_root;
1636  	struct btrfs_root *root = pending->root;
1637  	struct btrfs_root *parent_root;
1638  	struct btrfs_block_rsv *rsv;
1639  	struct inode *parent_inode = &pending->dir->vfs_inode;
1640  	struct btrfs_path *path;
1641  	struct btrfs_dir_item *dir_item;
1642  	struct extent_buffer *tmp;
1643  	struct extent_buffer *old;
1644  	struct timespec64 cur_time;
1645  	int ret = 0;
1646  	u64 to_reserve = 0;
1647  	u64 index = 0;
1648  	u64 objectid;
1649  	u64 root_flags;
1650  	unsigned int nofs_flags;
1651  	struct fscrypt_name fname;
1652  
1653  	ASSERT(pending->path);
1654  	path = pending->path;
1655  
1656  	ASSERT(pending->root_item);
1657  	new_root_item = pending->root_item;
1658  
1659  	/*
1660  	 * We're inside a transaction and must make sure that any potential
1661  	 * allocations with GFP_KERNEL in fscrypt won't recurse back to
1662  	 * filesystem.
1663  	 */
1664  	nofs_flags = memalloc_nofs_save();
1665  	pending->error = fscrypt_setup_filename(parent_inode,
1666  						&pending->dentry->d_name, 0,
1667  						&fname);
1668  	memalloc_nofs_restore(nofs_flags);
1669  	if (pending->error)
1670  		goto free_pending;
1671  
1672  	pending->error = btrfs_get_free_objectid(tree_root, &objectid);
1673  	if (pending->error)
1674  		goto free_fname;
1675  
1676  	/*
1677  	 * Make qgroup to skip current new snapshot's qgroupid, as it is
1678  	 * accounted by later btrfs_qgroup_inherit().
1679  	 */
1680  	btrfs_set_skip_qgroup(trans, objectid);
1681  
1682  	btrfs_reloc_pre_snapshot(pending, &to_reserve);
1683  
1684  	if (to_reserve > 0) {
1685  		pending->error = btrfs_block_rsv_add(fs_info,
1686  						     &pending->block_rsv,
1687  						     to_reserve,
1688  						     BTRFS_RESERVE_NO_FLUSH);
1689  		if (pending->error)
1690  			goto clear_skip_qgroup;
1691  	}
1692  
1693  	key.objectid = objectid;
1694  	key.offset = (u64)-1;
1695  	key.type = BTRFS_ROOT_ITEM_KEY;
1696  
1697  	rsv = trans->block_rsv;
1698  	trans->block_rsv = &pending->block_rsv;
1699  	trans->bytes_reserved = trans->block_rsv->reserved;
1700  	trace_btrfs_space_reservation(fs_info, "transaction",
1701  				      trans->transid,
1702  				      trans->bytes_reserved, 1);
1703  	parent_root = BTRFS_I(parent_inode)->root;
1704  	ret = record_root_in_trans(trans, parent_root, 0);
1705  	if (ret)
1706  		goto fail;
1707  	cur_time = current_time(parent_inode);
1708  
1709  	/*
1710  	 * insert the directory item
1711  	 */
1712  	ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
1713  	if (ret) {
1714  		btrfs_abort_transaction(trans, ret);
1715  		goto fail;
1716  	}
1717  
1718  	/* check if there is a file/dir which has the same name. */
1719  	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
1720  					 btrfs_ino(BTRFS_I(parent_inode)),
1721  					 &fname.disk_name, 0);
1722  	if (dir_item != NULL && !IS_ERR(dir_item)) {
1723  		pending->error = -EEXIST;
1724  		goto dir_item_existed;
1725  	} else if (IS_ERR(dir_item)) {
1726  		ret = PTR_ERR(dir_item);
1727  		btrfs_abort_transaction(trans, ret);
1728  		goto fail;
1729  	}
1730  	btrfs_release_path(path);
1731  
1732  	ret = btrfs_create_qgroup(trans, objectid);
1733  	if (ret && ret != -EEXIST) {
1734  		btrfs_abort_transaction(trans, ret);
1735  		goto fail;
1736  	}
1737  
1738  	/*
1739  	 * pull in the delayed directory update
1740  	 * and the delayed inode item
1741  	 * otherwise we corrupt the FS during
1742  	 * snapshot
1743  	 */
1744  	ret = btrfs_run_delayed_items(trans);
1745  	if (ret) {	/* Transaction aborted */
1746  		btrfs_abort_transaction(trans, ret);
1747  		goto fail;
1748  	}
1749  
1750  	ret = record_root_in_trans(trans, root, 0);
1751  	if (ret) {
1752  		btrfs_abort_transaction(trans, ret);
1753  		goto fail;
1754  	}
1755  	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
1756  	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
1757  	btrfs_check_and_init_root_item(new_root_item);
1758  
1759  	root_flags = btrfs_root_flags(new_root_item);
1760  	if (pending->readonly)
1761  		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
1762  	else
1763  		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
1764  	btrfs_set_root_flags(new_root_item, root_flags);
1765  
1766  	btrfs_set_root_generation_v2(new_root_item,
1767  			trans->transid);
1768  	generate_random_guid(new_root_item->uuid);
1769  	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
1770  			BTRFS_UUID_SIZE);
1771  	if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
1772  		memset(new_root_item->received_uuid, 0,
1773  		       sizeof(new_root_item->received_uuid));
1774  		memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
1775  		memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
1776  		btrfs_set_root_stransid(new_root_item, 0);
1777  		btrfs_set_root_rtransid(new_root_item, 0);
1778  	}
1779  	btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
1780  	btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
1781  	btrfs_set_root_otransid(new_root_item, trans->transid);
1782  
1783  	old = btrfs_lock_root_node(root);
1784  	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
1785  			      BTRFS_NESTING_COW);
1786  	if (ret) {
1787  		btrfs_tree_unlock(old);
1788  		free_extent_buffer(old);
1789  		btrfs_abort_transaction(trans, ret);
1790  		goto fail;
1791  	}
1792  
1793  	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
1794  	/* clean up in any case */
1795  	btrfs_tree_unlock(old);
1796  	free_extent_buffer(old);
1797  	if (ret) {
1798  		btrfs_abort_transaction(trans, ret);
1799  		goto fail;
1800  	}
1801  	/* see comments in should_cow_block() */
1802  	set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
1803  	smp_wmb();
1804  
1805  	btrfs_set_root_node(new_root_item, tmp);
1806  	/* record when the snapshot was created in key.offset */
1807  	key.offset = trans->transid;
1808  	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
1809  	btrfs_tree_unlock(tmp);
1810  	free_extent_buffer(tmp);
1811  	if (ret) {
1812  		btrfs_abort_transaction(trans, ret);
1813  		goto fail;
1814  	}
1815  
1816  	/*
1817  	 * insert root back/forward references
1818  	 */
1819  	ret = btrfs_add_root_ref(trans, objectid,
1820  				 btrfs_root_id(parent_root),
1821  				 btrfs_ino(BTRFS_I(parent_inode)), index,
1822  				 &fname.disk_name);
1823  	if (ret) {
1824  		btrfs_abort_transaction(trans, ret);
1825  		goto fail;
1826  	}
1827  
1828  	key.offset = (u64)-1;
1829  	pending->snap = btrfs_get_new_fs_root(fs_info, objectid, &pending->anon_dev);
1830  	if (IS_ERR(pending->snap)) {
1831  		ret = PTR_ERR(pending->snap);
1832  		pending->snap = NULL;
1833  		btrfs_abort_transaction(trans, ret);
1834  		goto fail;
1835  	}
1836  
1837  	ret = btrfs_reloc_post_snapshot(trans, pending);
1838  	if (ret) {
1839  		btrfs_abort_transaction(trans, ret);
1840  		goto fail;
1841  	}
1842  
1843  	/*
1844  	 * Do special qgroup accounting for snapshot, as we do some qgroup
1845  	 * snapshot hack to do fast snapshot.
1846  	 * To co-operate with that hack, we do hack again.
1847  	 * Or snapshot will be greatly slowed down by a subtree qgroup rescan
1848  	 */
1849  	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL)
1850  		ret = qgroup_account_snapshot(trans, root, parent_root,
1851  					      pending->inherit, objectid);
1852  	else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
1853  		ret = btrfs_qgroup_inherit(trans, btrfs_root_id(root), objectid,
1854  					   btrfs_root_id(parent_root), pending->inherit);
1855  	if (ret < 0)
1856  		goto fail;
1857  
1858  	ret = btrfs_insert_dir_item(trans, &fname.disk_name,
1859  				    BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
1860  				    index);
1861  	if (ret) {
1862  		btrfs_abort_transaction(trans, ret);
1863  		goto fail;
1864  	}
1865  
1866  	btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
1867  						  fname.disk_name.len * 2);
1868  	inode_set_mtime_to_ts(parent_inode,
1869  			      inode_set_ctime_current(parent_inode));
1870  	ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode));
1871  	if (ret) {
1872  		btrfs_abort_transaction(trans, ret);
1873  		goto fail;
1874  	}
1875  	ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
1876  				  BTRFS_UUID_KEY_SUBVOL,
1877  				  objectid);
1878  	if (ret) {
1879  		btrfs_abort_transaction(trans, ret);
1880  		goto fail;
1881  	}
1882  	if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
1883  		ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
1884  					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
1885  					  objectid);
1886  		if (ret && ret != -EEXIST) {
1887  			btrfs_abort_transaction(trans, ret);
1888  			goto fail;
1889  		}
1890  	}
1891  
1892  fail:
1893  	pending->error = ret;
1894  dir_item_existed:
1895  	trans->block_rsv = rsv;
1896  	trans->bytes_reserved = 0;
1897  clear_skip_qgroup:
1898  	btrfs_clear_skip_qgroup(trans);
1899  free_fname:
1900  	fscrypt_free_filename(&fname);
1901  free_pending:
1902  	kfree(new_root_item);
1903  	pending->root_item = NULL;
1904  	btrfs_free_path(path);
1905  	pending->path = NULL;
1906  
1907  	return ret;
1908  }
1909  
1910  /*
1911   * create all the snapshots we've scheduled for creation
1912   */
create_pending_snapshots(struct btrfs_trans_handle * trans)1913  static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans)
1914  {
1915  	struct btrfs_pending_snapshot *pending, *next;
1916  	struct list_head *head = &trans->transaction->pending_snapshots;
1917  	int ret = 0;
1918  
1919  	list_for_each_entry_safe(pending, next, head, list) {
1920  		list_del(&pending->list);
1921  		ret = create_pending_snapshot(trans, pending);
1922  		if (ret)
1923  			break;
1924  	}
1925  	return ret;
1926  }
1927  
update_super_roots(struct btrfs_fs_info * fs_info)1928  static void update_super_roots(struct btrfs_fs_info *fs_info)
1929  {
1930  	struct btrfs_root_item *root_item;
1931  	struct btrfs_super_block *super;
1932  
1933  	super = fs_info->super_copy;
1934  
1935  	root_item = &fs_info->chunk_root->root_item;
1936  	super->chunk_root = root_item->bytenr;
1937  	super->chunk_root_generation = root_item->generation;
1938  	super->chunk_root_level = root_item->level;
1939  
1940  	root_item = &fs_info->tree_root->root_item;
1941  	super->root = root_item->bytenr;
1942  	super->generation = root_item->generation;
1943  	super->root_level = root_item->level;
1944  	if (btrfs_test_opt(fs_info, SPACE_CACHE))
1945  		super->cache_generation = root_item->generation;
1946  	else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags))
1947  		super->cache_generation = 0;
1948  	if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
1949  		super->uuid_tree_generation = root_item->generation;
1950  }
1951  
btrfs_transaction_blocked(struct btrfs_fs_info * info)1952  int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1953  {
1954  	struct btrfs_transaction *trans;
1955  	int ret = 0;
1956  
1957  	spin_lock(&info->trans_lock);
1958  	trans = info->running_transaction;
1959  	if (trans)
1960  		ret = is_transaction_blocked(trans);
1961  	spin_unlock(&info->trans_lock);
1962  	return ret;
1963  }
1964  
btrfs_commit_transaction_async(struct btrfs_trans_handle * trans)1965  void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
1966  {
1967  	struct btrfs_fs_info *fs_info = trans->fs_info;
1968  	struct btrfs_transaction *cur_trans;
1969  
1970  	/* Kick the transaction kthread. */
1971  	set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
1972  	wake_up_process(fs_info->transaction_kthread);
1973  
1974  	/* take transaction reference */
1975  	cur_trans = trans->transaction;
1976  	refcount_inc(&cur_trans->use_count);
1977  
1978  	btrfs_end_transaction(trans);
1979  
1980  	/*
1981  	 * Wait for the current transaction commit to start and block
1982  	 * subsequent transaction joins
1983  	 */
1984  	btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
1985  	wait_event(fs_info->transaction_blocked_wait,
1986  		   cur_trans->state >= TRANS_STATE_COMMIT_START ||
1987  		   TRANS_ABORTED(cur_trans));
1988  	btrfs_put_transaction(cur_trans);
1989  }
1990  
1991  /*
1992   * If there is a running transaction commit it or if it's already committing,
1993   * wait for its commit to complete. Does not start and commit a new transaction
1994   * if there isn't any running.
1995   */
btrfs_commit_current_transaction(struct btrfs_root * root)1996  int btrfs_commit_current_transaction(struct btrfs_root *root)
1997  {
1998  	struct btrfs_trans_handle *trans;
1999  
2000  	trans = btrfs_attach_transaction_barrier(root);
2001  	if (IS_ERR(trans)) {
2002  		int ret = PTR_ERR(trans);
2003  
2004  		return (ret == -ENOENT) ? 0 : ret;
2005  	}
2006  
2007  	return btrfs_commit_transaction(trans);
2008  }
2009  
cleanup_transaction(struct btrfs_trans_handle * trans,int err)2010  static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
2011  {
2012  	struct btrfs_fs_info *fs_info = trans->fs_info;
2013  	struct btrfs_transaction *cur_trans = trans->transaction;
2014  
2015  	WARN_ON(refcount_read(&trans->use_count) > 1);
2016  
2017  	btrfs_abort_transaction(trans, err);
2018  
2019  	spin_lock(&fs_info->trans_lock);
2020  
2021  	/*
2022  	 * If the transaction is removed from the list, it means this
2023  	 * transaction has been committed successfully, so it is impossible
2024  	 * to call the cleanup function.
2025  	 */
2026  	BUG_ON(list_empty(&cur_trans->list));
2027  
2028  	if (cur_trans == fs_info->running_transaction) {
2029  		cur_trans->state = TRANS_STATE_COMMIT_DOING;
2030  		spin_unlock(&fs_info->trans_lock);
2031  
2032  		/*
2033  		 * The thread has already released the lockdep map as reader
2034  		 * already in btrfs_commit_transaction().
2035  		 */
2036  		btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
2037  		wait_event(cur_trans->writer_wait,
2038  			   atomic_read(&cur_trans->num_writers) == 1);
2039  
2040  		spin_lock(&fs_info->trans_lock);
2041  	}
2042  
2043  	/*
2044  	 * Now that we know no one else is still using the transaction we can
2045  	 * remove the transaction from the list of transactions. This avoids
2046  	 * the transaction kthread from cleaning up the transaction while some
2047  	 * other task is still using it, which could result in a use-after-free
2048  	 * on things like log trees, as it forces the transaction kthread to
2049  	 * wait for this transaction to be cleaned up by us.
2050  	 */
2051  	list_del_init(&cur_trans->list);
2052  
2053  	spin_unlock(&fs_info->trans_lock);
2054  
2055  	btrfs_cleanup_one_transaction(trans->transaction, fs_info);
2056  
2057  	spin_lock(&fs_info->trans_lock);
2058  	if (cur_trans == fs_info->running_transaction)
2059  		fs_info->running_transaction = NULL;
2060  	spin_unlock(&fs_info->trans_lock);
2061  
2062  	if (trans->type & __TRANS_FREEZABLE)
2063  		sb_end_intwrite(fs_info->sb);
2064  	btrfs_put_transaction(cur_trans);
2065  	btrfs_put_transaction(cur_trans);
2066  
2067  	trace_btrfs_transaction_commit(fs_info);
2068  
2069  	if (current->journal_info == trans)
2070  		current->journal_info = NULL;
2071  
2072  	/*
2073  	 * If relocation is running, we can't cancel scrub because that will
2074  	 * result in a deadlock. Before relocating a block group, relocation
2075  	 * pauses scrub, then starts and commits a transaction before unpausing
2076  	 * scrub. If the transaction commit is being done by the relocation
2077  	 * task or triggered by another task and the relocation task is waiting
2078  	 * for the commit, and we end up here due to an error in the commit
2079  	 * path, then calling btrfs_scrub_cancel() will deadlock, as we are
2080  	 * asking for scrub to stop while having it asked to be paused higher
2081  	 * above in relocation code.
2082  	 */
2083  	if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
2084  		btrfs_scrub_cancel(fs_info);
2085  
2086  	kmem_cache_free(btrfs_trans_handle_cachep, trans);
2087  }
2088  
2089  /*
2090   * Release reserved delayed ref space of all pending block groups of the
2091   * transaction and remove them from the list
2092   */
btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle * trans)2093  static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
2094  {
2095         struct btrfs_fs_info *fs_info = trans->fs_info;
2096         struct btrfs_block_group *block_group, *tmp;
2097  
2098         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
2099                 btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
2100                 list_del_init(&block_group->bg_list);
2101         }
2102  }
2103  
btrfs_start_delalloc_flush(struct btrfs_fs_info * fs_info)2104  static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
2105  {
2106  	/*
2107  	 * We use try_to_writeback_inodes_sb() here because if we used
2108  	 * btrfs_start_delalloc_roots we would deadlock with fs freeze.
2109  	 * Currently are holding the fs freeze lock, if we do an async flush
2110  	 * we'll do btrfs_join_transaction() and deadlock because we need to
2111  	 * wait for the fs freeze lock.  Using the direct flushing we benefit
2112  	 * from already being in a transaction and our join_transaction doesn't
2113  	 * have to re-take the fs freeze lock.
2114  	 *
2115  	 * Note that try_to_writeback_inodes_sb() will only trigger writeback
2116  	 * if it can read lock sb->s_umount. It will always be able to lock it,
2117  	 * except when the filesystem is being unmounted or being frozen, but in
2118  	 * those cases sync_filesystem() is called, which results in calling
2119  	 * writeback_inodes_sb() while holding a write lock on sb->s_umount.
2120  	 * Note that we don't call writeback_inodes_sb() directly, because it
2121  	 * will emit a warning if sb->s_umount is not locked.
2122  	 */
2123  	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
2124  		try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
2125  	return 0;
2126  }
2127  
btrfs_wait_delalloc_flush(struct btrfs_fs_info * fs_info)2128  static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
2129  {
2130  	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
2131  		btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
2132  }
2133  
2134  /*
2135   * Add a pending snapshot associated with the given transaction handle to the
2136   * respective handle. This must be called after the transaction commit started
2137   * and while holding fs_info->trans_lock.
2138   * This serves to guarantee a caller of btrfs_commit_transaction() that it can
2139   * safely free the pending snapshot pointer in case btrfs_commit_transaction()
2140   * returns an error.
2141   */
add_pending_snapshot(struct btrfs_trans_handle * trans)2142  static void add_pending_snapshot(struct btrfs_trans_handle *trans)
2143  {
2144  	struct btrfs_transaction *cur_trans = trans->transaction;
2145  
2146  	if (!trans->pending_snapshot)
2147  		return;
2148  
2149  	lockdep_assert_held(&trans->fs_info->trans_lock);
2150  	ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP);
2151  
2152  	list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
2153  }
2154  
update_commit_stats(struct btrfs_fs_info * fs_info,ktime_t interval)2155  static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval)
2156  {
2157  	fs_info->commit_stats.commit_count++;
2158  	fs_info->commit_stats.last_commit_dur = interval;
2159  	fs_info->commit_stats.max_commit_dur =
2160  			max_t(u64, fs_info->commit_stats.max_commit_dur, interval);
2161  	fs_info->commit_stats.total_commit_dur += interval;
2162  }
2163  
btrfs_commit_transaction(struct btrfs_trans_handle * trans)2164  int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
2165  {
2166  	struct btrfs_fs_info *fs_info = trans->fs_info;
2167  	struct btrfs_transaction *cur_trans = trans->transaction;
2168  	struct btrfs_transaction *prev_trans = NULL;
2169  	int ret;
2170  	ktime_t start_time;
2171  	ktime_t interval;
2172  
2173  	ASSERT(refcount_read(&trans->use_count) == 1);
2174  	btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
2175  
2176  	clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
2177  
2178  	/* Stop the commit early if ->aborted is set */
2179  	if (TRANS_ABORTED(cur_trans)) {
2180  		ret = cur_trans->aborted;
2181  		goto lockdep_trans_commit_start_release;
2182  	}
2183  
2184  	btrfs_trans_release_metadata(trans);
2185  	trans->block_rsv = NULL;
2186  
2187  	/*
2188  	 * We only want one transaction commit doing the flushing so we do not
2189  	 * waste a bunch of time on lock contention on the extent root node.
2190  	 */
2191  	if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING,
2192  			      &cur_trans->delayed_refs.flags)) {
2193  		/*
2194  		 * Make a pass through all the delayed refs we have so far.
2195  		 * Any running threads may add more while we are here.
2196  		 */
2197  		ret = btrfs_run_delayed_refs(trans, 0);
2198  		if (ret)
2199  			goto lockdep_trans_commit_start_release;
2200  	}
2201  
2202  	btrfs_create_pending_block_groups(trans);
2203  
2204  	if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
2205  		int run_it = 0;
2206  
2207  		/* this mutex is also taken before trying to set
2208  		 * block groups readonly.  We need to make sure
2209  		 * that nobody has set a block group readonly
2210  		 * after a extents from that block group have been
2211  		 * allocated for cache files.  btrfs_set_block_group_ro
2212  		 * will wait for the transaction to commit if it
2213  		 * finds BTRFS_TRANS_DIRTY_BG_RUN set.
2214  		 *
2215  		 * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
2216  		 * only one process starts all the block group IO.  It wouldn't
2217  		 * hurt to have more than one go through, but there's no
2218  		 * real advantage to it either.
2219  		 */
2220  		mutex_lock(&fs_info->ro_block_group_mutex);
2221  		if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
2222  				      &cur_trans->flags))
2223  			run_it = 1;
2224  		mutex_unlock(&fs_info->ro_block_group_mutex);
2225  
2226  		if (run_it) {
2227  			ret = btrfs_start_dirty_block_groups(trans);
2228  			if (ret)
2229  				goto lockdep_trans_commit_start_release;
2230  		}
2231  	}
2232  
2233  	spin_lock(&fs_info->trans_lock);
2234  	if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) {
2235  		enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
2236  
2237  		add_pending_snapshot(trans);
2238  
2239  		spin_unlock(&fs_info->trans_lock);
2240  		refcount_inc(&cur_trans->use_count);
2241  
2242  		if (trans->in_fsync)
2243  			want_state = TRANS_STATE_SUPER_COMMITTED;
2244  
2245  		btrfs_trans_state_lockdep_release(fs_info,
2246  						  BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
2247  		ret = btrfs_end_transaction(trans);
2248  		wait_for_commit(cur_trans, want_state);
2249  
2250  		if (TRANS_ABORTED(cur_trans))
2251  			ret = cur_trans->aborted;
2252  
2253  		btrfs_put_transaction(cur_trans);
2254  
2255  		return ret;
2256  	}
2257  
2258  	cur_trans->state = TRANS_STATE_COMMIT_PREP;
2259  	wake_up(&fs_info->transaction_blocked_wait);
2260  	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
2261  
2262  	if (cur_trans->list.prev != &fs_info->trans_list) {
2263  		enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
2264  
2265  		if (trans->in_fsync)
2266  			want_state = TRANS_STATE_SUPER_COMMITTED;
2267  
2268  		prev_trans = list_entry(cur_trans->list.prev,
2269  					struct btrfs_transaction, list);
2270  		if (prev_trans->state < want_state) {
2271  			refcount_inc(&prev_trans->use_count);
2272  			spin_unlock(&fs_info->trans_lock);
2273  
2274  			wait_for_commit(prev_trans, want_state);
2275  
2276  			ret = READ_ONCE(prev_trans->aborted);
2277  
2278  			btrfs_put_transaction(prev_trans);
2279  			if (ret)
2280  				goto lockdep_release;
2281  			spin_lock(&fs_info->trans_lock);
2282  		}
2283  	} else {
2284  		/*
2285  		 * The previous transaction was aborted and was already removed
2286  		 * from the list of transactions at fs_info->trans_list. So we
2287  		 * abort to prevent writing a new superblock that reflects a
2288  		 * corrupt state (pointing to trees with unwritten nodes/leafs).
2289  		 */
2290  		if (BTRFS_FS_ERROR(fs_info)) {
2291  			spin_unlock(&fs_info->trans_lock);
2292  			ret = -EROFS;
2293  			goto lockdep_release;
2294  		}
2295  	}
2296  
2297  	cur_trans->state = TRANS_STATE_COMMIT_START;
2298  	wake_up(&fs_info->transaction_blocked_wait);
2299  	spin_unlock(&fs_info->trans_lock);
2300  
2301  	/*
2302  	 * Get the time spent on the work done by the commit thread and not
2303  	 * the time spent waiting on a previous commit
2304  	 */
2305  	start_time = ktime_get_ns();
2306  
2307  	extwriter_counter_dec(cur_trans, trans->type);
2308  
2309  	ret = btrfs_start_delalloc_flush(fs_info);
2310  	if (ret)
2311  		goto lockdep_release;
2312  
2313  	ret = btrfs_run_delayed_items(trans);
2314  	if (ret)
2315  		goto lockdep_release;
2316  
2317  	/*
2318  	 * The thread has started/joined the transaction thus it holds the
2319  	 * lockdep map as a reader. It has to release it before acquiring the
2320  	 * lockdep map as a writer.
2321  	 */
2322  	btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
2323  	btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters);
2324  	wait_event(cur_trans->writer_wait,
2325  		   extwriter_counter_read(cur_trans) == 0);
2326  
2327  	/* some pending stuffs might be added after the previous flush. */
2328  	ret = btrfs_run_delayed_items(trans);
2329  	if (ret) {
2330  		btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
2331  		goto cleanup_transaction;
2332  	}
2333  
2334  	btrfs_wait_delalloc_flush(fs_info);
2335  
2336  	/*
2337  	 * Wait for all ordered extents started by a fast fsync that joined this
2338  	 * transaction. Otherwise if this transaction commits before the ordered
2339  	 * extents complete we lose logged data after a power failure.
2340  	 */
2341  	btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered);
2342  	wait_event(cur_trans->pending_wait,
2343  		   atomic_read(&cur_trans->pending_ordered) == 0);
2344  
2345  	btrfs_scrub_pause(fs_info);
2346  	/*
2347  	 * Ok now we need to make sure to block out any other joins while we
2348  	 * commit the transaction.  We could have started a join before setting
2349  	 * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
2350  	 */
2351  	spin_lock(&fs_info->trans_lock);
2352  	add_pending_snapshot(trans);
2353  	cur_trans->state = TRANS_STATE_COMMIT_DOING;
2354  	spin_unlock(&fs_info->trans_lock);
2355  
2356  	/*
2357  	 * The thread has started/joined the transaction thus it holds the
2358  	 * lockdep map as a reader. It has to release it before acquiring the
2359  	 * lockdep map as a writer.
2360  	 */
2361  	btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
2362  	btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
2363  	wait_event(cur_trans->writer_wait,
2364  		   atomic_read(&cur_trans->num_writers) == 1);
2365  
2366  	/*
2367  	 * Make lockdep happy by acquiring the state locks after
2368  	 * btrfs_trans_num_writers is released. If we acquired the state locks
2369  	 * before releasing the btrfs_trans_num_writers lock then lockdep would
2370  	 * complain because we did not follow the reverse order unlocking rule.
2371  	 */
2372  	btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
2373  	btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
2374  	btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2375  
2376  	/*
2377  	 * We've started the commit, clear the flag in case we were triggered to
2378  	 * do an async commit but somebody else started before the transaction
2379  	 * kthread could do the work.
2380  	 */
2381  	clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
2382  
2383  	if (TRANS_ABORTED(cur_trans)) {
2384  		ret = cur_trans->aborted;
2385  		btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2386  		goto scrub_continue;
2387  	}
2388  	/*
2389  	 * the reloc mutex makes sure that we stop
2390  	 * the balancing code from coming in and moving
2391  	 * extents around in the middle of the commit
2392  	 */
2393  	mutex_lock(&fs_info->reloc_mutex);
2394  
2395  	/*
2396  	 * We needn't worry about the delayed items because we will
2397  	 * deal with them in create_pending_snapshot(), which is the
2398  	 * core function of the snapshot creation.
2399  	 */
2400  	ret = create_pending_snapshots(trans);
2401  	if (ret)
2402  		goto unlock_reloc;
2403  
2404  	/*
2405  	 * We insert the dir indexes of the snapshots and update the inode
2406  	 * of the snapshots' parents after the snapshot creation, so there
2407  	 * are some delayed items which are not dealt with. Now deal with
2408  	 * them.
2409  	 *
2410  	 * We needn't worry that this operation will corrupt the snapshots,
2411  	 * because all the tree which are snapshoted will be forced to COW
2412  	 * the nodes and leaves.
2413  	 */
2414  	ret = btrfs_run_delayed_items(trans);
2415  	if (ret)
2416  		goto unlock_reloc;
2417  
2418  	ret = btrfs_run_delayed_refs(trans, U64_MAX);
2419  	if (ret)
2420  		goto unlock_reloc;
2421  
2422  	/*
2423  	 * make sure none of the code above managed to slip in a
2424  	 * delayed item
2425  	 */
2426  	btrfs_assert_delayed_root_empty(fs_info);
2427  
2428  	WARN_ON(cur_trans != trans->transaction);
2429  
2430  	ret = commit_fs_roots(trans);
2431  	if (ret)
2432  		goto unlock_reloc;
2433  
2434  	/* commit_fs_roots gets rid of all the tree log roots, it is now
2435  	 * safe to free the root of tree log roots
2436  	 */
2437  	btrfs_free_log_root_tree(trans, fs_info);
2438  
2439  	/*
2440  	 * Since fs roots are all committed, we can get a quite accurate
2441  	 * new_roots. So let's do quota accounting.
2442  	 */
2443  	ret = btrfs_qgroup_account_extents(trans);
2444  	if (ret < 0)
2445  		goto unlock_reloc;
2446  
2447  	ret = commit_cowonly_roots(trans);
2448  	if (ret)
2449  		goto unlock_reloc;
2450  
2451  	/*
2452  	 * The tasks which save the space cache and inode cache may also
2453  	 * update ->aborted, check it.
2454  	 */
2455  	if (TRANS_ABORTED(cur_trans)) {
2456  		ret = cur_trans->aborted;
2457  		goto unlock_reloc;
2458  	}
2459  
2460  	cur_trans = fs_info->running_transaction;
2461  
2462  	btrfs_set_root_node(&fs_info->tree_root->root_item,
2463  			    fs_info->tree_root->node);
2464  	list_add_tail(&fs_info->tree_root->dirty_list,
2465  		      &cur_trans->switch_commits);
2466  
2467  	btrfs_set_root_node(&fs_info->chunk_root->root_item,
2468  			    fs_info->chunk_root->node);
2469  	list_add_tail(&fs_info->chunk_root->dirty_list,
2470  		      &cur_trans->switch_commits);
2471  
2472  	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2473  		btrfs_set_root_node(&fs_info->block_group_root->root_item,
2474  				    fs_info->block_group_root->node);
2475  		list_add_tail(&fs_info->block_group_root->dirty_list,
2476  			      &cur_trans->switch_commits);
2477  	}
2478  
2479  	switch_commit_roots(trans);
2480  
2481  	ASSERT(list_empty(&cur_trans->dirty_bgs));
2482  	ASSERT(list_empty(&cur_trans->io_bgs));
2483  	update_super_roots(fs_info);
2484  
2485  	btrfs_set_super_log_root(fs_info->super_copy, 0);
2486  	btrfs_set_super_log_root_level(fs_info->super_copy, 0);
2487  	memcpy(fs_info->super_for_commit, fs_info->super_copy,
2488  	       sizeof(*fs_info->super_copy));
2489  
2490  	btrfs_commit_device_sizes(cur_trans);
2491  
2492  	clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
2493  	clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
2494  
2495  	btrfs_trans_release_chunk_metadata(trans);
2496  
2497  	/*
2498  	 * Before changing the transaction state to TRANS_STATE_UNBLOCKED and
2499  	 * setting fs_info->running_transaction to NULL, lock tree_log_mutex to
2500  	 * make sure that before we commit our superblock, no other task can
2501  	 * start a new transaction and commit a log tree before we commit our
2502  	 * superblock. Anyone trying to commit a log tree locks this mutex before
2503  	 * writing its superblock.
2504  	 */
2505  	mutex_lock(&fs_info->tree_log_mutex);
2506  
2507  	spin_lock(&fs_info->trans_lock);
2508  	cur_trans->state = TRANS_STATE_UNBLOCKED;
2509  	fs_info->running_transaction = NULL;
2510  	spin_unlock(&fs_info->trans_lock);
2511  	mutex_unlock(&fs_info->reloc_mutex);
2512  
2513  	wake_up(&fs_info->transaction_wait);
2514  	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2515  
2516  	/* If we have features changed, wake up the cleaner to update sysfs. */
2517  	if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) &&
2518  	    fs_info->cleaner_kthread)
2519  		wake_up_process(fs_info->cleaner_kthread);
2520  
2521  	ret = btrfs_write_and_wait_transaction(trans);
2522  	if (ret) {
2523  		btrfs_handle_fs_error(fs_info, ret,
2524  				      "Error while writing out transaction");
2525  		mutex_unlock(&fs_info->tree_log_mutex);
2526  		goto scrub_continue;
2527  	}
2528  
2529  	ret = write_all_supers(fs_info, 0);
2530  	/*
2531  	 * the super is written, we can safely allow the tree-loggers
2532  	 * to go about their business
2533  	 */
2534  	mutex_unlock(&fs_info->tree_log_mutex);
2535  	if (ret)
2536  		goto scrub_continue;
2537  
2538  	/*
2539  	 * We needn't acquire the lock here because there is no other task
2540  	 * which can change it.
2541  	 */
2542  	cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
2543  	wake_up(&cur_trans->commit_wait);
2544  	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
2545  
2546  	btrfs_finish_extent_commit(trans);
2547  
2548  	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
2549  		btrfs_clear_space_info_full(fs_info);
2550  
2551  	btrfs_set_last_trans_committed(fs_info, cur_trans->transid);
2552  	/*
2553  	 * We needn't acquire the lock here because there is no other task
2554  	 * which can change it.
2555  	 */
2556  	cur_trans->state = TRANS_STATE_COMPLETED;
2557  	wake_up(&cur_trans->commit_wait);
2558  	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
2559  
2560  	spin_lock(&fs_info->trans_lock);
2561  	list_del_init(&cur_trans->list);
2562  	spin_unlock(&fs_info->trans_lock);
2563  
2564  	btrfs_put_transaction(cur_trans);
2565  	btrfs_put_transaction(cur_trans);
2566  
2567  	if (trans->type & __TRANS_FREEZABLE)
2568  		sb_end_intwrite(fs_info->sb);
2569  
2570  	trace_btrfs_transaction_commit(fs_info);
2571  
2572  	interval = ktime_get_ns() - start_time;
2573  
2574  	btrfs_scrub_continue(fs_info);
2575  
2576  	if (current->journal_info == trans)
2577  		current->journal_info = NULL;
2578  
2579  	kmem_cache_free(btrfs_trans_handle_cachep, trans);
2580  
2581  	update_commit_stats(fs_info, interval);
2582  
2583  	return ret;
2584  
2585  unlock_reloc:
2586  	mutex_unlock(&fs_info->reloc_mutex);
2587  	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2588  scrub_continue:
2589  	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
2590  	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
2591  	btrfs_scrub_continue(fs_info);
2592  cleanup_transaction:
2593  	btrfs_trans_release_metadata(trans);
2594  	btrfs_cleanup_pending_block_groups(trans);
2595  	btrfs_trans_release_chunk_metadata(trans);
2596  	trans->block_rsv = NULL;
2597  	btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
2598  	if (current->journal_info == trans)
2599  		current->journal_info = NULL;
2600  	cleanup_transaction(trans, ret);
2601  
2602  	return ret;
2603  
2604  lockdep_release:
2605  	btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
2606  	btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
2607  	goto cleanup_transaction;
2608  
2609  lockdep_trans_commit_start_release:
2610  	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
2611  	btrfs_end_transaction(trans);
2612  	return ret;
2613  }
2614  
2615  /*
2616   * return < 0 if error
2617   * 0 if there are no more dead_roots at the time of call
2618   * 1 there are more to be processed, call me again
2619   *
2620   * The return value indicates there are certainly more snapshots to delete, but
2621   * if there comes a new one during processing, it may return 0. We don't mind,
2622   * because btrfs_commit_super will poke cleaner thread and it will process it a
2623   * few seconds later.
2624   */
btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info * fs_info)2625  int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
2626  {
2627  	struct btrfs_root *root;
2628  	int ret;
2629  
2630  	spin_lock(&fs_info->trans_lock);
2631  	if (list_empty(&fs_info->dead_roots)) {
2632  		spin_unlock(&fs_info->trans_lock);
2633  		return 0;
2634  	}
2635  	root = list_first_entry(&fs_info->dead_roots,
2636  			struct btrfs_root, root_list);
2637  	list_del_init(&root->root_list);
2638  	spin_unlock(&fs_info->trans_lock);
2639  
2640  	btrfs_debug(fs_info, "cleaner removing %llu", btrfs_root_id(root));
2641  
2642  	btrfs_kill_all_delayed_nodes(root);
2643  
2644  	if (btrfs_header_backref_rev(root->node) <
2645  			BTRFS_MIXED_BACKREF_REV)
2646  		ret = btrfs_drop_snapshot(root, 0, 0);
2647  	else
2648  		ret = btrfs_drop_snapshot(root, 1, 0);
2649  
2650  	btrfs_put_root(root);
2651  	return (ret < 0) ? 0 : 1;
2652  }
2653  
2654  /*
2655   * We only mark the transaction aborted and then set the file system read-only.
2656   * This will prevent new transactions from starting or trying to join this
2657   * one.
2658   *
2659   * This means that error recovery at the call site is limited to freeing
2660   * any local memory allocations and passing the error code up without
2661   * further cleanup. The transaction should complete as it normally would
2662   * in the call path but will return -EIO.
2663   *
2664   * We'll complete the cleanup in btrfs_end_transaction and
2665   * btrfs_commit_transaction.
2666   */
__btrfs_abort_transaction(struct btrfs_trans_handle * trans,const char * function,unsigned int line,int error,bool first_hit)2667  void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
2668  				      const char *function,
2669  				      unsigned int line, int error, bool first_hit)
2670  {
2671  	struct btrfs_fs_info *fs_info = trans->fs_info;
2672  
2673  	WRITE_ONCE(trans->aborted, error);
2674  	WRITE_ONCE(trans->transaction->aborted, error);
2675  	if (first_hit && error == -ENOSPC)
2676  		btrfs_dump_space_info_for_trans_abort(fs_info);
2677  	/* Wake up anybody who may be waiting on this transaction */
2678  	wake_up(&fs_info->transaction_wait);
2679  	wake_up(&fs_info->transaction_blocked_wait);
2680  	__btrfs_handle_fs_error(fs_info, function, line, error, NULL);
2681  }
2682  
btrfs_transaction_init(void)2683  int __init btrfs_transaction_init(void)
2684  {
2685  	btrfs_trans_handle_cachep = KMEM_CACHE(btrfs_trans_handle, SLAB_TEMPORARY);
2686  	if (!btrfs_trans_handle_cachep)
2687  		return -ENOMEM;
2688  	return 0;
2689  }
2690  
btrfs_transaction_exit(void)2691  void __cold btrfs_transaction_exit(void)
2692  {
2693  	kmem_cache_destroy(btrfs_trans_handle_cachep);
2694  }
2695