1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2008 Oracle.  All rights reserved.
4   */
5  
6  #include <linux/sched.h>
7  #include <linux/slab.h>
8  #include <linux/blkdev.h>
9  #include <linux/list_sort.h>
10  #include <linux/iversion.h>
11  #include "misc.h"
12  #include "ctree.h"
13  #include "tree-log.h"
14  #include "disk-io.h"
15  #include "locking.h"
16  #include "backref.h"
17  #include "compression.h"
18  #include "qgroup.h"
19  #include "block-group.h"
20  #include "space-info.h"
21  #include "inode-item.h"
22  #include "fs.h"
23  #include "accessors.h"
24  #include "extent-tree.h"
25  #include "root-tree.h"
26  #include "dir-item.h"
27  #include "file-item.h"
28  #include "file.h"
29  #include "orphan.h"
30  #include "tree-checker.h"
31  
32  #define MAX_CONFLICT_INODES 10
33  
34  /* magic values for the inode_only field in btrfs_log_inode:
35   *
36   * LOG_INODE_ALL means to log everything
37   * LOG_INODE_EXISTS means to log just enough to recreate the inode
38   * during log replay
39   */
40  enum {
41  	LOG_INODE_ALL,
42  	LOG_INODE_EXISTS,
43  };
44  
45  /*
46   * directory trouble cases
47   *
48   * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
49   * log, we must force a full commit before doing an fsync of the directory
50   * where the unlink was done.
51   * ---> record transid of last unlink/rename per directory
52   *
53   * mkdir foo/some_dir
54   * normal commit
55   * rename foo/some_dir foo2/some_dir
56   * mkdir foo/some_dir
57   * fsync foo/some_dir/some_file
58   *
59   * The fsync above will unlink the original some_dir without recording
60   * it in its new location (foo2).  After a crash, some_dir will be gone
61   * unless the fsync of some_file forces a full commit
62   *
63   * 2) we must log any new names for any file or dir that is in the fsync
64   * log. ---> check inode while renaming/linking.
65   *
66   * 2a) we must log any new names for any file or dir during rename
67   * when the directory they are being removed from was logged.
68   * ---> check inode and old parent dir during rename
69   *
70   *  2a is actually the more important variant.  With the extra logging
71   *  a crash might unlink the old name without recreating the new one
72   *
73   * 3) after a crash, we must go through any directories with a link count
74   * of zero and redo the rm -rf
75   *
76   * mkdir f1/foo
77   * normal commit
78   * rm -rf f1/foo
79   * fsync(f1)
80   *
81   * The directory f1 was fully removed from the FS, but fsync was never
82   * called on f1, only its parent dir.  After a crash the rm -rf must
83   * be replayed.  This must be able to recurse down the entire
84   * directory tree.  The inode link count fixup code takes care of the
85   * ugly details.
86   */
87  
88  /*
89   * stages for the tree walking.  The first
90   * stage (0) is to only pin down the blocks we find
91   * the second stage (1) is to make sure that all the inodes
92   * we find in the log are created in the subvolume.
93   *
94   * The last stage is to deal with directories and links and extents
95   * and all the other fun semantics
96   */
97  enum {
98  	LOG_WALK_PIN_ONLY,
99  	LOG_WALK_REPLAY_INODES,
100  	LOG_WALK_REPLAY_DIR_INDEX,
101  	LOG_WALK_REPLAY_ALL,
102  };
103  
104  static int btrfs_log_inode(struct btrfs_trans_handle *trans,
105  			   struct btrfs_inode *inode,
106  			   int inode_only,
107  			   struct btrfs_log_ctx *ctx);
108  static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
109  			     struct btrfs_root *root,
110  			     struct btrfs_path *path, u64 objectid);
111  static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
112  				       struct btrfs_root *root,
113  				       struct btrfs_root *log,
114  				       struct btrfs_path *path,
115  				       u64 dirid, int del_all);
116  static void wait_log_commit(struct btrfs_root *root, int transid);
117  
118  /*
119   * tree logging is a special write ahead log used to make sure that
120   * fsyncs and O_SYNCs can happen without doing full tree commits.
121   *
122   * Full tree commits are expensive because they require commonly
123   * modified blocks to be recowed, creating many dirty pages in the
124   * extent tree an 4x-6x higher write load than ext3.
125   *
126   * Instead of doing a tree commit on every fsync, we use the
127   * key ranges and transaction ids to find items for a given file or directory
128   * that have changed in this transaction.  Those items are copied into
129   * a special tree (one per subvolume root), that tree is written to disk
130   * and then the fsync is considered complete.
131   *
132   * After a crash, items are copied out of the log-tree back into the
133   * subvolume tree.  Any file data extents found are recorded in the extent
134   * allocation tree, and the log-tree freed.
135   *
136   * The log tree is read three times, once to pin down all the extents it is
137   * using in ram and once, once to create all the inodes logged in the tree
138   * and once to do all the other items.
139   */
140  
btrfs_iget_logging(u64 objectid,struct btrfs_root * root)141  static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
142  {
143  	unsigned int nofs_flag;
144  	struct inode *inode;
145  
146  	/*
147  	 * We're holding a transaction handle whether we are logging or
148  	 * replaying a log tree, so we must make sure NOFS semantics apply
149  	 * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
150  	 * to allocate an inode, which can recurse back into the filesystem and
151  	 * attempt a transaction commit, resulting in a deadlock.
152  	 */
153  	nofs_flag = memalloc_nofs_save();
154  	inode = btrfs_iget(objectid, root);
155  	memalloc_nofs_restore(nofs_flag);
156  
157  	return inode;
158  }
159  
160  /*
161   * start a sub transaction and setup the log tree
162   * this increments the log tree writer count to make the people
163   * syncing the tree wait for us to finish
164   */
start_log_trans(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)165  static int start_log_trans(struct btrfs_trans_handle *trans,
166  			   struct btrfs_root *root,
167  			   struct btrfs_log_ctx *ctx)
168  {
169  	struct btrfs_fs_info *fs_info = root->fs_info;
170  	struct btrfs_root *tree_root = fs_info->tree_root;
171  	const bool zoned = btrfs_is_zoned(fs_info);
172  	int ret = 0;
173  	bool created = false;
174  
175  	/*
176  	 * First check if the log root tree was already created. If not, create
177  	 * it before locking the root's log_mutex, just to keep lockdep happy.
178  	 */
179  	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
180  		mutex_lock(&tree_root->log_mutex);
181  		if (!fs_info->log_root_tree) {
182  			ret = btrfs_init_log_root_tree(trans, fs_info);
183  			if (!ret) {
184  				set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
185  				created = true;
186  			}
187  		}
188  		mutex_unlock(&tree_root->log_mutex);
189  		if (ret)
190  			return ret;
191  	}
192  
193  	mutex_lock(&root->log_mutex);
194  
195  again:
196  	if (root->log_root) {
197  		int index = (root->log_transid + 1) % 2;
198  
199  		if (btrfs_need_log_full_commit(trans)) {
200  			ret = BTRFS_LOG_FORCE_COMMIT;
201  			goto out;
202  		}
203  
204  		if (zoned && atomic_read(&root->log_commit[index])) {
205  			wait_log_commit(root, root->log_transid - 1);
206  			goto again;
207  		}
208  
209  		if (!root->log_start_pid) {
210  			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
211  			root->log_start_pid = current->pid;
212  		} else if (root->log_start_pid != current->pid) {
213  			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
214  		}
215  	} else {
216  		/*
217  		 * This means fs_info->log_root_tree was already created
218  		 * for some other FS trees. Do the full commit not to mix
219  		 * nodes from multiple log transactions to do sequential
220  		 * writing.
221  		 */
222  		if (zoned && !created) {
223  			ret = BTRFS_LOG_FORCE_COMMIT;
224  			goto out;
225  		}
226  
227  		ret = btrfs_add_log_tree(trans, root);
228  		if (ret)
229  			goto out;
230  
231  		set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
232  		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
233  		root->log_start_pid = current->pid;
234  	}
235  
236  	atomic_inc(&root->log_writers);
237  	if (!ctx->logging_new_name) {
238  		int index = root->log_transid % 2;
239  		list_add_tail(&ctx->list, &root->log_ctxs[index]);
240  		ctx->log_transid = root->log_transid;
241  	}
242  
243  out:
244  	mutex_unlock(&root->log_mutex);
245  	return ret;
246  }
247  
248  /*
249   * returns 0 if there was a log transaction running and we were able
250   * to join, or returns -ENOENT if there were not transactions
251   * in progress
252   */
join_running_log_trans(struct btrfs_root * root)253  static int join_running_log_trans(struct btrfs_root *root)
254  {
255  	const bool zoned = btrfs_is_zoned(root->fs_info);
256  	int ret = -ENOENT;
257  
258  	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
259  		return ret;
260  
261  	mutex_lock(&root->log_mutex);
262  again:
263  	if (root->log_root) {
264  		int index = (root->log_transid + 1) % 2;
265  
266  		ret = 0;
267  		if (zoned && atomic_read(&root->log_commit[index])) {
268  			wait_log_commit(root, root->log_transid - 1);
269  			goto again;
270  		}
271  		atomic_inc(&root->log_writers);
272  	}
273  	mutex_unlock(&root->log_mutex);
274  	return ret;
275  }
276  
277  /*
278   * This either makes the current running log transaction wait
279   * until you call btrfs_end_log_trans() or it makes any future
280   * log transactions wait until you call btrfs_end_log_trans()
281   */
btrfs_pin_log_trans(struct btrfs_root * root)282  void btrfs_pin_log_trans(struct btrfs_root *root)
283  {
284  	atomic_inc(&root->log_writers);
285  }
286  
287  /*
288   * indicate we're done making changes to the log tree
289   * and wake up anyone waiting to do a sync
290   */
btrfs_end_log_trans(struct btrfs_root * root)291  void btrfs_end_log_trans(struct btrfs_root *root)
292  {
293  	if (atomic_dec_and_test(&root->log_writers)) {
294  		/* atomic_dec_and_test implies a barrier */
295  		cond_wake_up_nomb(&root->log_writer_wait);
296  	}
297  }
298  
299  /*
300   * the walk control struct is used to pass state down the chain when
301   * processing the log tree.  The stage field tells us which part
302   * of the log tree processing we are currently doing.  The others
303   * are state fields used for that specific part
304   */
305  struct walk_control {
306  	/* should we free the extent on disk when done?  This is used
307  	 * at transaction commit time while freeing a log tree
308  	 */
309  	int free;
310  
311  	/* pin only walk, we record which extents on disk belong to the
312  	 * log trees
313  	 */
314  	int pin;
315  
316  	/* what stage of the replay code we're currently in */
317  	int stage;
318  
319  	/*
320  	 * Ignore any items from the inode currently being processed. Needs
321  	 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
322  	 * the LOG_WALK_REPLAY_INODES stage.
323  	 */
324  	bool ignore_cur_inode;
325  
326  	/* the root we are currently replaying */
327  	struct btrfs_root *replay_dest;
328  
329  	/* the trans handle for the current replay */
330  	struct btrfs_trans_handle *trans;
331  
332  	/* the function that gets used to process blocks we find in the
333  	 * tree.  Note the extent_buffer might not be up to date when it is
334  	 * passed in, and it must be checked or read if you need the data
335  	 * inside it
336  	 */
337  	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
338  			    struct walk_control *wc, u64 gen, int level);
339  };
340  
341  /*
342   * process_func used to pin down extents, write them or wait on them
343   */
process_one_buffer(struct btrfs_root * log,struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)344  static int process_one_buffer(struct btrfs_root *log,
345  			      struct extent_buffer *eb,
346  			      struct walk_control *wc, u64 gen, int level)
347  {
348  	struct btrfs_fs_info *fs_info = log->fs_info;
349  	int ret = 0;
350  
351  	/*
352  	 * If this fs is mixed then we need to be able to process the leaves to
353  	 * pin down any logged extents, so we have to read the block.
354  	 */
355  	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
356  		struct btrfs_tree_parent_check check = {
357  			.level = level,
358  			.transid = gen
359  		};
360  
361  		ret = btrfs_read_extent_buffer(eb, &check);
362  		if (ret)
363  			return ret;
364  	}
365  
366  	if (wc->pin) {
367  		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
368  		if (ret)
369  			return ret;
370  
371  		if (btrfs_buffer_uptodate(eb, gen, 0) &&
372  		    btrfs_header_level(eb) == 0)
373  			ret = btrfs_exclude_logged_extents(eb);
374  	}
375  	return ret;
376  }
377  
378  /*
379   * Item overwrite used by replay and tree logging.  eb, slot and key all refer
380   * to the src data we are copying out.
381   *
382   * root is the tree we are copying into, and path is a scratch
383   * path for use in this function (it should be released on entry and
384   * will be released on exit).
385   *
386   * If the key is already in the destination tree the existing item is
387   * overwritten.  If the existing item isn't big enough, it is extended.
388   * If it is too large, it is truncated.
389   *
390   * If the key isn't in the destination yet, a new item is inserted.
391   */
overwrite_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)392  static int overwrite_item(struct btrfs_trans_handle *trans,
393  			  struct btrfs_root *root,
394  			  struct btrfs_path *path,
395  			  struct extent_buffer *eb, int slot,
396  			  struct btrfs_key *key)
397  {
398  	int ret;
399  	u32 item_size;
400  	u64 saved_i_size = 0;
401  	int save_old_i_size = 0;
402  	unsigned long src_ptr;
403  	unsigned long dst_ptr;
404  	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
405  
406  	/*
407  	 * This is only used during log replay, so the root is always from a
408  	 * fs/subvolume tree. In case we ever need to support a log root, then
409  	 * we'll have to clone the leaf in the path, release the path and use
410  	 * the leaf before writing into the log tree. See the comments at
411  	 * copy_items() for more details.
412  	 */
413  	ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
414  
415  	item_size = btrfs_item_size(eb, slot);
416  	src_ptr = btrfs_item_ptr_offset(eb, slot);
417  
418  	/* Look for the key in the destination tree. */
419  	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
420  	if (ret < 0)
421  		return ret;
422  
423  	if (ret == 0) {
424  		char *src_copy;
425  		char *dst_copy;
426  		u32 dst_size = btrfs_item_size(path->nodes[0],
427  						  path->slots[0]);
428  		if (dst_size != item_size)
429  			goto insert;
430  
431  		if (item_size == 0) {
432  			btrfs_release_path(path);
433  			return 0;
434  		}
435  		dst_copy = kmalloc(item_size, GFP_NOFS);
436  		src_copy = kmalloc(item_size, GFP_NOFS);
437  		if (!dst_copy || !src_copy) {
438  			btrfs_release_path(path);
439  			kfree(dst_copy);
440  			kfree(src_copy);
441  			return -ENOMEM;
442  		}
443  
444  		read_extent_buffer(eb, src_copy, src_ptr, item_size);
445  
446  		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
447  		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
448  				   item_size);
449  		ret = memcmp(dst_copy, src_copy, item_size);
450  
451  		kfree(dst_copy);
452  		kfree(src_copy);
453  		/*
454  		 * they have the same contents, just return, this saves
455  		 * us from cowing blocks in the destination tree and doing
456  		 * extra writes that may not have been done by a previous
457  		 * sync
458  		 */
459  		if (ret == 0) {
460  			btrfs_release_path(path);
461  			return 0;
462  		}
463  
464  		/*
465  		 * We need to load the old nbytes into the inode so when we
466  		 * replay the extents we've logged we get the right nbytes.
467  		 */
468  		if (inode_item) {
469  			struct btrfs_inode_item *item;
470  			u64 nbytes;
471  			u32 mode;
472  
473  			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
474  					      struct btrfs_inode_item);
475  			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
476  			item = btrfs_item_ptr(eb, slot,
477  					      struct btrfs_inode_item);
478  			btrfs_set_inode_nbytes(eb, item, nbytes);
479  
480  			/*
481  			 * If this is a directory we need to reset the i_size to
482  			 * 0 so that we can set it up properly when replaying
483  			 * the rest of the items in this log.
484  			 */
485  			mode = btrfs_inode_mode(eb, item);
486  			if (S_ISDIR(mode))
487  				btrfs_set_inode_size(eb, item, 0);
488  		}
489  	} else if (inode_item) {
490  		struct btrfs_inode_item *item;
491  		u32 mode;
492  
493  		/*
494  		 * New inode, set nbytes to 0 so that the nbytes comes out
495  		 * properly when we replay the extents.
496  		 */
497  		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
498  		btrfs_set_inode_nbytes(eb, item, 0);
499  
500  		/*
501  		 * If this is a directory we need to reset the i_size to 0 so
502  		 * that we can set it up properly when replaying the rest of
503  		 * the items in this log.
504  		 */
505  		mode = btrfs_inode_mode(eb, item);
506  		if (S_ISDIR(mode))
507  			btrfs_set_inode_size(eb, item, 0);
508  	}
509  insert:
510  	btrfs_release_path(path);
511  	/* try to insert the key into the destination tree */
512  	path->skip_release_on_error = 1;
513  	ret = btrfs_insert_empty_item(trans, root, path,
514  				      key, item_size);
515  	path->skip_release_on_error = 0;
516  
517  	/* make sure any existing item is the correct size */
518  	if (ret == -EEXIST || ret == -EOVERFLOW) {
519  		u32 found_size;
520  		found_size = btrfs_item_size(path->nodes[0],
521  						path->slots[0]);
522  		if (found_size > item_size)
523  			btrfs_truncate_item(trans, path, item_size, 1);
524  		else if (found_size < item_size)
525  			btrfs_extend_item(trans, path, item_size - found_size);
526  	} else if (ret) {
527  		return ret;
528  	}
529  	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
530  					path->slots[0]);
531  
532  	/* don't overwrite an existing inode if the generation number
533  	 * was logged as zero.  This is done when the tree logging code
534  	 * is just logging an inode to make sure it exists after recovery.
535  	 *
536  	 * Also, don't overwrite i_size on directories during replay.
537  	 * log replay inserts and removes directory items based on the
538  	 * state of the tree found in the subvolume, and i_size is modified
539  	 * as it goes
540  	 */
541  	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
542  		struct btrfs_inode_item *src_item;
543  		struct btrfs_inode_item *dst_item;
544  
545  		src_item = (struct btrfs_inode_item *)src_ptr;
546  		dst_item = (struct btrfs_inode_item *)dst_ptr;
547  
548  		if (btrfs_inode_generation(eb, src_item) == 0) {
549  			struct extent_buffer *dst_eb = path->nodes[0];
550  			const u64 ino_size = btrfs_inode_size(eb, src_item);
551  
552  			/*
553  			 * For regular files an ino_size == 0 is used only when
554  			 * logging that an inode exists, as part of a directory
555  			 * fsync, and the inode wasn't fsynced before. In this
556  			 * case don't set the size of the inode in the fs/subvol
557  			 * tree, otherwise we would be throwing valid data away.
558  			 */
559  			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
560  			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
561  			    ino_size != 0)
562  				btrfs_set_inode_size(dst_eb, dst_item, ino_size);
563  			goto no_copy;
564  		}
565  
566  		if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
567  		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
568  			save_old_i_size = 1;
569  			saved_i_size = btrfs_inode_size(path->nodes[0],
570  							dst_item);
571  		}
572  	}
573  
574  	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
575  			   src_ptr, item_size);
576  
577  	if (save_old_i_size) {
578  		struct btrfs_inode_item *dst_item;
579  		dst_item = (struct btrfs_inode_item *)dst_ptr;
580  		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
581  	}
582  
583  	/* make sure the generation is filled in */
584  	if (key->type == BTRFS_INODE_ITEM_KEY) {
585  		struct btrfs_inode_item *dst_item;
586  		dst_item = (struct btrfs_inode_item *)dst_ptr;
587  		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
588  			btrfs_set_inode_generation(path->nodes[0], dst_item,
589  						   trans->transid);
590  		}
591  	}
592  no_copy:
593  	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
594  	btrfs_release_path(path);
595  	return 0;
596  }
597  
read_alloc_one_name(struct extent_buffer * eb,void * start,int len,struct fscrypt_str * name)598  static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
599  			       struct fscrypt_str *name)
600  {
601  	char *buf;
602  
603  	buf = kmalloc(len, GFP_NOFS);
604  	if (!buf)
605  		return -ENOMEM;
606  
607  	read_extent_buffer(eb, buf, (unsigned long)start, len);
608  	name->name = buf;
609  	name->len = len;
610  	return 0;
611  }
612  
613  /*
614   * simple helper to read an inode off the disk from a given root
615   * This can only be called for subvolume roots and not for the log
616   */
read_one_inode(struct btrfs_root * root,u64 objectid)617  static noinline struct inode *read_one_inode(struct btrfs_root *root,
618  					     u64 objectid)
619  {
620  	struct inode *inode;
621  
622  	inode = btrfs_iget_logging(objectid, root);
623  	if (IS_ERR(inode))
624  		inode = NULL;
625  	return inode;
626  }
627  
628  /* replays a single extent in 'eb' at 'slot' with 'key' into the
629   * subvolume 'root'.  path is released on entry and should be released
630   * on exit.
631   *
632   * extents in the log tree have not been allocated out of the extent
633   * tree yet.  So, this completes the allocation, taking a reference
634   * as required if the extent already exists or creating a new extent
635   * if it isn't in the extent allocation tree yet.
636   *
637   * The extent is inserted into the file, dropping any existing extents
638   * from the file that overlap the new one.
639   */
replay_one_extent(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)640  static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
641  				      struct btrfs_root *root,
642  				      struct btrfs_path *path,
643  				      struct extent_buffer *eb, int slot,
644  				      struct btrfs_key *key)
645  {
646  	struct btrfs_drop_extents_args drop_args = { 0 };
647  	struct btrfs_fs_info *fs_info = root->fs_info;
648  	int found_type;
649  	u64 extent_end;
650  	u64 start = key->offset;
651  	u64 nbytes = 0;
652  	struct btrfs_file_extent_item *item;
653  	struct inode *inode = NULL;
654  	unsigned long size;
655  	int ret = 0;
656  
657  	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
658  	found_type = btrfs_file_extent_type(eb, item);
659  
660  	if (found_type == BTRFS_FILE_EXTENT_REG ||
661  	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
662  		nbytes = btrfs_file_extent_num_bytes(eb, item);
663  		extent_end = start + nbytes;
664  
665  		/*
666  		 * We don't add to the inodes nbytes if we are prealloc or a
667  		 * hole.
668  		 */
669  		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
670  			nbytes = 0;
671  	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
672  		size = btrfs_file_extent_ram_bytes(eb, item);
673  		nbytes = btrfs_file_extent_ram_bytes(eb, item);
674  		extent_end = ALIGN(start + size,
675  				   fs_info->sectorsize);
676  	} else {
677  		ret = 0;
678  		goto out;
679  	}
680  
681  	inode = read_one_inode(root, key->objectid);
682  	if (!inode) {
683  		ret = -EIO;
684  		goto out;
685  	}
686  
687  	/*
688  	 * first check to see if we already have this extent in the
689  	 * file.  This must be done before the btrfs_drop_extents run
690  	 * so we don't try to drop this extent.
691  	 */
692  	ret = btrfs_lookup_file_extent(trans, root, path,
693  			btrfs_ino(BTRFS_I(inode)), start, 0);
694  
695  	if (ret == 0 &&
696  	    (found_type == BTRFS_FILE_EXTENT_REG ||
697  	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
698  		struct btrfs_file_extent_item cmp1;
699  		struct btrfs_file_extent_item cmp2;
700  		struct btrfs_file_extent_item *existing;
701  		struct extent_buffer *leaf;
702  
703  		leaf = path->nodes[0];
704  		existing = btrfs_item_ptr(leaf, path->slots[0],
705  					  struct btrfs_file_extent_item);
706  
707  		read_extent_buffer(eb, &cmp1, (unsigned long)item,
708  				   sizeof(cmp1));
709  		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
710  				   sizeof(cmp2));
711  
712  		/*
713  		 * we already have a pointer to this exact extent,
714  		 * we don't have to do anything
715  		 */
716  		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
717  			btrfs_release_path(path);
718  			goto out;
719  		}
720  	}
721  	btrfs_release_path(path);
722  
723  	/* drop any overlapping extents */
724  	drop_args.start = start;
725  	drop_args.end = extent_end;
726  	drop_args.drop_cache = true;
727  	ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
728  	if (ret)
729  		goto out;
730  
731  	if (found_type == BTRFS_FILE_EXTENT_REG ||
732  	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
733  		u64 offset;
734  		unsigned long dest_offset;
735  		struct btrfs_key ins;
736  
737  		if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
738  		    btrfs_fs_incompat(fs_info, NO_HOLES))
739  			goto update_inode;
740  
741  		ret = btrfs_insert_empty_item(trans, root, path, key,
742  					      sizeof(*item));
743  		if (ret)
744  			goto out;
745  		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
746  						    path->slots[0]);
747  		copy_extent_buffer(path->nodes[0], eb, dest_offset,
748  				(unsigned long)item,  sizeof(*item));
749  
750  		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
751  		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
752  		ins.type = BTRFS_EXTENT_ITEM_KEY;
753  		offset = key->offset - btrfs_file_extent_offset(eb, item);
754  
755  		/*
756  		 * Manually record dirty extent, as here we did a shallow
757  		 * file extent item copy and skip normal backref update,
758  		 * but modifying extent tree all by ourselves.
759  		 * So need to manually record dirty extent for qgroup,
760  		 * as the owner of the file extent changed from log tree
761  		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
762  		 */
763  		ret = btrfs_qgroup_trace_extent(trans,
764  				btrfs_file_extent_disk_bytenr(eb, item),
765  				btrfs_file_extent_disk_num_bytes(eb, item));
766  		if (ret < 0)
767  			goto out;
768  
769  		if (ins.objectid > 0) {
770  			u64 csum_start;
771  			u64 csum_end;
772  			LIST_HEAD(ordered_sums);
773  
774  			/*
775  			 * is this extent already allocated in the extent
776  			 * allocation tree?  If so, just add a reference
777  			 */
778  			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
779  						ins.offset);
780  			if (ret < 0) {
781  				goto out;
782  			} else if (ret == 0) {
783  				struct btrfs_ref ref = {
784  					.action = BTRFS_ADD_DELAYED_REF,
785  					.bytenr = ins.objectid,
786  					.num_bytes = ins.offset,
787  					.owning_root = btrfs_root_id(root),
788  					.ref_root = btrfs_root_id(root),
789  				};
790  				btrfs_init_data_ref(&ref, key->objectid, offset,
791  						    0, false);
792  				ret = btrfs_inc_extent_ref(trans, &ref);
793  				if (ret)
794  					goto out;
795  			} else {
796  				/*
797  				 * insert the extent pointer in the extent
798  				 * allocation tree
799  				 */
800  				ret = btrfs_alloc_logged_file_extent(trans,
801  						btrfs_root_id(root),
802  						key->objectid, offset, &ins);
803  				if (ret)
804  					goto out;
805  			}
806  			btrfs_release_path(path);
807  
808  			if (btrfs_file_extent_compression(eb, item)) {
809  				csum_start = ins.objectid;
810  				csum_end = csum_start + ins.offset;
811  			} else {
812  				csum_start = ins.objectid +
813  					btrfs_file_extent_offset(eb, item);
814  				csum_end = csum_start +
815  					btrfs_file_extent_num_bytes(eb, item);
816  			}
817  
818  			ret = btrfs_lookup_csums_list(root->log_root,
819  						csum_start, csum_end - 1,
820  						&ordered_sums, false);
821  			if (ret < 0)
822  				goto out;
823  			ret = 0;
824  			/*
825  			 * Now delete all existing cums in the csum root that
826  			 * cover our range. We do this because we can have an
827  			 * extent that is completely referenced by one file
828  			 * extent item and partially referenced by another
829  			 * file extent item (like after using the clone or
830  			 * extent_same ioctls). In this case if we end up doing
831  			 * the replay of the one that partially references the
832  			 * extent first, and we do not do the csum deletion
833  			 * below, we can get 2 csum items in the csum tree that
834  			 * overlap each other. For example, imagine our log has
835  			 * the two following file extent items:
836  			 *
837  			 * key (257 EXTENT_DATA 409600)
838  			 *     extent data disk byte 12845056 nr 102400
839  			 *     extent data offset 20480 nr 20480 ram 102400
840  			 *
841  			 * key (257 EXTENT_DATA 819200)
842  			 *     extent data disk byte 12845056 nr 102400
843  			 *     extent data offset 0 nr 102400 ram 102400
844  			 *
845  			 * Where the second one fully references the 100K extent
846  			 * that starts at disk byte 12845056, and the log tree
847  			 * has a single csum item that covers the entire range
848  			 * of the extent:
849  			 *
850  			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
851  			 *
852  			 * After the first file extent item is replayed, the
853  			 * csum tree gets the following csum item:
854  			 *
855  			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
856  			 *
857  			 * Which covers the 20K sub-range starting at offset 20K
858  			 * of our extent. Now when we replay the second file
859  			 * extent item, if we do not delete existing csum items
860  			 * that cover any of its blocks, we end up getting two
861  			 * csum items in our csum tree that overlap each other:
862  			 *
863  			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
864  			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
865  			 *
866  			 * Which is a problem, because after this anyone trying
867  			 * to lookup up for the checksum of any block of our
868  			 * extent starting at an offset of 40K or higher, will
869  			 * end up looking at the second csum item only, which
870  			 * does not contain the checksum for any block starting
871  			 * at offset 40K or higher of our extent.
872  			 */
873  			while (!list_empty(&ordered_sums)) {
874  				struct btrfs_ordered_sum *sums;
875  				struct btrfs_root *csum_root;
876  
877  				sums = list_entry(ordered_sums.next,
878  						struct btrfs_ordered_sum,
879  						list);
880  				csum_root = btrfs_csum_root(fs_info,
881  							    sums->logical);
882  				if (!ret)
883  					ret = btrfs_del_csums(trans, csum_root,
884  							      sums->logical,
885  							      sums->len);
886  				if (!ret)
887  					ret = btrfs_csum_file_blocks(trans,
888  								     csum_root,
889  								     sums);
890  				list_del(&sums->list);
891  				kfree(sums);
892  			}
893  			if (ret)
894  				goto out;
895  		} else {
896  			btrfs_release_path(path);
897  		}
898  	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
899  		/* inline extents are easy, we just overwrite them */
900  		ret = overwrite_item(trans, root, path, eb, slot, key);
901  		if (ret)
902  			goto out;
903  	}
904  
905  	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
906  						extent_end - start);
907  	if (ret)
908  		goto out;
909  
910  update_inode:
911  	btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
912  	ret = btrfs_update_inode(trans, BTRFS_I(inode));
913  out:
914  	iput(inode);
915  	return ret;
916  }
917  
unlink_inode_for_log_replay(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name)918  static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
919  				       struct btrfs_inode *dir,
920  				       struct btrfs_inode *inode,
921  				       const struct fscrypt_str *name)
922  {
923  	int ret;
924  
925  	ret = btrfs_unlink_inode(trans, dir, inode, name);
926  	if (ret)
927  		return ret;
928  	/*
929  	 * Whenever we need to check if a name exists or not, we check the
930  	 * fs/subvolume tree. So after an unlink we must run delayed items, so
931  	 * that future checks for a name during log replay see that the name
932  	 * does not exists anymore.
933  	 */
934  	return btrfs_run_delayed_items(trans);
935  }
936  
937  /*
938   * when cleaning up conflicts between the directory names in the
939   * subvolume, directory names in the log and directory names in the
940   * inode back references, we may have to unlink inodes from directories.
941   *
942   * This is a helper function to do the unlink of a specific directory
943   * item
944   */
drop_one_dir_item(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_inode * dir,struct btrfs_dir_item * di)945  static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
946  				      struct btrfs_path *path,
947  				      struct btrfs_inode *dir,
948  				      struct btrfs_dir_item *di)
949  {
950  	struct btrfs_root *root = dir->root;
951  	struct inode *inode;
952  	struct fscrypt_str name;
953  	struct extent_buffer *leaf;
954  	struct btrfs_key location;
955  	int ret;
956  
957  	leaf = path->nodes[0];
958  
959  	btrfs_dir_item_key_to_cpu(leaf, di, &location);
960  	ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
961  	if (ret)
962  		return -ENOMEM;
963  
964  	btrfs_release_path(path);
965  
966  	inode = read_one_inode(root, location.objectid);
967  	if (!inode) {
968  		ret = -EIO;
969  		goto out;
970  	}
971  
972  	ret = link_to_fixup_dir(trans, root, path, location.objectid);
973  	if (ret)
974  		goto out;
975  
976  	ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
977  out:
978  	kfree(name.name);
979  	iput(inode);
980  	return ret;
981  }
982  
983  /*
984   * See if a given name and sequence number found in an inode back reference are
985   * already in a directory and correctly point to this inode.
986   *
987   * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
988   * exists.
989   */
inode_in_dir(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,u64 objectid,u64 index,struct fscrypt_str * name)990  static noinline int inode_in_dir(struct btrfs_root *root,
991  				 struct btrfs_path *path,
992  				 u64 dirid, u64 objectid, u64 index,
993  				 struct fscrypt_str *name)
994  {
995  	struct btrfs_dir_item *di;
996  	struct btrfs_key location;
997  	int ret = 0;
998  
999  	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
1000  					 index, name, 0);
1001  	if (IS_ERR(di)) {
1002  		ret = PTR_ERR(di);
1003  		goto out;
1004  	} else if (di) {
1005  		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1006  		if (location.objectid != objectid)
1007  			goto out;
1008  	} else {
1009  		goto out;
1010  	}
1011  
1012  	btrfs_release_path(path);
1013  	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
1014  	if (IS_ERR(di)) {
1015  		ret = PTR_ERR(di);
1016  		goto out;
1017  	} else if (di) {
1018  		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1019  		if (location.objectid == objectid)
1020  			ret = 1;
1021  	}
1022  out:
1023  	btrfs_release_path(path);
1024  	return ret;
1025  }
1026  
1027  /*
1028   * helper function to check a log tree for a named back reference in
1029   * an inode.  This is used to decide if a back reference that is
1030   * found in the subvolume conflicts with what we find in the log.
1031   *
1032   * inode backreferences may have multiple refs in a single item,
1033   * during replay we process one reference at a time, and we don't
1034   * want to delete valid links to a file from the subvolume if that
1035   * link is also in the log.
1036   */
backref_in_log(struct btrfs_root * log,struct btrfs_key * key,u64 ref_objectid,const struct fscrypt_str * name)1037  static noinline int backref_in_log(struct btrfs_root *log,
1038  				   struct btrfs_key *key,
1039  				   u64 ref_objectid,
1040  				   const struct fscrypt_str *name)
1041  {
1042  	struct btrfs_path *path;
1043  	int ret;
1044  
1045  	path = btrfs_alloc_path();
1046  	if (!path)
1047  		return -ENOMEM;
1048  
1049  	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1050  	if (ret < 0) {
1051  		goto out;
1052  	} else if (ret == 1) {
1053  		ret = 0;
1054  		goto out;
1055  	}
1056  
1057  	if (key->type == BTRFS_INODE_EXTREF_KEY)
1058  		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1059  						       path->slots[0],
1060  						       ref_objectid, name);
1061  	else
1062  		ret = !!btrfs_find_name_in_backref(path->nodes[0],
1063  						   path->slots[0], name);
1064  out:
1065  	btrfs_free_path(path);
1066  	return ret;
1067  }
1068  
__add_inode_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_root * log_root,struct btrfs_inode * dir,struct btrfs_inode * inode,u64 inode_objectid,u64 parent_objectid,u64 ref_index,struct fscrypt_str * name)1069  static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1070  				  struct btrfs_root *root,
1071  				  struct btrfs_path *path,
1072  				  struct btrfs_root *log_root,
1073  				  struct btrfs_inode *dir,
1074  				  struct btrfs_inode *inode,
1075  				  u64 inode_objectid, u64 parent_objectid,
1076  				  u64 ref_index, struct fscrypt_str *name)
1077  {
1078  	int ret;
1079  	struct extent_buffer *leaf;
1080  	struct btrfs_dir_item *di;
1081  	struct btrfs_key search_key;
1082  	struct btrfs_inode_extref *extref;
1083  
1084  again:
1085  	/* Search old style refs */
1086  	search_key.objectid = inode_objectid;
1087  	search_key.type = BTRFS_INODE_REF_KEY;
1088  	search_key.offset = parent_objectid;
1089  	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1090  	if (ret == 0) {
1091  		struct btrfs_inode_ref *victim_ref;
1092  		unsigned long ptr;
1093  		unsigned long ptr_end;
1094  
1095  		leaf = path->nodes[0];
1096  
1097  		/* are we trying to overwrite a back ref for the root directory
1098  		 * if so, just jump out, we're done
1099  		 */
1100  		if (search_key.objectid == search_key.offset)
1101  			return 1;
1102  
1103  		/* check all the names in this back reference to see
1104  		 * if they are in the log.  if so, we allow them to stay
1105  		 * otherwise they must be unlinked as a conflict
1106  		 */
1107  		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1108  		ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
1109  		while (ptr < ptr_end) {
1110  			struct fscrypt_str victim_name;
1111  
1112  			victim_ref = (struct btrfs_inode_ref *)ptr;
1113  			ret = read_alloc_one_name(leaf, (victim_ref + 1),
1114  				 btrfs_inode_ref_name_len(leaf, victim_ref),
1115  				 &victim_name);
1116  			if (ret)
1117  				return ret;
1118  
1119  			ret = backref_in_log(log_root, &search_key,
1120  					     parent_objectid, &victim_name);
1121  			if (ret < 0) {
1122  				kfree(victim_name.name);
1123  				return ret;
1124  			} else if (!ret) {
1125  				inc_nlink(&inode->vfs_inode);
1126  				btrfs_release_path(path);
1127  
1128  				ret = unlink_inode_for_log_replay(trans, dir, inode,
1129  						&victim_name);
1130  				kfree(victim_name.name);
1131  				if (ret)
1132  					return ret;
1133  				goto again;
1134  			}
1135  			kfree(victim_name.name);
1136  
1137  			ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
1138  		}
1139  	}
1140  	btrfs_release_path(path);
1141  
1142  	/* Same search but for extended refs */
1143  	extref = btrfs_lookup_inode_extref(NULL, root, path, name,
1144  					   inode_objectid, parent_objectid, 0,
1145  					   0);
1146  	if (IS_ERR(extref)) {
1147  		return PTR_ERR(extref);
1148  	} else if (extref) {
1149  		u32 item_size;
1150  		u32 cur_offset = 0;
1151  		unsigned long base;
1152  		struct inode *victim_parent;
1153  
1154  		leaf = path->nodes[0];
1155  
1156  		item_size = btrfs_item_size(leaf, path->slots[0]);
1157  		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1158  
1159  		while (cur_offset < item_size) {
1160  			struct fscrypt_str victim_name;
1161  
1162  			extref = (struct btrfs_inode_extref *)(base + cur_offset);
1163  
1164  			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1165  				goto next;
1166  
1167  			ret = read_alloc_one_name(leaf, &extref->name,
1168  				 btrfs_inode_extref_name_len(leaf, extref),
1169  				 &victim_name);
1170  			if (ret)
1171  				return ret;
1172  
1173  			search_key.objectid = inode_objectid;
1174  			search_key.type = BTRFS_INODE_EXTREF_KEY;
1175  			search_key.offset = btrfs_extref_hash(parent_objectid,
1176  							      victim_name.name,
1177  							      victim_name.len);
1178  			ret = backref_in_log(log_root, &search_key,
1179  					     parent_objectid, &victim_name);
1180  			if (ret < 0) {
1181  				kfree(victim_name.name);
1182  				return ret;
1183  			} else if (!ret) {
1184  				ret = -ENOENT;
1185  				victim_parent = read_one_inode(root,
1186  						parent_objectid);
1187  				if (victim_parent) {
1188  					inc_nlink(&inode->vfs_inode);
1189  					btrfs_release_path(path);
1190  
1191  					ret = unlink_inode_for_log_replay(trans,
1192  							BTRFS_I(victim_parent),
1193  							inode, &victim_name);
1194  				}
1195  				iput(victim_parent);
1196  				kfree(victim_name.name);
1197  				if (ret)
1198  					return ret;
1199  				goto again;
1200  			}
1201  			kfree(victim_name.name);
1202  next:
1203  			cur_offset += victim_name.len + sizeof(*extref);
1204  		}
1205  	}
1206  	btrfs_release_path(path);
1207  
1208  	/* look for a conflicting sequence number */
1209  	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1210  					 ref_index, name, 0);
1211  	if (IS_ERR(di)) {
1212  		return PTR_ERR(di);
1213  	} else if (di) {
1214  		ret = drop_one_dir_item(trans, path, dir, di);
1215  		if (ret)
1216  			return ret;
1217  	}
1218  	btrfs_release_path(path);
1219  
1220  	/* look for a conflicting name */
1221  	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
1222  	if (IS_ERR(di)) {
1223  		return PTR_ERR(di);
1224  	} else if (di) {
1225  		ret = drop_one_dir_item(trans, path, dir, di);
1226  		if (ret)
1227  			return ret;
1228  	}
1229  	btrfs_release_path(path);
1230  
1231  	return 0;
1232  }
1233  
extref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,struct fscrypt_str * name,u64 * index,u64 * parent_objectid)1234  static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1235  			     struct fscrypt_str *name, u64 *index,
1236  			     u64 *parent_objectid)
1237  {
1238  	struct btrfs_inode_extref *extref;
1239  	int ret;
1240  
1241  	extref = (struct btrfs_inode_extref *)ref_ptr;
1242  
1243  	ret = read_alloc_one_name(eb, &extref->name,
1244  				  btrfs_inode_extref_name_len(eb, extref), name);
1245  	if (ret)
1246  		return ret;
1247  
1248  	if (index)
1249  		*index = btrfs_inode_extref_index(eb, extref);
1250  	if (parent_objectid)
1251  		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1252  
1253  	return 0;
1254  }
1255  
ref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,struct fscrypt_str * name,u64 * index)1256  static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1257  			  struct fscrypt_str *name, u64 *index)
1258  {
1259  	struct btrfs_inode_ref *ref;
1260  	int ret;
1261  
1262  	ref = (struct btrfs_inode_ref *)ref_ptr;
1263  
1264  	ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
1265  				  name);
1266  	if (ret)
1267  		return ret;
1268  
1269  	if (index)
1270  		*index = btrfs_inode_ref_index(eb, ref);
1271  
1272  	return 0;
1273  }
1274  
1275  /*
1276   * Take an inode reference item from the log tree and iterate all names from the
1277   * inode reference item in the subvolume tree with the same key (if it exists).
1278   * For any name that is not in the inode reference item from the log tree, do a
1279   * proper unlink of that name (that is, remove its entry from the inode
1280   * reference item and both dir index keys).
1281   */
unlink_old_inode_refs(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_inode * inode,struct extent_buffer * log_eb,int log_slot,struct btrfs_key * key)1282  static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1283  				 struct btrfs_root *root,
1284  				 struct btrfs_path *path,
1285  				 struct btrfs_inode *inode,
1286  				 struct extent_buffer *log_eb,
1287  				 int log_slot,
1288  				 struct btrfs_key *key)
1289  {
1290  	int ret;
1291  	unsigned long ref_ptr;
1292  	unsigned long ref_end;
1293  	struct extent_buffer *eb;
1294  
1295  again:
1296  	btrfs_release_path(path);
1297  	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1298  	if (ret > 0) {
1299  		ret = 0;
1300  		goto out;
1301  	}
1302  	if (ret < 0)
1303  		goto out;
1304  
1305  	eb = path->nodes[0];
1306  	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1307  	ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
1308  	while (ref_ptr < ref_end) {
1309  		struct fscrypt_str name;
1310  		u64 parent_id;
1311  
1312  		if (key->type == BTRFS_INODE_EXTREF_KEY) {
1313  			ret = extref_get_fields(eb, ref_ptr, &name,
1314  						NULL, &parent_id);
1315  		} else {
1316  			parent_id = key->offset;
1317  			ret = ref_get_fields(eb, ref_ptr, &name, NULL);
1318  		}
1319  		if (ret)
1320  			goto out;
1321  
1322  		if (key->type == BTRFS_INODE_EXTREF_KEY)
1323  			ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1324  							       parent_id, &name);
1325  		else
1326  			ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
1327  
1328  		if (!ret) {
1329  			struct inode *dir;
1330  
1331  			btrfs_release_path(path);
1332  			dir = read_one_inode(root, parent_id);
1333  			if (!dir) {
1334  				ret = -ENOENT;
1335  				kfree(name.name);
1336  				goto out;
1337  			}
1338  			ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
1339  						 inode, &name);
1340  			kfree(name.name);
1341  			iput(dir);
1342  			if (ret)
1343  				goto out;
1344  			goto again;
1345  		}
1346  
1347  		kfree(name.name);
1348  		ref_ptr += name.len;
1349  		if (key->type == BTRFS_INODE_EXTREF_KEY)
1350  			ref_ptr += sizeof(struct btrfs_inode_extref);
1351  		else
1352  			ref_ptr += sizeof(struct btrfs_inode_ref);
1353  	}
1354  	ret = 0;
1355   out:
1356  	btrfs_release_path(path);
1357  	return ret;
1358  }
1359  
1360  /*
1361   * replay one inode back reference item found in the log tree.
1362   * eb, slot and key refer to the buffer and key found in the log tree.
1363   * root is the destination we are replaying into, and path is for temp
1364   * use by this function.  (it should be released on return).
1365   */
add_inode_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)1366  static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1367  				  struct btrfs_root *root,
1368  				  struct btrfs_root *log,
1369  				  struct btrfs_path *path,
1370  				  struct extent_buffer *eb, int slot,
1371  				  struct btrfs_key *key)
1372  {
1373  	struct inode *dir = NULL;
1374  	struct inode *inode = NULL;
1375  	unsigned long ref_ptr;
1376  	unsigned long ref_end;
1377  	struct fscrypt_str name = { 0 };
1378  	int ret;
1379  	int log_ref_ver = 0;
1380  	u64 parent_objectid;
1381  	u64 inode_objectid;
1382  	u64 ref_index = 0;
1383  	int ref_struct_size;
1384  
1385  	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1386  	ref_end = ref_ptr + btrfs_item_size(eb, slot);
1387  
1388  	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1389  		struct btrfs_inode_extref *r;
1390  
1391  		ref_struct_size = sizeof(struct btrfs_inode_extref);
1392  		log_ref_ver = 1;
1393  		r = (struct btrfs_inode_extref *)ref_ptr;
1394  		parent_objectid = btrfs_inode_extref_parent(eb, r);
1395  	} else {
1396  		ref_struct_size = sizeof(struct btrfs_inode_ref);
1397  		parent_objectid = key->offset;
1398  	}
1399  	inode_objectid = key->objectid;
1400  
1401  	/*
1402  	 * it is possible that we didn't log all the parent directories
1403  	 * for a given inode.  If we don't find the dir, just don't
1404  	 * copy the back ref in.  The link count fixup code will take
1405  	 * care of the rest
1406  	 */
1407  	dir = read_one_inode(root, parent_objectid);
1408  	if (!dir) {
1409  		ret = -ENOENT;
1410  		goto out;
1411  	}
1412  
1413  	inode = read_one_inode(root, inode_objectid);
1414  	if (!inode) {
1415  		ret = -EIO;
1416  		goto out;
1417  	}
1418  
1419  	while (ref_ptr < ref_end) {
1420  		if (log_ref_ver) {
1421  			ret = extref_get_fields(eb, ref_ptr, &name,
1422  						&ref_index, &parent_objectid);
1423  			/*
1424  			 * parent object can change from one array
1425  			 * item to another.
1426  			 */
1427  			if (!dir)
1428  				dir = read_one_inode(root, parent_objectid);
1429  			if (!dir) {
1430  				ret = -ENOENT;
1431  				goto out;
1432  			}
1433  		} else {
1434  			ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
1435  		}
1436  		if (ret)
1437  			goto out;
1438  
1439  		ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1440  				   btrfs_ino(BTRFS_I(inode)), ref_index, &name);
1441  		if (ret < 0) {
1442  			goto out;
1443  		} else if (ret == 0) {
1444  			/*
1445  			 * look for a conflicting back reference in the
1446  			 * metadata. if we find one we have to unlink that name
1447  			 * of the file before we add our new link.  Later on, we
1448  			 * overwrite any existing back reference, and we don't
1449  			 * want to create dangling pointers in the directory.
1450  			 */
1451  			ret = __add_inode_ref(trans, root, path, log,
1452  					      BTRFS_I(dir), BTRFS_I(inode),
1453  					      inode_objectid, parent_objectid,
1454  					      ref_index, &name);
1455  			if (ret) {
1456  				if (ret == 1)
1457  					ret = 0;
1458  				goto out;
1459  			}
1460  
1461  			/* insert our name */
1462  			ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1463  					     &name, 0, ref_index);
1464  			if (ret)
1465  				goto out;
1466  
1467  			ret = btrfs_update_inode(trans, BTRFS_I(inode));
1468  			if (ret)
1469  				goto out;
1470  		}
1471  		/* Else, ret == 1, we already have a perfect match, we're done. */
1472  
1473  		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
1474  		kfree(name.name);
1475  		name.name = NULL;
1476  		if (log_ref_ver) {
1477  			iput(dir);
1478  			dir = NULL;
1479  		}
1480  	}
1481  
1482  	/*
1483  	 * Before we overwrite the inode reference item in the subvolume tree
1484  	 * with the item from the log tree, we must unlink all names from the
1485  	 * parent directory that are in the subvolume's tree inode reference
1486  	 * item, otherwise we end up with an inconsistent subvolume tree where
1487  	 * dir index entries exist for a name but there is no inode reference
1488  	 * item with the same name.
1489  	 */
1490  	ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1491  				    key);
1492  	if (ret)
1493  		goto out;
1494  
1495  	/* finally write the back reference in the inode */
1496  	ret = overwrite_item(trans, root, path, eb, slot, key);
1497  out:
1498  	btrfs_release_path(path);
1499  	kfree(name.name);
1500  	iput(dir);
1501  	iput(inode);
1502  	return ret;
1503  }
1504  
count_inode_extrefs(struct btrfs_inode * inode,struct btrfs_path * path)1505  static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
1506  {
1507  	int ret = 0;
1508  	int name_len;
1509  	unsigned int nlink = 0;
1510  	u32 item_size;
1511  	u32 cur_offset = 0;
1512  	u64 inode_objectid = btrfs_ino(inode);
1513  	u64 offset = 0;
1514  	unsigned long ptr;
1515  	struct btrfs_inode_extref *extref;
1516  	struct extent_buffer *leaf;
1517  
1518  	while (1) {
1519  		ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
1520  					    path, &extref, &offset);
1521  		if (ret)
1522  			break;
1523  
1524  		leaf = path->nodes[0];
1525  		item_size = btrfs_item_size(leaf, path->slots[0]);
1526  		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1527  		cur_offset = 0;
1528  
1529  		while (cur_offset < item_size) {
1530  			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1531  			name_len = btrfs_inode_extref_name_len(leaf, extref);
1532  
1533  			nlink++;
1534  
1535  			cur_offset += name_len + sizeof(*extref);
1536  		}
1537  
1538  		offset++;
1539  		btrfs_release_path(path);
1540  	}
1541  	btrfs_release_path(path);
1542  
1543  	if (ret < 0 && ret != -ENOENT)
1544  		return ret;
1545  	return nlink;
1546  }
1547  
count_inode_refs(struct btrfs_inode * inode,struct btrfs_path * path)1548  static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
1549  {
1550  	int ret;
1551  	struct btrfs_key key;
1552  	unsigned int nlink = 0;
1553  	unsigned long ptr;
1554  	unsigned long ptr_end;
1555  	int name_len;
1556  	u64 ino = btrfs_ino(inode);
1557  
1558  	key.objectid = ino;
1559  	key.type = BTRFS_INODE_REF_KEY;
1560  	key.offset = (u64)-1;
1561  
1562  	while (1) {
1563  		ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
1564  		if (ret < 0)
1565  			break;
1566  		if (ret > 0) {
1567  			if (path->slots[0] == 0)
1568  				break;
1569  			path->slots[0]--;
1570  		}
1571  process_slot:
1572  		btrfs_item_key_to_cpu(path->nodes[0], &key,
1573  				      path->slots[0]);
1574  		if (key.objectid != ino ||
1575  		    key.type != BTRFS_INODE_REF_KEY)
1576  			break;
1577  		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1578  		ptr_end = ptr + btrfs_item_size(path->nodes[0],
1579  						   path->slots[0]);
1580  		while (ptr < ptr_end) {
1581  			struct btrfs_inode_ref *ref;
1582  
1583  			ref = (struct btrfs_inode_ref *)ptr;
1584  			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1585  							    ref);
1586  			ptr = (unsigned long)(ref + 1) + name_len;
1587  			nlink++;
1588  		}
1589  
1590  		if (key.offset == 0)
1591  			break;
1592  		if (path->slots[0] > 0) {
1593  			path->slots[0]--;
1594  			goto process_slot;
1595  		}
1596  		key.offset--;
1597  		btrfs_release_path(path);
1598  	}
1599  	btrfs_release_path(path);
1600  
1601  	return nlink;
1602  }
1603  
1604  /*
1605   * There are a few corners where the link count of the file can't
1606   * be properly maintained during replay.  So, instead of adding
1607   * lots of complexity to the log code, we just scan the backrefs
1608   * for any file that has been through replay.
1609   *
1610   * The scan will update the link count on the inode to reflect the
1611   * number of back refs found.  If it goes down to zero, the iput
1612   * will free the inode.
1613   */
fixup_inode_link_count(struct btrfs_trans_handle * trans,struct inode * inode)1614  static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1615  					   struct inode *inode)
1616  {
1617  	struct btrfs_root *root = BTRFS_I(inode)->root;
1618  	struct btrfs_path *path;
1619  	int ret;
1620  	u64 nlink = 0;
1621  	u64 ino = btrfs_ino(BTRFS_I(inode));
1622  
1623  	path = btrfs_alloc_path();
1624  	if (!path)
1625  		return -ENOMEM;
1626  
1627  	ret = count_inode_refs(BTRFS_I(inode), path);
1628  	if (ret < 0)
1629  		goto out;
1630  
1631  	nlink = ret;
1632  
1633  	ret = count_inode_extrefs(BTRFS_I(inode), path);
1634  	if (ret < 0)
1635  		goto out;
1636  
1637  	nlink += ret;
1638  
1639  	ret = 0;
1640  
1641  	if (nlink != inode->i_nlink) {
1642  		set_nlink(inode, nlink);
1643  		ret = btrfs_update_inode(trans, BTRFS_I(inode));
1644  		if (ret)
1645  			goto out;
1646  	}
1647  	if (S_ISDIR(inode->i_mode))
1648  		BTRFS_I(inode)->index_cnt = (u64)-1;
1649  
1650  	if (inode->i_nlink == 0) {
1651  		if (S_ISDIR(inode->i_mode)) {
1652  			ret = replay_dir_deletes(trans, root, NULL, path,
1653  						 ino, 1);
1654  			if (ret)
1655  				goto out;
1656  		}
1657  		ret = btrfs_insert_orphan_item(trans, root, ino);
1658  		if (ret == -EEXIST)
1659  			ret = 0;
1660  	}
1661  
1662  out:
1663  	btrfs_free_path(path);
1664  	return ret;
1665  }
1666  
fixup_inode_link_counts(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path)1667  static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1668  					    struct btrfs_root *root,
1669  					    struct btrfs_path *path)
1670  {
1671  	int ret;
1672  	struct btrfs_key key;
1673  	struct inode *inode;
1674  
1675  	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1676  	key.type = BTRFS_ORPHAN_ITEM_KEY;
1677  	key.offset = (u64)-1;
1678  	while (1) {
1679  		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1680  		if (ret < 0)
1681  			break;
1682  
1683  		if (ret == 1) {
1684  			ret = 0;
1685  			if (path->slots[0] == 0)
1686  				break;
1687  			path->slots[0]--;
1688  		}
1689  
1690  		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1691  		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1692  		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1693  			break;
1694  
1695  		ret = btrfs_del_item(trans, root, path);
1696  		if (ret)
1697  			break;
1698  
1699  		btrfs_release_path(path);
1700  		inode = read_one_inode(root, key.offset);
1701  		if (!inode) {
1702  			ret = -EIO;
1703  			break;
1704  		}
1705  
1706  		ret = fixup_inode_link_count(trans, inode);
1707  		iput(inode);
1708  		if (ret)
1709  			break;
1710  
1711  		/*
1712  		 * fixup on a directory may create new entries,
1713  		 * make sure we always look for the highset possible
1714  		 * offset
1715  		 */
1716  		key.offset = (u64)-1;
1717  	}
1718  	btrfs_release_path(path);
1719  	return ret;
1720  }
1721  
1722  
1723  /*
1724   * record a given inode in the fixup dir so we can check its link
1725   * count when replay is done.  The link count is incremented here
1726   * so the inode won't go away until we check it
1727   */
link_to_fixup_dir(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,u64 objectid)1728  static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1729  				      struct btrfs_root *root,
1730  				      struct btrfs_path *path,
1731  				      u64 objectid)
1732  {
1733  	struct btrfs_key key;
1734  	int ret = 0;
1735  	struct inode *inode;
1736  
1737  	inode = read_one_inode(root, objectid);
1738  	if (!inode)
1739  		return -EIO;
1740  
1741  	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1742  	key.type = BTRFS_ORPHAN_ITEM_KEY;
1743  	key.offset = objectid;
1744  
1745  	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1746  
1747  	btrfs_release_path(path);
1748  	if (ret == 0) {
1749  		if (!inode->i_nlink)
1750  			set_nlink(inode, 1);
1751  		else
1752  			inc_nlink(inode);
1753  		ret = btrfs_update_inode(trans, BTRFS_I(inode));
1754  	} else if (ret == -EEXIST) {
1755  		ret = 0;
1756  	}
1757  	iput(inode);
1758  
1759  	return ret;
1760  }
1761  
1762  /*
1763   * when replaying the log for a directory, we only insert names
1764   * for inodes that actually exist.  This means an fsync on a directory
1765   * does not implicitly fsync all the new files in it
1766   */
insert_one_name(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 dirid,u64 index,const struct fscrypt_str * name,struct btrfs_key * location)1767  static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1768  				    struct btrfs_root *root,
1769  				    u64 dirid, u64 index,
1770  				    const struct fscrypt_str *name,
1771  				    struct btrfs_key *location)
1772  {
1773  	struct inode *inode;
1774  	struct inode *dir;
1775  	int ret;
1776  
1777  	inode = read_one_inode(root, location->objectid);
1778  	if (!inode)
1779  		return -ENOENT;
1780  
1781  	dir = read_one_inode(root, dirid);
1782  	if (!dir) {
1783  		iput(inode);
1784  		return -EIO;
1785  	}
1786  
1787  	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1788  			     1, index);
1789  
1790  	/* FIXME, put inode into FIXUP list */
1791  
1792  	iput(inode);
1793  	iput(dir);
1794  	return ret;
1795  }
1796  
delete_conflicting_dir_entry(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_path * path,struct btrfs_dir_item * dst_di,const struct btrfs_key * log_key,u8 log_flags,bool exists)1797  static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
1798  					struct btrfs_inode *dir,
1799  					struct btrfs_path *path,
1800  					struct btrfs_dir_item *dst_di,
1801  					const struct btrfs_key *log_key,
1802  					u8 log_flags,
1803  					bool exists)
1804  {
1805  	struct btrfs_key found_key;
1806  
1807  	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1808  	/* The existing dentry points to the same inode, don't delete it. */
1809  	if (found_key.objectid == log_key->objectid &&
1810  	    found_key.type == log_key->type &&
1811  	    found_key.offset == log_key->offset &&
1812  	    btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
1813  		return 1;
1814  
1815  	/*
1816  	 * Don't drop the conflicting directory entry if the inode for the new
1817  	 * entry doesn't exist.
1818  	 */
1819  	if (!exists)
1820  		return 0;
1821  
1822  	return drop_one_dir_item(trans, path, dir, dst_di);
1823  }
1824  
1825  /*
1826   * take a single entry in a log directory item and replay it into
1827   * the subvolume.
1828   *
1829   * if a conflicting item exists in the subdirectory already,
1830   * the inode it points to is unlinked and put into the link count
1831   * fix up tree.
1832   *
1833   * If a name from the log points to a file or directory that does
1834   * not exist in the FS, it is skipped.  fsyncs on directories
1835   * do not force down inodes inside that directory, just changes to the
1836   * names or unlinks in a directory.
1837   *
1838   * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1839   * non-existing inode) and 1 if the name was replayed.
1840   */
replay_one_name(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,struct btrfs_dir_item * di,struct btrfs_key * key)1841  static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1842  				    struct btrfs_root *root,
1843  				    struct btrfs_path *path,
1844  				    struct extent_buffer *eb,
1845  				    struct btrfs_dir_item *di,
1846  				    struct btrfs_key *key)
1847  {
1848  	struct fscrypt_str name = { 0 };
1849  	struct btrfs_dir_item *dir_dst_di;
1850  	struct btrfs_dir_item *index_dst_di;
1851  	bool dir_dst_matches = false;
1852  	bool index_dst_matches = false;
1853  	struct btrfs_key log_key;
1854  	struct btrfs_key search_key;
1855  	struct inode *dir;
1856  	u8 log_flags;
1857  	bool exists;
1858  	int ret;
1859  	bool update_size = true;
1860  	bool name_added = false;
1861  
1862  	dir = read_one_inode(root, key->objectid);
1863  	if (!dir)
1864  		return -EIO;
1865  
1866  	ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
1867  	if (ret)
1868  		goto out;
1869  
1870  	log_flags = btrfs_dir_flags(eb, di);
1871  	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1872  	ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1873  	btrfs_release_path(path);
1874  	if (ret < 0)
1875  		goto out;
1876  	exists = (ret == 0);
1877  	ret = 0;
1878  
1879  	dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1880  					   &name, 1);
1881  	if (IS_ERR(dir_dst_di)) {
1882  		ret = PTR_ERR(dir_dst_di);
1883  		goto out;
1884  	} else if (dir_dst_di) {
1885  		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
1886  						   dir_dst_di, &log_key,
1887  						   log_flags, exists);
1888  		if (ret < 0)
1889  			goto out;
1890  		dir_dst_matches = (ret == 1);
1891  	}
1892  
1893  	btrfs_release_path(path);
1894  
1895  	index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1896  						   key->objectid, key->offset,
1897  						   &name, 1);
1898  	if (IS_ERR(index_dst_di)) {
1899  		ret = PTR_ERR(index_dst_di);
1900  		goto out;
1901  	} else if (index_dst_di) {
1902  		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
1903  						   index_dst_di, &log_key,
1904  						   log_flags, exists);
1905  		if (ret < 0)
1906  			goto out;
1907  		index_dst_matches = (ret == 1);
1908  	}
1909  
1910  	btrfs_release_path(path);
1911  
1912  	if (dir_dst_matches && index_dst_matches) {
1913  		ret = 0;
1914  		update_size = false;
1915  		goto out;
1916  	}
1917  
1918  	/*
1919  	 * Check if the inode reference exists in the log for the given name,
1920  	 * inode and parent inode
1921  	 */
1922  	search_key.objectid = log_key.objectid;
1923  	search_key.type = BTRFS_INODE_REF_KEY;
1924  	search_key.offset = key->objectid;
1925  	ret = backref_in_log(root->log_root, &search_key, 0, &name);
1926  	if (ret < 0) {
1927  	        goto out;
1928  	} else if (ret) {
1929  	        /* The dentry will be added later. */
1930  	        ret = 0;
1931  	        update_size = false;
1932  	        goto out;
1933  	}
1934  
1935  	search_key.objectid = log_key.objectid;
1936  	search_key.type = BTRFS_INODE_EXTREF_KEY;
1937  	search_key.offset = key->objectid;
1938  	ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
1939  	if (ret < 0) {
1940  		goto out;
1941  	} else if (ret) {
1942  		/* The dentry will be added later. */
1943  		ret = 0;
1944  		update_size = false;
1945  		goto out;
1946  	}
1947  	btrfs_release_path(path);
1948  	ret = insert_one_name(trans, root, key->objectid, key->offset,
1949  			      &name, &log_key);
1950  	if (ret && ret != -ENOENT && ret != -EEXIST)
1951  		goto out;
1952  	if (!ret)
1953  		name_added = true;
1954  	update_size = false;
1955  	ret = 0;
1956  
1957  out:
1958  	if (!ret && update_size) {
1959  		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
1960  		ret = btrfs_update_inode(trans, BTRFS_I(dir));
1961  	}
1962  	kfree(name.name);
1963  	iput(dir);
1964  	if (!ret && name_added)
1965  		ret = 1;
1966  	return ret;
1967  }
1968  
1969  /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
replay_one_dir_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)1970  static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1971  					struct btrfs_root *root,
1972  					struct btrfs_path *path,
1973  					struct extent_buffer *eb, int slot,
1974  					struct btrfs_key *key)
1975  {
1976  	int ret;
1977  	struct btrfs_dir_item *di;
1978  
1979  	/* We only log dir index keys, which only contain a single dir item. */
1980  	ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
1981  
1982  	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1983  	ret = replay_one_name(trans, root, path, eb, di, key);
1984  	if (ret < 0)
1985  		return ret;
1986  
1987  	/*
1988  	 * If this entry refers to a non-directory (directories can not have a
1989  	 * link count > 1) and it was added in the transaction that was not
1990  	 * committed, make sure we fixup the link count of the inode the entry
1991  	 * points to. Otherwise something like the following would result in a
1992  	 * directory pointing to an inode with a wrong link that does not account
1993  	 * for this dir entry:
1994  	 *
1995  	 * mkdir testdir
1996  	 * touch testdir/foo
1997  	 * touch testdir/bar
1998  	 * sync
1999  	 *
2000  	 * ln testdir/bar testdir/bar_link
2001  	 * ln testdir/foo testdir/foo_link
2002  	 * xfs_io -c "fsync" testdir/bar
2003  	 *
2004  	 * <power failure>
2005  	 *
2006  	 * mount fs, log replay happens
2007  	 *
2008  	 * File foo would remain with a link count of 1 when it has two entries
2009  	 * pointing to it in the directory testdir. This would make it impossible
2010  	 * to ever delete the parent directory has it would result in stale
2011  	 * dentries that can never be deleted.
2012  	 */
2013  	if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
2014  		struct btrfs_path *fixup_path;
2015  		struct btrfs_key di_key;
2016  
2017  		fixup_path = btrfs_alloc_path();
2018  		if (!fixup_path)
2019  			return -ENOMEM;
2020  
2021  		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2022  		ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
2023  		btrfs_free_path(fixup_path);
2024  	}
2025  
2026  	return ret;
2027  }
2028  
2029  /*
2030   * directory replay has two parts.  There are the standard directory
2031   * items in the log copied from the subvolume, and range items
2032   * created in the log while the subvolume was logged.
2033   *
2034   * The range items tell us which parts of the key space the log
2035   * is authoritative for.  During replay, if a key in the subvolume
2036   * directory is in a logged range item, but not actually in the log
2037   * that means it was deleted from the directory before the fsync
2038   * and should be removed.
2039   */
find_dir_range(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,u64 * start_ret,u64 * end_ret)2040  static noinline int find_dir_range(struct btrfs_root *root,
2041  				   struct btrfs_path *path,
2042  				   u64 dirid,
2043  				   u64 *start_ret, u64 *end_ret)
2044  {
2045  	struct btrfs_key key;
2046  	u64 found_end;
2047  	struct btrfs_dir_log_item *item;
2048  	int ret;
2049  	int nritems;
2050  
2051  	if (*start_ret == (u64)-1)
2052  		return 1;
2053  
2054  	key.objectid = dirid;
2055  	key.type = BTRFS_DIR_LOG_INDEX_KEY;
2056  	key.offset = *start_ret;
2057  
2058  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2059  	if (ret < 0)
2060  		goto out;
2061  	if (ret > 0) {
2062  		if (path->slots[0] == 0)
2063  			goto out;
2064  		path->slots[0]--;
2065  	}
2066  	if (ret != 0)
2067  		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2068  
2069  	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2070  		ret = 1;
2071  		goto next;
2072  	}
2073  	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2074  			      struct btrfs_dir_log_item);
2075  	found_end = btrfs_dir_log_end(path->nodes[0], item);
2076  
2077  	if (*start_ret >= key.offset && *start_ret <= found_end) {
2078  		ret = 0;
2079  		*start_ret = key.offset;
2080  		*end_ret = found_end;
2081  		goto out;
2082  	}
2083  	ret = 1;
2084  next:
2085  	/* check the next slot in the tree to see if it is a valid item */
2086  	nritems = btrfs_header_nritems(path->nodes[0]);
2087  	path->slots[0]++;
2088  	if (path->slots[0] >= nritems) {
2089  		ret = btrfs_next_leaf(root, path);
2090  		if (ret)
2091  			goto out;
2092  	}
2093  
2094  	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2095  
2096  	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2097  		ret = 1;
2098  		goto out;
2099  	}
2100  	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2101  			      struct btrfs_dir_log_item);
2102  	found_end = btrfs_dir_log_end(path->nodes[0], item);
2103  	*start_ret = key.offset;
2104  	*end_ret = found_end;
2105  	ret = 0;
2106  out:
2107  	btrfs_release_path(path);
2108  	return ret;
2109  }
2110  
2111  /*
2112   * this looks for a given directory item in the log.  If the directory
2113   * item is not in the log, the item is removed and the inode it points
2114   * to is unlinked
2115   */
check_item_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_path * log_path,struct inode * dir,struct btrfs_key * dir_key)2116  static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2117  				      struct btrfs_root *log,
2118  				      struct btrfs_path *path,
2119  				      struct btrfs_path *log_path,
2120  				      struct inode *dir,
2121  				      struct btrfs_key *dir_key)
2122  {
2123  	struct btrfs_root *root = BTRFS_I(dir)->root;
2124  	int ret;
2125  	struct extent_buffer *eb;
2126  	int slot;
2127  	struct btrfs_dir_item *di;
2128  	struct fscrypt_str name = { 0 };
2129  	struct inode *inode = NULL;
2130  	struct btrfs_key location;
2131  
2132  	/*
2133  	 * Currently we only log dir index keys. Even if we replay a log created
2134  	 * by an older kernel that logged both dir index and dir item keys, all
2135  	 * we need to do is process the dir index keys, we (and our caller) can
2136  	 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
2137  	 */
2138  	ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
2139  
2140  	eb = path->nodes[0];
2141  	slot = path->slots[0];
2142  	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2143  	ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
2144  	if (ret)
2145  		goto out;
2146  
2147  	if (log) {
2148  		struct btrfs_dir_item *log_di;
2149  
2150  		log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
2151  						     dir_key->objectid,
2152  						     dir_key->offset, &name, 0);
2153  		if (IS_ERR(log_di)) {
2154  			ret = PTR_ERR(log_di);
2155  			goto out;
2156  		} else if (log_di) {
2157  			/* The dentry exists in the log, we have nothing to do. */
2158  			ret = 0;
2159  			goto out;
2160  		}
2161  	}
2162  
2163  	btrfs_dir_item_key_to_cpu(eb, di, &location);
2164  	btrfs_release_path(path);
2165  	btrfs_release_path(log_path);
2166  	inode = read_one_inode(root, location.objectid);
2167  	if (!inode) {
2168  		ret = -EIO;
2169  		goto out;
2170  	}
2171  
2172  	ret = link_to_fixup_dir(trans, root, path, location.objectid);
2173  	if (ret)
2174  		goto out;
2175  
2176  	inc_nlink(inode);
2177  	ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
2178  					  &name);
2179  	/*
2180  	 * Unlike dir item keys, dir index keys can only have one name (entry) in
2181  	 * them, as there are no key collisions since each key has a unique offset
2182  	 * (an index number), so we're done.
2183  	 */
2184  out:
2185  	btrfs_release_path(path);
2186  	btrfs_release_path(log_path);
2187  	kfree(name.name);
2188  	iput(inode);
2189  	return ret;
2190  }
2191  
replay_xattr_deletes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,const u64 ino)2192  static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2193  			      struct btrfs_root *root,
2194  			      struct btrfs_root *log,
2195  			      struct btrfs_path *path,
2196  			      const u64 ino)
2197  {
2198  	struct btrfs_key search_key;
2199  	struct btrfs_path *log_path;
2200  	int i;
2201  	int nritems;
2202  	int ret;
2203  
2204  	log_path = btrfs_alloc_path();
2205  	if (!log_path)
2206  		return -ENOMEM;
2207  
2208  	search_key.objectid = ino;
2209  	search_key.type = BTRFS_XATTR_ITEM_KEY;
2210  	search_key.offset = 0;
2211  again:
2212  	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2213  	if (ret < 0)
2214  		goto out;
2215  process_leaf:
2216  	nritems = btrfs_header_nritems(path->nodes[0]);
2217  	for (i = path->slots[0]; i < nritems; i++) {
2218  		struct btrfs_key key;
2219  		struct btrfs_dir_item *di;
2220  		struct btrfs_dir_item *log_di;
2221  		u32 total_size;
2222  		u32 cur;
2223  
2224  		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2225  		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2226  			ret = 0;
2227  			goto out;
2228  		}
2229  
2230  		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2231  		total_size = btrfs_item_size(path->nodes[0], i);
2232  		cur = 0;
2233  		while (cur < total_size) {
2234  			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2235  			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2236  			u32 this_len = sizeof(*di) + name_len + data_len;
2237  			char *name;
2238  
2239  			name = kmalloc(name_len, GFP_NOFS);
2240  			if (!name) {
2241  				ret = -ENOMEM;
2242  				goto out;
2243  			}
2244  			read_extent_buffer(path->nodes[0], name,
2245  					   (unsigned long)(di + 1), name_len);
2246  
2247  			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2248  						    name, name_len, 0);
2249  			btrfs_release_path(log_path);
2250  			if (!log_di) {
2251  				/* Doesn't exist in log tree, so delete it. */
2252  				btrfs_release_path(path);
2253  				di = btrfs_lookup_xattr(trans, root, path, ino,
2254  							name, name_len, -1);
2255  				kfree(name);
2256  				if (IS_ERR(di)) {
2257  					ret = PTR_ERR(di);
2258  					goto out;
2259  				}
2260  				ASSERT(di);
2261  				ret = btrfs_delete_one_dir_name(trans, root,
2262  								path, di);
2263  				if (ret)
2264  					goto out;
2265  				btrfs_release_path(path);
2266  				search_key = key;
2267  				goto again;
2268  			}
2269  			kfree(name);
2270  			if (IS_ERR(log_di)) {
2271  				ret = PTR_ERR(log_di);
2272  				goto out;
2273  			}
2274  			cur += this_len;
2275  			di = (struct btrfs_dir_item *)((char *)di + this_len);
2276  		}
2277  	}
2278  	ret = btrfs_next_leaf(root, path);
2279  	if (ret > 0)
2280  		ret = 0;
2281  	else if (ret == 0)
2282  		goto process_leaf;
2283  out:
2284  	btrfs_free_path(log_path);
2285  	btrfs_release_path(path);
2286  	return ret;
2287  }
2288  
2289  
2290  /*
2291   * deletion replay happens before we copy any new directory items
2292   * out of the log or out of backreferences from inodes.  It
2293   * scans the log to find ranges of keys that log is authoritative for,
2294   * and then scans the directory to find items in those ranges that are
2295   * not present in the log.
2296   *
2297   * Anything we don't find in the log is unlinked and removed from the
2298   * directory.
2299   */
replay_dir_deletes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,u64 dirid,int del_all)2300  static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2301  				       struct btrfs_root *root,
2302  				       struct btrfs_root *log,
2303  				       struct btrfs_path *path,
2304  				       u64 dirid, int del_all)
2305  {
2306  	u64 range_start;
2307  	u64 range_end;
2308  	int ret = 0;
2309  	struct btrfs_key dir_key;
2310  	struct btrfs_key found_key;
2311  	struct btrfs_path *log_path;
2312  	struct inode *dir;
2313  
2314  	dir_key.objectid = dirid;
2315  	dir_key.type = BTRFS_DIR_INDEX_KEY;
2316  	log_path = btrfs_alloc_path();
2317  	if (!log_path)
2318  		return -ENOMEM;
2319  
2320  	dir = read_one_inode(root, dirid);
2321  	/* it isn't an error if the inode isn't there, that can happen
2322  	 * because we replay the deletes before we copy in the inode item
2323  	 * from the log
2324  	 */
2325  	if (!dir) {
2326  		btrfs_free_path(log_path);
2327  		return 0;
2328  	}
2329  
2330  	range_start = 0;
2331  	range_end = 0;
2332  	while (1) {
2333  		if (del_all)
2334  			range_end = (u64)-1;
2335  		else {
2336  			ret = find_dir_range(log, path, dirid,
2337  					     &range_start, &range_end);
2338  			if (ret < 0)
2339  				goto out;
2340  			else if (ret > 0)
2341  				break;
2342  		}
2343  
2344  		dir_key.offset = range_start;
2345  		while (1) {
2346  			int nritems;
2347  			ret = btrfs_search_slot(NULL, root, &dir_key, path,
2348  						0, 0);
2349  			if (ret < 0)
2350  				goto out;
2351  
2352  			nritems = btrfs_header_nritems(path->nodes[0]);
2353  			if (path->slots[0] >= nritems) {
2354  				ret = btrfs_next_leaf(root, path);
2355  				if (ret == 1)
2356  					break;
2357  				else if (ret < 0)
2358  					goto out;
2359  			}
2360  			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2361  					      path->slots[0]);
2362  			if (found_key.objectid != dirid ||
2363  			    found_key.type != dir_key.type) {
2364  				ret = 0;
2365  				goto out;
2366  			}
2367  
2368  			if (found_key.offset > range_end)
2369  				break;
2370  
2371  			ret = check_item_in_log(trans, log, path,
2372  						log_path, dir,
2373  						&found_key);
2374  			if (ret)
2375  				goto out;
2376  			if (found_key.offset == (u64)-1)
2377  				break;
2378  			dir_key.offset = found_key.offset + 1;
2379  		}
2380  		btrfs_release_path(path);
2381  		if (range_end == (u64)-1)
2382  			break;
2383  		range_start = range_end + 1;
2384  	}
2385  	ret = 0;
2386  out:
2387  	btrfs_release_path(path);
2388  	btrfs_free_path(log_path);
2389  	iput(dir);
2390  	return ret;
2391  }
2392  
2393  /*
2394   * the process_func used to replay items from the log tree.  This
2395   * gets called in two different stages.  The first stage just looks
2396   * for inodes and makes sure they are all copied into the subvolume.
2397   *
2398   * The second stage copies all the other item types from the log into
2399   * the subvolume.  The two stage approach is slower, but gets rid of
2400   * lots of complexity around inodes referencing other inodes that exist
2401   * only in the log (references come from either directory items or inode
2402   * back refs).
2403   */
replay_one_buffer(struct btrfs_root * log,struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)2404  static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2405  			     struct walk_control *wc, u64 gen, int level)
2406  {
2407  	int nritems;
2408  	struct btrfs_tree_parent_check check = {
2409  		.transid = gen,
2410  		.level = level
2411  	};
2412  	struct btrfs_path *path;
2413  	struct btrfs_root *root = wc->replay_dest;
2414  	struct btrfs_key key;
2415  	int i;
2416  	int ret;
2417  
2418  	ret = btrfs_read_extent_buffer(eb, &check);
2419  	if (ret)
2420  		return ret;
2421  
2422  	level = btrfs_header_level(eb);
2423  
2424  	if (level != 0)
2425  		return 0;
2426  
2427  	path = btrfs_alloc_path();
2428  	if (!path)
2429  		return -ENOMEM;
2430  
2431  	nritems = btrfs_header_nritems(eb);
2432  	for (i = 0; i < nritems; i++) {
2433  		btrfs_item_key_to_cpu(eb, &key, i);
2434  
2435  		/* inode keys are done during the first stage */
2436  		if (key.type == BTRFS_INODE_ITEM_KEY &&
2437  		    wc->stage == LOG_WALK_REPLAY_INODES) {
2438  			struct btrfs_inode_item *inode_item;
2439  			u32 mode;
2440  
2441  			inode_item = btrfs_item_ptr(eb, i,
2442  					    struct btrfs_inode_item);
2443  			/*
2444  			 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2445  			 * and never got linked before the fsync, skip it, as
2446  			 * replaying it is pointless since it would be deleted
2447  			 * later. We skip logging tmpfiles, but it's always
2448  			 * possible we are replaying a log created with a kernel
2449  			 * that used to log tmpfiles.
2450  			 */
2451  			if (btrfs_inode_nlink(eb, inode_item) == 0) {
2452  				wc->ignore_cur_inode = true;
2453  				continue;
2454  			} else {
2455  				wc->ignore_cur_inode = false;
2456  			}
2457  			ret = replay_xattr_deletes(wc->trans, root, log,
2458  						   path, key.objectid);
2459  			if (ret)
2460  				break;
2461  			mode = btrfs_inode_mode(eb, inode_item);
2462  			if (S_ISDIR(mode)) {
2463  				ret = replay_dir_deletes(wc->trans,
2464  					 root, log, path, key.objectid, 0);
2465  				if (ret)
2466  					break;
2467  			}
2468  			ret = overwrite_item(wc->trans, root, path,
2469  					     eb, i, &key);
2470  			if (ret)
2471  				break;
2472  
2473  			/*
2474  			 * Before replaying extents, truncate the inode to its
2475  			 * size. We need to do it now and not after log replay
2476  			 * because before an fsync we can have prealloc extents
2477  			 * added beyond the inode's i_size. If we did it after,
2478  			 * through orphan cleanup for example, we would drop
2479  			 * those prealloc extents just after replaying them.
2480  			 */
2481  			if (S_ISREG(mode)) {
2482  				struct btrfs_drop_extents_args drop_args = { 0 };
2483  				struct inode *inode;
2484  				u64 from;
2485  
2486  				inode = read_one_inode(root, key.objectid);
2487  				if (!inode) {
2488  					ret = -EIO;
2489  					break;
2490  				}
2491  				from = ALIGN(i_size_read(inode),
2492  					     root->fs_info->sectorsize);
2493  				drop_args.start = from;
2494  				drop_args.end = (u64)-1;
2495  				drop_args.drop_cache = true;
2496  				ret = btrfs_drop_extents(wc->trans, root,
2497  							 BTRFS_I(inode),
2498  							 &drop_args);
2499  				if (!ret) {
2500  					inode_sub_bytes(inode,
2501  							drop_args.bytes_found);
2502  					/* Update the inode's nbytes. */
2503  					ret = btrfs_update_inode(wc->trans,
2504  								 BTRFS_I(inode));
2505  				}
2506  				iput(inode);
2507  				if (ret)
2508  					break;
2509  			}
2510  
2511  			ret = link_to_fixup_dir(wc->trans, root,
2512  						path, key.objectid);
2513  			if (ret)
2514  				break;
2515  		}
2516  
2517  		if (wc->ignore_cur_inode)
2518  			continue;
2519  
2520  		if (key.type == BTRFS_DIR_INDEX_KEY &&
2521  		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2522  			ret = replay_one_dir_item(wc->trans, root, path,
2523  						  eb, i, &key);
2524  			if (ret)
2525  				break;
2526  		}
2527  
2528  		if (wc->stage < LOG_WALK_REPLAY_ALL)
2529  			continue;
2530  
2531  		/* these keys are simply copied */
2532  		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2533  			ret = overwrite_item(wc->trans, root, path,
2534  					     eb, i, &key);
2535  			if (ret)
2536  				break;
2537  		} else if (key.type == BTRFS_INODE_REF_KEY ||
2538  			   key.type == BTRFS_INODE_EXTREF_KEY) {
2539  			ret = add_inode_ref(wc->trans, root, log, path,
2540  					    eb, i, &key);
2541  			if (ret && ret != -ENOENT)
2542  				break;
2543  			ret = 0;
2544  		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2545  			ret = replay_one_extent(wc->trans, root, path,
2546  						eb, i, &key);
2547  			if (ret)
2548  				break;
2549  		}
2550  		/*
2551  		 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2552  		 * BTRFS_DIR_INDEX_KEY items which we use to derive the
2553  		 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2554  		 * older kernel with such keys, ignore them.
2555  		 */
2556  	}
2557  	btrfs_free_path(path);
2558  	return ret;
2559  }
2560  
2561  /*
2562   * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2563   */
unaccount_log_buffer(struct btrfs_fs_info * fs_info,u64 start)2564  static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2565  {
2566  	struct btrfs_block_group *cache;
2567  
2568  	cache = btrfs_lookup_block_group(fs_info, start);
2569  	if (!cache) {
2570  		btrfs_err(fs_info, "unable to find block group for %llu", start);
2571  		return;
2572  	}
2573  
2574  	spin_lock(&cache->space_info->lock);
2575  	spin_lock(&cache->lock);
2576  	cache->reserved -= fs_info->nodesize;
2577  	cache->space_info->bytes_reserved -= fs_info->nodesize;
2578  	spin_unlock(&cache->lock);
2579  	spin_unlock(&cache->space_info->lock);
2580  
2581  	btrfs_put_block_group(cache);
2582  }
2583  
clean_log_buffer(struct btrfs_trans_handle * trans,struct extent_buffer * eb)2584  static int clean_log_buffer(struct btrfs_trans_handle *trans,
2585  			    struct extent_buffer *eb)
2586  {
2587  	int ret;
2588  
2589  	btrfs_tree_lock(eb);
2590  	btrfs_clear_buffer_dirty(trans, eb);
2591  	wait_on_extent_buffer_writeback(eb);
2592  	btrfs_tree_unlock(eb);
2593  
2594  	if (trans) {
2595  		ret = btrfs_pin_reserved_extent(trans, eb);
2596  		if (ret)
2597  			return ret;
2598  	} else {
2599  		unaccount_log_buffer(eb->fs_info, eb->start);
2600  	}
2601  
2602  	return 0;
2603  }
2604  
walk_down_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int * level,struct walk_control * wc)2605  static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2606  				   struct btrfs_root *root,
2607  				   struct btrfs_path *path, int *level,
2608  				   struct walk_control *wc)
2609  {
2610  	struct btrfs_fs_info *fs_info = root->fs_info;
2611  	u64 bytenr;
2612  	u64 ptr_gen;
2613  	struct extent_buffer *next;
2614  	struct extent_buffer *cur;
2615  	int ret = 0;
2616  
2617  	while (*level > 0) {
2618  		struct btrfs_tree_parent_check check = { 0 };
2619  
2620  		cur = path->nodes[*level];
2621  
2622  		WARN_ON(btrfs_header_level(cur) != *level);
2623  
2624  		if (path->slots[*level] >=
2625  		    btrfs_header_nritems(cur))
2626  			break;
2627  
2628  		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2629  		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2630  		check.transid = ptr_gen;
2631  		check.level = *level - 1;
2632  		check.has_first_key = true;
2633  		btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
2634  
2635  		next = btrfs_find_create_tree_block(fs_info, bytenr,
2636  						    btrfs_header_owner(cur),
2637  						    *level - 1);
2638  		if (IS_ERR(next))
2639  			return PTR_ERR(next);
2640  
2641  		if (*level == 1) {
2642  			ret = wc->process_func(root, next, wc, ptr_gen,
2643  					       *level - 1);
2644  			if (ret) {
2645  				free_extent_buffer(next);
2646  				return ret;
2647  			}
2648  
2649  			path->slots[*level]++;
2650  			if (wc->free) {
2651  				ret = btrfs_read_extent_buffer(next, &check);
2652  				if (ret) {
2653  					free_extent_buffer(next);
2654  					return ret;
2655  				}
2656  
2657  				ret = clean_log_buffer(trans, next);
2658  				if (ret) {
2659  					free_extent_buffer(next);
2660  					return ret;
2661  				}
2662  			}
2663  			free_extent_buffer(next);
2664  			continue;
2665  		}
2666  		ret = btrfs_read_extent_buffer(next, &check);
2667  		if (ret) {
2668  			free_extent_buffer(next);
2669  			return ret;
2670  		}
2671  
2672  		if (path->nodes[*level-1])
2673  			free_extent_buffer(path->nodes[*level-1]);
2674  		path->nodes[*level-1] = next;
2675  		*level = btrfs_header_level(next);
2676  		path->slots[*level] = 0;
2677  		cond_resched();
2678  	}
2679  	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2680  
2681  	cond_resched();
2682  	return 0;
2683  }
2684  
walk_up_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int * level,struct walk_control * wc)2685  static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2686  				 struct btrfs_root *root,
2687  				 struct btrfs_path *path, int *level,
2688  				 struct walk_control *wc)
2689  {
2690  	int i;
2691  	int slot;
2692  	int ret;
2693  
2694  	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2695  		slot = path->slots[i];
2696  		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2697  			path->slots[i]++;
2698  			*level = i;
2699  			WARN_ON(*level == 0);
2700  			return 0;
2701  		} else {
2702  			ret = wc->process_func(root, path->nodes[*level], wc,
2703  				 btrfs_header_generation(path->nodes[*level]),
2704  				 *level);
2705  			if (ret)
2706  				return ret;
2707  
2708  			if (wc->free) {
2709  				ret = clean_log_buffer(trans, path->nodes[*level]);
2710  				if (ret)
2711  					return ret;
2712  			}
2713  			free_extent_buffer(path->nodes[*level]);
2714  			path->nodes[*level] = NULL;
2715  			*level = i + 1;
2716  		}
2717  	}
2718  	return 1;
2719  }
2720  
2721  /*
2722   * drop the reference count on the tree rooted at 'snap'.  This traverses
2723   * the tree freeing any blocks that have a ref count of zero after being
2724   * decremented.
2725   */
walk_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct walk_control * wc)2726  static int walk_log_tree(struct btrfs_trans_handle *trans,
2727  			 struct btrfs_root *log, struct walk_control *wc)
2728  {
2729  	int ret = 0;
2730  	int wret;
2731  	int level;
2732  	struct btrfs_path *path;
2733  	int orig_level;
2734  
2735  	path = btrfs_alloc_path();
2736  	if (!path)
2737  		return -ENOMEM;
2738  
2739  	level = btrfs_header_level(log->node);
2740  	orig_level = level;
2741  	path->nodes[level] = log->node;
2742  	atomic_inc(&log->node->refs);
2743  	path->slots[level] = 0;
2744  
2745  	while (1) {
2746  		wret = walk_down_log_tree(trans, log, path, &level, wc);
2747  		if (wret > 0)
2748  			break;
2749  		if (wret < 0) {
2750  			ret = wret;
2751  			goto out;
2752  		}
2753  
2754  		wret = walk_up_log_tree(trans, log, path, &level, wc);
2755  		if (wret > 0)
2756  			break;
2757  		if (wret < 0) {
2758  			ret = wret;
2759  			goto out;
2760  		}
2761  	}
2762  
2763  	/* was the root node processed? if not, catch it here */
2764  	if (path->nodes[orig_level]) {
2765  		ret = wc->process_func(log, path->nodes[orig_level], wc,
2766  			 btrfs_header_generation(path->nodes[orig_level]),
2767  			 orig_level);
2768  		if (ret)
2769  			goto out;
2770  		if (wc->free)
2771  			ret = clean_log_buffer(trans, path->nodes[orig_level]);
2772  	}
2773  
2774  out:
2775  	btrfs_free_path(path);
2776  	return ret;
2777  }
2778  
2779  /*
2780   * helper function to update the item for a given subvolumes log root
2781   * in the tree of log roots
2782   */
update_log_root(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_root_item * root_item)2783  static int update_log_root(struct btrfs_trans_handle *trans,
2784  			   struct btrfs_root *log,
2785  			   struct btrfs_root_item *root_item)
2786  {
2787  	struct btrfs_fs_info *fs_info = log->fs_info;
2788  	int ret;
2789  
2790  	if (log->log_transid == 1) {
2791  		/* insert root item on the first sync */
2792  		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2793  				&log->root_key, root_item);
2794  	} else {
2795  		ret = btrfs_update_root(trans, fs_info->log_root_tree,
2796  				&log->root_key, root_item);
2797  	}
2798  	return ret;
2799  }
2800  
wait_log_commit(struct btrfs_root * root,int transid)2801  static void wait_log_commit(struct btrfs_root *root, int transid)
2802  {
2803  	DEFINE_WAIT(wait);
2804  	int index = transid % 2;
2805  
2806  	/*
2807  	 * we only allow two pending log transactions at a time,
2808  	 * so we know that if ours is more than 2 older than the
2809  	 * current transaction, we're done
2810  	 */
2811  	for (;;) {
2812  		prepare_to_wait(&root->log_commit_wait[index],
2813  				&wait, TASK_UNINTERRUPTIBLE);
2814  
2815  		if (!(root->log_transid_committed < transid &&
2816  		      atomic_read(&root->log_commit[index])))
2817  			break;
2818  
2819  		mutex_unlock(&root->log_mutex);
2820  		schedule();
2821  		mutex_lock(&root->log_mutex);
2822  	}
2823  	finish_wait(&root->log_commit_wait[index], &wait);
2824  }
2825  
wait_for_writer(struct btrfs_root * root)2826  static void wait_for_writer(struct btrfs_root *root)
2827  {
2828  	DEFINE_WAIT(wait);
2829  
2830  	for (;;) {
2831  		prepare_to_wait(&root->log_writer_wait, &wait,
2832  				TASK_UNINTERRUPTIBLE);
2833  		if (!atomic_read(&root->log_writers))
2834  			break;
2835  
2836  		mutex_unlock(&root->log_mutex);
2837  		schedule();
2838  		mutex_lock(&root->log_mutex);
2839  	}
2840  	finish_wait(&root->log_writer_wait, &wait);
2841  }
2842  
btrfs_init_log_ctx(struct btrfs_log_ctx * ctx,struct btrfs_inode * inode)2843  void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode)
2844  {
2845  	ctx->log_ret = 0;
2846  	ctx->log_transid = 0;
2847  	ctx->log_new_dentries = false;
2848  	ctx->logging_new_name = false;
2849  	ctx->logging_new_delayed_dentries = false;
2850  	ctx->logged_before = false;
2851  	ctx->inode = inode;
2852  	INIT_LIST_HEAD(&ctx->list);
2853  	INIT_LIST_HEAD(&ctx->ordered_extents);
2854  	INIT_LIST_HEAD(&ctx->conflict_inodes);
2855  	ctx->num_conflict_inodes = 0;
2856  	ctx->logging_conflict_inodes = false;
2857  	ctx->scratch_eb = NULL;
2858  }
2859  
btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx * ctx)2860  void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
2861  {
2862  	struct btrfs_inode *inode = ctx->inode;
2863  
2864  	if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
2865  	    !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
2866  		return;
2867  
2868  	/*
2869  	 * Don't care about allocation failure. This is just for optimization,
2870  	 * if we fail to allocate here, we will try again later if needed.
2871  	 */
2872  	ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
2873  }
2874  
btrfs_release_log_ctx_extents(struct btrfs_log_ctx * ctx)2875  void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
2876  {
2877  	struct btrfs_ordered_extent *ordered;
2878  	struct btrfs_ordered_extent *tmp;
2879  
2880  	btrfs_assert_inode_locked(ctx->inode);
2881  
2882  	list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
2883  		list_del_init(&ordered->log_list);
2884  		btrfs_put_ordered_extent(ordered);
2885  	}
2886  }
2887  
2888  
btrfs_remove_log_ctx(struct btrfs_root * root,struct btrfs_log_ctx * ctx)2889  static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2890  					struct btrfs_log_ctx *ctx)
2891  {
2892  	mutex_lock(&root->log_mutex);
2893  	list_del_init(&ctx->list);
2894  	mutex_unlock(&root->log_mutex);
2895  }
2896  
2897  /*
2898   * Invoked in log mutex context, or be sure there is no other task which
2899   * can access the list.
2900   */
btrfs_remove_all_log_ctxs(struct btrfs_root * root,int index,int error)2901  static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2902  					     int index, int error)
2903  {
2904  	struct btrfs_log_ctx *ctx;
2905  	struct btrfs_log_ctx *safe;
2906  
2907  	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2908  		list_del_init(&ctx->list);
2909  		ctx->log_ret = error;
2910  	}
2911  }
2912  
2913  /*
2914   * Sends a given tree log down to the disk and updates the super blocks to
2915   * record it.  When this call is done, you know that any inodes previously
2916   * logged are safely on disk only if it returns 0.
2917   *
2918   * Any other return value means you need to call btrfs_commit_transaction.
2919   * Some of the edge cases for fsyncing directories that have had unlinks
2920   * or renames done in the past mean that sometimes the only safe
2921   * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2922   * that has happened.
2923   */
btrfs_sync_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)2924  int btrfs_sync_log(struct btrfs_trans_handle *trans,
2925  		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2926  {
2927  	int index1;
2928  	int index2;
2929  	int mark;
2930  	int ret;
2931  	struct btrfs_fs_info *fs_info = root->fs_info;
2932  	struct btrfs_root *log = root->log_root;
2933  	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2934  	struct btrfs_root_item new_root_item;
2935  	int log_transid = 0;
2936  	struct btrfs_log_ctx root_log_ctx;
2937  	struct blk_plug plug;
2938  	u64 log_root_start;
2939  	u64 log_root_level;
2940  
2941  	mutex_lock(&root->log_mutex);
2942  	log_transid = ctx->log_transid;
2943  	if (root->log_transid_committed >= log_transid) {
2944  		mutex_unlock(&root->log_mutex);
2945  		return ctx->log_ret;
2946  	}
2947  
2948  	index1 = log_transid % 2;
2949  	if (atomic_read(&root->log_commit[index1])) {
2950  		wait_log_commit(root, log_transid);
2951  		mutex_unlock(&root->log_mutex);
2952  		return ctx->log_ret;
2953  	}
2954  	ASSERT(log_transid == root->log_transid);
2955  	atomic_set(&root->log_commit[index1], 1);
2956  
2957  	/* wait for previous tree log sync to complete */
2958  	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2959  		wait_log_commit(root, log_transid - 1);
2960  
2961  	while (1) {
2962  		int batch = atomic_read(&root->log_batch);
2963  		/* when we're on an ssd, just kick the log commit out */
2964  		if (!btrfs_test_opt(fs_info, SSD) &&
2965  		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2966  			mutex_unlock(&root->log_mutex);
2967  			schedule_timeout_uninterruptible(1);
2968  			mutex_lock(&root->log_mutex);
2969  		}
2970  		wait_for_writer(root);
2971  		if (batch == atomic_read(&root->log_batch))
2972  			break;
2973  	}
2974  
2975  	/* bail out if we need to do a full commit */
2976  	if (btrfs_need_log_full_commit(trans)) {
2977  		ret = BTRFS_LOG_FORCE_COMMIT;
2978  		mutex_unlock(&root->log_mutex);
2979  		goto out;
2980  	}
2981  
2982  	if (log_transid % 2 == 0)
2983  		mark = EXTENT_DIRTY;
2984  	else
2985  		mark = EXTENT_NEW;
2986  
2987  	/* we start IO on  all the marked extents here, but we don't actually
2988  	 * wait for them until later.
2989  	 */
2990  	blk_start_plug(&plug);
2991  	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
2992  	/*
2993  	 * -EAGAIN happens when someone, e.g., a concurrent transaction
2994  	 *  commit, writes a dirty extent in this tree-log commit. This
2995  	 *  concurrent write will create a hole writing out the extents,
2996  	 *  and we cannot proceed on a zoned filesystem, requiring
2997  	 *  sequential writing. While we can bail out to a full commit
2998  	 *  here, but we can continue hoping the concurrent writing fills
2999  	 *  the hole.
3000  	 */
3001  	if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
3002  		ret = 0;
3003  	if (ret) {
3004  		blk_finish_plug(&plug);
3005  		btrfs_set_log_full_commit(trans);
3006  		mutex_unlock(&root->log_mutex);
3007  		goto out;
3008  	}
3009  
3010  	/*
3011  	 * We _must_ update under the root->log_mutex in order to make sure we
3012  	 * have a consistent view of the log root we are trying to commit at
3013  	 * this moment.
3014  	 *
3015  	 * We _must_ copy this into a local copy, because we are not holding the
3016  	 * log_root_tree->log_mutex yet.  This is important because when we
3017  	 * commit the log_root_tree we must have a consistent view of the
3018  	 * log_root_tree when we update the super block to point at the
3019  	 * log_root_tree bytenr.  If we update the log_root_tree here we'll race
3020  	 * with the commit and possibly point at the new block which we may not
3021  	 * have written out.
3022  	 */
3023  	btrfs_set_root_node(&log->root_item, log->node);
3024  	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3025  
3026  	btrfs_set_root_log_transid(root, root->log_transid + 1);
3027  	log->log_transid = root->log_transid;
3028  	root->log_start_pid = 0;
3029  	/*
3030  	 * IO has been started, blocks of the log tree have WRITTEN flag set
3031  	 * in their headers. new modifications of the log will be written to
3032  	 * new positions. so it's safe to allow log writers to go in.
3033  	 */
3034  	mutex_unlock(&root->log_mutex);
3035  
3036  	if (btrfs_is_zoned(fs_info)) {
3037  		mutex_lock(&fs_info->tree_root->log_mutex);
3038  		if (!log_root_tree->node) {
3039  			ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
3040  			if (ret) {
3041  				mutex_unlock(&fs_info->tree_root->log_mutex);
3042  				blk_finish_plug(&plug);
3043  				goto out;
3044  			}
3045  		}
3046  		mutex_unlock(&fs_info->tree_root->log_mutex);
3047  	}
3048  
3049  	btrfs_init_log_ctx(&root_log_ctx, NULL);
3050  
3051  	mutex_lock(&log_root_tree->log_mutex);
3052  
3053  	index2 = log_root_tree->log_transid % 2;
3054  	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3055  	root_log_ctx.log_transid = log_root_tree->log_transid;
3056  
3057  	/*
3058  	 * Now we are safe to update the log_root_tree because we're under the
3059  	 * log_mutex, and we're a current writer so we're holding the commit
3060  	 * open until we drop the log_mutex.
3061  	 */
3062  	ret = update_log_root(trans, log, &new_root_item);
3063  	if (ret) {
3064  		list_del_init(&root_log_ctx.list);
3065  		blk_finish_plug(&plug);
3066  		btrfs_set_log_full_commit(trans);
3067  		if (ret != -ENOSPC)
3068  			btrfs_err(fs_info,
3069  				  "failed to update log for root %llu ret %d",
3070  				  btrfs_root_id(root), ret);
3071  		btrfs_wait_tree_log_extents(log, mark);
3072  		mutex_unlock(&log_root_tree->log_mutex);
3073  		goto out;
3074  	}
3075  
3076  	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3077  		blk_finish_plug(&plug);
3078  		list_del_init(&root_log_ctx.list);
3079  		mutex_unlock(&log_root_tree->log_mutex);
3080  		ret = root_log_ctx.log_ret;
3081  		goto out;
3082  	}
3083  
3084  	if (atomic_read(&log_root_tree->log_commit[index2])) {
3085  		blk_finish_plug(&plug);
3086  		ret = btrfs_wait_tree_log_extents(log, mark);
3087  		wait_log_commit(log_root_tree,
3088  				root_log_ctx.log_transid);
3089  		mutex_unlock(&log_root_tree->log_mutex);
3090  		if (!ret)
3091  			ret = root_log_ctx.log_ret;
3092  		goto out;
3093  	}
3094  	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3095  	atomic_set(&log_root_tree->log_commit[index2], 1);
3096  
3097  	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3098  		wait_log_commit(log_root_tree,
3099  				root_log_ctx.log_transid - 1);
3100  	}
3101  
3102  	/*
3103  	 * now that we've moved on to the tree of log tree roots,
3104  	 * check the full commit flag again
3105  	 */
3106  	if (btrfs_need_log_full_commit(trans)) {
3107  		blk_finish_plug(&plug);
3108  		btrfs_wait_tree_log_extents(log, mark);
3109  		mutex_unlock(&log_root_tree->log_mutex);
3110  		ret = BTRFS_LOG_FORCE_COMMIT;
3111  		goto out_wake_log_root;
3112  	}
3113  
3114  	ret = btrfs_write_marked_extents(fs_info,
3115  					 &log_root_tree->dirty_log_pages,
3116  					 EXTENT_DIRTY | EXTENT_NEW);
3117  	blk_finish_plug(&plug);
3118  	/*
3119  	 * As described above, -EAGAIN indicates a hole in the extents. We
3120  	 * cannot wait for these write outs since the waiting cause a
3121  	 * deadlock. Bail out to the full commit instead.
3122  	 */
3123  	if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3124  		btrfs_set_log_full_commit(trans);
3125  		btrfs_wait_tree_log_extents(log, mark);
3126  		mutex_unlock(&log_root_tree->log_mutex);
3127  		goto out_wake_log_root;
3128  	} else if (ret) {
3129  		btrfs_set_log_full_commit(trans);
3130  		mutex_unlock(&log_root_tree->log_mutex);
3131  		goto out_wake_log_root;
3132  	}
3133  	ret = btrfs_wait_tree_log_extents(log, mark);
3134  	if (!ret)
3135  		ret = btrfs_wait_tree_log_extents(log_root_tree,
3136  						  EXTENT_NEW | EXTENT_DIRTY);
3137  	if (ret) {
3138  		btrfs_set_log_full_commit(trans);
3139  		mutex_unlock(&log_root_tree->log_mutex);
3140  		goto out_wake_log_root;
3141  	}
3142  
3143  	log_root_start = log_root_tree->node->start;
3144  	log_root_level = btrfs_header_level(log_root_tree->node);
3145  	log_root_tree->log_transid++;
3146  	mutex_unlock(&log_root_tree->log_mutex);
3147  
3148  	/*
3149  	 * Here we are guaranteed that nobody is going to write the superblock
3150  	 * for the current transaction before us and that neither we do write
3151  	 * our superblock before the previous transaction finishes its commit
3152  	 * and writes its superblock, because:
3153  	 *
3154  	 * 1) We are holding a handle on the current transaction, so no body
3155  	 *    can commit it until we release the handle;
3156  	 *
3157  	 * 2) Before writing our superblock we acquire the tree_log_mutex, so
3158  	 *    if the previous transaction is still committing, and hasn't yet
3159  	 *    written its superblock, we wait for it to do it, because a
3160  	 *    transaction commit acquires the tree_log_mutex when the commit
3161  	 *    begins and releases it only after writing its superblock.
3162  	 */
3163  	mutex_lock(&fs_info->tree_log_mutex);
3164  
3165  	/*
3166  	 * The previous transaction writeout phase could have failed, and thus
3167  	 * marked the fs in an error state.  We must not commit here, as we
3168  	 * could have updated our generation in the super_for_commit and
3169  	 * writing the super here would result in transid mismatches.  If there
3170  	 * is an error here just bail.
3171  	 */
3172  	if (BTRFS_FS_ERROR(fs_info)) {
3173  		ret = -EIO;
3174  		btrfs_set_log_full_commit(trans);
3175  		btrfs_abort_transaction(trans, ret);
3176  		mutex_unlock(&fs_info->tree_log_mutex);
3177  		goto out_wake_log_root;
3178  	}
3179  
3180  	btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3181  	btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3182  	ret = write_all_supers(fs_info, 1);
3183  	mutex_unlock(&fs_info->tree_log_mutex);
3184  	if (ret) {
3185  		btrfs_set_log_full_commit(trans);
3186  		btrfs_abort_transaction(trans, ret);
3187  		goto out_wake_log_root;
3188  	}
3189  
3190  	/*
3191  	 * We know there can only be one task here, since we have not yet set
3192  	 * root->log_commit[index1] to 0 and any task attempting to sync the
3193  	 * log must wait for the previous log transaction to commit if it's
3194  	 * still in progress or wait for the current log transaction commit if
3195  	 * someone else already started it. We use <= and not < because the
3196  	 * first log transaction has an ID of 0.
3197  	 */
3198  	ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
3199  	btrfs_set_root_last_log_commit(root, log_transid);
3200  
3201  out_wake_log_root:
3202  	mutex_lock(&log_root_tree->log_mutex);
3203  	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3204  
3205  	log_root_tree->log_transid_committed++;
3206  	atomic_set(&log_root_tree->log_commit[index2], 0);
3207  	mutex_unlock(&log_root_tree->log_mutex);
3208  
3209  	/*
3210  	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3211  	 * all the updates above are seen by the woken threads. It might not be
3212  	 * necessary, but proving that seems to be hard.
3213  	 */
3214  	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3215  out:
3216  	mutex_lock(&root->log_mutex);
3217  	btrfs_remove_all_log_ctxs(root, index1, ret);
3218  	root->log_transid_committed++;
3219  	atomic_set(&root->log_commit[index1], 0);
3220  	mutex_unlock(&root->log_mutex);
3221  
3222  	/*
3223  	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3224  	 * all the updates above are seen by the woken threads. It might not be
3225  	 * necessary, but proving that seems to be hard.
3226  	 */
3227  	cond_wake_up(&root->log_commit_wait[index1]);
3228  	return ret;
3229  }
3230  
free_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * log)3231  static void free_log_tree(struct btrfs_trans_handle *trans,
3232  			  struct btrfs_root *log)
3233  {
3234  	int ret;
3235  	struct walk_control wc = {
3236  		.free = 1,
3237  		.process_func = process_one_buffer
3238  	};
3239  
3240  	if (log->node) {
3241  		ret = walk_log_tree(trans, log, &wc);
3242  		if (ret) {
3243  			/*
3244  			 * We weren't able to traverse the entire log tree, the
3245  			 * typical scenario is getting an -EIO when reading an
3246  			 * extent buffer of the tree, due to a previous writeback
3247  			 * failure of it.
3248  			 */
3249  			set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
3250  				&log->fs_info->fs_state);
3251  
3252  			/*
3253  			 * Some extent buffers of the log tree may still be dirty
3254  			 * and not yet written back to storage, because we may
3255  			 * have updates to a log tree without syncing a log tree,
3256  			 * such as during rename and link operations. So flush
3257  			 * them out and wait for their writeback to complete, so
3258  			 * that we properly cleanup their state and pages.
3259  			 */
3260  			btrfs_write_marked_extents(log->fs_info,
3261  						   &log->dirty_log_pages,
3262  						   EXTENT_DIRTY | EXTENT_NEW);
3263  			btrfs_wait_tree_log_extents(log,
3264  						    EXTENT_DIRTY | EXTENT_NEW);
3265  
3266  			if (trans)
3267  				btrfs_abort_transaction(trans, ret);
3268  			else
3269  				btrfs_handle_fs_error(log->fs_info, ret, NULL);
3270  		}
3271  	}
3272  
3273  	extent_io_tree_release(&log->dirty_log_pages);
3274  	extent_io_tree_release(&log->log_csum_range);
3275  
3276  	btrfs_put_root(log);
3277  }
3278  
3279  /*
3280   * free all the extents used by the tree log.  This should be called
3281   * at commit time of the full transaction
3282   */
btrfs_free_log(struct btrfs_trans_handle * trans,struct btrfs_root * root)3283  int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3284  {
3285  	if (root->log_root) {
3286  		free_log_tree(trans, root->log_root);
3287  		root->log_root = NULL;
3288  		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3289  	}
3290  	return 0;
3291  }
3292  
btrfs_free_log_root_tree(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info)3293  int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3294  			     struct btrfs_fs_info *fs_info)
3295  {
3296  	if (fs_info->log_root_tree) {
3297  		free_log_tree(trans, fs_info->log_root_tree);
3298  		fs_info->log_root_tree = NULL;
3299  		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3300  	}
3301  	return 0;
3302  }
3303  
3304  /*
3305   * Check if an inode was logged in the current transaction. This correctly deals
3306   * with the case where the inode was logged but has a logged_trans of 0, which
3307   * happens if the inode is evicted and loaded again, as logged_trans is an in
3308   * memory only field (not persisted).
3309   *
3310   * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
3311   * and < 0 on error.
3312   */
inode_logged(const struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path_in)3313  static int inode_logged(const struct btrfs_trans_handle *trans,
3314  			struct btrfs_inode *inode,
3315  			struct btrfs_path *path_in)
3316  {
3317  	struct btrfs_path *path = path_in;
3318  	struct btrfs_key key;
3319  	int ret;
3320  
3321  	if (inode->logged_trans == trans->transid)
3322  		return 1;
3323  
3324  	/*
3325  	 * If logged_trans is not 0, then we know the inode logged was not logged
3326  	 * in this transaction, so we can return false right away.
3327  	 */
3328  	if (inode->logged_trans > 0)
3329  		return 0;
3330  
3331  	/*
3332  	 * If no log tree was created for this root in this transaction, then
3333  	 * the inode can not have been logged in this transaction. In that case
3334  	 * set logged_trans to anything greater than 0 and less than the current
3335  	 * transaction's ID, to avoid the search below in a future call in case
3336  	 * a log tree gets created after this.
3337  	 */
3338  	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
3339  		inode->logged_trans = trans->transid - 1;
3340  		return 0;
3341  	}
3342  
3343  	/*
3344  	 * We have a log tree and the inode's logged_trans is 0. We can't tell
3345  	 * for sure if the inode was logged before in this transaction by looking
3346  	 * only at logged_trans. We could be pessimistic and assume it was, but
3347  	 * that can lead to unnecessarily logging an inode during rename and link
3348  	 * operations, and then further updating the log in followup rename and
3349  	 * link operations, specially if it's a directory, which adds latency
3350  	 * visible to applications doing a series of rename or link operations.
3351  	 *
3352  	 * A logged_trans of 0 here can mean several things:
3353  	 *
3354  	 * 1) The inode was never logged since the filesystem was mounted, and may
3355  	 *    or may have not been evicted and loaded again;
3356  	 *
3357  	 * 2) The inode was logged in a previous transaction, then evicted and
3358  	 *    then loaded again;
3359  	 *
3360  	 * 3) The inode was logged in the current transaction, then evicted and
3361  	 *    then loaded again.
3362  	 *
3363  	 * For cases 1) and 2) we don't want to return true, but we need to detect
3364  	 * case 3) and return true. So we do a search in the log root for the inode
3365  	 * item.
3366  	 */
3367  	key.objectid = btrfs_ino(inode);
3368  	key.type = BTRFS_INODE_ITEM_KEY;
3369  	key.offset = 0;
3370  
3371  	if (!path) {
3372  		path = btrfs_alloc_path();
3373  		if (!path)
3374  			return -ENOMEM;
3375  	}
3376  
3377  	ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3378  
3379  	if (path_in)
3380  		btrfs_release_path(path);
3381  	else
3382  		btrfs_free_path(path);
3383  
3384  	/*
3385  	 * Logging an inode always results in logging its inode item. So if we
3386  	 * did not find the item we know the inode was not logged for sure.
3387  	 */
3388  	if (ret < 0) {
3389  		return ret;
3390  	} else if (ret > 0) {
3391  		/*
3392  		 * Set logged_trans to a value greater than 0 and less then the
3393  		 * current transaction to avoid doing the search in future calls.
3394  		 */
3395  		inode->logged_trans = trans->transid - 1;
3396  		return 0;
3397  	}
3398  
3399  	/*
3400  	 * The inode was previously logged and then evicted, set logged_trans to
3401  	 * the current transacion's ID, to avoid future tree searches as long as
3402  	 * the inode is not evicted again.
3403  	 */
3404  	inode->logged_trans = trans->transid;
3405  
3406  	/*
3407  	 * If it's a directory, then we must set last_dir_index_offset to the
3408  	 * maximum possible value, so that the next attempt to log the inode does
3409  	 * not skip checking if dir index keys found in modified subvolume tree
3410  	 * leaves have been logged before, otherwise it would result in attempts
3411  	 * to insert duplicate dir index keys in the log tree. This must be done
3412  	 * because last_dir_index_offset is an in-memory only field, not persisted
3413  	 * in the inode item or any other on-disk structure, so its value is lost
3414  	 * once the inode is evicted.
3415  	 */
3416  	if (S_ISDIR(inode->vfs_inode.i_mode))
3417  		inode->last_dir_index_offset = (u64)-1;
3418  
3419  	return 1;
3420  }
3421  
3422  /*
3423   * Delete a directory entry from the log if it exists.
3424   *
3425   * Returns < 0 on error
3426   *           1 if the entry does not exists
3427   *           0 if the entry existed and was successfully deleted
3428   */
del_logged_dentry(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,u64 dir_ino,const struct fscrypt_str * name,u64 index)3429  static int del_logged_dentry(struct btrfs_trans_handle *trans,
3430  			     struct btrfs_root *log,
3431  			     struct btrfs_path *path,
3432  			     u64 dir_ino,
3433  			     const struct fscrypt_str *name,
3434  			     u64 index)
3435  {
3436  	struct btrfs_dir_item *di;
3437  
3438  	/*
3439  	 * We only log dir index items of a directory, so we don't need to look
3440  	 * for dir item keys.
3441  	 */
3442  	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3443  					 index, name, -1);
3444  	if (IS_ERR(di))
3445  		return PTR_ERR(di);
3446  	else if (!di)
3447  		return 1;
3448  
3449  	/*
3450  	 * We do not need to update the size field of the directory's
3451  	 * inode item because on log replay we update the field to reflect
3452  	 * all existing entries in the directory (see overwrite_item()).
3453  	 */
3454  	return btrfs_delete_one_dir_name(trans, log, path, di);
3455  }
3456  
3457  /*
3458   * If both a file and directory are logged, and unlinks or renames are
3459   * mixed in, we have a few interesting corners:
3460   *
3461   * create file X in dir Y
3462   * link file X to X.link in dir Y
3463   * fsync file X
3464   * unlink file X but leave X.link
3465   * fsync dir Y
3466   *
3467   * After a crash we would expect only X.link to exist.  But file X
3468   * didn't get fsync'd again so the log has back refs for X and X.link.
3469   *
3470   * We solve this by removing directory entries and inode backrefs from the
3471   * log when a file that was logged in the current transaction is
3472   * unlinked.  Any later fsync will include the updated log entries, and
3473   * we'll be able to reconstruct the proper directory items from backrefs.
3474   *
3475   * This optimizations allows us to avoid relogging the entire inode
3476   * or the entire directory.
3477   */
btrfs_del_dir_entries_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct fscrypt_str * name,struct btrfs_inode * dir,u64 index)3478  void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3479  				  struct btrfs_root *root,
3480  				  const struct fscrypt_str *name,
3481  				  struct btrfs_inode *dir, u64 index)
3482  {
3483  	struct btrfs_path *path;
3484  	int ret;
3485  
3486  	ret = inode_logged(trans, dir, NULL);
3487  	if (ret == 0)
3488  		return;
3489  	else if (ret < 0) {
3490  		btrfs_set_log_full_commit(trans);
3491  		return;
3492  	}
3493  
3494  	ret = join_running_log_trans(root);
3495  	if (ret)
3496  		return;
3497  
3498  	mutex_lock(&dir->log_mutex);
3499  
3500  	path = btrfs_alloc_path();
3501  	if (!path) {
3502  		ret = -ENOMEM;
3503  		goto out_unlock;
3504  	}
3505  
3506  	ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
3507  				name, index);
3508  	btrfs_free_path(path);
3509  out_unlock:
3510  	mutex_unlock(&dir->log_mutex);
3511  	if (ret < 0)
3512  		btrfs_set_log_full_commit(trans);
3513  	btrfs_end_log_trans(root);
3514  }
3515  
3516  /* see comments for btrfs_del_dir_entries_in_log */
btrfs_del_inode_ref_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct fscrypt_str * name,struct btrfs_inode * inode,u64 dirid)3517  void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3518  				struct btrfs_root *root,
3519  				const struct fscrypt_str *name,
3520  				struct btrfs_inode *inode, u64 dirid)
3521  {
3522  	struct btrfs_root *log;
3523  	u64 index;
3524  	int ret;
3525  
3526  	ret = inode_logged(trans, inode, NULL);
3527  	if (ret == 0)
3528  		return;
3529  	else if (ret < 0) {
3530  		btrfs_set_log_full_commit(trans);
3531  		return;
3532  	}
3533  
3534  	ret = join_running_log_trans(root);
3535  	if (ret)
3536  		return;
3537  	log = root->log_root;
3538  	mutex_lock(&inode->log_mutex);
3539  
3540  	ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
3541  				  dirid, &index);
3542  	mutex_unlock(&inode->log_mutex);
3543  	if (ret < 0 && ret != -ENOENT)
3544  		btrfs_set_log_full_commit(trans);
3545  	btrfs_end_log_trans(root);
3546  }
3547  
3548  /*
3549   * creates a range item in the log for 'dirid'.  first_offset and
3550   * last_offset tell us which parts of the key space the log should
3551   * be considered authoritative for.
3552   */
insert_dir_log_key(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,u64 dirid,u64 first_offset,u64 last_offset)3553  static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3554  				       struct btrfs_root *log,
3555  				       struct btrfs_path *path,
3556  				       u64 dirid,
3557  				       u64 first_offset, u64 last_offset)
3558  {
3559  	int ret;
3560  	struct btrfs_key key;
3561  	struct btrfs_dir_log_item *item;
3562  
3563  	key.objectid = dirid;
3564  	key.offset = first_offset;
3565  	key.type = BTRFS_DIR_LOG_INDEX_KEY;
3566  	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3567  	/*
3568  	 * -EEXIST is fine and can happen sporadically when we are logging a
3569  	 * directory and have concurrent insertions in the subvolume's tree for
3570  	 * items from other inodes and that result in pushing off some dir items
3571  	 * from one leaf to another in order to accommodate for the new items.
3572  	 * This results in logging the same dir index range key.
3573  	 */
3574  	if (ret && ret != -EEXIST)
3575  		return ret;
3576  
3577  	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3578  			      struct btrfs_dir_log_item);
3579  	if (ret == -EEXIST) {
3580  		const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
3581  
3582  		/*
3583  		 * btrfs_del_dir_entries_in_log() might have been called during
3584  		 * an unlink between the initial insertion of this key and the
3585  		 * current update, or we might be logging a single entry deletion
3586  		 * during a rename, so set the new last_offset to the max value.
3587  		 */
3588  		last_offset = max(last_offset, curr_end);
3589  	}
3590  	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3591  	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
3592  	btrfs_release_path(path);
3593  	return 0;
3594  }
3595  
flush_dir_items_batch(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct extent_buffer * src,struct btrfs_path * dst_path,int start_slot,int count)3596  static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
3597  				 struct btrfs_inode *inode,
3598  				 struct extent_buffer *src,
3599  				 struct btrfs_path *dst_path,
3600  				 int start_slot,
3601  				 int count)
3602  {
3603  	struct btrfs_root *log = inode->root->log_root;
3604  	char *ins_data = NULL;
3605  	struct btrfs_item_batch batch;
3606  	struct extent_buffer *dst;
3607  	unsigned long src_offset;
3608  	unsigned long dst_offset;
3609  	u64 last_index;
3610  	struct btrfs_key key;
3611  	u32 item_size;
3612  	int ret;
3613  	int i;
3614  
3615  	ASSERT(count > 0);
3616  	batch.nr = count;
3617  
3618  	if (count == 1) {
3619  		btrfs_item_key_to_cpu(src, &key, start_slot);
3620  		item_size = btrfs_item_size(src, start_slot);
3621  		batch.keys = &key;
3622  		batch.data_sizes = &item_size;
3623  		batch.total_data_size = item_size;
3624  	} else {
3625  		struct btrfs_key *ins_keys;
3626  		u32 *ins_sizes;
3627  
3628  		ins_data = kmalloc(count * sizeof(u32) +
3629  				   count * sizeof(struct btrfs_key), GFP_NOFS);
3630  		if (!ins_data)
3631  			return -ENOMEM;
3632  
3633  		ins_sizes = (u32 *)ins_data;
3634  		ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
3635  		batch.keys = ins_keys;
3636  		batch.data_sizes = ins_sizes;
3637  		batch.total_data_size = 0;
3638  
3639  		for (i = 0; i < count; i++) {
3640  			const int slot = start_slot + i;
3641  
3642  			btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
3643  			ins_sizes[i] = btrfs_item_size(src, slot);
3644  			batch.total_data_size += ins_sizes[i];
3645  		}
3646  	}
3647  
3648  	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
3649  	if (ret)
3650  		goto out;
3651  
3652  	dst = dst_path->nodes[0];
3653  	/*
3654  	 * Copy all the items in bulk, in a single copy operation. Item data is
3655  	 * organized such that it's placed at the end of a leaf and from right
3656  	 * to left. For example, the data for the second item ends at an offset
3657  	 * that matches the offset where the data for the first item starts, the
3658  	 * data for the third item ends at an offset that matches the offset
3659  	 * where the data of the second items starts, and so on.
3660  	 * Therefore our source and destination start offsets for copy match the
3661  	 * offsets of the last items (highest slots).
3662  	 */
3663  	dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
3664  	src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
3665  	copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
3666  	btrfs_release_path(dst_path);
3667  
3668  	last_index = batch.keys[count - 1].offset;
3669  	ASSERT(last_index > inode->last_dir_index_offset);
3670  
3671  	/*
3672  	 * If for some unexpected reason the last item's index is not greater
3673  	 * than the last index we logged, warn and force a transaction commit.
3674  	 */
3675  	if (WARN_ON(last_index <= inode->last_dir_index_offset))
3676  		ret = BTRFS_LOG_FORCE_COMMIT;
3677  	else
3678  		inode->last_dir_index_offset = last_index;
3679  
3680  	if (btrfs_get_first_dir_index_to_log(inode) == 0)
3681  		btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
3682  out:
3683  	kfree(ins_data);
3684  
3685  	return ret;
3686  }
3687  
clone_leaf(struct btrfs_path * path,struct btrfs_log_ctx * ctx)3688  static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
3689  {
3690  	const int slot = path->slots[0];
3691  
3692  	if (ctx->scratch_eb) {
3693  		copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]);
3694  	} else {
3695  		ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]);
3696  		if (!ctx->scratch_eb)
3697  			return -ENOMEM;
3698  	}
3699  
3700  	btrfs_release_path(path);
3701  	path->nodes[0] = ctx->scratch_eb;
3702  	path->slots[0] = slot;
3703  	/*
3704  	 * Add extra ref to scratch eb so that it is not freed when callers
3705  	 * release the path, so we can reuse it later if needed.
3706  	 */
3707  	atomic_inc(&ctx->scratch_eb->refs);
3708  
3709  	return 0;
3710  }
3711  
process_dir_items_leaf(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx,u64 * last_old_dentry_offset)3712  static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
3713  				  struct btrfs_inode *inode,
3714  				  struct btrfs_path *path,
3715  				  struct btrfs_path *dst_path,
3716  				  struct btrfs_log_ctx *ctx,
3717  				  u64 *last_old_dentry_offset)
3718  {
3719  	struct btrfs_root *log = inode->root->log_root;
3720  	struct extent_buffer *src;
3721  	const int nritems = btrfs_header_nritems(path->nodes[0]);
3722  	const u64 ino = btrfs_ino(inode);
3723  	bool last_found = false;
3724  	int batch_start = 0;
3725  	int batch_size = 0;
3726  	int ret;
3727  
3728  	/*
3729  	 * We need to clone the leaf, release the read lock on it, and use the
3730  	 * clone before modifying the log tree. See the comment at copy_items()
3731  	 * about why we need to do this.
3732  	 */
3733  	ret = clone_leaf(path, ctx);
3734  	if (ret < 0)
3735  		return ret;
3736  
3737  	src = path->nodes[0];
3738  
3739  	for (int i = path->slots[0]; i < nritems; i++) {
3740  		struct btrfs_dir_item *di;
3741  		struct btrfs_key key;
3742  		int ret;
3743  
3744  		btrfs_item_key_to_cpu(src, &key, i);
3745  
3746  		if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
3747  			last_found = true;
3748  			break;
3749  		}
3750  
3751  		di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3752  
3753  		/*
3754  		 * Skip ranges of items that consist only of dir item keys created
3755  		 * in past transactions. However if we find a gap, we must log a
3756  		 * dir index range item for that gap, so that index keys in that
3757  		 * gap are deleted during log replay.
3758  		 */
3759  		if (btrfs_dir_transid(src, di) < trans->transid) {
3760  			if (key.offset > *last_old_dentry_offset + 1) {
3761  				ret = insert_dir_log_key(trans, log, dst_path,
3762  						 ino, *last_old_dentry_offset + 1,
3763  						 key.offset - 1);
3764  				if (ret < 0)
3765  					return ret;
3766  			}
3767  
3768  			*last_old_dentry_offset = key.offset;
3769  			continue;
3770  		}
3771  
3772  		/* If we logged this dir index item before, we can skip it. */
3773  		if (key.offset <= inode->last_dir_index_offset)
3774  			continue;
3775  
3776  		/*
3777  		 * We must make sure that when we log a directory entry, the
3778  		 * corresponding inode, after log replay, has a matching link
3779  		 * count. For example:
3780  		 *
3781  		 * touch foo
3782  		 * mkdir mydir
3783  		 * sync
3784  		 * ln foo mydir/bar
3785  		 * xfs_io -c "fsync" mydir
3786  		 * <crash>
3787  		 * <mount fs and log replay>
3788  		 *
3789  		 * Would result in a fsync log that when replayed, our file inode
3790  		 * would have a link count of 1, but we get two directory entries
3791  		 * pointing to the same inode. After removing one of the names,
3792  		 * it would not be possible to remove the other name, which
3793  		 * resulted always in stale file handle errors, and would not be
3794  		 * possible to rmdir the parent directory, since its i_size could
3795  		 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
3796  		 * resulting in -ENOTEMPTY errors.
3797  		 */
3798  		if (!ctx->log_new_dentries) {
3799  			struct btrfs_key di_key;
3800  
3801  			btrfs_dir_item_key_to_cpu(src, di, &di_key);
3802  			if (di_key.type != BTRFS_ROOT_ITEM_KEY)
3803  				ctx->log_new_dentries = true;
3804  		}
3805  
3806  		if (batch_size == 0)
3807  			batch_start = i;
3808  		batch_size++;
3809  	}
3810  
3811  	if (batch_size > 0) {
3812  		int ret;
3813  
3814  		ret = flush_dir_items_batch(trans, inode, src, dst_path,
3815  					    batch_start, batch_size);
3816  		if (ret < 0)
3817  			return ret;
3818  	}
3819  
3820  	return last_found ? 1 : 0;
3821  }
3822  
3823  /*
3824   * log all the items included in the current transaction for a given
3825   * directory.  This also creates the range items in the log tree required
3826   * to replay anything deleted before the fsync
3827   */
log_dir_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx,u64 min_offset,u64 * last_offset_ret)3828  static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3829  			  struct btrfs_inode *inode,
3830  			  struct btrfs_path *path,
3831  			  struct btrfs_path *dst_path,
3832  			  struct btrfs_log_ctx *ctx,
3833  			  u64 min_offset, u64 *last_offset_ret)
3834  {
3835  	struct btrfs_key min_key;
3836  	struct btrfs_root *root = inode->root;
3837  	struct btrfs_root *log = root->log_root;
3838  	int ret;
3839  	u64 last_old_dentry_offset = min_offset - 1;
3840  	u64 last_offset = (u64)-1;
3841  	u64 ino = btrfs_ino(inode);
3842  
3843  	min_key.objectid = ino;
3844  	min_key.type = BTRFS_DIR_INDEX_KEY;
3845  	min_key.offset = min_offset;
3846  
3847  	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3848  
3849  	/*
3850  	 * we didn't find anything from this transaction, see if there
3851  	 * is anything at all
3852  	 */
3853  	if (ret != 0 || min_key.objectid != ino ||
3854  	    min_key.type != BTRFS_DIR_INDEX_KEY) {
3855  		min_key.objectid = ino;
3856  		min_key.type = BTRFS_DIR_INDEX_KEY;
3857  		min_key.offset = (u64)-1;
3858  		btrfs_release_path(path);
3859  		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3860  		if (ret < 0) {
3861  			btrfs_release_path(path);
3862  			return ret;
3863  		}
3864  		ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3865  
3866  		/* if ret == 0 there are items for this type,
3867  		 * create a range to tell us the last key of this type.
3868  		 * otherwise, there are no items in this directory after
3869  		 * *min_offset, and we create a range to indicate that.
3870  		 */
3871  		if (ret == 0) {
3872  			struct btrfs_key tmp;
3873  
3874  			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3875  					      path->slots[0]);
3876  			if (tmp.type == BTRFS_DIR_INDEX_KEY)
3877  				last_old_dentry_offset = tmp.offset;
3878  		} else if (ret > 0) {
3879  			ret = 0;
3880  		}
3881  
3882  		goto done;
3883  	}
3884  
3885  	/* go backward to find any previous key */
3886  	ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3887  	if (ret == 0) {
3888  		struct btrfs_key tmp;
3889  
3890  		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3891  		/*
3892  		 * The dir index key before the first one we found that needs to
3893  		 * be logged might be in a previous leaf, and there might be a
3894  		 * gap between these keys, meaning that we had deletions that
3895  		 * happened. So the key range item we log (key type
3896  		 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
3897  		 * previous key's offset plus 1, so that those deletes are replayed.
3898  		 */
3899  		if (tmp.type == BTRFS_DIR_INDEX_KEY)
3900  			last_old_dentry_offset = tmp.offset;
3901  	} else if (ret < 0) {
3902  		goto done;
3903  	}
3904  
3905  	btrfs_release_path(path);
3906  
3907  	/*
3908  	 * Find the first key from this transaction again or the one we were at
3909  	 * in the loop below in case we had to reschedule. We may be logging the
3910  	 * directory without holding its VFS lock, which happen when logging new
3911  	 * dentries (through log_new_dir_dentries()) or in some cases when we
3912  	 * need to log the parent directory of an inode. This means a dir index
3913  	 * key might be deleted from the inode's root, and therefore we may not
3914  	 * find it anymore. If we can't find it, just move to the next key. We
3915  	 * can not bail out and ignore, because if we do that we will simply
3916  	 * not log dir index keys that come after the one that was just deleted
3917  	 * and we can end up logging a dir index range that ends at (u64)-1
3918  	 * (@last_offset is initialized to that), resulting in removing dir
3919  	 * entries we should not remove at log replay time.
3920  	 */
3921  search:
3922  	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3923  	if (ret > 0) {
3924  		ret = btrfs_next_item(root, path);
3925  		if (ret > 0) {
3926  			/* There are no more keys in the inode's root. */
3927  			ret = 0;
3928  			goto done;
3929  		}
3930  	}
3931  	if (ret < 0)
3932  		goto done;
3933  
3934  	/*
3935  	 * we have a block from this transaction, log every item in it
3936  	 * from our directory
3937  	 */
3938  	while (1) {
3939  		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
3940  					     &last_old_dentry_offset);
3941  		if (ret != 0) {
3942  			if (ret > 0)
3943  				ret = 0;
3944  			goto done;
3945  		}
3946  		path->slots[0] = btrfs_header_nritems(path->nodes[0]);
3947  
3948  		/*
3949  		 * look ahead to the next item and see if it is also
3950  		 * from this directory and from this transaction
3951  		 */
3952  		ret = btrfs_next_leaf(root, path);
3953  		if (ret) {
3954  			if (ret == 1) {
3955  				last_offset = (u64)-1;
3956  				ret = 0;
3957  			}
3958  			goto done;
3959  		}
3960  		btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
3961  		if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
3962  			last_offset = (u64)-1;
3963  			goto done;
3964  		}
3965  		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3966  			/*
3967  			 * The next leaf was not changed in the current transaction
3968  			 * and has at least one dir index key.
3969  			 * We check for the next key because there might have been
3970  			 * one or more deletions between the last key we logged and
3971  			 * that next key. So the key range item we log (key type
3972  			 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
3973  			 * offset minus 1, so that those deletes are replayed.
3974  			 */
3975  			last_offset = min_key.offset - 1;
3976  			goto done;
3977  		}
3978  		if (need_resched()) {
3979  			btrfs_release_path(path);
3980  			cond_resched();
3981  			goto search;
3982  		}
3983  	}
3984  done:
3985  	btrfs_release_path(path);
3986  	btrfs_release_path(dst_path);
3987  
3988  	if (ret == 0) {
3989  		*last_offset_ret = last_offset;
3990  		/*
3991  		 * In case the leaf was changed in the current transaction but
3992  		 * all its dir items are from a past transaction, the last item
3993  		 * in the leaf is a dir item and there's no gap between that last
3994  		 * dir item and the first one on the next leaf (which did not
3995  		 * change in the current transaction), then we don't need to log
3996  		 * a range, last_old_dentry_offset is == to last_offset.
3997  		 */
3998  		ASSERT(last_old_dentry_offset <= last_offset);
3999  		if (last_old_dentry_offset < last_offset)
4000  			ret = insert_dir_log_key(trans, log, path, ino,
4001  						 last_old_dentry_offset + 1,
4002  						 last_offset);
4003  	}
4004  
4005  	return ret;
4006  }
4007  
4008  /*
4009   * If the inode was logged before and it was evicted, then its
4010   * last_dir_index_offset is (u64)-1, so we don't the value of the last index
4011   * key offset. If that's the case, search for it and update the inode. This
4012   * is to avoid lookups in the log tree every time we try to insert a dir index
4013   * key from a leaf changed in the current transaction, and to allow us to always
4014   * do batch insertions of dir index keys.
4015   */
update_last_dir_index_offset(struct btrfs_inode * inode,struct btrfs_path * path,const struct btrfs_log_ctx * ctx)4016  static int update_last_dir_index_offset(struct btrfs_inode *inode,
4017  					struct btrfs_path *path,
4018  					const struct btrfs_log_ctx *ctx)
4019  {
4020  	const u64 ino = btrfs_ino(inode);
4021  	struct btrfs_key key;
4022  	int ret;
4023  
4024  	lockdep_assert_held(&inode->log_mutex);
4025  
4026  	if (inode->last_dir_index_offset != (u64)-1)
4027  		return 0;
4028  
4029  	if (!ctx->logged_before) {
4030  		inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
4031  		return 0;
4032  	}
4033  
4034  	key.objectid = ino;
4035  	key.type = BTRFS_DIR_INDEX_KEY;
4036  	key.offset = (u64)-1;
4037  
4038  	ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
4039  	/*
4040  	 * An error happened or we actually have an index key with an offset
4041  	 * value of (u64)-1. Bail out, we're done.
4042  	 */
4043  	if (ret <= 0)
4044  		goto out;
4045  
4046  	ret = 0;
4047  	inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
4048  
4049  	/*
4050  	 * No dir index items, bail out and leave last_dir_index_offset with
4051  	 * the value right before the first valid index value.
4052  	 */
4053  	if (path->slots[0] == 0)
4054  		goto out;
4055  
4056  	/*
4057  	 * btrfs_search_slot() left us at one slot beyond the slot with the last
4058  	 * index key, or beyond the last key of the directory that is not an
4059  	 * index key. If we have an index key before, set last_dir_index_offset
4060  	 * to its offset value, otherwise leave it with a value right before the
4061  	 * first valid index value, as it means we have an empty directory.
4062  	 */
4063  	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
4064  	if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
4065  		inode->last_dir_index_offset = key.offset;
4066  
4067  out:
4068  	btrfs_release_path(path);
4069  
4070  	return ret;
4071  }
4072  
4073  /*
4074   * logging directories is very similar to logging inodes, We find all the items
4075   * from the current transaction and write them to the log.
4076   *
4077   * The recovery code scans the directory in the subvolume, and if it finds a
4078   * key in the range logged that is not present in the log tree, then it means
4079   * that dir entry was unlinked during the transaction.
4080   *
4081   * In order for that scan to work, we must include one key smaller than
4082   * the smallest logged by this transaction and one key larger than the largest
4083   * key logged by this transaction.
4084   */
log_directory_changes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx)4085  static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
4086  			  struct btrfs_inode *inode,
4087  			  struct btrfs_path *path,
4088  			  struct btrfs_path *dst_path,
4089  			  struct btrfs_log_ctx *ctx)
4090  {
4091  	u64 min_key;
4092  	u64 max_key;
4093  	int ret;
4094  
4095  	ret = update_last_dir_index_offset(inode, path, ctx);
4096  	if (ret)
4097  		return ret;
4098  
4099  	min_key = BTRFS_DIR_START_INDEX;
4100  	max_key = 0;
4101  
4102  	while (1) {
4103  		ret = log_dir_items(trans, inode, path, dst_path,
4104  				ctx, min_key, &max_key);
4105  		if (ret)
4106  			return ret;
4107  		if (max_key == (u64)-1)
4108  			break;
4109  		min_key = max_key + 1;
4110  	}
4111  
4112  	return 0;
4113  }
4114  
4115  /*
4116   * a helper function to drop items from the log before we relog an
4117   * inode.  max_key_type indicates the highest item type to remove.
4118   * This cannot be run for file data extents because it does not
4119   * free the extents they point to.
4120   */
drop_inode_items(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_inode * inode,int max_key_type)4121  static int drop_inode_items(struct btrfs_trans_handle *trans,
4122  				  struct btrfs_root *log,
4123  				  struct btrfs_path *path,
4124  				  struct btrfs_inode *inode,
4125  				  int max_key_type)
4126  {
4127  	int ret;
4128  	struct btrfs_key key;
4129  	struct btrfs_key found_key;
4130  	int start_slot;
4131  
4132  	key.objectid = btrfs_ino(inode);
4133  	key.type = max_key_type;
4134  	key.offset = (u64)-1;
4135  
4136  	while (1) {
4137  		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
4138  		if (ret < 0) {
4139  			break;
4140  		} else if (ret > 0) {
4141  			if (path->slots[0] == 0)
4142  				break;
4143  			path->slots[0]--;
4144  		}
4145  
4146  		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4147  				      path->slots[0]);
4148  
4149  		if (found_key.objectid != key.objectid)
4150  			break;
4151  
4152  		found_key.offset = 0;
4153  		found_key.type = 0;
4154  		ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
4155  		if (ret < 0)
4156  			break;
4157  
4158  		ret = btrfs_del_items(trans, log, path, start_slot,
4159  				      path->slots[0] - start_slot + 1);
4160  		/*
4161  		 * If start slot isn't 0 then we don't need to re-search, we've
4162  		 * found the last guy with the objectid in this tree.
4163  		 */
4164  		if (ret || start_slot != 0)
4165  			break;
4166  		btrfs_release_path(path);
4167  	}
4168  	btrfs_release_path(path);
4169  	if (ret > 0)
4170  		ret = 0;
4171  	return ret;
4172  }
4173  
truncate_inode_items(struct btrfs_trans_handle * trans,struct btrfs_root * log_root,struct btrfs_inode * inode,u64 new_size,u32 min_type)4174  static int truncate_inode_items(struct btrfs_trans_handle *trans,
4175  				struct btrfs_root *log_root,
4176  				struct btrfs_inode *inode,
4177  				u64 new_size, u32 min_type)
4178  {
4179  	struct btrfs_truncate_control control = {
4180  		.new_size = new_size,
4181  		.ino = btrfs_ino(inode),
4182  		.min_type = min_type,
4183  		.skip_ref_updates = true,
4184  	};
4185  
4186  	return btrfs_truncate_inode_items(trans, log_root, &control);
4187  }
4188  
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode,int log_inode_only,u64 logged_isize)4189  static void fill_inode_item(struct btrfs_trans_handle *trans,
4190  			    struct extent_buffer *leaf,
4191  			    struct btrfs_inode_item *item,
4192  			    struct inode *inode, int log_inode_only,
4193  			    u64 logged_isize)
4194  {
4195  	struct btrfs_map_token token;
4196  	u64 flags;
4197  
4198  	btrfs_init_map_token(&token, leaf);
4199  
4200  	if (log_inode_only) {
4201  		/* set the generation to zero so the recover code
4202  		 * can tell the difference between an logging
4203  		 * just to say 'this inode exists' and a logging
4204  		 * to say 'update this inode with these values'
4205  		 */
4206  		btrfs_set_token_inode_generation(&token, item, 0);
4207  		btrfs_set_token_inode_size(&token, item, logged_isize);
4208  	} else {
4209  		btrfs_set_token_inode_generation(&token, item,
4210  						 BTRFS_I(inode)->generation);
4211  		btrfs_set_token_inode_size(&token, item, inode->i_size);
4212  	}
4213  
4214  	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
4215  	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
4216  	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
4217  	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
4218  
4219  	btrfs_set_token_timespec_sec(&token, &item->atime,
4220  				     inode_get_atime_sec(inode));
4221  	btrfs_set_token_timespec_nsec(&token, &item->atime,
4222  				      inode_get_atime_nsec(inode));
4223  
4224  	btrfs_set_token_timespec_sec(&token, &item->mtime,
4225  				     inode_get_mtime_sec(inode));
4226  	btrfs_set_token_timespec_nsec(&token, &item->mtime,
4227  				      inode_get_mtime_nsec(inode));
4228  
4229  	btrfs_set_token_timespec_sec(&token, &item->ctime,
4230  				     inode_get_ctime_sec(inode));
4231  	btrfs_set_token_timespec_nsec(&token, &item->ctime,
4232  				      inode_get_ctime_nsec(inode));
4233  
4234  	/*
4235  	 * We do not need to set the nbytes field, in fact during a fast fsync
4236  	 * its value may not even be correct, since a fast fsync does not wait
4237  	 * for ordered extent completion, which is where we update nbytes, it
4238  	 * only waits for writeback to complete. During log replay as we find
4239  	 * file extent items and replay them, we adjust the nbytes field of the
4240  	 * inode item in subvolume tree as needed (see overwrite_item()).
4241  	 */
4242  
4243  	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4244  	btrfs_set_token_inode_transid(&token, item, trans->transid);
4245  	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
4246  	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4247  					  BTRFS_I(inode)->ro_flags);
4248  	btrfs_set_token_inode_flags(&token, item, flags);
4249  	btrfs_set_token_inode_block_group(&token, item, 0);
4250  }
4251  
log_inode_item(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_inode * inode,bool inode_item_dropped)4252  static int log_inode_item(struct btrfs_trans_handle *trans,
4253  			  struct btrfs_root *log, struct btrfs_path *path,
4254  			  struct btrfs_inode *inode, bool inode_item_dropped)
4255  {
4256  	struct btrfs_inode_item *inode_item;
4257  	struct btrfs_key key;
4258  	int ret;
4259  
4260  	btrfs_get_inode_key(inode, &key);
4261  	/*
4262  	 * If we are doing a fast fsync and the inode was logged before in the
4263  	 * current transaction, then we know the inode was previously logged and
4264  	 * it exists in the log tree. For performance reasons, in this case use
4265  	 * btrfs_search_slot() directly with ins_len set to 0 so that we never
4266  	 * attempt a write lock on the leaf's parent, which adds unnecessary lock
4267  	 * contention in case there are concurrent fsyncs for other inodes of the
4268  	 * same subvolume. Using btrfs_insert_empty_item() when the inode item
4269  	 * already exists can also result in unnecessarily splitting a leaf.
4270  	 */
4271  	if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4272  		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
4273  		ASSERT(ret <= 0);
4274  		if (ret > 0)
4275  			ret = -ENOENT;
4276  	} else {
4277  		/*
4278  		 * This means it is the first fsync in the current transaction,
4279  		 * so the inode item is not in the log and we need to insert it.
4280  		 * We can never get -EEXIST because we are only called for a fast
4281  		 * fsync and in case an inode eviction happens after the inode was
4282  		 * logged before in the current transaction, when we load again
4283  		 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4284  		 * flags and set ->logged_trans to 0.
4285  		 */
4286  		ret = btrfs_insert_empty_item(trans, log, path, &key,
4287  					      sizeof(*inode_item));
4288  		ASSERT(ret != -EEXIST);
4289  	}
4290  	if (ret)
4291  		return ret;
4292  	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4293  				    struct btrfs_inode_item);
4294  	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4295  			0, 0);
4296  	btrfs_release_path(path);
4297  	return 0;
4298  }
4299  
log_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,struct btrfs_ordered_sum * sums)4300  static int log_csums(struct btrfs_trans_handle *trans,
4301  		     struct btrfs_inode *inode,
4302  		     struct btrfs_root *log_root,
4303  		     struct btrfs_ordered_sum *sums)
4304  {
4305  	const u64 lock_end = sums->logical + sums->len - 1;
4306  	struct extent_state *cached_state = NULL;
4307  	int ret;
4308  
4309  	/*
4310  	 * If this inode was not used for reflink operations in the current
4311  	 * transaction with new extents, then do the fast path, no need to
4312  	 * worry about logging checksum items with overlapping ranges.
4313  	 */
4314  	if (inode->last_reflink_trans < trans->transid)
4315  		return btrfs_csum_file_blocks(trans, log_root, sums);
4316  
4317  	/*
4318  	 * Serialize logging for checksums. This is to avoid racing with the
4319  	 * same checksum being logged by another task that is logging another
4320  	 * file which happens to refer to the same extent as well. Such races
4321  	 * can leave checksum items in the log with overlapping ranges.
4322  	 */
4323  	ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4324  			  &cached_state);
4325  	if (ret)
4326  		return ret;
4327  	/*
4328  	 * Due to extent cloning, we might have logged a csum item that covers a
4329  	 * subrange of a cloned extent, and later we can end up logging a csum
4330  	 * item for a larger subrange of the same extent or the entire range.
4331  	 * This would leave csum items in the log tree that cover the same range
4332  	 * and break the searches for checksums in the log tree, resulting in
4333  	 * some checksums missing in the fs/subvolume tree. So just delete (or
4334  	 * trim and adjust) any existing csum items in the log for this range.
4335  	 */
4336  	ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len);
4337  	if (!ret)
4338  		ret = btrfs_csum_file_blocks(trans, log_root, sums);
4339  
4340  	unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4341  		      &cached_state);
4342  
4343  	return ret;
4344  }
4345  
copy_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * dst_path,struct btrfs_path * src_path,int start_slot,int nr,int inode_only,u64 logged_isize,struct btrfs_log_ctx * ctx)4346  static noinline int copy_items(struct btrfs_trans_handle *trans,
4347  			       struct btrfs_inode *inode,
4348  			       struct btrfs_path *dst_path,
4349  			       struct btrfs_path *src_path,
4350  			       int start_slot, int nr, int inode_only,
4351  			       u64 logged_isize, struct btrfs_log_ctx *ctx)
4352  {
4353  	struct btrfs_root *log = inode->root->log_root;
4354  	struct btrfs_file_extent_item *extent;
4355  	struct extent_buffer *src;
4356  	int ret;
4357  	struct btrfs_key *ins_keys;
4358  	u32 *ins_sizes;
4359  	struct btrfs_item_batch batch;
4360  	char *ins_data;
4361  	int dst_index;
4362  	const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
4363  	const u64 i_size = i_size_read(&inode->vfs_inode);
4364  
4365  	/*
4366  	 * To keep lockdep happy and avoid deadlocks, clone the source leaf and
4367  	 * use the clone. This is because otherwise we would be changing the log
4368  	 * tree, to insert items from the subvolume tree or insert csum items,
4369  	 * while holding a read lock on a leaf from the subvolume tree, which
4370  	 * creates a nasty lock dependency when COWing log tree nodes/leaves:
4371  	 *
4372  	 * 1) Modifying the log tree triggers an extent buffer allocation while
4373  	 *    holding a write lock on a parent extent buffer from the log tree.
4374  	 *    Allocating the pages for an extent buffer, or the extent buffer
4375  	 *    struct, can trigger inode eviction and finally the inode eviction
4376  	 *    will trigger a release/remove of a delayed node, which requires
4377  	 *    taking the delayed node's mutex;
4378  	 *
4379  	 * 2) Allocating a metadata extent for a log tree can trigger the async
4380  	 *    reclaim thread and make us wait for it to release enough space and
4381  	 *    unblock our reservation ticket. The reclaim thread can start
4382  	 *    flushing delayed items, and that in turn results in the need to
4383  	 *    lock delayed node mutexes and in the need to write lock extent
4384  	 *    buffers of a subvolume tree - all this while holding a write lock
4385  	 *    on the parent extent buffer in the log tree.
4386  	 *
4387  	 * So one task in scenario 1) running in parallel with another task in
4388  	 * scenario 2) could lead to a deadlock, one wanting to lock a delayed
4389  	 * node mutex while having a read lock on a leaf from the subvolume,
4390  	 * while the other is holding the delayed node's mutex and wants to
4391  	 * write lock the same subvolume leaf for flushing delayed items.
4392  	 */
4393  	ret = clone_leaf(src_path, ctx);
4394  	if (ret < 0)
4395  		return ret;
4396  
4397  	src = src_path->nodes[0];
4398  
4399  	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4400  			   nr * sizeof(u32), GFP_NOFS);
4401  	if (!ins_data)
4402  		return -ENOMEM;
4403  
4404  	ins_sizes = (u32 *)ins_data;
4405  	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4406  	batch.keys = ins_keys;
4407  	batch.data_sizes = ins_sizes;
4408  	batch.total_data_size = 0;
4409  	batch.nr = 0;
4410  
4411  	dst_index = 0;
4412  	for (int i = 0; i < nr; i++) {
4413  		const int src_slot = start_slot + i;
4414  		struct btrfs_root *csum_root;
4415  		struct btrfs_ordered_sum *sums;
4416  		struct btrfs_ordered_sum *sums_next;
4417  		LIST_HEAD(ordered_sums);
4418  		u64 disk_bytenr;
4419  		u64 disk_num_bytes;
4420  		u64 extent_offset;
4421  		u64 extent_num_bytes;
4422  		bool is_old_extent;
4423  
4424  		btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
4425  
4426  		if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
4427  			goto add_to_batch;
4428  
4429  		extent = btrfs_item_ptr(src, src_slot,
4430  					struct btrfs_file_extent_item);
4431  
4432  		is_old_extent = (btrfs_file_extent_generation(src, extent) <
4433  				 trans->transid);
4434  
4435  		/*
4436  		 * Don't copy extents from past generations. That would make us
4437  		 * log a lot more metadata for common cases like doing only a
4438  		 * few random writes into a file and then fsync it for the first
4439  		 * time or after the full sync flag is set on the inode. We can
4440  		 * get leaves full of extent items, most of which are from past
4441  		 * generations, so we can skip them - as long as the inode has
4442  		 * not been the target of a reflink operation in this transaction,
4443  		 * as in that case it might have had file extent items with old
4444  		 * generations copied into it. We also must always log prealloc
4445  		 * extents that start at or beyond eof, otherwise we would lose
4446  		 * them on log replay.
4447  		 */
4448  		if (is_old_extent &&
4449  		    ins_keys[dst_index].offset < i_size &&
4450  		    inode->last_reflink_trans < trans->transid)
4451  			continue;
4452  
4453  		if (skip_csum)
4454  			goto add_to_batch;
4455  
4456  		/* Only regular extents have checksums. */
4457  		if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
4458  			goto add_to_batch;
4459  
4460  		/*
4461  		 * If it's an extent created in a past transaction, then its
4462  		 * checksums are already accessible from the committed csum tree,
4463  		 * no need to log them.
4464  		 */
4465  		if (is_old_extent)
4466  			goto add_to_batch;
4467  
4468  		disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
4469  		/* If it's an explicit hole, there are no checksums. */
4470  		if (disk_bytenr == 0)
4471  			goto add_to_batch;
4472  
4473  		disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
4474  
4475  		if (btrfs_file_extent_compression(src, extent)) {
4476  			extent_offset = 0;
4477  			extent_num_bytes = disk_num_bytes;
4478  		} else {
4479  			extent_offset = btrfs_file_extent_offset(src, extent);
4480  			extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
4481  		}
4482  
4483  		csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
4484  		disk_bytenr += extent_offset;
4485  		ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
4486  					      disk_bytenr + extent_num_bytes - 1,
4487  					      &ordered_sums, false);
4488  		if (ret < 0)
4489  			goto out;
4490  		ret = 0;
4491  
4492  		list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
4493  			if (!ret)
4494  				ret = log_csums(trans, inode, log, sums);
4495  			list_del(&sums->list);
4496  			kfree(sums);
4497  		}
4498  		if (ret)
4499  			goto out;
4500  
4501  add_to_batch:
4502  		ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
4503  		batch.total_data_size += ins_sizes[dst_index];
4504  		batch.nr++;
4505  		dst_index++;
4506  	}
4507  
4508  	/*
4509  	 * We have a leaf full of old extent items that don't need to be logged,
4510  	 * so we don't need to do anything.
4511  	 */
4512  	if (batch.nr == 0)
4513  		goto out;
4514  
4515  	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4516  	if (ret)
4517  		goto out;
4518  
4519  	dst_index = 0;
4520  	for (int i = 0; i < nr; i++) {
4521  		const int src_slot = start_slot + i;
4522  		const int dst_slot = dst_path->slots[0] + dst_index;
4523  		struct btrfs_key key;
4524  		unsigned long src_offset;
4525  		unsigned long dst_offset;
4526  
4527  		/*
4528  		 * We're done, all the remaining items in the source leaf
4529  		 * correspond to old file extent items.
4530  		 */
4531  		if (dst_index >= batch.nr)
4532  			break;
4533  
4534  		btrfs_item_key_to_cpu(src, &key, src_slot);
4535  
4536  		if (key.type != BTRFS_EXTENT_DATA_KEY)
4537  			goto copy_item;
4538  
4539  		extent = btrfs_item_ptr(src, src_slot,
4540  					struct btrfs_file_extent_item);
4541  
4542  		/* See the comment in the previous loop, same logic. */
4543  		if (btrfs_file_extent_generation(src, extent) < trans->transid &&
4544  		    key.offset < i_size &&
4545  		    inode->last_reflink_trans < trans->transid)
4546  			continue;
4547  
4548  copy_item:
4549  		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
4550  		src_offset = btrfs_item_ptr_offset(src, src_slot);
4551  
4552  		if (key.type == BTRFS_INODE_ITEM_KEY) {
4553  			struct btrfs_inode_item *inode_item;
4554  
4555  			inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
4556  						    struct btrfs_inode_item);
4557  			fill_inode_item(trans, dst_path->nodes[0], inode_item,
4558  					&inode->vfs_inode,
4559  					inode_only == LOG_INODE_EXISTS,
4560  					logged_isize);
4561  		} else {
4562  			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4563  					   src_offset, ins_sizes[dst_index]);
4564  		}
4565  
4566  		dst_index++;
4567  	}
4568  
4569  	btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]);
4570  	btrfs_release_path(dst_path);
4571  out:
4572  	kfree(ins_data);
4573  
4574  	return ret;
4575  }
4576  
extent_cmp(void * priv,const struct list_head * a,const struct list_head * b)4577  static int extent_cmp(void *priv, const struct list_head *a,
4578  		      const struct list_head *b)
4579  {
4580  	const struct extent_map *em1, *em2;
4581  
4582  	em1 = list_entry(a, struct extent_map, list);
4583  	em2 = list_entry(b, struct extent_map, list);
4584  
4585  	if (em1->start < em2->start)
4586  		return -1;
4587  	else if (em1->start > em2->start)
4588  		return 1;
4589  	return 0;
4590  }
4591  
log_extent_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,const struct extent_map * em,struct btrfs_log_ctx * ctx)4592  static int log_extent_csums(struct btrfs_trans_handle *trans,
4593  			    struct btrfs_inode *inode,
4594  			    struct btrfs_root *log_root,
4595  			    const struct extent_map *em,
4596  			    struct btrfs_log_ctx *ctx)
4597  {
4598  	struct btrfs_ordered_extent *ordered;
4599  	struct btrfs_root *csum_root;
4600  	u64 block_start;
4601  	u64 csum_offset;
4602  	u64 csum_len;
4603  	u64 mod_start = em->start;
4604  	u64 mod_len = em->len;
4605  	LIST_HEAD(ordered_sums);
4606  	int ret = 0;
4607  
4608  	if (inode->flags & BTRFS_INODE_NODATASUM ||
4609  	    (em->flags & EXTENT_FLAG_PREALLOC) ||
4610  	    em->disk_bytenr == EXTENT_MAP_HOLE)
4611  		return 0;
4612  
4613  	list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4614  		const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4615  		const u64 mod_end = mod_start + mod_len;
4616  		struct btrfs_ordered_sum *sums;
4617  
4618  		if (mod_len == 0)
4619  			break;
4620  
4621  		if (ordered_end <= mod_start)
4622  			continue;
4623  		if (mod_end <= ordered->file_offset)
4624  			break;
4625  
4626  		/*
4627  		 * We are going to copy all the csums on this ordered extent, so
4628  		 * go ahead and adjust mod_start and mod_len in case this ordered
4629  		 * extent has already been logged.
4630  		 */
4631  		if (ordered->file_offset > mod_start) {
4632  			if (ordered_end >= mod_end)
4633  				mod_len = ordered->file_offset - mod_start;
4634  			/*
4635  			 * If we have this case
4636  			 *
4637  			 * |--------- logged extent ---------|
4638  			 *       |----- ordered extent ----|
4639  			 *
4640  			 * Just don't mess with mod_start and mod_len, we'll
4641  			 * just end up logging more csums than we need and it
4642  			 * will be ok.
4643  			 */
4644  		} else {
4645  			if (ordered_end < mod_end) {
4646  				mod_len = mod_end - ordered_end;
4647  				mod_start = ordered_end;
4648  			} else {
4649  				mod_len = 0;
4650  			}
4651  		}
4652  
4653  		/*
4654  		 * To keep us from looping for the above case of an ordered
4655  		 * extent that falls inside of the logged extent.
4656  		 */
4657  		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4658  			continue;
4659  
4660  		list_for_each_entry(sums, &ordered->list, list) {
4661  			ret = log_csums(trans, inode, log_root, sums);
4662  			if (ret)
4663  				return ret;
4664  		}
4665  	}
4666  
4667  	/* We're done, found all csums in the ordered extents. */
4668  	if (mod_len == 0)
4669  		return 0;
4670  
4671  	/* If we're compressed we have to save the entire range of csums. */
4672  	if (extent_map_is_compressed(em)) {
4673  		csum_offset = 0;
4674  		csum_len = em->disk_num_bytes;
4675  	} else {
4676  		csum_offset = mod_start - em->start;
4677  		csum_len = mod_len;
4678  	}
4679  
4680  	/* block start is already adjusted for the file extent offset. */
4681  	block_start = extent_map_block_start(em);
4682  	csum_root = btrfs_csum_root(trans->fs_info, block_start);
4683  	ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset,
4684  				      block_start + csum_offset + csum_len - 1,
4685  				      &ordered_sums, false);
4686  	if (ret < 0)
4687  		return ret;
4688  	ret = 0;
4689  
4690  	while (!list_empty(&ordered_sums)) {
4691  		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4692  						   struct btrfs_ordered_sum,
4693  						   list);
4694  		if (!ret)
4695  			ret = log_csums(trans, inode, log_root, sums);
4696  		list_del(&sums->list);
4697  		kfree(sums);
4698  	}
4699  
4700  	return ret;
4701  }
4702  
log_one_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,const struct extent_map * em,struct btrfs_path * path,struct btrfs_log_ctx * ctx)4703  static int log_one_extent(struct btrfs_trans_handle *trans,
4704  			  struct btrfs_inode *inode,
4705  			  const struct extent_map *em,
4706  			  struct btrfs_path *path,
4707  			  struct btrfs_log_ctx *ctx)
4708  {
4709  	struct btrfs_drop_extents_args drop_args = { 0 };
4710  	struct btrfs_root *log = inode->root->log_root;
4711  	struct btrfs_file_extent_item fi = { 0 };
4712  	struct extent_buffer *leaf;
4713  	struct btrfs_key key;
4714  	enum btrfs_compression_type compress_type;
4715  	u64 extent_offset = em->offset;
4716  	u64 block_start = extent_map_block_start(em);
4717  	u64 block_len;
4718  	int ret;
4719  
4720  	btrfs_set_stack_file_extent_generation(&fi, trans->transid);
4721  	if (em->flags & EXTENT_FLAG_PREALLOC)
4722  		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
4723  	else
4724  		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
4725  
4726  	block_len = em->disk_num_bytes;
4727  	compress_type = extent_map_compression(em);
4728  	if (compress_type != BTRFS_COMPRESS_NONE) {
4729  		btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start);
4730  		btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4731  	} else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
4732  		btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start - extent_offset);
4733  		btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4734  	}
4735  
4736  	btrfs_set_stack_file_extent_offset(&fi, extent_offset);
4737  	btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
4738  	btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
4739  	btrfs_set_stack_file_extent_compression(&fi, compress_type);
4740  
4741  	ret = log_extent_csums(trans, inode, log, em, ctx);
4742  	if (ret)
4743  		return ret;
4744  
4745  	/*
4746  	 * If this is the first time we are logging the inode in the current
4747  	 * transaction, we can avoid btrfs_drop_extents(), which is expensive
4748  	 * because it does a deletion search, which always acquires write locks
4749  	 * for extent buffers at levels 2, 1 and 0. This not only wastes time
4750  	 * but also adds significant contention in a log tree, since log trees
4751  	 * are small, with a root at level 2 or 3 at most, due to their short
4752  	 * life span.
4753  	 */
4754  	if (ctx->logged_before) {
4755  		drop_args.path = path;
4756  		drop_args.start = em->start;
4757  		drop_args.end = em->start + em->len;
4758  		drop_args.replace_extent = true;
4759  		drop_args.extent_item_size = sizeof(fi);
4760  		ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4761  		if (ret)
4762  			return ret;
4763  	}
4764  
4765  	if (!drop_args.extent_inserted) {
4766  		key.objectid = btrfs_ino(inode);
4767  		key.type = BTRFS_EXTENT_DATA_KEY;
4768  		key.offset = em->start;
4769  
4770  		ret = btrfs_insert_empty_item(trans, log, path, &key,
4771  					      sizeof(fi));
4772  		if (ret)
4773  			return ret;
4774  	}
4775  	leaf = path->nodes[0];
4776  	write_extent_buffer(leaf, &fi,
4777  			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4778  			    sizeof(fi));
4779  	btrfs_mark_buffer_dirty(trans, leaf);
4780  
4781  	btrfs_release_path(path);
4782  
4783  	return ret;
4784  }
4785  
4786  /*
4787   * Log all prealloc extents beyond the inode's i_size to make sure we do not
4788   * lose them after doing a full/fast fsync and replaying the log. We scan the
4789   * subvolume's root instead of iterating the inode's extent map tree because
4790   * otherwise we can log incorrect extent items based on extent map conversion.
4791   * That can happen due to the fact that extent maps are merged when they
4792   * are not in the extent map tree's list of modified extents.
4793   */
btrfs_log_prealloc_extents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_log_ctx * ctx)4794  static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4795  				      struct btrfs_inode *inode,
4796  				      struct btrfs_path *path,
4797  				      struct btrfs_log_ctx *ctx)
4798  {
4799  	struct btrfs_root *root = inode->root;
4800  	struct btrfs_key key;
4801  	const u64 i_size = i_size_read(&inode->vfs_inode);
4802  	const u64 ino = btrfs_ino(inode);
4803  	struct btrfs_path *dst_path = NULL;
4804  	bool dropped_extents = false;
4805  	u64 truncate_offset = i_size;
4806  	struct extent_buffer *leaf;
4807  	int slot;
4808  	int ins_nr = 0;
4809  	int start_slot = 0;
4810  	int ret;
4811  
4812  	if (!(inode->flags & BTRFS_INODE_PREALLOC))
4813  		return 0;
4814  
4815  	key.objectid = ino;
4816  	key.type = BTRFS_EXTENT_DATA_KEY;
4817  	key.offset = i_size;
4818  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4819  	if (ret < 0)
4820  		goto out;
4821  
4822  	/*
4823  	 * We must check if there is a prealloc extent that starts before the
4824  	 * i_size and crosses the i_size boundary. This is to ensure later we
4825  	 * truncate down to the end of that extent and not to the i_size, as
4826  	 * otherwise we end up losing part of the prealloc extent after a log
4827  	 * replay and with an implicit hole if there is another prealloc extent
4828  	 * that starts at an offset beyond i_size.
4829  	 */
4830  	ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4831  	if (ret < 0)
4832  		goto out;
4833  
4834  	if (ret == 0) {
4835  		struct btrfs_file_extent_item *ei;
4836  
4837  		leaf = path->nodes[0];
4838  		slot = path->slots[0];
4839  		ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4840  
4841  		if (btrfs_file_extent_type(leaf, ei) ==
4842  		    BTRFS_FILE_EXTENT_PREALLOC) {
4843  			u64 extent_end;
4844  
4845  			btrfs_item_key_to_cpu(leaf, &key, slot);
4846  			extent_end = key.offset +
4847  				btrfs_file_extent_num_bytes(leaf, ei);
4848  
4849  			if (extent_end > i_size)
4850  				truncate_offset = extent_end;
4851  		}
4852  	} else {
4853  		ret = 0;
4854  	}
4855  
4856  	while (true) {
4857  		leaf = path->nodes[0];
4858  		slot = path->slots[0];
4859  
4860  		if (slot >= btrfs_header_nritems(leaf)) {
4861  			if (ins_nr > 0) {
4862  				ret = copy_items(trans, inode, dst_path, path,
4863  						 start_slot, ins_nr, 1, 0, ctx);
4864  				if (ret < 0)
4865  					goto out;
4866  				ins_nr = 0;
4867  			}
4868  			ret = btrfs_next_leaf(root, path);
4869  			if (ret < 0)
4870  				goto out;
4871  			if (ret > 0) {
4872  				ret = 0;
4873  				break;
4874  			}
4875  			continue;
4876  		}
4877  
4878  		btrfs_item_key_to_cpu(leaf, &key, slot);
4879  		if (key.objectid > ino)
4880  			break;
4881  		if (WARN_ON_ONCE(key.objectid < ino) ||
4882  		    key.type < BTRFS_EXTENT_DATA_KEY ||
4883  		    key.offset < i_size) {
4884  			path->slots[0]++;
4885  			continue;
4886  		}
4887  		/*
4888  		 * Avoid overlapping items in the log tree. The first time we
4889  		 * get here, get rid of everything from a past fsync. After
4890  		 * that, if the current extent starts before the end of the last
4891  		 * extent we copied, truncate the last one. This can happen if
4892  		 * an ordered extent completion modifies the subvolume tree
4893  		 * while btrfs_next_leaf() has the tree unlocked.
4894  		 */
4895  		if (!dropped_extents || key.offset < truncate_offset) {
4896  			ret = truncate_inode_items(trans, root->log_root, inode,
4897  						   min(key.offset, truncate_offset),
4898  						   BTRFS_EXTENT_DATA_KEY);
4899  			if (ret)
4900  				goto out;
4901  			dropped_extents = true;
4902  		}
4903  		truncate_offset = btrfs_file_extent_end(path);
4904  		if (ins_nr == 0)
4905  			start_slot = slot;
4906  		ins_nr++;
4907  		path->slots[0]++;
4908  		if (!dst_path) {
4909  			dst_path = btrfs_alloc_path();
4910  			if (!dst_path) {
4911  				ret = -ENOMEM;
4912  				goto out;
4913  			}
4914  		}
4915  	}
4916  	if (ins_nr > 0)
4917  		ret = copy_items(trans, inode, dst_path, path,
4918  				 start_slot, ins_nr, 1, 0, ctx);
4919  out:
4920  	btrfs_release_path(path);
4921  	btrfs_free_path(dst_path);
4922  	return ret;
4923  }
4924  
btrfs_log_changed_extents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_log_ctx * ctx)4925  static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4926  				     struct btrfs_inode *inode,
4927  				     struct btrfs_path *path,
4928  				     struct btrfs_log_ctx *ctx)
4929  {
4930  	struct btrfs_ordered_extent *ordered;
4931  	struct btrfs_ordered_extent *tmp;
4932  	struct extent_map *em, *n;
4933  	LIST_HEAD(extents);
4934  	struct extent_map_tree *tree = &inode->extent_tree;
4935  	int ret = 0;
4936  	int num = 0;
4937  
4938  	write_lock(&tree->lock);
4939  
4940  	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4941  		list_del_init(&em->list);
4942  		/*
4943  		 * Just an arbitrary number, this can be really CPU intensive
4944  		 * once we start getting a lot of extents, and really once we
4945  		 * have a bunch of extents we just want to commit since it will
4946  		 * be faster.
4947  		 */
4948  		if (++num > 32768) {
4949  			list_del_init(&tree->modified_extents);
4950  			ret = -EFBIG;
4951  			goto process;
4952  		}
4953  
4954  		if (em->generation < trans->transid)
4955  			continue;
4956  
4957  		/* We log prealloc extents beyond eof later. */
4958  		if ((em->flags & EXTENT_FLAG_PREALLOC) &&
4959  		    em->start >= i_size_read(&inode->vfs_inode))
4960  			continue;
4961  
4962  		/* Need a ref to keep it from getting evicted from cache */
4963  		refcount_inc(&em->refs);
4964  		em->flags |= EXTENT_FLAG_LOGGING;
4965  		list_add_tail(&em->list, &extents);
4966  		num++;
4967  	}
4968  
4969  	list_sort(NULL, &extents, extent_cmp);
4970  process:
4971  	while (!list_empty(&extents)) {
4972  		em = list_entry(extents.next, struct extent_map, list);
4973  
4974  		list_del_init(&em->list);
4975  
4976  		/*
4977  		 * If we had an error we just need to delete everybody from our
4978  		 * private list.
4979  		 */
4980  		if (ret) {
4981  			clear_em_logging(inode, em);
4982  			free_extent_map(em);
4983  			continue;
4984  		}
4985  
4986  		write_unlock(&tree->lock);
4987  
4988  		ret = log_one_extent(trans, inode, em, path, ctx);
4989  		write_lock(&tree->lock);
4990  		clear_em_logging(inode, em);
4991  		free_extent_map(em);
4992  	}
4993  	WARN_ON(!list_empty(&extents));
4994  	write_unlock(&tree->lock);
4995  
4996  	if (!ret)
4997  		ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
4998  	if (ret)
4999  		return ret;
5000  
5001  	/*
5002  	 * We have logged all extents successfully, now make sure the commit of
5003  	 * the current transaction waits for the ordered extents to complete
5004  	 * before it commits and wipes out the log trees, otherwise we would
5005  	 * lose data if an ordered extents completes after the transaction
5006  	 * commits and a power failure happens after the transaction commit.
5007  	 */
5008  	list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
5009  		list_del_init(&ordered->log_list);
5010  		set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
5011  
5012  		if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
5013  			spin_lock_irq(&inode->ordered_tree_lock);
5014  			if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
5015  				set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
5016  				atomic_inc(&trans->transaction->pending_ordered);
5017  			}
5018  			spin_unlock_irq(&inode->ordered_tree_lock);
5019  		}
5020  		btrfs_put_ordered_extent(ordered);
5021  	}
5022  
5023  	return 0;
5024  }
5025  
logged_inode_size(struct btrfs_root * log,struct btrfs_inode * inode,struct btrfs_path * path,u64 * size_ret)5026  static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
5027  			     struct btrfs_path *path, u64 *size_ret)
5028  {
5029  	struct btrfs_key key;
5030  	int ret;
5031  
5032  	key.objectid = btrfs_ino(inode);
5033  	key.type = BTRFS_INODE_ITEM_KEY;
5034  	key.offset = 0;
5035  
5036  	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
5037  	if (ret < 0) {
5038  		return ret;
5039  	} else if (ret > 0) {
5040  		*size_ret = 0;
5041  	} else {
5042  		struct btrfs_inode_item *item;
5043  
5044  		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5045  				      struct btrfs_inode_item);
5046  		*size_ret = btrfs_inode_size(path->nodes[0], item);
5047  		/*
5048  		 * If the in-memory inode's i_size is smaller then the inode
5049  		 * size stored in the btree, return the inode's i_size, so
5050  		 * that we get a correct inode size after replaying the log
5051  		 * when before a power failure we had a shrinking truncate
5052  		 * followed by addition of a new name (rename / new hard link).
5053  		 * Otherwise return the inode size from the btree, to avoid
5054  		 * data loss when replaying a log due to previously doing a
5055  		 * write that expands the inode's size and logging a new name
5056  		 * immediately after.
5057  		 */
5058  		if (*size_ret > inode->vfs_inode.i_size)
5059  			*size_ret = inode->vfs_inode.i_size;
5060  	}
5061  
5062  	btrfs_release_path(path);
5063  	return 0;
5064  }
5065  
5066  /*
5067   * At the moment we always log all xattrs. This is to figure out at log replay
5068   * time which xattrs must have their deletion replayed. If a xattr is missing
5069   * in the log tree and exists in the fs/subvol tree, we delete it. This is
5070   * because if a xattr is deleted, the inode is fsynced and a power failure
5071   * happens, causing the log to be replayed the next time the fs is mounted,
5072   * we want the xattr to not exist anymore (same behaviour as other filesystems
5073   * with a journal, ext3/4, xfs, f2fs, etc).
5074   */
btrfs_log_all_xattrs(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx)5075  static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
5076  				struct btrfs_inode *inode,
5077  				struct btrfs_path *path,
5078  				struct btrfs_path *dst_path,
5079  				struct btrfs_log_ctx *ctx)
5080  {
5081  	struct btrfs_root *root = inode->root;
5082  	int ret;
5083  	struct btrfs_key key;
5084  	const u64 ino = btrfs_ino(inode);
5085  	int ins_nr = 0;
5086  	int start_slot = 0;
5087  	bool found_xattrs = false;
5088  
5089  	if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
5090  		return 0;
5091  
5092  	key.objectid = ino;
5093  	key.type = BTRFS_XATTR_ITEM_KEY;
5094  	key.offset = 0;
5095  
5096  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5097  	if (ret < 0)
5098  		return ret;
5099  
5100  	while (true) {
5101  		int slot = path->slots[0];
5102  		struct extent_buffer *leaf = path->nodes[0];
5103  		int nritems = btrfs_header_nritems(leaf);
5104  
5105  		if (slot >= nritems) {
5106  			if (ins_nr > 0) {
5107  				ret = copy_items(trans, inode, dst_path, path,
5108  						 start_slot, ins_nr, 1, 0, ctx);
5109  				if (ret < 0)
5110  					return ret;
5111  				ins_nr = 0;
5112  			}
5113  			ret = btrfs_next_leaf(root, path);
5114  			if (ret < 0)
5115  				return ret;
5116  			else if (ret > 0)
5117  				break;
5118  			continue;
5119  		}
5120  
5121  		btrfs_item_key_to_cpu(leaf, &key, slot);
5122  		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
5123  			break;
5124  
5125  		if (ins_nr == 0)
5126  			start_slot = slot;
5127  		ins_nr++;
5128  		path->slots[0]++;
5129  		found_xattrs = true;
5130  		cond_resched();
5131  	}
5132  	if (ins_nr > 0) {
5133  		ret = copy_items(trans, inode, dst_path, path,
5134  				 start_slot, ins_nr, 1, 0, ctx);
5135  		if (ret < 0)
5136  			return ret;
5137  	}
5138  
5139  	if (!found_xattrs)
5140  		set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
5141  
5142  	return 0;
5143  }
5144  
5145  /*
5146   * When using the NO_HOLES feature if we punched a hole that causes the
5147   * deletion of entire leafs or all the extent items of the first leaf (the one
5148   * that contains the inode item and references) we may end up not processing
5149   * any extents, because there are no leafs with a generation matching the
5150   * current transaction that have extent items for our inode. So we need to find
5151   * if any holes exist and then log them. We also need to log holes after any
5152   * truncate operation that changes the inode's size.
5153   */
btrfs_log_holes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path)5154  static int btrfs_log_holes(struct btrfs_trans_handle *trans,
5155  			   struct btrfs_inode *inode,
5156  			   struct btrfs_path *path)
5157  {
5158  	struct btrfs_root *root = inode->root;
5159  	struct btrfs_fs_info *fs_info = root->fs_info;
5160  	struct btrfs_key key;
5161  	const u64 ino = btrfs_ino(inode);
5162  	const u64 i_size = i_size_read(&inode->vfs_inode);
5163  	u64 prev_extent_end = 0;
5164  	int ret;
5165  
5166  	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
5167  		return 0;
5168  
5169  	key.objectid = ino;
5170  	key.type = BTRFS_EXTENT_DATA_KEY;
5171  	key.offset = 0;
5172  
5173  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5174  	if (ret < 0)
5175  		return ret;
5176  
5177  	while (true) {
5178  		struct extent_buffer *leaf = path->nodes[0];
5179  
5180  		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5181  			ret = btrfs_next_leaf(root, path);
5182  			if (ret < 0)
5183  				return ret;
5184  			if (ret > 0) {
5185  				ret = 0;
5186  				break;
5187  			}
5188  			leaf = path->nodes[0];
5189  		}
5190  
5191  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5192  		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
5193  			break;
5194  
5195  		/* We have a hole, log it. */
5196  		if (prev_extent_end < key.offset) {
5197  			const u64 hole_len = key.offset - prev_extent_end;
5198  
5199  			/*
5200  			 * Release the path to avoid deadlocks with other code
5201  			 * paths that search the root while holding locks on
5202  			 * leafs from the log root.
5203  			 */
5204  			btrfs_release_path(path);
5205  			ret = btrfs_insert_hole_extent(trans, root->log_root,
5206  						       ino, prev_extent_end,
5207  						       hole_len);
5208  			if (ret < 0)
5209  				return ret;
5210  
5211  			/*
5212  			 * Search for the same key again in the root. Since it's
5213  			 * an extent item and we are holding the inode lock, the
5214  			 * key must still exist. If it doesn't just emit warning
5215  			 * and return an error to fall back to a transaction
5216  			 * commit.
5217  			 */
5218  			ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5219  			if (ret < 0)
5220  				return ret;
5221  			if (WARN_ON(ret > 0))
5222  				return -ENOENT;
5223  			leaf = path->nodes[0];
5224  		}
5225  
5226  		prev_extent_end = btrfs_file_extent_end(path);
5227  		path->slots[0]++;
5228  		cond_resched();
5229  	}
5230  
5231  	if (prev_extent_end < i_size) {
5232  		u64 hole_len;
5233  
5234  		btrfs_release_path(path);
5235  		hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
5236  		ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
5237  					       prev_extent_end, hole_len);
5238  		if (ret < 0)
5239  			return ret;
5240  	}
5241  
5242  	return 0;
5243  }
5244  
5245  /*
5246   * When we are logging a new inode X, check if it doesn't have a reference that
5247   * matches the reference from some other inode Y created in a past transaction
5248   * and that was renamed in the current transaction. If we don't do this, then at
5249   * log replay time we can lose inode Y (and all its files if it's a directory):
5250   *
5251   * mkdir /mnt/x
5252   * echo "hello world" > /mnt/x/foobar
5253   * sync
5254   * mv /mnt/x /mnt/y
5255   * mkdir /mnt/x                 # or touch /mnt/x
5256   * xfs_io -c fsync /mnt/x
5257   * <power fail>
5258   * mount fs, trigger log replay
5259   *
5260   * After the log replay procedure, we would lose the first directory and all its
5261   * files (file foobar).
5262   * For the case where inode Y is not a directory we simply end up losing it:
5263   *
5264   * echo "123" > /mnt/foo
5265   * sync
5266   * mv /mnt/foo /mnt/bar
5267   * echo "abc" > /mnt/foo
5268   * xfs_io -c fsync /mnt/foo
5269   * <power fail>
5270   *
5271   * We also need this for cases where a snapshot entry is replaced by some other
5272   * entry (file or directory) otherwise we end up with an unreplayable log due to
5273   * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
5274   * if it were a regular entry:
5275   *
5276   * mkdir /mnt/x
5277   * btrfs subvolume snapshot /mnt /mnt/x/snap
5278   * btrfs subvolume delete /mnt/x/snap
5279   * rmdir /mnt/x
5280   * mkdir /mnt/x
5281   * fsync /mnt/x or fsync some new file inside it
5282   * <power fail>
5283   *
5284   * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
5285   * the same transaction.
5286   */
btrfs_check_ref_name_override(struct extent_buffer * eb,const int slot,const struct btrfs_key * key,struct btrfs_inode * inode,u64 * other_ino,u64 * other_parent)5287  static int btrfs_check_ref_name_override(struct extent_buffer *eb,
5288  					 const int slot,
5289  					 const struct btrfs_key *key,
5290  					 struct btrfs_inode *inode,
5291  					 u64 *other_ino, u64 *other_parent)
5292  {
5293  	int ret;
5294  	struct btrfs_path *search_path;
5295  	char *name = NULL;
5296  	u32 name_len = 0;
5297  	u32 item_size = btrfs_item_size(eb, slot);
5298  	u32 cur_offset = 0;
5299  	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
5300  
5301  	search_path = btrfs_alloc_path();
5302  	if (!search_path)
5303  		return -ENOMEM;
5304  	search_path->search_commit_root = 1;
5305  	search_path->skip_locking = 1;
5306  
5307  	while (cur_offset < item_size) {
5308  		u64 parent;
5309  		u32 this_name_len;
5310  		u32 this_len;
5311  		unsigned long name_ptr;
5312  		struct btrfs_dir_item *di;
5313  		struct fscrypt_str name_str;
5314  
5315  		if (key->type == BTRFS_INODE_REF_KEY) {
5316  			struct btrfs_inode_ref *iref;
5317  
5318  			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
5319  			parent = key->offset;
5320  			this_name_len = btrfs_inode_ref_name_len(eb, iref);
5321  			name_ptr = (unsigned long)(iref + 1);
5322  			this_len = sizeof(*iref) + this_name_len;
5323  		} else {
5324  			struct btrfs_inode_extref *extref;
5325  
5326  			extref = (struct btrfs_inode_extref *)(ptr +
5327  							       cur_offset);
5328  			parent = btrfs_inode_extref_parent(eb, extref);
5329  			this_name_len = btrfs_inode_extref_name_len(eb, extref);
5330  			name_ptr = (unsigned long)&extref->name;
5331  			this_len = sizeof(*extref) + this_name_len;
5332  		}
5333  
5334  		if (this_name_len > name_len) {
5335  			char *new_name;
5336  
5337  			new_name = krealloc(name, this_name_len, GFP_NOFS);
5338  			if (!new_name) {
5339  				ret = -ENOMEM;
5340  				goto out;
5341  			}
5342  			name_len = this_name_len;
5343  			name = new_name;
5344  		}
5345  
5346  		read_extent_buffer(eb, name, name_ptr, this_name_len);
5347  
5348  		name_str.name = name;
5349  		name_str.len = this_name_len;
5350  		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
5351  				parent, &name_str, 0);
5352  		if (di && !IS_ERR(di)) {
5353  			struct btrfs_key di_key;
5354  
5355  			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
5356  						  di, &di_key);
5357  			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
5358  				if (di_key.objectid != key->objectid) {
5359  					ret = 1;
5360  					*other_ino = di_key.objectid;
5361  					*other_parent = parent;
5362  				} else {
5363  					ret = 0;
5364  				}
5365  			} else {
5366  				ret = -EAGAIN;
5367  			}
5368  			goto out;
5369  		} else if (IS_ERR(di)) {
5370  			ret = PTR_ERR(di);
5371  			goto out;
5372  		}
5373  		btrfs_release_path(search_path);
5374  
5375  		cur_offset += this_len;
5376  	}
5377  	ret = 0;
5378  out:
5379  	btrfs_free_path(search_path);
5380  	kfree(name);
5381  	return ret;
5382  }
5383  
5384  /*
5385   * Check if we need to log an inode. This is used in contexts where while
5386   * logging an inode we need to log another inode (either that it exists or in
5387   * full mode). This is used instead of btrfs_inode_in_log() because the later
5388   * requires the inode to be in the log and have the log transaction committed,
5389   * while here we do not care if the log transaction was already committed - our
5390   * caller will commit the log later - and we want to avoid logging an inode
5391   * multiple times when multiple tasks have joined the same log transaction.
5392   */
need_log_inode(const struct btrfs_trans_handle * trans,struct btrfs_inode * inode)5393  static bool need_log_inode(const struct btrfs_trans_handle *trans,
5394  			   struct btrfs_inode *inode)
5395  {
5396  	/*
5397  	 * If a directory was not modified, no dentries added or removed, we can
5398  	 * and should avoid logging it.
5399  	 */
5400  	if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5401  		return false;
5402  
5403  	/*
5404  	 * If this inode does not have new/updated/deleted xattrs since the last
5405  	 * time it was logged and is flagged as logged in the current transaction,
5406  	 * we can skip logging it. As for new/deleted names, those are updated in
5407  	 * the log by link/unlink/rename operations.
5408  	 * In case the inode was logged and then evicted and reloaded, its
5409  	 * logged_trans will be 0, in which case we have to fully log it since
5410  	 * logged_trans is a transient field, not persisted.
5411  	 */
5412  	if (inode_logged(trans, inode, NULL) == 1 &&
5413  	    !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5414  		return false;
5415  
5416  	return true;
5417  }
5418  
5419  struct btrfs_dir_list {
5420  	u64 ino;
5421  	struct list_head list;
5422  };
5423  
5424  /*
5425   * Log the inodes of the new dentries of a directory.
5426   * See process_dir_items_leaf() for details about why it is needed.
5427   * This is a recursive operation - if an existing dentry corresponds to a
5428   * directory, that directory's new entries are logged too (same behaviour as
5429   * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5430   * the dentries point to we do not acquire their VFS lock, otherwise lockdep
5431   * complains about the following circular lock dependency / possible deadlock:
5432   *
5433   *        CPU0                                        CPU1
5434   *        ----                                        ----
5435   * lock(&type->i_mutex_dir_key#3/2);
5436   *                                            lock(sb_internal#2);
5437   *                                            lock(&type->i_mutex_dir_key#3/2);
5438   * lock(&sb->s_type->i_mutex_key#14);
5439   *
5440   * Where sb_internal is the lock (a counter that works as a lock) acquired by
5441   * sb_start_intwrite() in btrfs_start_transaction().
5442   * Not acquiring the VFS lock of the inodes is still safe because:
5443   *
5444   * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5445   *    that while logging the inode new references (names) are added or removed
5446   *    from the inode, leaving the logged inode item with a link count that does
5447   *    not match the number of logged inode reference items. This is fine because
5448   *    at log replay time we compute the real number of links and correct the
5449   *    link count in the inode item (see replay_one_buffer() and
5450   *    link_to_fixup_dir());
5451   *
5452   * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5453   *    while logging the inode's items new index items (key type
5454   *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
5455   *    has a size that doesn't match the sum of the lengths of all the logged
5456   *    names - this is ok, not a problem, because at log replay time we set the
5457   *    directory's i_size to the correct value (see replay_one_name() and
5458   *    overwrite_item()).
5459   */
log_new_dir_dentries(struct btrfs_trans_handle * trans,struct btrfs_inode * start_inode,struct btrfs_log_ctx * ctx)5460  static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5461  				struct btrfs_inode *start_inode,
5462  				struct btrfs_log_ctx *ctx)
5463  {
5464  	struct btrfs_root *root = start_inode->root;
5465  	struct btrfs_path *path;
5466  	LIST_HEAD(dir_list);
5467  	struct btrfs_dir_list *dir_elem;
5468  	u64 ino = btrfs_ino(start_inode);
5469  	struct btrfs_inode *curr_inode = start_inode;
5470  	int ret = 0;
5471  
5472  	/*
5473  	 * If we are logging a new name, as part of a link or rename operation,
5474  	 * don't bother logging new dentries, as we just want to log the names
5475  	 * of an inode and that any new parents exist.
5476  	 */
5477  	if (ctx->logging_new_name)
5478  		return 0;
5479  
5480  	path = btrfs_alloc_path();
5481  	if (!path)
5482  		return -ENOMEM;
5483  
5484  	/* Pairs with btrfs_add_delayed_iput below. */
5485  	ihold(&curr_inode->vfs_inode);
5486  
5487  	while (true) {
5488  		struct inode *vfs_inode;
5489  		struct btrfs_key key;
5490  		struct btrfs_key found_key;
5491  		u64 next_index;
5492  		bool continue_curr_inode = true;
5493  		int iter_ret;
5494  
5495  		key.objectid = ino;
5496  		key.type = BTRFS_DIR_INDEX_KEY;
5497  		key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
5498  		next_index = key.offset;
5499  again:
5500  		btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
5501  			struct extent_buffer *leaf = path->nodes[0];
5502  			struct btrfs_dir_item *di;
5503  			struct btrfs_key di_key;
5504  			struct inode *di_inode;
5505  			int log_mode = LOG_INODE_EXISTS;
5506  			int type;
5507  
5508  			if (found_key.objectid != ino ||
5509  			    found_key.type != BTRFS_DIR_INDEX_KEY) {
5510  				continue_curr_inode = false;
5511  				break;
5512  			}
5513  
5514  			next_index = found_key.offset + 1;
5515  
5516  			di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5517  			type = btrfs_dir_ftype(leaf, di);
5518  			if (btrfs_dir_transid(leaf, di) < trans->transid)
5519  				continue;
5520  			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5521  			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5522  				continue;
5523  
5524  			btrfs_release_path(path);
5525  			di_inode = btrfs_iget_logging(di_key.objectid, root);
5526  			if (IS_ERR(di_inode)) {
5527  				ret = PTR_ERR(di_inode);
5528  				goto out;
5529  			}
5530  
5531  			if (!need_log_inode(trans, BTRFS_I(di_inode))) {
5532  				btrfs_add_delayed_iput(BTRFS_I(di_inode));
5533  				break;
5534  			}
5535  
5536  			ctx->log_new_dentries = false;
5537  			if (type == BTRFS_FT_DIR)
5538  				log_mode = LOG_INODE_ALL;
5539  			ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
5540  					      log_mode, ctx);
5541  			btrfs_add_delayed_iput(BTRFS_I(di_inode));
5542  			if (ret)
5543  				goto out;
5544  			if (ctx->log_new_dentries) {
5545  				dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5546  				if (!dir_elem) {
5547  					ret = -ENOMEM;
5548  					goto out;
5549  				}
5550  				dir_elem->ino = di_key.objectid;
5551  				list_add_tail(&dir_elem->list, &dir_list);
5552  			}
5553  			break;
5554  		}
5555  
5556  		btrfs_release_path(path);
5557  
5558  		if (iter_ret < 0) {
5559  			ret = iter_ret;
5560  			goto out;
5561  		} else if (iter_ret > 0) {
5562  			continue_curr_inode = false;
5563  		} else {
5564  			key = found_key;
5565  		}
5566  
5567  		if (continue_curr_inode && key.offset < (u64)-1) {
5568  			key.offset++;
5569  			goto again;
5570  		}
5571  
5572  		btrfs_set_first_dir_index_to_log(curr_inode, next_index);
5573  
5574  		if (list_empty(&dir_list))
5575  			break;
5576  
5577  		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
5578  		ino = dir_elem->ino;
5579  		list_del(&dir_elem->list);
5580  		kfree(dir_elem);
5581  
5582  		btrfs_add_delayed_iput(curr_inode);
5583  		curr_inode = NULL;
5584  
5585  		vfs_inode = btrfs_iget_logging(ino, root);
5586  		if (IS_ERR(vfs_inode)) {
5587  			ret = PTR_ERR(vfs_inode);
5588  			break;
5589  		}
5590  		curr_inode = BTRFS_I(vfs_inode);
5591  	}
5592  out:
5593  	btrfs_free_path(path);
5594  	if (curr_inode)
5595  		btrfs_add_delayed_iput(curr_inode);
5596  
5597  	if (ret) {
5598  		struct btrfs_dir_list *next;
5599  
5600  		list_for_each_entry_safe(dir_elem, next, &dir_list, list)
5601  			kfree(dir_elem);
5602  	}
5603  
5604  	return ret;
5605  }
5606  
5607  struct btrfs_ino_list {
5608  	u64 ino;
5609  	u64 parent;
5610  	struct list_head list;
5611  };
5612  
free_conflicting_inodes(struct btrfs_log_ctx * ctx)5613  static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
5614  {
5615  	struct btrfs_ino_list *curr;
5616  	struct btrfs_ino_list *next;
5617  
5618  	list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
5619  		list_del(&curr->list);
5620  		kfree(curr);
5621  	}
5622  }
5623  
conflicting_inode_is_dir(struct btrfs_root * root,u64 ino,struct btrfs_path * path)5624  static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
5625  				    struct btrfs_path *path)
5626  {
5627  	struct btrfs_key key;
5628  	int ret;
5629  
5630  	key.objectid = ino;
5631  	key.type = BTRFS_INODE_ITEM_KEY;
5632  	key.offset = 0;
5633  
5634  	path->search_commit_root = 1;
5635  	path->skip_locking = 1;
5636  
5637  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5638  	if (WARN_ON_ONCE(ret > 0)) {
5639  		/*
5640  		 * We have previously found the inode through the commit root
5641  		 * so this should not happen. If it does, just error out and
5642  		 * fallback to a transaction commit.
5643  		 */
5644  		ret = -ENOENT;
5645  	} else if (ret == 0) {
5646  		struct btrfs_inode_item *item;
5647  
5648  		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5649  				      struct btrfs_inode_item);
5650  		if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
5651  			ret = 1;
5652  	}
5653  
5654  	btrfs_release_path(path);
5655  	path->search_commit_root = 0;
5656  	path->skip_locking = 0;
5657  
5658  	return ret;
5659  }
5660  
add_conflicting_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,u64 ino,u64 parent,struct btrfs_log_ctx * ctx)5661  static int add_conflicting_inode(struct btrfs_trans_handle *trans,
5662  				 struct btrfs_root *root,
5663  				 struct btrfs_path *path,
5664  				 u64 ino, u64 parent,
5665  				 struct btrfs_log_ctx *ctx)
5666  {
5667  	struct btrfs_ino_list *ino_elem;
5668  	struct inode *inode;
5669  
5670  	/*
5671  	 * It's rare to have a lot of conflicting inodes, in practice it is not
5672  	 * common to have more than 1 or 2. We don't want to collect too many,
5673  	 * as we could end up logging too many inodes (even if only in
5674  	 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
5675  	 * commits.
5676  	 */
5677  	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
5678  		return BTRFS_LOG_FORCE_COMMIT;
5679  
5680  	inode = btrfs_iget_logging(ino, root);
5681  	/*
5682  	 * If the other inode that had a conflicting dir entry was deleted in
5683  	 * the current transaction then we either:
5684  	 *
5685  	 * 1) Log the parent directory (later after adding it to the list) if
5686  	 *    the inode is a directory. This is because it may be a deleted
5687  	 *    subvolume/snapshot or it may be a regular directory that had
5688  	 *    deleted subvolumes/snapshots (or subdirectories that had them),
5689  	 *    and at the moment we can't deal with dropping subvolumes/snapshots
5690  	 *    during log replay. So we just log the parent, which will result in
5691  	 *    a fallback to a transaction commit if we are dealing with those
5692  	 *    cases (last_unlink_trans will match the current transaction);
5693  	 *
5694  	 * 2) Do nothing if it's not a directory. During log replay we simply
5695  	 *    unlink the conflicting dentry from the parent directory and then
5696  	 *    add the dentry for our inode. Like this we can avoid logging the
5697  	 *    parent directory (and maybe fallback to a transaction commit in
5698  	 *    case it has a last_unlink_trans == trans->transid, due to moving
5699  	 *    some inode from it to some other directory).
5700  	 */
5701  	if (IS_ERR(inode)) {
5702  		int ret = PTR_ERR(inode);
5703  
5704  		if (ret != -ENOENT)
5705  			return ret;
5706  
5707  		ret = conflicting_inode_is_dir(root, ino, path);
5708  		/* Not a directory or we got an error. */
5709  		if (ret <= 0)
5710  			return ret;
5711  
5712  		/* Conflicting inode is a directory, so we'll log its parent. */
5713  		ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5714  		if (!ino_elem)
5715  			return -ENOMEM;
5716  		ino_elem->ino = ino;
5717  		ino_elem->parent = parent;
5718  		list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5719  		ctx->num_conflict_inodes++;
5720  
5721  		return 0;
5722  	}
5723  
5724  	/*
5725  	 * If the inode was already logged skip it - otherwise we can hit an
5726  	 * infinite loop. Example:
5727  	 *
5728  	 * From the commit root (previous transaction) we have the following
5729  	 * inodes:
5730  	 *
5731  	 * inode 257 a directory
5732  	 * inode 258 with references "zz" and "zz_link" on inode 257
5733  	 * inode 259 with reference "a" on inode 257
5734  	 *
5735  	 * And in the current (uncommitted) transaction we have:
5736  	 *
5737  	 * inode 257 a directory, unchanged
5738  	 * inode 258 with references "a" and "a2" on inode 257
5739  	 * inode 259 with reference "zz_link" on inode 257
5740  	 * inode 261 with reference "zz" on inode 257
5741  	 *
5742  	 * When logging inode 261 the following infinite loop could
5743  	 * happen if we don't skip already logged inodes:
5744  	 *
5745  	 * - we detect inode 258 as a conflicting inode, with inode 261
5746  	 *   on reference "zz", and log it;
5747  	 *
5748  	 * - we detect inode 259 as a conflicting inode, with inode 258
5749  	 *   on reference "a", and log it;
5750  	 *
5751  	 * - we detect inode 258 as a conflicting inode, with inode 259
5752  	 *   on reference "zz_link", and log it - again! After this we
5753  	 *   repeat the above steps forever.
5754  	 *
5755  	 * Here we can use need_log_inode() because we only need to log the
5756  	 * inode in LOG_INODE_EXISTS mode and rename operations update the log,
5757  	 * so that the log ends up with the new name and without the old name.
5758  	 */
5759  	if (!need_log_inode(trans, BTRFS_I(inode))) {
5760  		btrfs_add_delayed_iput(BTRFS_I(inode));
5761  		return 0;
5762  	}
5763  
5764  	btrfs_add_delayed_iput(BTRFS_I(inode));
5765  
5766  	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5767  	if (!ino_elem)
5768  		return -ENOMEM;
5769  	ino_elem->ino = ino;
5770  	ino_elem->parent = parent;
5771  	list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5772  	ctx->num_conflict_inodes++;
5773  
5774  	return 0;
5775  }
5776  
log_conflicting_inodes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)5777  static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
5778  				  struct btrfs_root *root,
5779  				  struct btrfs_log_ctx *ctx)
5780  {
5781  	int ret = 0;
5782  
5783  	/*
5784  	 * Conflicting inodes are logged by the first call to btrfs_log_inode(),
5785  	 * otherwise we could have unbounded recursion of btrfs_log_inode()
5786  	 * calls. This check guarantees we can have only 1 level of recursion.
5787  	 */
5788  	if (ctx->logging_conflict_inodes)
5789  		return 0;
5790  
5791  	ctx->logging_conflict_inodes = true;
5792  
5793  	/*
5794  	 * New conflicting inodes may be found and added to the list while we
5795  	 * are logging a conflicting inode, so keep iterating while the list is
5796  	 * not empty.
5797  	 */
5798  	while (!list_empty(&ctx->conflict_inodes)) {
5799  		struct btrfs_ino_list *curr;
5800  		struct inode *inode;
5801  		u64 ino;
5802  		u64 parent;
5803  
5804  		curr = list_first_entry(&ctx->conflict_inodes,
5805  					struct btrfs_ino_list, list);
5806  		ino = curr->ino;
5807  		parent = curr->parent;
5808  		list_del(&curr->list);
5809  		kfree(curr);
5810  
5811  		inode = btrfs_iget_logging(ino, root);
5812  		/*
5813  		 * If the other inode that had a conflicting dir entry was
5814  		 * deleted in the current transaction, we need to log its parent
5815  		 * directory. See the comment at add_conflicting_inode().
5816  		 */
5817  		if (IS_ERR(inode)) {
5818  			ret = PTR_ERR(inode);
5819  			if (ret != -ENOENT)
5820  				break;
5821  
5822  			inode = btrfs_iget_logging(parent, root);
5823  			if (IS_ERR(inode)) {
5824  				ret = PTR_ERR(inode);
5825  				break;
5826  			}
5827  
5828  			/*
5829  			 * Always log the directory, we cannot make this
5830  			 * conditional on need_log_inode() because the directory
5831  			 * might have been logged in LOG_INODE_EXISTS mode or
5832  			 * the dir index of the conflicting inode is not in a
5833  			 * dir index key range logged for the directory. So we
5834  			 * must make sure the deletion is recorded.
5835  			 */
5836  			ret = btrfs_log_inode(trans, BTRFS_I(inode),
5837  					      LOG_INODE_ALL, ctx);
5838  			btrfs_add_delayed_iput(BTRFS_I(inode));
5839  			if (ret)
5840  				break;
5841  			continue;
5842  		}
5843  
5844  		/*
5845  		 * Here we can use need_log_inode() because we only need to log
5846  		 * the inode in LOG_INODE_EXISTS mode and rename operations
5847  		 * update the log, so that the log ends up with the new name and
5848  		 * without the old name.
5849  		 *
5850  		 * We did this check at add_conflicting_inode(), but here we do
5851  		 * it again because if some other task logged the inode after
5852  		 * that, we can avoid doing it again.
5853  		 */
5854  		if (!need_log_inode(trans, BTRFS_I(inode))) {
5855  			btrfs_add_delayed_iput(BTRFS_I(inode));
5856  			continue;
5857  		}
5858  
5859  		/*
5860  		 * We are safe logging the other inode without acquiring its
5861  		 * lock as long as we log with the LOG_INODE_EXISTS mode. We
5862  		 * are safe against concurrent renames of the other inode as
5863  		 * well because during a rename we pin the log and update the
5864  		 * log with the new name before we unpin it.
5865  		 */
5866  		ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
5867  		btrfs_add_delayed_iput(BTRFS_I(inode));
5868  		if (ret)
5869  			break;
5870  	}
5871  
5872  	ctx->logging_conflict_inodes = false;
5873  	if (ret)
5874  		free_conflicting_inodes(ctx);
5875  
5876  	return ret;
5877  }
5878  
copy_inode_items_to_log(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_key * min_key,const struct btrfs_key * max_key,struct btrfs_path * path,struct btrfs_path * dst_path,const u64 logged_isize,const int inode_only,struct btrfs_log_ctx * ctx,bool * need_log_inode_item)5879  static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5880  				   struct btrfs_inode *inode,
5881  				   struct btrfs_key *min_key,
5882  				   const struct btrfs_key *max_key,
5883  				   struct btrfs_path *path,
5884  				   struct btrfs_path *dst_path,
5885  				   const u64 logged_isize,
5886  				   const int inode_only,
5887  				   struct btrfs_log_ctx *ctx,
5888  				   bool *need_log_inode_item)
5889  {
5890  	const u64 i_size = i_size_read(&inode->vfs_inode);
5891  	struct btrfs_root *root = inode->root;
5892  	int ins_start_slot = 0;
5893  	int ins_nr = 0;
5894  	int ret;
5895  
5896  	while (1) {
5897  		ret = btrfs_search_forward(root, min_key, path, trans->transid);
5898  		if (ret < 0)
5899  			return ret;
5900  		if (ret > 0) {
5901  			ret = 0;
5902  			break;
5903  		}
5904  again:
5905  		/* Note, ins_nr might be > 0 here, cleanup outside the loop */
5906  		if (min_key->objectid != max_key->objectid)
5907  			break;
5908  		if (min_key->type > max_key->type)
5909  			break;
5910  
5911  		if (min_key->type == BTRFS_INODE_ITEM_KEY) {
5912  			*need_log_inode_item = false;
5913  		} else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5914  			   min_key->offset >= i_size) {
5915  			/*
5916  			 * Extents at and beyond eof are logged with
5917  			 * btrfs_log_prealloc_extents().
5918  			 * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5919  			 * and no keys greater than that, so bail out.
5920  			 */
5921  			break;
5922  		} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5923  			    min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5924  			   (inode->generation == trans->transid ||
5925  			    ctx->logging_conflict_inodes)) {
5926  			u64 other_ino = 0;
5927  			u64 other_parent = 0;
5928  
5929  			ret = btrfs_check_ref_name_override(path->nodes[0],
5930  					path->slots[0], min_key, inode,
5931  					&other_ino, &other_parent);
5932  			if (ret < 0) {
5933  				return ret;
5934  			} else if (ret > 0 &&
5935  				   other_ino != btrfs_ino(ctx->inode)) {
5936  				if (ins_nr > 0) {
5937  					ins_nr++;
5938  				} else {
5939  					ins_nr = 1;
5940  					ins_start_slot = path->slots[0];
5941  				}
5942  				ret = copy_items(trans, inode, dst_path, path,
5943  						 ins_start_slot, ins_nr,
5944  						 inode_only, logged_isize, ctx);
5945  				if (ret < 0)
5946  					return ret;
5947  				ins_nr = 0;
5948  
5949  				btrfs_release_path(path);
5950  				ret = add_conflicting_inode(trans, root, path,
5951  							    other_ino,
5952  							    other_parent, ctx);
5953  				if (ret)
5954  					return ret;
5955  				goto next_key;
5956  			}
5957  		} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5958  			/* Skip xattrs, logged later with btrfs_log_all_xattrs() */
5959  			if (ins_nr == 0)
5960  				goto next_slot;
5961  			ret = copy_items(trans, inode, dst_path, path,
5962  					 ins_start_slot,
5963  					 ins_nr, inode_only, logged_isize, ctx);
5964  			if (ret < 0)
5965  				return ret;
5966  			ins_nr = 0;
5967  			goto next_slot;
5968  		}
5969  
5970  		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5971  			ins_nr++;
5972  			goto next_slot;
5973  		} else if (!ins_nr) {
5974  			ins_start_slot = path->slots[0];
5975  			ins_nr = 1;
5976  			goto next_slot;
5977  		}
5978  
5979  		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5980  				 ins_nr, inode_only, logged_isize, ctx);
5981  		if (ret < 0)
5982  			return ret;
5983  		ins_nr = 1;
5984  		ins_start_slot = path->slots[0];
5985  next_slot:
5986  		path->slots[0]++;
5987  		if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5988  			btrfs_item_key_to_cpu(path->nodes[0], min_key,
5989  					      path->slots[0]);
5990  			goto again;
5991  		}
5992  		if (ins_nr) {
5993  			ret = copy_items(trans, inode, dst_path, path,
5994  					 ins_start_slot, ins_nr, inode_only,
5995  					 logged_isize, ctx);
5996  			if (ret < 0)
5997  				return ret;
5998  			ins_nr = 0;
5999  		}
6000  		btrfs_release_path(path);
6001  next_key:
6002  		if (min_key->offset < (u64)-1) {
6003  			min_key->offset++;
6004  		} else if (min_key->type < max_key->type) {
6005  			min_key->type++;
6006  			min_key->offset = 0;
6007  		} else {
6008  			break;
6009  		}
6010  
6011  		/*
6012  		 * We may process many leaves full of items for our inode, so
6013  		 * avoid monopolizing a cpu for too long by rescheduling while
6014  		 * not holding locks on any tree.
6015  		 */
6016  		cond_resched();
6017  	}
6018  	if (ins_nr) {
6019  		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
6020  				 ins_nr, inode_only, logged_isize, ctx);
6021  		if (ret)
6022  			return ret;
6023  	}
6024  
6025  	if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
6026  		/*
6027  		 * Release the path because otherwise we might attempt to double
6028  		 * lock the same leaf with btrfs_log_prealloc_extents() below.
6029  		 */
6030  		btrfs_release_path(path);
6031  		ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx);
6032  	}
6033  
6034  	return ret;
6035  }
6036  
insert_delayed_items_batch(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,const struct btrfs_item_batch * batch,const struct btrfs_delayed_item * first_item)6037  static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
6038  				      struct btrfs_root *log,
6039  				      struct btrfs_path *path,
6040  				      const struct btrfs_item_batch *batch,
6041  				      const struct btrfs_delayed_item *first_item)
6042  {
6043  	const struct btrfs_delayed_item *curr = first_item;
6044  	int ret;
6045  
6046  	ret = btrfs_insert_empty_items(trans, log, path, batch);
6047  	if (ret)
6048  		return ret;
6049  
6050  	for (int i = 0; i < batch->nr; i++) {
6051  		char *data_ptr;
6052  
6053  		data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
6054  		write_extent_buffer(path->nodes[0], &curr->data,
6055  				    (unsigned long)data_ptr, curr->data_len);
6056  		curr = list_next_entry(curr, log_list);
6057  		path->slots[0]++;
6058  	}
6059  
6060  	btrfs_release_path(path);
6061  
6062  	return 0;
6063  }
6064  
log_delayed_insertion_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_ins_list,struct btrfs_log_ctx * ctx)6065  static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
6066  				       struct btrfs_inode *inode,
6067  				       struct btrfs_path *path,
6068  				       const struct list_head *delayed_ins_list,
6069  				       struct btrfs_log_ctx *ctx)
6070  {
6071  	/* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
6072  	const int max_batch_size = 195;
6073  	const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
6074  	const u64 ino = btrfs_ino(inode);
6075  	struct btrfs_root *log = inode->root->log_root;
6076  	struct btrfs_item_batch batch = {
6077  		.nr = 0,
6078  		.total_data_size = 0,
6079  	};
6080  	const struct btrfs_delayed_item *first = NULL;
6081  	const struct btrfs_delayed_item *curr;
6082  	char *ins_data;
6083  	struct btrfs_key *ins_keys;
6084  	u32 *ins_sizes;
6085  	u64 curr_batch_size = 0;
6086  	int batch_idx = 0;
6087  	int ret;
6088  
6089  	/* We are adding dir index items to the log tree. */
6090  	lockdep_assert_held(&inode->log_mutex);
6091  
6092  	/*
6093  	 * We collect delayed items before copying index keys from the subvolume
6094  	 * to the log tree. However just after we collected them, they may have
6095  	 * been flushed (all of them or just some of them), and therefore we
6096  	 * could have copied them from the subvolume tree to the log tree.
6097  	 * So find the first delayed item that was not yet logged (they are
6098  	 * sorted by index number).
6099  	 */
6100  	list_for_each_entry(curr, delayed_ins_list, log_list) {
6101  		if (curr->index > inode->last_dir_index_offset) {
6102  			first = curr;
6103  			break;
6104  		}
6105  	}
6106  
6107  	/* Empty list or all delayed items were already logged. */
6108  	if (!first)
6109  		return 0;
6110  
6111  	ins_data = kmalloc(max_batch_size * sizeof(u32) +
6112  			   max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
6113  	if (!ins_data)
6114  		return -ENOMEM;
6115  	ins_sizes = (u32 *)ins_data;
6116  	batch.data_sizes = ins_sizes;
6117  	ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
6118  	batch.keys = ins_keys;
6119  
6120  	curr = first;
6121  	while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
6122  		const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
6123  
6124  		if (curr_batch_size + curr_size > leaf_data_size ||
6125  		    batch.nr == max_batch_size) {
6126  			ret = insert_delayed_items_batch(trans, log, path,
6127  							 &batch, first);
6128  			if (ret)
6129  				goto out;
6130  			batch_idx = 0;
6131  			batch.nr = 0;
6132  			batch.total_data_size = 0;
6133  			curr_batch_size = 0;
6134  			first = curr;
6135  		}
6136  
6137  		ins_sizes[batch_idx] = curr->data_len;
6138  		ins_keys[batch_idx].objectid = ino;
6139  		ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
6140  		ins_keys[batch_idx].offset = curr->index;
6141  		curr_batch_size += curr_size;
6142  		batch.total_data_size += curr->data_len;
6143  		batch.nr++;
6144  		batch_idx++;
6145  		curr = list_next_entry(curr, log_list);
6146  	}
6147  
6148  	ASSERT(batch.nr >= 1);
6149  	ret = insert_delayed_items_batch(trans, log, path, &batch, first);
6150  
6151  	curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
6152  			       log_list);
6153  	inode->last_dir_index_offset = curr->index;
6154  out:
6155  	kfree(ins_data);
6156  
6157  	return ret;
6158  }
6159  
log_delayed_deletions_full(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6160  static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
6161  				      struct btrfs_inode *inode,
6162  				      struct btrfs_path *path,
6163  				      const struct list_head *delayed_del_list,
6164  				      struct btrfs_log_ctx *ctx)
6165  {
6166  	const u64 ino = btrfs_ino(inode);
6167  	const struct btrfs_delayed_item *curr;
6168  
6169  	curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6170  				log_list);
6171  
6172  	while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6173  		u64 first_dir_index = curr->index;
6174  		u64 last_dir_index;
6175  		const struct btrfs_delayed_item *next;
6176  		int ret;
6177  
6178  		/*
6179  		 * Find a range of consecutive dir index items to delete. Like
6180  		 * this we log a single dir range item spanning several contiguous
6181  		 * dir items instead of logging one range item per dir index item.
6182  		 */
6183  		next = list_next_entry(curr, log_list);
6184  		while (!list_entry_is_head(next, delayed_del_list, log_list)) {
6185  			if (next->index != curr->index + 1)
6186  				break;
6187  			curr = next;
6188  			next = list_next_entry(next, log_list);
6189  		}
6190  
6191  		last_dir_index = curr->index;
6192  		ASSERT(last_dir_index >= first_dir_index);
6193  
6194  		ret = insert_dir_log_key(trans, inode->root->log_root, path,
6195  					 ino, first_dir_index, last_dir_index);
6196  		if (ret)
6197  			return ret;
6198  		curr = list_next_entry(curr, log_list);
6199  	}
6200  
6201  	return 0;
6202  }
6203  
batch_delete_dir_index_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_log_ctx * ctx,const struct list_head * delayed_del_list,const struct btrfs_delayed_item * first,const struct btrfs_delayed_item ** last_ret)6204  static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
6205  					struct btrfs_inode *inode,
6206  					struct btrfs_path *path,
6207  					struct btrfs_log_ctx *ctx,
6208  					const struct list_head *delayed_del_list,
6209  					const struct btrfs_delayed_item *first,
6210  					const struct btrfs_delayed_item **last_ret)
6211  {
6212  	const struct btrfs_delayed_item *next;
6213  	struct extent_buffer *leaf = path->nodes[0];
6214  	const int last_slot = btrfs_header_nritems(leaf) - 1;
6215  	int slot = path->slots[0] + 1;
6216  	const u64 ino = btrfs_ino(inode);
6217  
6218  	next = list_next_entry(first, log_list);
6219  
6220  	while (slot < last_slot &&
6221  	       !list_entry_is_head(next, delayed_del_list, log_list)) {
6222  		struct btrfs_key key;
6223  
6224  		btrfs_item_key_to_cpu(leaf, &key, slot);
6225  		if (key.objectid != ino ||
6226  		    key.type != BTRFS_DIR_INDEX_KEY ||
6227  		    key.offset != next->index)
6228  			break;
6229  
6230  		slot++;
6231  		*last_ret = next;
6232  		next = list_next_entry(next, log_list);
6233  	}
6234  
6235  	return btrfs_del_items(trans, inode->root->log_root, path,
6236  			       path->slots[0], slot - path->slots[0]);
6237  }
6238  
log_delayed_deletions_incremental(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6239  static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
6240  					     struct btrfs_inode *inode,
6241  					     struct btrfs_path *path,
6242  					     const struct list_head *delayed_del_list,
6243  					     struct btrfs_log_ctx *ctx)
6244  {
6245  	struct btrfs_root *log = inode->root->log_root;
6246  	const struct btrfs_delayed_item *curr;
6247  	u64 last_range_start = 0;
6248  	u64 last_range_end = 0;
6249  	struct btrfs_key key;
6250  
6251  	key.objectid = btrfs_ino(inode);
6252  	key.type = BTRFS_DIR_INDEX_KEY;
6253  	curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6254  				log_list);
6255  
6256  	while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6257  		const struct btrfs_delayed_item *last = curr;
6258  		u64 first_dir_index = curr->index;
6259  		u64 last_dir_index;
6260  		bool deleted_items = false;
6261  		int ret;
6262  
6263  		key.offset = curr->index;
6264  		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
6265  		if (ret < 0) {
6266  			return ret;
6267  		} else if (ret == 0) {
6268  			ret = batch_delete_dir_index_items(trans, inode, path, ctx,
6269  							   delayed_del_list, curr,
6270  							   &last);
6271  			if (ret)
6272  				return ret;
6273  			deleted_items = true;
6274  		}
6275  
6276  		btrfs_release_path(path);
6277  
6278  		/*
6279  		 * If we deleted items from the leaf, it means we have a range
6280  		 * item logging their range, so no need to add one or update an
6281  		 * existing one. Otherwise we have to log a dir range item.
6282  		 */
6283  		if (deleted_items)
6284  			goto next_batch;
6285  
6286  		last_dir_index = last->index;
6287  		ASSERT(last_dir_index >= first_dir_index);
6288  		/*
6289  		 * If this range starts right after where the previous one ends,
6290  		 * then we want to reuse the previous range item and change its
6291  		 * end offset to the end of this range. This is just to minimize
6292  		 * leaf space usage, by avoiding adding a new range item.
6293  		 */
6294  		if (last_range_end != 0 && first_dir_index == last_range_end + 1)
6295  			first_dir_index = last_range_start;
6296  
6297  		ret = insert_dir_log_key(trans, log, path, key.objectid,
6298  					 first_dir_index, last_dir_index);
6299  		if (ret)
6300  			return ret;
6301  
6302  		last_range_start = first_dir_index;
6303  		last_range_end = last_dir_index;
6304  next_batch:
6305  		curr = list_next_entry(last, log_list);
6306  	}
6307  
6308  	return 0;
6309  }
6310  
log_delayed_deletion_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6311  static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
6312  				      struct btrfs_inode *inode,
6313  				      struct btrfs_path *path,
6314  				      const struct list_head *delayed_del_list,
6315  				      struct btrfs_log_ctx *ctx)
6316  {
6317  	/*
6318  	 * We are deleting dir index items from the log tree or adding range
6319  	 * items to it.
6320  	 */
6321  	lockdep_assert_held(&inode->log_mutex);
6322  
6323  	if (list_empty(delayed_del_list))
6324  		return 0;
6325  
6326  	if (ctx->logged_before)
6327  		return log_delayed_deletions_incremental(trans, inode, path,
6328  							 delayed_del_list, ctx);
6329  
6330  	return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
6331  					  ctx);
6332  }
6333  
6334  /*
6335   * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
6336   * items instead of the subvolume tree.
6337   */
log_new_delayed_dentries(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,const struct list_head * delayed_ins_list,struct btrfs_log_ctx * ctx)6338  static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
6339  				    struct btrfs_inode *inode,
6340  				    const struct list_head *delayed_ins_list,
6341  				    struct btrfs_log_ctx *ctx)
6342  {
6343  	const bool orig_log_new_dentries = ctx->log_new_dentries;
6344  	struct btrfs_delayed_item *item;
6345  	int ret = 0;
6346  
6347  	/*
6348  	 * No need for the log mutex, plus to avoid potential deadlocks or
6349  	 * lockdep annotations due to nesting of delayed inode mutexes and log
6350  	 * mutexes.
6351  	 */
6352  	lockdep_assert_not_held(&inode->log_mutex);
6353  
6354  	ASSERT(!ctx->logging_new_delayed_dentries);
6355  	ctx->logging_new_delayed_dentries = true;
6356  
6357  	list_for_each_entry(item, delayed_ins_list, log_list) {
6358  		struct btrfs_dir_item *dir_item;
6359  		struct inode *di_inode;
6360  		struct btrfs_key key;
6361  		int log_mode = LOG_INODE_EXISTS;
6362  
6363  		dir_item = (struct btrfs_dir_item *)item->data;
6364  		btrfs_disk_key_to_cpu(&key, &dir_item->location);
6365  
6366  		if (key.type == BTRFS_ROOT_ITEM_KEY)
6367  			continue;
6368  
6369  		di_inode = btrfs_iget_logging(key.objectid, inode->root);
6370  		if (IS_ERR(di_inode)) {
6371  			ret = PTR_ERR(di_inode);
6372  			break;
6373  		}
6374  
6375  		if (!need_log_inode(trans, BTRFS_I(di_inode))) {
6376  			btrfs_add_delayed_iput(BTRFS_I(di_inode));
6377  			continue;
6378  		}
6379  
6380  		if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
6381  			log_mode = LOG_INODE_ALL;
6382  
6383  		ctx->log_new_dentries = false;
6384  		ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
6385  
6386  		if (!ret && ctx->log_new_dentries)
6387  			ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
6388  
6389  		btrfs_add_delayed_iput(BTRFS_I(di_inode));
6390  
6391  		if (ret)
6392  			break;
6393  	}
6394  
6395  	ctx->log_new_dentries = orig_log_new_dentries;
6396  	ctx->logging_new_delayed_dentries = false;
6397  
6398  	return ret;
6399  }
6400  
6401  /* log a single inode in the tree log.
6402   * At least one parent directory for this inode must exist in the tree
6403   * or be logged already.
6404   *
6405   * Any items from this inode changed by the current transaction are copied
6406   * to the log tree.  An extra reference is taken on any extents in this
6407   * file, allowing us to avoid a whole pile of corner cases around logging
6408   * blocks that have been removed from the tree.
6409   *
6410   * See LOG_INODE_ALL and related defines for a description of what inode_only
6411   * does.
6412   *
6413   * This handles both files and directories.
6414   */
btrfs_log_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,int inode_only,struct btrfs_log_ctx * ctx)6415  static int btrfs_log_inode(struct btrfs_trans_handle *trans,
6416  			   struct btrfs_inode *inode,
6417  			   int inode_only,
6418  			   struct btrfs_log_ctx *ctx)
6419  {
6420  	struct btrfs_path *path;
6421  	struct btrfs_path *dst_path;
6422  	struct btrfs_key min_key;
6423  	struct btrfs_key max_key;
6424  	struct btrfs_root *log = inode->root->log_root;
6425  	int ret;
6426  	bool fast_search = false;
6427  	u64 ino = btrfs_ino(inode);
6428  	struct extent_map_tree *em_tree = &inode->extent_tree;
6429  	u64 logged_isize = 0;
6430  	bool need_log_inode_item = true;
6431  	bool xattrs_logged = false;
6432  	bool inode_item_dropped = true;
6433  	bool full_dir_logging = false;
6434  	LIST_HEAD(delayed_ins_list);
6435  	LIST_HEAD(delayed_del_list);
6436  
6437  	path = btrfs_alloc_path();
6438  	if (!path)
6439  		return -ENOMEM;
6440  	dst_path = btrfs_alloc_path();
6441  	if (!dst_path) {
6442  		btrfs_free_path(path);
6443  		return -ENOMEM;
6444  	}
6445  
6446  	min_key.objectid = ino;
6447  	min_key.type = BTRFS_INODE_ITEM_KEY;
6448  	min_key.offset = 0;
6449  
6450  	max_key.objectid = ino;
6451  
6452  
6453  	/* today the code can only do partial logging of directories */
6454  	if (S_ISDIR(inode->vfs_inode.i_mode) ||
6455  	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6456  		       &inode->runtime_flags) &&
6457  	     inode_only >= LOG_INODE_EXISTS))
6458  		max_key.type = BTRFS_XATTR_ITEM_KEY;
6459  	else
6460  		max_key.type = (u8)-1;
6461  	max_key.offset = (u64)-1;
6462  
6463  	if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
6464  		full_dir_logging = true;
6465  
6466  	/*
6467  	 * If we are logging a directory while we are logging dentries of the
6468  	 * delayed items of some other inode, then we need to flush the delayed
6469  	 * items of this directory and not log the delayed items directly. This
6470  	 * is to prevent more than one level of recursion into btrfs_log_inode()
6471  	 * by having something like this:
6472  	 *
6473  	 *     $ mkdir -p a/b/c/d/e/f/g/h/...
6474  	 *     $ xfs_io -c "fsync" a
6475  	 *
6476  	 * Where all directories in the path did not exist before and are
6477  	 * created in the current transaction.
6478  	 * So in such a case we directly log the delayed items of the main
6479  	 * directory ("a") without flushing them first, while for each of its
6480  	 * subdirectories we flush their delayed items before logging them.
6481  	 * This prevents a potential unbounded recursion like this:
6482  	 *
6483  	 * btrfs_log_inode()
6484  	 *   log_new_delayed_dentries()
6485  	 *      btrfs_log_inode()
6486  	 *        log_new_delayed_dentries()
6487  	 *          btrfs_log_inode()
6488  	 *            log_new_delayed_dentries()
6489  	 *              (...)
6490  	 *
6491  	 * We have thresholds for the maximum number of delayed items to have in
6492  	 * memory, and once they are hit, the items are flushed asynchronously.
6493  	 * However the limit is quite high, so lets prevent deep levels of
6494  	 * recursion to happen by limiting the maximum depth to be 1.
6495  	 */
6496  	if (full_dir_logging && ctx->logging_new_delayed_dentries) {
6497  		ret = btrfs_commit_inode_delayed_items(trans, inode);
6498  		if (ret)
6499  			goto out;
6500  	}
6501  
6502  	mutex_lock(&inode->log_mutex);
6503  
6504  	/*
6505  	 * For symlinks, we must always log their content, which is stored in an
6506  	 * inline extent, otherwise we could end up with an empty symlink after
6507  	 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
6508  	 * one attempts to create an empty symlink).
6509  	 * We don't need to worry about flushing delalloc, because when we create
6510  	 * the inline extent when the symlink is created (we never have delalloc
6511  	 * for symlinks).
6512  	 */
6513  	if (S_ISLNK(inode->vfs_inode.i_mode))
6514  		inode_only = LOG_INODE_ALL;
6515  
6516  	/*
6517  	 * Before logging the inode item, cache the value returned by
6518  	 * inode_logged(), because after that we have the need to figure out if
6519  	 * the inode was previously logged in this transaction.
6520  	 */
6521  	ret = inode_logged(trans, inode, path);
6522  	if (ret < 0)
6523  		goto out_unlock;
6524  	ctx->logged_before = (ret == 1);
6525  	ret = 0;
6526  
6527  	/*
6528  	 * This is for cases where logging a directory could result in losing a
6529  	 * a file after replaying the log. For example, if we move a file from a
6530  	 * directory A to a directory B, then fsync directory A, we have no way
6531  	 * to known the file was moved from A to B, so logging just A would
6532  	 * result in losing the file after a log replay.
6533  	 */
6534  	if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
6535  		ret = BTRFS_LOG_FORCE_COMMIT;
6536  		goto out_unlock;
6537  	}
6538  
6539  	/*
6540  	 * a brute force approach to making sure we get the most uptodate
6541  	 * copies of everything.
6542  	 */
6543  	if (S_ISDIR(inode->vfs_inode.i_mode)) {
6544  		clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
6545  		if (ctx->logged_before)
6546  			ret = drop_inode_items(trans, log, path, inode,
6547  					       BTRFS_XATTR_ITEM_KEY);
6548  	} else {
6549  		if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
6550  			/*
6551  			 * Make sure the new inode item we write to the log has
6552  			 * the same isize as the current one (if it exists).
6553  			 * This is necessary to prevent data loss after log
6554  			 * replay, and also to prevent doing a wrong expanding
6555  			 * truncate - for e.g. create file, write 4K into offset
6556  			 * 0, fsync, write 4K into offset 4096, add hard link,
6557  			 * fsync some other file (to sync log), power fail - if
6558  			 * we use the inode's current i_size, after log replay
6559  			 * we get a 8Kb file, with the last 4Kb extent as a hole
6560  			 * (zeroes), as if an expanding truncate happened,
6561  			 * instead of getting a file of 4Kb only.
6562  			 */
6563  			ret = logged_inode_size(log, inode, path, &logged_isize);
6564  			if (ret)
6565  				goto out_unlock;
6566  		}
6567  		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6568  			     &inode->runtime_flags)) {
6569  			if (inode_only == LOG_INODE_EXISTS) {
6570  				max_key.type = BTRFS_XATTR_ITEM_KEY;
6571  				if (ctx->logged_before)
6572  					ret = drop_inode_items(trans, log, path,
6573  							       inode, max_key.type);
6574  			} else {
6575  				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6576  					  &inode->runtime_flags);
6577  				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6578  					  &inode->runtime_flags);
6579  				if (ctx->logged_before)
6580  					ret = truncate_inode_items(trans, log,
6581  								   inode, 0, 0);
6582  			}
6583  		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6584  					      &inode->runtime_flags) ||
6585  			   inode_only == LOG_INODE_EXISTS) {
6586  			if (inode_only == LOG_INODE_ALL)
6587  				fast_search = true;
6588  			max_key.type = BTRFS_XATTR_ITEM_KEY;
6589  			if (ctx->logged_before)
6590  				ret = drop_inode_items(trans, log, path, inode,
6591  						       max_key.type);
6592  		} else {
6593  			if (inode_only == LOG_INODE_ALL)
6594  				fast_search = true;
6595  			inode_item_dropped = false;
6596  			goto log_extents;
6597  		}
6598  
6599  	}
6600  	if (ret)
6601  		goto out_unlock;
6602  
6603  	/*
6604  	 * If we are logging a directory in full mode, collect the delayed items
6605  	 * before iterating the subvolume tree, so that we don't miss any new
6606  	 * dir index items in case they get flushed while or right after we are
6607  	 * iterating the subvolume tree.
6608  	 */
6609  	if (full_dir_logging && !ctx->logging_new_delayed_dentries)
6610  		btrfs_log_get_delayed_items(inode, &delayed_ins_list,
6611  					    &delayed_del_list);
6612  
6613  	ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
6614  				      path, dst_path, logged_isize,
6615  				      inode_only, ctx,
6616  				      &need_log_inode_item);
6617  	if (ret)
6618  		goto out_unlock;
6619  
6620  	btrfs_release_path(path);
6621  	btrfs_release_path(dst_path);
6622  	ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
6623  	if (ret)
6624  		goto out_unlock;
6625  	xattrs_logged = true;
6626  	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
6627  		btrfs_release_path(path);
6628  		btrfs_release_path(dst_path);
6629  		ret = btrfs_log_holes(trans, inode, path);
6630  		if (ret)
6631  			goto out_unlock;
6632  	}
6633  log_extents:
6634  	btrfs_release_path(path);
6635  	btrfs_release_path(dst_path);
6636  	if (need_log_inode_item) {
6637  		ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
6638  		if (ret)
6639  			goto out_unlock;
6640  		/*
6641  		 * If we are doing a fast fsync and the inode was logged before
6642  		 * in this transaction, we don't need to log the xattrs because
6643  		 * they were logged before. If xattrs were added, changed or
6644  		 * deleted since the last time we logged the inode, then we have
6645  		 * already logged them because the inode had the runtime flag
6646  		 * BTRFS_INODE_COPY_EVERYTHING set.
6647  		 */
6648  		if (!xattrs_logged && inode->logged_trans < trans->transid) {
6649  			ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
6650  			if (ret)
6651  				goto out_unlock;
6652  			btrfs_release_path(path);
6653  		}
6654  	}
6655  	if (fast_search) {
6656  		ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
6657  		if (ret)
6658  			goto out_unlock;
6659  	} else if (inode_only == LOG_INODE_ALL) {
6660  		struct extent_map *em, *n;
6661  
6662  		write_lock(&em_tree->lock);
6663  		list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
6664  			list_del_init(&em->list);
6665  		write_unlock(&em_tree->lock);
6666  	}
6667  
6668  	if (full_dir_logging) {
6669  		ret = log_directory_changes(trans, inode, path, dst_path, ctx);
6670  		if (ret)
6671  			goto out_unlock;
6672  		ret = log_delayed_insertion_items(trans, inode, path,
6673  						  &delayed_ins_list, ctx);
6674  		if (ret)
6675  			goto out_unlock;
6676  		ret = log_delayed_deletion_items(trans, inode, path,
6677  						 &delayed_del_list, ctx);
6678  		if (ret)
6679  			goto out_unlock;
6680  	}
6681  
6682  	spin_lock(&inode->lock);
6683  	inode->logged_trans = trans->transid;
6684  	/*
6685  	 * Don't update last_log_commit if we logged that an inode exists.
6686  	 * We do this for three reasons:
6687  	 *
6688  	 * 1) We might have had buffered writes to this inode that were
6689  	 *    flushed and had their ordered extents completed in this
6690  	 *    transaction, but we did not previously log the inode with
6691  	 *    LOG_INODE_ALL. Later the inode was evicted and after that
6692  	 *    it was loaded again and this LOG_INODE_EXISTS log operation
6693  	 *    happened. We must make sure that if an explicit fsync against
6694  	 *    the inode is performed later, it logs the new extents, an
6695  	 *    updated inode item, etc, and syncs the log. The same logic
6696  	 *    applies to direct IO writes instead of buffered writes.
6697  	 *
6698  	 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
6699  	 *    is logged with an i_size of 0 or whatever value was logged
6700  	 *    before. If later the i_size of the inode is increased by a
6701  	 *    truncate operation, the log is synced through an fsync of
6702  	 *    some other inode and then finally an explicit fsync against
6703  	 *    this inode is made, we must make sure this fsync logs the
6704  	 *    inode with the new i_size, the hole between old i_size and
6705  	 *    the new i_size, and syncs the log.
6706  	 *
6707  	 * 3) If we are logging that an ancestor inode exists as part of
6708  	 *    logging a new name from a link or rename operation, don't update
6709  	 *    its last_log_commit - otherwise if an explicit fsync is made
6710  	 *    against an ancestor, the fsync considers the inode in the log
6711  	 *    and doesn't sync the log, resulting in the ancestor missing after
6712  	 *    a power failure unless the log was synced as part of an fsync
6713  	 *    against any other unrelated inode.
6714  	 */
6715  	if (inode_only != LOG_INODE_EXISTS)
6716  		inode->last_log_commit = inode->last_sub_trans;
6717  	spin_unlock(&inode->lock);
6718  
6719  	/*
6720  	 * Reset the last_reflink_trans so that the next fsync does not need to
6721  	 * go through the slower path when logging extents and their checksums.
6722  	 */
6723  	if (inode_only == LOG_INODE_ALL)
6724  		inode->last_reflink_trans = 0;
6725  
6726  out_unlock:
6727  	mutex_unlock(&inode->log_mutex);
6728  out:
6729  	btrfs_free_path(path);
6730  	btrfs_free_path(dst_path);
6731  
6732  	if (ret)
6733  		free_conflicting_inodes(ctx);
6734  	else
6735  		ret = log_conflicting_inodes(trans, inode->root, ctx);
6736  
6737  	if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
6738  		if (!ret)
6739  			ret = log_new_delayed_dentries(trans, inode,
6740  						       &delayed_ins_list, ctx);
6741  
6742  		btrfs_log_put_delayed_items(inode, &delayed_ins_list,
6743  					    &delayed_del_list);
6744  	}
6745  
6746  	return ret;
6747  }
6748  
btrfs_log_all_parents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_log_ctx * ctx)6749  static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
6750  				 struct btrfs_inode *inode,
6751  				 struct btrfs_log_ctx *ctx)
6752  {
6753  	int ret;
6754  	struct btrfs_path *path;
6755  	struct btrfs_key key;
6756  	struct btrfs_root *root = inode->root;
6757  	const u64 ino = btrfs_ino(inode);
6758  
6759  	path = btrfs_alloc_path();
6760  	if (!path)
6761  		return -ENOMEM;
6762  	path->skip_locking = 1;
6763  	path->search_commit_root = 1;
6764  
6765  	key.objectid = ino;
6766  	key.type = BTRFS_INODE_REF_KEY;
6767  	key.offset = 0;
6768  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6769  	if (ret < 0)
6770  		goto out;
6771  
6772  	while (true) {
6773  		struct extent_buffer *leaf = path->nodes[0];
6774  		int slot = path->slots[0];
6775  		u32 cur_offset = 0;
6776  		u32 item_size;
6777  		unsigned long ptr;
6778  
6779  		if (slot >= btrfs_header_nritems(leaf)) {
6780  			ret = btrfs_next_leaf(root, path);
6781  			if (ret < 0)
6782  				goto out;
6783  			else if (ret > 0)
6784  				break;
6785  			continue;
6786  		}
6787  
6788  		btrfs_item_key_to_cpu(leaf, &key, slot);
6789  		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
6790  		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
6791  			break;
6792  
6793  		item_size = btrfs_item_size(leaf, slot);
6794  		ptr = btrfs_item_ptr_offset(leaf, slot);
6795  		while (cur_offset < item_size) {
6796  			struct btrfs_key inode_key;
6797  			struct inode *dir_inode;
6798  
6799  			inode_key.type = BTRFS_INODE_ITEM_KEY;
6800  			inode_key.offset = 0;
6801  
6802  			if (key.type == BTRFS_INODE_EXTREF_KEY) {
6803  				struct btrfs_inode_extref *extref;
6804  
6805  				extref = (struct btrfs_inode_extref *)
6806  					(ptr + cur_offset);
6807  				inode_key.objectid = btrfs_inode_extref_parent(
6808  					leaf, extref);
6809  				cur_offset += sizeof(*extref);
6810  				cur_offset += btrfs_inode_extref_name_len(leaf,
6811  					extref);
6812  			} else {
6813  				inode_key.objectid = key.offset;
6814  				cur_offset = item_size;
6815  			}
6816  
6817  			dir_inode = btrfs_iget_logging(inode_key.objectid, root);
6818  			/*
6819  			 * If the parent inode was deleted, return an error to
6820  			 * fallback to a transaction commit. This is to prevent
6821  			 * getting an inode that was moved from one parent A to
6822  			 * a parent B, got its former parent A deleted and then
6823  			 * it got fsync'ed, from existing at both parents after
6824  			 * a log replay (and the old parent still existing).
6825  			 * Example:
6826  			 *
6827  			 * mkdir /mnt/A
6828  			 * mkdir /mnt/B
6829  			 * touch /mnt/B/bar
6830  			 * sync
6831  			 * mv /mnt/B/bar /mnt/A/bar
6832  			 * mv -T /mnt/A /mnt/B
6833  			 * fsync /mnt/B/bar
6834  			 * <power fail>
6835  			 *
6836  			 * If we ignore the old parent B which got deleted,
6837  			 * after a log replay we would have file bar linked
6838  			 * at both parents and the old parent B would still
6839  			 * exist.
6840  			 */
6841  			if (IS_ERR(dir_inode)) {
6842  				ret = PTR_ERR(dir_inode);
6843  				goto out;
6844  			}
6845  
6846  			if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
6847  				btrfs_add_delayed_iput(BTRFS_I(dir_inode));
6848  				continue;
6849  			}
6850  
6851  			ctx->log_new_dentries = false;
6852  			ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
6853  					      LOG_INODE_ALL, ctx);
6854  			if (!ret && ctx->log_new_dentries)
6855  				ret = log_new_dir_dentries(trans,
6856  						   BTRFS_I(dir_inode), ctx);
6857  			btrfs_add_delayed_iput(BTRFS_I(dir_inode));
6858  			if (ret)
6859  				goto out;
6860  		}
6861  		path->slots[0]++;
6862  	}
6863  	ret = 0;
6864  out:
6865  	btrfs_free_path(path);
6866  	return ret;
6867  }
6868  
log_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_log_ctx * ctx)6869  static int log_new_ancestors(struct btrfs_trans_handle *trans,
6870  			     struct btrfs_root *root,
6871  			     struct btrfs_path *path,
6872  			     struct btrfs_log_ctx *ctx)
6873  {
6874  	struct btrfs_key found_key;
6875  
6876  	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
6877  
6878  	while (true) {
6879  		struct extent_buffer *leaf;
6880  		int slot;
6881  		struct btrfs_key search_key;
6882  		struct inode *inode;
6883  		u64 ino;
6884  		int ret = 0;
6885  
6886  		btrfs_release_path(path);
6887  
6888  		ino = found_key.offset;
6889  
6890  		search_key.objectid = found_key.offset;
6891  		search_key.type = BTRFS_INODE_ITEM_KEY;
6892  		search_key.offset = 0;
6893  		inode = btrfs_iget_logging(ino, root);
6894  		if (IS_ERR(inode))
6895  			return PTR_ERR(inode);
6896  
6897  		if (BTRFS_I(inode)->generation >= trans->transid &&
6898  		    need_log_inode(trans, BTRFS_I(inode)))
6899  			ret = btrfs_log_inode(trans, BTRFS_I(inode),
6900  					      LOG_INODE_EXISTS, ctx);
6901  		btrfs_add_delayed_iput(BTRFS_I(inode));
6902  		if (ret)
6903  			return ret;
6904  
6905  		if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
6906  			break;
6907  
6908  		search_key.type = BTRFS_INODE_REF_KEY;
6909  		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6910  		if (ret < 0)
6911  			return ret;
6912  
6913  		leaf = path->nodes[0];
6914  		slot = path->slots[0];
6915  		if (slot >= btrfs_header_nritems(leaf)) {
6916  			ret = btrfs_next_leaf(root, path);
6917  			if (ret < 0)
6918  				return ret;
6919  			else if (ret > 0)
6920  				return -ENOENT;
6921  			leaf = path->nodes[0];
6922  			slot = path->slots[0];
6923  		}
6924  
6925  		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6926  		if (found_key.objectid != search_key.objectid ||
6927  		    found_key.type != BTRFS_INODE_REF_KEY)
6928  			return -ENOENT;
6929  	}
6930  	return 0;
6931  }
6932  
log_new_ancestors_fast(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)6933  static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
6934  				  struct btrfs_inode *inode,
6935  				  struct dentry *parent,
6936  				  struct btrfs_log_ctx *ctx)
6937  {
6938  	struct btrfs_root *root = inode->root;
6939  	struct dentry *old_parent = NULL;
6940  	struct super_block *sb = inode->vfs_inode.i_sb;
6941  	int ret = 0;
6942  
6943  	while (true) {
6944  		if (!parent || d_really_is_negative(parent) ||
6945  		    sb != parent->d_sb)
6946  			break;
6947  
6948  		inode = BTRFS_I(d_inode(parent));
6949  		if (root != inode->root)
6950  			break;
6951  
6952  		if (inode->generation >= trans->transid &&
6953  		    need_log_inode(trans, inode)) {
6954  			ret = btrfs_log_inode(trans, inode,
6955  					      LOG_INODE_EXISTS, ctx);
6956  			if (ret)
6957  				break;
6958  		}
6959  		if (IS_ROOT(parent))
6960  			break;
6961  
6962  		parent = dget_parent(parent);
6963  		dput(old_parent);
6964  		old_parent = parent;
6965  	}
6966  	dput(old_parent);
6967  
6968  	return ret;
6969  }
6970  
log_all_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)6971  static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6972  				 struct btrfs_inode *inode,
6973  				 struct dentry *parent,
6974  				 struct btrfs_log_ctx *ctx)
6975  {
6976  	struct btrfs_root *root = inode->root;
6977  	const u64 ino = btrfs_ino(inode);
6978  	struct btrfs_path *path;
6979  	struct btrfs_key search_key;
6980  	int ret;
6981  
6982  	/*
6983  	 * For a single hard link case, go through a fast path that does not
6984  	 * need to iterate the fs/subvolume tree.
6985  	 */
6986  	if (inode->vfs_inode.i_nlink < 2)
6987  		return log_new_ancestors_fast(trans, inode, parent, ctx);
6988  
6989  	path = btrfs_alloc_path();
6990  	if (!path)
6991  		return -ENOMEM;
6992  
6993  	search_key.objectid = ino;
6994  	search_key.type = BTRFS_INODE_REF_KEY;
6995  	search_key.offset = 0;
6996  again:
6997  	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6998  	if (ret < 0)
6999  		goto out;
7000  	if (ret == 0)
7001  		path->slots[0]++;
7002  
7003  	while (true) {
7004  		struct extent_buffer *leaf = path->nodes[0];
7005  		int slot = path->slots[0];
7006  		struct btrfs_key found_key;
7007  
7008  		if (slot >= btrfs_header_nritems(leaf)) {
7009  			ret = btrfs_next_leaf(root, path);
7010  			if (ret < 0)
7011  				goto out;
7012  			else if (ret > 0)
7013  				break;
7014  			continue;
7015  		}
7016  
7017  		btrfs_item_key_to_cpu(leaf, &found_key, slot);
7018  		if (found_key.objectid != ino ||
7019  		    found_key.type > BTRFS_INODE_EXTREF_KEY)
7020  			break;
7021  
7022  		/*
7023  		 * Don't deal with extended references because they are rare
7024  		 * cases and too complex to deal with (we would need to keep
7025  		 * track of which subitem we are processing for each item in
7026  		 * this loop, etc). So just return some error to fallback to
7027  		 * a transaction commit.
7028  		 */
7029  		if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
7030  			ret = -EMLINK;
7031  			goto out;
7032  		}
7033  
7034  		/*
7035  		 * Logging ancestors needs to do more searches on the fs/subvol
7036  		 * tree, so it releases the path as needed to avoid deadlocks.
7037  		 * Keep track of the last inode ref key and resume from that key
7038  		 * after logging all new ancestors for the current hard link.
7039  		 */
7040  		memcpy(&search_key, &found_key, sizeof(search_key));
7041  
7042  		ret = log_new_ancestors(trans, root, path, ctx);
7043  		if (ret)
7044  			goto out;
7045  		btrfs_release_path(path);
7046  		goto again;
7047  	}
7048  	ret = 0;
7049  out:
7050  	btrfs_free_path(path);
7051  	return ret;
7052  }
7053  
7054  /*
7055   * helper function around btrfs_log_inode to make sure newly created
7056   * parent directories also end up in the log.  A minimal inode and backref
7057   * only logging is done of any parent directories that are older than
7058   * the last committed transaction
7059   */
btrfs_log_inode_parent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,int inode_only,struct btrfs_log_ctx * ctx)7060  static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
7061  				  struct btrfs_inode *inode,
7062  				  struct dentry *parent,
7063  				  int inode_only,
7064  				  struct btrfs_log_ctx *ctx)
7065  {
7066  	struct btrfs_root *root = inode->root;
7067  	struct btrfs_fs_info *fs_info = root->fs_info;
7068  	int ret = 0;
7069  	bool log_dentries = false;
7070  
7071  	if (btrfs_test_opt(fs_info, NOTREELOG)) {
7072  		ret = BTRFS_LOG_FORCE_COMMIT;
7073  		goto end_no_trans;
7074  	}
7075  
7076  	if (btrfs_root_refs(&root->root_item) == 0) {
7077  		ret = BTRFS_LOG_FORCE_COMMIT;
7078  		goto end_no_trans;
7079  	}
7080  
7081  	/*
7082  	 * If we're logging an inode from a subvolume created in the current
7083  	 * transaction we must force a commit since the root is not persisted.
7084  	 */
7085  	if (btrfs_root_generation(&root->root_item) == trans->transid) {
7086  		ret = BTRFS_LOG_FORCE_COMMIT;
7087  		goto end_no_trans;
7088  	}
7089  
7090  	/*
7091  	 * Skip already logged inodes or inodes corresponding to tmpfiles
7092  	 * (since logging them is pointless, a link count of 0 means they
7093  	 * will never be accessible).
7094  	 */
7095  	if ((btrfs_inode_in_log(inode, trans->transid) &&
7096  	     list_empty(&ctx->ordered_extents)) ||
7097  	    inode->vfs_inode.i_nlink == 0) {
7098  		ret = BTRFS_NO_LOG_SYNC;
7099  		goto end_no_trans;
7100  	}
7101  
7102  	ret = start_log_trans(trans, root, ctx);
7103  	if (ret)
7104  		goto end_no_trans;
7105  
7106  	ret = btrfs_log_inode(trans, inode, inode_only, ctx);
7107  	if (ret)
7108  		goto end_trans;
7109  
7110  	/*
7111  	 * for regular files, if its inode is already on disk, we don't
7112  	 * have to worry about the parents at all.  This is because
7113  	 * we can use the last_unlink_trans field to record renames
7114  	 * and other fun in this file.
7115  	 */
7116  	if (S_ISREG(inode->vfs_inode.i_mode) &&
7117  	    inode->generation < trans->transid &&
7118  	    inode->last_unlink_trans < trans->transid) {
7119  		ret = 0;
7120  		goto end_trans;
7121  	}
7122  
7123  	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
7124  		log_dentries = true;
7125  
7126  	/*
7127  	 * On unlink we must make sure all our current and old parent directory
7128  	 * inodes are fully logged. This is to prevent leaving dangling
7129  	 * directory index entries in directories that were our parents but are
7130  	 * not anymore. Not doing this results in old parent directory being
7131  	 * impossible to delete after log replay (rmdir will always fail with
7132  	 * error -ENOTEMPTY).
7133  	 *
7134  	 * Example 1:
7135  	 *
7136  	 * mkdir testdir
7137  	 * touch testdir/foo
7138  	 * ln testdir/foo testdir/bar
7139  	 * sync
7140  	 * unlink testdir/bar
7141  	 * xfs_io -c fsync testdir/foo
7142  	 * <power failure>
7143  	 * mount fs, triggers log replay
7144  	 *
7145  	 * If we don't log the parent directory (testdir), after log replay the
7146  	 * directory still has an entry pointing to the file inode using the bar
7147  	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
7148  	 * the file inode has a link count of 1.
7149  	 *
7150  	 * Example 2:
7151  	 *
7152  	 * mkdir testdir
7153  	 * touch foo
7154  	 * ln foo testdir/foo2
7155  	 * ln foo testdir/foo3
7156  	 * sync
7157  	 * unlink testdir/foo3
7158  	 * xfs_io -c fsync foo
7159  	 * <power failure>
7160  	 * mount fs, triggers log replay
7161  	 *
7162  	 * Similar as the first example, after log replay the parent directory
7163  	 * testdir still has an entry pointing to the inode file with name foo3
7164  	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
7165  	 * and has a link count of 2.
7166  	 */
7167  	if (inode->last_unlink_trans >= trans->transid) {
7168  		ret = btrfs_log_all_parents(trans, inode, ctx);
7169  		if (ret)
7170  			goto end_trans;
7171  	}
7172  
7173  	ret = log_all_new_ancestors(trans, inode, parent, ctx);
7174  	if (ret)
7175  		goto end_trans;
7176  
7177  	if (log_dentries)
7178  		ret = log_new_dir_dentries(trans, inode, ctx);
7179  	else
7180  		ret = 0;
7181  end_trans:
7182  	if (ret < 0) {
7183  		btrfs_set_log_full_commit(trans);
7184  		ret = BTRFS_LOG_FORCE_COMMIT;
7185  	}
7186  
7187  	if (ret)
7188  		btrfs_remove_log_ctx(root, ctx);
7189  	btrfs_end_log_trans(root);
7190  end_no_trans:
7191  	return ret;
7192  }
7193  
7194  /*
7195   * it is not safe to log dentry if the chunk root has added new
7196   * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
7197   * If this returns 1, you must commit the transaction to safely get your
7198   * data on disk.
7199   */
btrfs_log_dentry_safe(struct btrfs_trans_handle * trans,struct dentry * dentry,struct btrfs_log_ctx * ctx)7200  int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
7201  			  struct dentry *dentry,
7202  			  struct btrfs_log_ctx *ctx)
7203  {
7204  	struct dentry *parent = dget_parent(dentry);
7205  	int ret;
7206  
7207  	ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
7208  				     LOG_INODE_ALL, ctx);
7209  	dput(parent);
7210  
7211  	return ret;
7212  }
7213  
7214  /*
7215   * should be called during mount to recover any replay any log trees
7216   * from the FS
7217   */
btrfs_recover_log_trees(struct btrfs_root * log_root_tree)7218  int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
7219  {
7220  	int ret;
7221  	struct btrfs_path *path;
7222  	struct btrfs_trans_handle *trans;
7223  	struct btrfs_key key;
7224  	struct btrfs_key found_key;
7225  	struct btrfs_root *log;
7226  	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
7227  	struct walk_control wc = {
7228  		.process_func = process_one_buffer,
7229  		.stage = LOG_WALK_PIN_ONLY,
7230  	};
7231  
7232  	path = btrfs_alloc_path();
7233  	if (!path)
7234  		return -ENOMEM;
7235  
7236  	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7237  
7238  	trans = btrfs_start_transaction(fs_info->tree_root, 0);
7239  	if (IS_ERR(trans)) {
7240  		ret = PTR_ERR(trans);
7241  		goto error;
7242  	}
7243  
7244  	wc.trans = trans;
7245  	wc.pin = 1;
7246  
7247  	ret = walk_log_tree(trans, log_root_tree, &wc);
7248  	if (ret) {
7249  		btrfs_abort_transaction(trans, ret);
7250  		goto error;
7251  	}
7252  
7253  again:
7254  	key.objectid = BTRFS_TREE_LOG_OBJECTID;
7255  	key.offset = (u64)-1;
7256  	key.type = BTRFS_ROOT_ITEM_KEY;
7257  
7258  	while (1) {
7259  		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
7260  
7261  		if (ret < 0) {
7262  			btrfs_abort_transaction(trans, ret);
7263  			goto error;
7264  		}
7265  		if (ret > 0) {
7266  			if (path->slots[0] == 0)
7267  				break;
7268  			path->slots[0]--;
7269  		}
7270  		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
7271  				      path->slots[0]);
7272  		btrfs_release_path(path);
7273  		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
7274  			break;
7275  
7276  		log = btrfs_read_tree_root(log_root_tree, &found_key);
7277  		if (IS_ERR(log)) {
7278  			ret = PTR_ERR(log);
7279  			btrfs_abort_transaction(trans, ret);
7280  			goto error;
7281  		}
7282  
7283  		wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
7284  						   true);
7285  		if (IS_ERR(wc.replay_dest)) {
7286  			ret = PTR_ERR(wc.replay_dest);
7287  
7288  			/*
7289  			 * We didn't find the subvol, likely because it was
7290  			 * deleted.  This is ok, simply skip this log and go to
7291  			 * the next one.
7292  			 *
7293  			 * We need to exclude the root because we can't have
7294  			 * other log replays overwriting this log as we'll read
7295  			 * it back in a few more times.  This will keep our
7296  			 * block from being modified, and we'll just bail for
7297  			 * each subsequent pass.
7298  			 */
7299  			if (ret == -ENOENT)
7300  				ret = btrfs_pin_extent_for_log_replay(trans, log->node);
7301  			btrfs_put_root(log);
7302  
7303  			if (!ret)
7304  				goto next;
7305  			btrfs_abort_transaction(trans, ret);
7306  			goto error;
7307  		}
7308  
7309  		wc.replay_dest->log_root = log;
7310  		ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
7311  		if (ret)
7312  			/* The loop needs to continue due to the root refs */
7313  			btrfs_abort_transaction(trans, ret);
7314  		else
7315  			ret = walk_log_tree(trans, log, &wc);
7316  
7317  		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
7318  			ret = fixup_inode_link_counts(trans, wc.replay_dest,
7319  						      path);
7320  			if (ret)
7321  				btrfs_abort_transaction(trans, ret);
7322  		}
7323  
7324  		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
7325  			struct btrfs_root *root = wc.replay_dest;
7326  
7327  			btrfs_release_path(path);
7328  
7329  			/*
7330  			 * We have just replayed everything, and the highest
7331  			 * objectid of fs roots probably has changed in case
7332  			 * some inode_item's got replayed.
7333  			 *
7334  			 * root->objectid_mutex is not acquired as log replay
7335  			 * could only happen during mount.
7336  			 */
7337  			ret = btrfs_init_root_free_objectid(root);
7338  			if (ret)
7339  				btrfs_abort_transaction(trans, ret);
7340  		}
7341  
7342  		wc.replay_dest->log_root = NULL;
7343  		btrfs_put_root(wc.replay_dest);
7344  		btrfs_put_root(log);
7345  
7346  		if (ret)
7347  			goto error;
7348  next:
7349  		if (found_key.offset == 0)
7350  			break;
7351  		key.offset = found_key.offset - 1;
7352  	}
7353  	btrfs_release_path(path);
7354  
7355  	/* step one is to pin it all, step two is to replay just inodes */
7356  	if (wc.pin) {
7357  		wc.pin = 0;
7358  		wc.process_func = replay_one_buffer;
7359  		wc.stage = LOG_WALK_REPLAY_INODES;
7360  		goto again;
7361  	}
7362  	/* step three is to replay everything */
7363  	if (wc.stage < LOG_WALK_REPLAY_ALL) {
7364  		wc.stage++;
7365  		goto again;
7366  	}
7367  
7368  	btrfs_free_path(path);
7369  
7370  	/* step 4: commit the transaction, which also unpins the blocks */
7371  	ret = btrfs_commit_transaction(trans);
7372  	if (ret)
7373  		return ret;
7374  
7375  	log_root_tree->log_root = NULL;
7376  	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7377  	btrfs_put_root(log_root_tree);
7378  
7379  	return 0;
7380  error:
7381  	if (wc.trans)
7382  		btrfs_end_transaction(wc.trans);
7383  	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7384  	btrfs_free_path(path);
7385  	return ret;
7386  }
7387  
7388  /*
7389   * there are some corner cases where we want to force a full
7390   * commit instead of allowing a directory to be logged.
7391   *
7392   * They revolve around files there were unlinked from the directory, and
7393   * this function updates the parent directory so that a full commit is
7394   * properly done if it is fsync'd later after the unlinks are done.
7395   *
7396   * Must be called before the unlink operations (updates to the subvolume tree,
7397   * inodes, etc) are done.
7398   */
btrfs_record_unlink_dir(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,bool for_rename)7399  void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
7400  			     struct btrfs_inode *dir, struct btrfs_inode *inode,
7401  			     bool for_rename)
7402  {
7403  	/*
7404  	 * when we're logging a file, if it hasn't been renamed
7405  	 * or unlinked, and its inode is fully committed on disk,
7406  	 * we don't have to worry about walking up the directory chain
7407  	 * to log its parents.
7408  	 *
7409  	 * So, we use the last_unlink_trans field to put this transid
7410  	 * into the file.  When the file is logged we check it and
7411  	 * don't log the parents if the file is fully on disk.
7412  	 */
7413  	mutex_lock(&inode->log_mutex);
7414  	inode->last_unlink_trans = trans->transid;
7415  	mutex_unlock(&inode->log_mutex);
7416  
7417  	if (!for_rename)
7418  		return;
7419  
7420  	/*
7421  	 * If this directory was already logged, any new names will be logged
7422  	 * with btrfs_log_new_name() and old names will be deleted from the log
7423  	 * tree with btrfs_del_dir_entries_in_log() or with
7424  	 * btrfs_del_inode_ref_in_log().
7425  	 */
7426  	if (inode_logged(trans, dir, NULL) == 1)
7427  		return;
7428  
7429  	/*
7430  	 * If the inode we're about to unlink was logged before, the log will be
7431  	 * properly updated with the new name with btrfs_log_new_name() and the
7432  	 * old name removed with btrfs_del_dir_entries_in_log() or with
7433  	 * btrfs_del_inode_ref_in_log().
7434  	 */
7435  	if (inode_logged(trans, inode, NULL) == 1)
7436  		return;
7437  
7438  	/*
7439  	 * when renaming files across directories, if the directory
7440  	 * there we're unlinking from gets fsync'd later on, there's
7441  	 * no way to find the destination directory later and fsync it
7442  	 * properly.  So, we have to be conservative and force commits
7443  	 * so the new name gets discovered.
7444  	 */
7445  	mutex_lock(&dir->log_mutex);
7446  	dir->last_unlink_trans = trans->transid;
7447  	mutex_unlock(&dir->log_mutex);
7448  }
7449  
7450  /*
7451   * Make sure that if someone attempts to fsync the parent directory of a deleted
7452   * snapshot, it ends up triggering a transaction commit. This is to guarantee
7453   * that after replaying the log tree of the parent directory's root we will not
7454   * see the snapshot anymore and at log replay time we will not see any log tree
7455   * corresponding to the deleted snapshot's root, which could lead to replaying
7456   * it after replaying the log tree of the parent directory (which would replay
7457   * the snapshot delete operation).
7458   *
7459   * Must be called before the actual snapshot destroy operation (updates to the
7460   * parent root and tree of tree roots trees, etc) are done.
7461   */
btrfs_record_snapshot_destroy(struct btrfs_trans_handle * trans,struct btrfs_inode * dir)7462  void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
7463  				   struct btrfs_inode *dir)
7464  {
7465  	mutex_lock(&dir->log_mutex);
7466  	dir->last_unlink_trans = trans->transid;
7467  	mutex_unlock(&dir->log_mutex);
7468  }
7469  
7470  /*
7471   * Call this when creating a subvolume in a directory.
7472   * Because we don't commit a transaction when creating a subvolume, we can't
7473   * allow the directory pointing to the subvolume to be logged with an entry that
7474   * points to an unpersisted root if we are still in the transaction used to
7475   * create the subvolume, so make any attempt to log the directory to result in a
7476   * full log sync.
7477   * Also we don't need to worry with renames, since btrfs_rename() marks the log
7478   * for full commit when renaming a subvolume.
7479   */
btrfs_record_new_subvolume(const struct btrfs_trans_handle * trans,struct btrfs_inode * dir)7480  void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
7481  				struct btrfs_inode *dir)
7482  {
7483  	mutex_lock(&dir->log_mutex);
7484  	dir->last_unlink_trans = trans->transid;
7485  	mutex_unlock(&dir->log_mutex);
7486  }
7487  
7488  /*
7489   * Update the log after adding a new name for an inode.
7490   *
7491   * @trans:              Transaction handle.
7492   * @old_dentry:         The dentry associated with the old name and the old
7493   *                      parent directory.
7494   * @old_dir:            The inode of the previous parent directory for the case
7495   *                      of a rename. For a link operation, it must be NULL.
7496   * @old_dir_index:      The index number associated with the old name, meaningful
7497   *                      only for rename operations (when @old_dir is not NULL).
7498   *                      Ignored for link operations.
7499   * @parent:             The dentry associated with the directory under which the
7500   *                      new name is located.
7501   *
7502   * Call this after adding a new name for an inode, as a result of a link or
7503   * rename operation, and it will properly update the log to reflect the new name.
7504   */
btrfs_log_new_name(struct btrfs_trans_handle * trans,struct dentry * old_dentry,struct btrfs_inode * old_dir,u64 old_dir_index,struct dentry * parent)7505  void btrfs_log_new_name(struct btrfs_trans_handle *trans,
7506  			struct dentry *old_dentry, struct btrfs_inode *old_dir,
7507  			u64 old_dir_index, struct dentry *parent)
7508  {
7509  	struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
7510  	struct btrfs_root *root = inode->root;
7511  	struct btrfs_log_ctx ctx;
7512  	bool log_pinned = false;
7513  	int ret;
7514  
7515  	/*
7516  	 * this will force the logging code to walk the dentry chain
7517  	 * up for the file
7518  	 */
7519  	if (!S_ISDIR(inode->vfs_inode.i_mode))
7520  		inode->last_unlink_trans = trans->transid;
7521  
7522  	/*
7523  	 * if this inode hasn't been logged and directory we're renaming it
7524  	 * from hasn't been logged, we don't need to log it
7525  	 */
7526  	ret = inode_logged(trans, inode, NULL);
7527  	if (ret < 0) {
7528  		goto out;
7529  	} else if (ret == 0) {
7530  		if (!old_dir)
7531  			return;
7532  		/*
7533  		 * If the inode was not logged and we are doing a rename (old_dir is not
7534  		 * NULL), check if old_dir was logged - if it was not we can return and
7535  		 * do nothing.
7536  		 */
7537  		ret = inode_logged(trans, old_dir, NULL);
7538  		if (ret < 0)
7539  			goto out;
7540  		else if (ret == 0)
7541  			return;
7542  	}
7543  	ret = 0;
7544  
7545  	/*
7546  	 * If we are doing a rename (old_dir is not NULL) from a directory that
7547  	 * was previously logged, make sure that on log replay we get the old
7548  	 * dir entry deleted. This is needed because we will also log the new
7549  	 * name of the renamed inode, so we need to make sure that after log
7550  	 * replay we don't end up with both the new and old dir entries existing.
7551  	 */
7552  	if (old_dir && old_dir->logged_trans == trans->transid) {
7553  		struct btrfs_root *log = old_dir->root->log_root;
7554  		struct btrfs_path *path;
7555  		struct fscrypt_name fname;
7556  
7557  		ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
7558  
7559  		ret = fscrypt_setup_filename(&old_dir->vfs_inode,
7560  					     &old_dentry->d_name, 0, &fname);
7561  		if (ret)
7562  			goto out;
7563  		/*
7564  		 * We have two inodes to update in the log, the old directory and
7565  		 * the inode that got renamed, so we must pin the log to prevent
7566  		 * anyone from syncing the log until we have updated both inodes
7567  		 * in the log.
7568  		 */
7569  		ret = join_running_log_trans(root);
7570  		/*
7571  		 * At least one of the inodes was logged before, so this should
7572  		 * not fail, but if it does, it's not serious, just bail out and
7573  		 * mark the log for a full commit.
7574  		 */
7575  		if (WARN_ON_ONCE(ret < 0)) {
7576  			fscrypt_free_filename(&fname);
7577  			goto out;
7578  		}
7579  
7580  		log_pinned = true;
7581  
7582  		path = btrfs_alloc_path();
7583  		if (!path) {
7584  			ret = -ENOMEM;
7585  			fscrypt_free_filename(&fname);
7586  			goto out;
7587  		}
7588  
7589  		/*
7590  		 * Other concurrent task might be logging the old directory,
7591  		 * as it can be triggered when logging other inode that had or
7592  		 * still has a dentry in the old directory. We lock the old
7593  		 * directory's log_mutex to ensure the deletion of the old
7594  		 * name is persisted, because during directory logging we
7595  		 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
7596  		 * the old name's dir index item is in the delayed items, so
7597  		 * it could be missed by an in progress directory logging.
7598  		 */
7599  		mutex_lock(&old_dir->log_mutex);
7600  		ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
7601  					&fname.disk_name, old_dir_index);
7602  		if (ret > 0) {
7603  			/*
7604  			 * The dentry does not exist in the log, so record its
7605  			 * deletion.
7606  			 */
7607  			btrfs_release_path(path);
7608  			ret = insert_dir_log_key(trans, log, path,
7609  						 btrfs_ino(old_dir),
7610  						 old_dir_index, old_dir_index);
7611  		}
7612  		mutex_unlock(&old_dir->log_mutex);
7613  
7614  		btrfs_free_path(path);
7615  		fscrypt_free_filename(&fname);
7616  		if (ret < 0)
7617  			goto out;
7618  	}
7619  
7620  	btrfs_init_log_ctx(&ctx, inode);
7621  	ctx.logging_new_name = true;
7622  	btrfs_init_log_ctx_scratch_eb(&ctx);
7623  	/*
7624  	 * We don't care about the return value. If we fail to log the new name
7625  	 * then we know the next attempt to sync the log will fallback to a full
7626  	 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
7627  	 * we don't need to worry about getting a log committed that has an
7628  	 * inconsistent state after a rename operation.
7629  	 */
7630  	btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
7631  	free_extent_buffer(ctx.scratch_eb);
7632  	ASSERT(list_empty(&ctx.conflict_inodes));
7633  out:
7634  	/*
7635  	 * If an error happened mark the log for a full commit because it's not
7636  	 * consistent and up to date or we couldn't find out if one of the
7637  	 * inodes was logged before in this transaction. Do it before unpinning
7638  	 * the log, to avoid any races with someone else trying to commit it.
7639  	 */
7640  	if (ret < 0)
7641  		btrfs_set_log_full_commit(trans);
7642  	if (log_pinned)
7643  		btrfs_end_log_trans(root);
7644  }
7645  
7646