1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2007 Oracle.  All rights reserved.
4   */
5  
6  #include <crypto/hash.h>
7  #include <linux/kernel.h>
8  #include <linux/bio.h>
9  #include <linux/blk-cgroup.h>
10  #include <linux/file.h>
11  #include <linux/fs.h>
12  #include <linux/pagemap.h>
13  #include <linux/highmem.h>
14  #include <linux/time.h>
15  #include <linux/init.h>
16  #include <linux/string.h>
17  #include <linux/backing-dev.h>
18  #include <linux/writeback.h>
19  #include <linux/compat.h>
20  #include <linux/xattr.h>
21  #include <linux/posix_acl.h>
22  #include <linux/falloc.h>
23  #include <linux/slab.h>
24  #include <linux/ratelimit.h>
25  #include <linux/btrfs.h>
26  #include <linux/blkdev.h>
27  #include <linux/posix_acl_xattr.h>
28  #include <linux/uio.h>
29  #include <linux/magic.h>
30  #include <linux/iversion.h>
31  #include <linux/swap.h>
32  #include <linux/migrate.h>
33  #include <linux/sched/mm.h>
34  #include <linux/iomap.h>
35  #include <linux/unaligned.h>
36  #include <linux/fsverity.h>
37  #include "misc.h"
38  #include "ctree.h"
39  #include "disk-io.h"
40  #include "transaction.h"
41  #include "btrfs_inode.h"
42  #include "ordered-data.h"
43  #include "xattr.h"
44  #include "tree-log.h"
45  #include "bio.h"
46  #include "compression.h"
47  #include "locking.h"
48  #include "props.h"
49  #include "qgroup.h"
50  #include "delalloc-space.h"
51  #include "block-group.h"
52  #include "space-info.h"
53  #include "zoned.h"
54  #include "subpage.h"
55  #include "inode-item.h"
56  #include "fs.h"
57  #include "accessors.h"
58  #include "extent-tree.h"
59  #include "root-tree.h"
60  #include "defrag.h"
61  #include "dir-item.h"
62  #include "file-item.h"
63  #include "uuid-tree.h"
64  #include "ioctl.h"
65  #include "file.h"
66  #include "acl.h"
67  #include "relocation.h"
68  #include "verity.h"
69  #include "super.h"
70  #include "orphan.h"
71  #include "backref.h"
72  #include "raid-stripe-tree.h"
73  #include "fiemap.h"
74  
75  struct btrfs_iget_args {
76  	u64 ino;
77  	struct btrfs_root *root;
78  };
79  
80  struct btrfs_rename_ctx {
81  	/* Output field. Stores the index number of the old directory entry. */
82  	u64 index;
83  };
84  
85  /*
86   * Used by data_reloc_print_warning_inode() to pass needed info for filename
87   * resolution and output of error message.
88   */
89  struct data_reloc_warn {
90  	struct btrfs_path path;
91  	struct btrfs_fs_info *fs_info;
92  	u64 extent_item_size;
93  	u64 logical;
94  	int mirror_num;
95  };
96  
97  /*
98   * For the file_extent_tree, we want to hold the inode lock when we lookup and
99   * update the disk_i_size, but lockdep will complain because our io_tree we hold
100   * the tree lock and get the inode lock when setting delalloc. These two things
101   * are unrelated, so make a class for the file_extent_tree so we don't get the
102   * two locking patterns mixed up.
103   */
104  static struct lock_class_key file_extent_tree_class;
105  
106  static const struct inode_operations btrfs_dir_inode_operations;
107  static const struct inode_operations btrfs_symlink_inode_operations;
108  static const struct inode_operations btrfs_special_inode_operations;
109  static const struct inode_operations btrfs_file_inode_operations;
110  static const struct address_space_operations btrfs_aops;
111  static const struct file_operations btrfs_dir_file_operations;
112  
113  static struct kmem_cache *btrfs_inode_cachep;
114  
115  static int btrfs_setsize(struct inode *inode, struct iattr *attr);
116  static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
117  
118  static noinline int run_delalloc_cow(struct btrfs_inode *inode,
119  				     struct folio *locked_folio, u64 start,
120  				     u64 end, struct writeback_control *wbc,
121  				     bool pages_dirty);
122  
data_reloc_print_warning_inode(u64 inum,u64 offset,u64 num_bytes,u64 root,void * warn_ctx)123  static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
124  					  u64 root, void *warn_ctx)
125  {
126  	struct data_reloc_warn *warn = warn_ctx;
127  	struct btrfs_fs_info *fs_info = warn->fs_info;
128  	struct extent_buffer *eb;
129  	struct btrfs_inode_item *inode_item;
130  	struct inode_fs_paths *ipath = NULL;
131  	struct btrfs_root *local_root;
132  	struct btrfs_key key;
133  	unsigned int nofs_flag;
134  	u32 nlink;
135  	int ret;
136  
137  	local_root = btrfs_get_fs_root(fs_info, root, true);
138  	if (IS_ERR(local_root)) {
139  		ret = PTR_ERR(local_root);
140  		goto err;
141  	}
142  
143  	/* This makes the path point to (inum INODE_ITEM ioff). */
144  	key.objectid = inum;
145  	key.type = BTRFS_INODE_ITEM_KEY;
146  	key.offset = 0;
147  
148  	ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
149  	if (ret) {
150  		btrfs_put_root(local_root);
151  		btrfs_release_path(&warn->path);
152  		goto err;
153  	}
154  
155  	eb = warn->path.nodes[0];
156  	inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
157  	nlink = btrfs_inode_nlink(eb, inode_item);
158  	btrfs_release_path(&warn->path);
159  
160  	nofs_flag = memalloc_nofs_save();
161  	ipath = init_ipath(4096, local_root, &warn->path);
162  	memalloc_nofs_restore(nofs_flag);
163  	if (IS_ERR(ipath)) {
164  		btrfs_put_root(local_root);
165  		ret = PTR_ERR(ipath);
166  		ipath = NULL;
167  		/*
168  		 * -ENOMEM, not a critical error, just output an generic error
169  		 * without filename.
170  		 */
171  		btrfs_warn(fs_info,
172  "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
173  			   warn->logical, warn->mirror_num, root, inum, offset);
174  		return ret;
175  	}
176  	ret = paths_from_inode(inum, ipath);
177  	if (ret < 0)
178  		goto err;
179  
180  	/*
181  	 * We deliberately ignore the bit ipath might have been too small to
182  	 * hold all of the paths here
183  	 */
184  	for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
185  		btrfs_warn(fs_info,
186  "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
187  			   warn->logical, warn->mirror_num, root, inum, offset,
188  			   fs_info->sectorsize, nlink,
189  			   (char *)(unsigned long)ipath->fspath->val[i]);
190  	}
191  
192  	btrfs_put_root(local_root);
193  	free_ipath(ipath);
194  	return 0;
195  
196  err:
197  	btrfs_warn(fs_info,
198  "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
199  		   warn->logical, warn->mirror_num, root, inum, offset, ret);
200  
201  	free_ipath(ipath);
202  	return ret;
203  }
204  
205  /*
206   * Do extra user-friendly error output (e.g. lookup all the affected files).
207   *
208   * Return true if we succeeded doing the backref lookup.
209   * Return false if such lookup failed, and has to fallback to the old error message.
210   */
print_data_reloc_error(const struct btrfs_inode * inode,u64 file_off,const u8 * csum,const u8 * csum_expected,int mirror_num)211  static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
212  				   const u8 *csum, const u8 *csum_expected,
213  				   int mirror_num)
214  {
215  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
216  	struct btrfs_path path = { 0 };
217  	struct btrfs_key found_key = { 0 };
218  	struct extent_buffer *eb;
219  	struct btrfs_extent_item *ei;
220  	const u32 csum_size = fs_info->csum_size;
221  	u64 logical;
222  	u64 flags;
223  	u32 item_size;
224  	int ret;
225  
226  	mutex_lock(&fs_info->reloc_mutex);
227  	logical = btrfs_get_reloc_bg_bytenr(fs_info);
228  	mutex_unlock(&fs_info->reloc_mutex);
229  
230  	if (logical == U64_MAX) {
231  		btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
232  		btrfs_warn_rl(fs_info,
233  "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
234  			btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
235  			CSUM_FMT_VALUE(csum_size, csum),
236  			CSUM_FMT_VALUE(csum_size, csum_expected),
237  			mirror_num);
238  		return;
239  	}
240  
241  	logical += file_off;
242  	btrfs_warn_rl(fs_info,
243  "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
244  			btrfs_root_id(inode->root),
245  			btrfs_ino(inode), file_off, logical,
246  			CSUM_FMT_VALUE(csum_size, csum),
247  			CSUM_FMT_VALUE(csum_size, csum_expected),
248  			mirror_num);
249  
250  	ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
251  	if (ret < 0) {
252  		btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
253  			     logical, ret);
254  		return;
255  	}
256  	eb = path.nodes[0];
257  	ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
258  	item_size = btrfs_item_size(eb, path.slots[0]);
259  	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
260  		unsigned long ptr = 0;
261  		u64 ref_root;
262  		u8 ref_level;
263  
264  		while (true) {
265  			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
266  						      item_size, &ref_root,
267  						      &ref_level);
268  			if (ret < 0) {
269  				btrfs_warn_rl(fs_info,
270  				"failed to resolve tree backref for logical %llu: %d",
271  					      logical, ret);
272  				break;
273  			}
274  			if (ret > 0)
275  				break;
276  
277  			btrfs_warn_rl(fs_info,
278  "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
279  				logical, mirror_num,
280  				(ref_level ? "node" : "leaf"),
281  				ref_level, ref_root);
282  		}
283  		btrfs_release_path(&path);
284  	} else {
285  		struct btrfs_backref_walk_ctx ctx = { 0 };
286  		struct data_reloc_warn reloc_warn = { 0 };
287  
288  		btrfs_release_path(&path);
289  
290  		ctx.bytenr = found_key.objectid;
291  		ctx.extent_item_pos = logical - found_key.objectid;
292  		ctx.fs_info = fs_info;
293  
294  		reloc_warn.logical = logical;
295  		reloc_warn.extent_item_size = found_key.offset;
296  		reloc_warn.mirror_num = mirror_num;
297  		reloc_warn.fs_info = fs_info;
298  
299  		iterate_extent_inodes(&ctx, true,
300  				      data_reloc_print_warning_inode, &reloc_warn);
301  	}
302  }
303  
btrfs_print_data_csum_error(struct btrfs_inode * inode,u64 logical_start,u8 * csum,u8 * csum_expected,int mirror_num)304  static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
305  		u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
306  {
307  	struct btrfs_root *root = inode->root;
308  	const u32 csum_size = root->fs_info->csum_size;
309  
310  	/* For data reloc tree, it's better to do a backref lookup instead. */
311  	if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
312  		return print_data_reloc_error(inode, logical_start, csum,
313  					      csum_expected, mirror_num);
314  
315  	/* Output without objectid, which is more meaningful */
316  	if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
317  		btrfs_warn_rl(root->fs_info,
318  "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
319  			btrfs_root_id(root), btrfs_ino(inode),
320  			logical_start,
321  			CSUM_FMT_VALUE(csum_size, csum),
322  			CSUM_FMT_VALUE(csum_size, csum_expected),
323  			mirror_num);
324  	} else {
325  		btrfs_warn_rl(root->fs_info,
326  "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
327  			btrfs_root_id(root), btrfs_ino(inode),
328  			logical_start,
329  			CSUM_FMT_VALUE(csum_size, csum),
330  			CSUM_FMT_VALUE(csum_size, csum_expected),
331  			mirror_num);
332  	}
333  }
334  
335  /*
336   * Lock inode i_rwsem based on arguments passed.
337   *
338   * ilock_flags can have the following bit set:
339   *
340   * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
341   * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
342   *		     return -EAGAIN
343   * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
344   */
btrfs_inode_lock(struct btrfs_inode * inode,unsigned int ilock_flags)345  int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
346  {
347  	if (ilock_flags & BTRFS_ILOCK_SHARED) {
348  		if (ilock_flags & BTRFS_ILOCK_TRY) {
349  			if (!inode_trylock_shared(&inode->vfs_inode))
350  				return -EAGAIN;
351  			else
352  				return 0;
353  		}
354  		inode_lock_shared(&inode->vfs_inode);
355  	} else {
356  		if (ilock_flags & BTRFS_ILOCK_TRY) {
357  			if (!inode_trylock(&inode->vfs_inode))
358  				return -EAGAIN;
359  			else
360  				return 0;
361  		}
362  		inode_lock(&inode->vfs_inode);
363  	}
364  	if (ilock_flags & BTRFS_ILOCK_MMAP)
365  		down_write(&inode->i_mmap_lock);
366  	return 0;
367  }
368  
369  /*
370   * Unock inode i_rwsem.
371   *
372   * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
373   * to decide whether the lock acquired is shared or exclusive.
374   */
btrfs_inode_unlock(struct btrfs_inode * inode,unsigned int ilock_flags)375  void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
376  {
377  	if (ilock_flags & BTRFS_ILOCK_MMAP)
378  		up_write(&inode->i_mmap_lock);
379  	if (ilock_flags & BTRFS_ILOCK_SHARED)
380  		inode_unlock_shared(&inode->vfs_inode);
381  	else
382  		inode_unlock(&inode->vfs_inode);
383  }
384  
385  /*
386   * Cleanup all submitted ordered extents in specified range to handle errors
387   * from the btrfs_run_delalloc_range() callback.
388   *
389   * NOTE: caller must ensure that when an error happens, it can not call
390   * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
391   * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
392   * to be released, which we want to happen only when finishing the ordered
393   * extent (btrfs_finish_ordered_io()).
394   */
btrfs_cleanup_ordered_extents(struct btrfs_inode * inode,struct folio * locked_folio,u64 offset,u64 bytes)395  static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
396  						 struct folio *locked_folio,
397  						 u64 offset, u64 bytes)
398  {
399  	unsigned long index = offset >> PAGE_SHIFT;
400  	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
401  	u64 page_start = 0, page_end = 0;
402  	struct folio *folio;
403  
404  	if (locked_folio) {
405  		page_start = folio_pos(locked_folio);
406  		page_end = page_start + folio_size(locked_folio) - 1;
407  	}
408  
409  	while (index <= end_index) {
410  		/*
411  		 * For locked page, we will call btrfs_mark_ordered_io_finished
412  		 * through btrfs_mark_ordered_io_finished() on it
413  		 * in run_delalloc_range() for the error handling, which will
414  		 * clear page Ordered and run the ordered extent accounting.
415  		 *
416  		 * Here we can't just clear the Ordered bit, or
417  		 * btrfs_mark_ordered_io_finished() would skip the accounting
418  		 * for the page range, and the ordered extent will never finish.
419  		 */
420  		if (locked_folio && index == (page_start >> PAGE_SHIFT)) {
421  			index++;
422  			continue;
423  		}
424  		folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
425  		index++;
426  		if (IS_ERR(folio))
427  			continue;
428  
429  		/*
430  		 * Here we just clear all Ordered bits for every page in the
431  		 * range, then btrfs_mark_ordered_io_finished() will handle
432  		 * the ordered extent accounting for the range.
433  		 */
434  		btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
435  						offset, bytes);
436  		folio_put(folio);
437  	}
438  
439  	if (locked_folio) {
440  		/* The locked page covers the full range, nothing needs to be done */
441  		if (bytes + offset <= page_start + folio_size(locked_folio))
442  			return;
443  		/*
444  		 * In case this page belongs to the delalloc range being
445  		 * instantiated then skip it, since the first page of a range is
446  		 * going to be properly cleaned up by the caller of
447  		 * run_delalloc_range
448  		 */
449  		if (page_start >= offset && page_end <= (offset + bytes - 1)) {
450  			bytes = offset + bytes - folio_pos(locked_folio) -
451  				folio_size(locked_folio);
452  			offset = folio_pos(locked_folio) + folio_size(locked_folio);
453  		}
454  	}
455  
456  	return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
457  }
458  
459  static int btrfs_dirty_inode(struct btrfs_inode *inode);
460  
btrfs_init_inode_security(struct btrfs_trans_handle * trans,struct btrfs_new_inode_args * args)461  static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
462  				     struct btrfs_new_inode_args *args)
463  {
464  	int err;
465  
466  	if (args->default_acl) {
467  		err = __btrfs_set_acl(trans, args->inode, args->default_acl,
468  				      ACL_TYPE_DEFAULT);
469  		if (err)
470  			return err;
471  	}
472  	if (args->acl) {
473  		err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
474  		if (err)
475  			return err;
476  	}
477  	if (!args->default_acl && !args->acl)
478  		cache_no_acl(args->inode);
479  	return btrfs_xattr_security_init(trans, args->inode, args->dir,
480  					 &args->dentry->d_name);
481  }
482  
483  /*
484   * this does all the hard work for inserting an inline extent into
485   * the btree.  The caller should have done a btrfs_drop_extents so that
486   * no overlapping inline items exist in the btree
487   */
insert_inline_extent(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_inode * inode,bool extent_inserted,size_t size,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)488  static int insert_inline_extent(struct btrfs_trans_handle *trans,
489  				struct btrfs_path *path,
490  				struct btrfs_inode *inode, bool extent_inserted,
491  				size_t size, size_t compressed_size,
492  				int compress_type,
493  				struct folio *compressed_folio,
494  				bool update_i_size)
495  {
496  	struct btrfs_root *root = inode->root;
497  	struct extent_buffer *leaf;
498  	const u32 sectorsize = trans->fs_info->sectorsize;
499  	char *kaddr;
500  	unsigned long ptr;
501  	struct btrfs_file_extent_item *ei;
502  	int ret;
503  	size_t cur_size = size;
504  	u64 i_size;
505  
506  	/*
507  	 * The decompressed size must still be no larger than a sector.  Under
508  	 * heavy race, we can have size == 0 passed in, but that shouldn't be a
509  	 * big deal and we can continue the insertion.
510  	 */
511  	ASSERT(size <= sectorsize);
512  
513  	/*
514  	 * The compressed size also needs to be no larger than a sector.
515  	 * That's also why we only need one page as the parameter.
516  	 */
517  	if (compressed_folio)
518  		ASSERT(compressed_size <= sectorsize);
519  	else
520  		ASSERT(compressed_size == 0);
521  
522  	if (compressed_size && compressed_folio)
523  		cur_size = compressed_size;
524  
525  	if (!extent_inserted) {
526  		struct btrfs_key key;
527  		size_t datasize;
528  
529  		key.objectid = btrfs_ino(inode);
530  		key.offset = 0;
531  		key.type = BTRFS_EXTENT_DATA_KEY;
532  
533  		datasize = btrfs_file_extent_calc_inline_size(cur_size);
534  		ret = btrfs_insert_empty_item(trans, root, path, &key,
535  					      datasize);
536  		if (ret)
537  			goto fail;
538  	}
539  	leaf = path->nodes[0];
540  	ei = btrfs_item_ptr(leaf, path->slots[0],
541  			    struct btrfs_file_extent_item);
542  	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
543  	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
544  	btrfs_set_file_extent_encryption(leaf, ei, 0);
545  	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
546  	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
547  	ptr = btrfs_file_extent_inline_start(ei);
548  
549  	if (compress_type != BTRFS_COMPRESS_NONE) {
550  		kaddr = kmap_local_folio(compressed_folio, 0);
551  		write_extent_buffer(leaf, kaddr, ptr, compressed_size);
552  		kunmap_local(kaddr);
553  
554  		btrfs_set_file_extent_compression(leaf, ei,
555  						  compress_type);
556  	} else {
557  		struct folio *folio;
558  
559  		folio = __filemap_get_folio(inode->vfs_inode.i_mapping,
560  					    0, 0, 0);
561  		ASSERT(!IS_ERR(folio));
562  		btrfs_set_file_extent_compression(leaf, ei, 0);
563  		kaddr = kmap_local_folio(folio, 0);
564  		write_extent_buffer(leaf, kaddr, ptr, size);
565  		kunmap_local(kaddr);
566  		folio_put(folio);
567  	}
568  	btrfs_mark_buffer_dirty(trans, leaf);
569  	btrfs_release_path(path);
570  
571  	/*
572  	 * We align size to sectorsize for inline extents just for simplicity
573  	 * sake.
574  	 */
575  	ret = btrfs_inode_set_file_extent_range(inode, 0,
576  					ALIGN(size, root->fs_info->sectorsize));
577  	if (ret)
578  		goto fail;
579  
580  	/*
581  	 * We're an inline extent, so nobody can extend the file past i_size
582  	 * without locking a page we already have locked.
583  	 *
584  	 * We must do any i_size and inode updates before we unlock the pages.
585  	 * Otherwise we could end up racing with unlink.
586  	 */
587  	i_size = i_size_read(&inode->vfs_inode);
588  	if (update_i_size && size > i_size) {
589  		i_size_write(&inode->vfs_inode, size);
590  		i_size = size;
591  	}
592  	inode->disk_i_size = i_size;
593  
594  fail:
595  	return ret;
596  }
597  
can_cow_file_range_inline(struct btrfs_inode * inode,u64 offset,u64 size,size_t compressed_size)598  static bool can_cow_file_range_inline(struct btrfs_inode *inode,
599  				      u64 offset, u64 size,
600  				      size_t compressed_size)
601  {
602  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
603  	u64 data_len = (compressed_size ?: size);
604  
605  	/* Inline extents must start at offset 0. */
606  	if (offset != 0)
607  		return false;
608  
609  	/*
610  	 * Due to the page size limit, for subpage we can only trigger the
611  	 * writeback for the dirty sectors of page, that means data writeback
612  	 * is doing more writeback than what we want.
613  	 *
614  	 * This is especially unexpected for some call sites like fallocate,
615  	 * where we only increase i_size after everything is done.
616  	 * This means we can trigger inline extent even if we didn't want to.
617  	 * So here we skip inline extent creation completely.
618  	 */
619  	if (fs_info->sectorsize != PAGE_SIZE)
620  		return false;
621  
622  	/* Inline extents are limited to sectorsize. */
623  	if (size > fs_info->sectorsize)
624  		return false;
625  
626  	/* We cannot exceed the maximum inline data size. */
627  	if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
628  		return false;
629  
630  	/* We cannot exceed the user specified max_inline size. */
631  	if (data_len > fs_info->max_inline)
632  		return false;
633  
634  	/* Inline extents must be the entirety of the file. */
635  	if (size < i_size_read(&inode->vfs_inode))
636  		return false;
637  
638  	return true;
639  }
640  
641  /*
642   * conditionally insert an inline extent into the file.  This
643   * does the checks required to make sure the data is small enough
644   * to fit as an inline extent.
645   *
646   * If being used directly, you must have already checked we're allowed to cow
647   * the range by getting true from can_cow_file_range_inline().
648   */
__cow_file_range_inline(struct btrfs_inode * inode,u64 offset,u64 size,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)649  static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
650  					    u64 size, size_t compressed_size,
651  					    int compress_type,
652  					    struct folio *compressed_folio,
653  					    bool update_i_size)
654  {
655  	struct btrfs_drop_extents_args drop_args = { 0 };
656  	struct btrfs_root *root = inode->root;
657  	struct btrfs_fs_info *fs_info = root->fs_info;
658  	struct btrfs_trans_handle *trans;
659  	u64 data_len = (compressed_size ?: size);
660  	int ret;
661  	struct btrfs_path *path;
662  
663  	path = btrfs_alloc_path();
664  	if (!path)
665  		return -ENOMEM;
666  
667  	trans = btrfs_join_transaction(root);
668  	if (IS_ERR(trans)) {
669  		btrfs_free_path(path);
670  		return PTR_ERR(trans);
671  	}
672  	trans->block_rsv = &inode->block_rsv;
673  
674  	drop_args.path = path;
675  	drop_args.start = 0;
676  	drop_args.end = fs_info->sectorsize;
677  	drop_args.drop_cache = true;
678  	drop_args.replace_extent = true;
679  	drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
680  	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
681  	if (ret) {
682  		btrfs_abort_transaction(trans, ret);
683  		goto out;
684  	}
685  
686  	ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
687  				   size, compressed_size, compress_type,
688  				   compressed_folio, update_i_size);
689  	if (ret && ret != -ENOSPC) {
690  		btrfs_abort_transaction(trans, ret);
691  		goto out;
692  	} else if (ret == -ENOSPC) {
693  		ret = 1;
694  		goto out;
695  	}
696  
697  	btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
698  	ret = btrfs_update_inode(trans, inode);
699  	if (ret && ret != -ENOSPC) {
700  		btrfs_abort_transaction(trans, ret);
701  		goto out;
702  	} else if (ret == -ENOSPC) {
703  		ret = 1;
704  		goto out;
705  	}
706  
707  	btrfs_set_inode_full_sync(inode);
708  out:
709  	/*
710  	 * Don't forget to free the reserved space, as for inlined extent
711  	 * it won't count as data extent, free them directly here.
712  	 * And at reserve time, it's always aligned to page size, so
713  	 * just free one page here.
714  	 */
715  	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
716  	btrfs_free_path(path);
717  	btrfs_end_transaction(trans);
718  	return ret;
719  }
720  
cow_file_range_inline(struct btrfs_inode * inode,struct folio * locked_folio,u64 offset,u64 end,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)721  static noinline int cow_file_range_inline(struct btrfs_inode *inode,
722  					  struct folio *locked_folio,
723  					  u64 offset, u64 end,
724  					  size_t compressed_size,
725  					  int compress_type,
726  					  struct folio *compressed_folio,
727  					  bool update_i_size)
728  {
729  	struct extent_state *cached = NULL;
730  	unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
731  		EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
732  	u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
733  	int ret;
734  
735  	if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
736  		return 1;
737  
738  	lock_extent(&inode->io_tree, offset, end, &cached);
739  	ret = __cow_file_range_inline(inode, offset, size, compressed_size,
740  				      compress_type, compressed_folio,
741  				      update_i_size);
742  	if (ret > 0) {
743  		unlock_extent(&inode->io_tree, offset, end, &cached);
744  		return ret;
745  	}
746  
747  	/*
748  	 * In the successful case (ret == 0 here), cow_file_range will return 1.
749  	 *
750  	 * Quite a bit further up the callstack in extent_writepage(), ret == 1
751  	 * is treated as a short circuited success and does not unlock the folio,
752  	 * so we must do it here.
753  	 *
754  	 * In the failure case, the locked_folio does get unlocked by
755  	 * btrfs_folio_end_all_writers, which asserts that it is still locked
756  	 * at that point, so we must *not* unlock it here.
757  	 *
758  	 * The other two callsites in compress_file_range do not have a
759  	 * locked_folio, so they are not relevant to this logic.
760  	 */
761  	if (ret == 0)
762  		locked_folio = NULL;
763  
764  	extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached,
765  				     clear_flags, PAGE_UNLOCK |
766  				     PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
767  	return ret;
768  }
769  
770  struct async_extent {
771  	u64 start;
772  	u64 ram_size;
773  	u64 compressed_size;
774  	struct folio **folios;
775  	unsigned long nr_folios;
776  	int compress_type;
777  	struct list_head list;
778  };
779  
780  struct async_chunk {
781  	struct btrfs_inode *inode;
782  	struct folio *locked_folio;
783  	u64 start;
784  	u64 end;
785  	blk_opf_t write_flags;
786  	struct list_head extents;
787  	struct cgroup_subsys_state *blkcg_css;
788  	struct btrfs_work work;
789  	struct async_cow *async_cow;
790  };
791  
792  struct async_cow {
793  	atomic_t num_chunks;
794  	struct async_chunk chunks[];
795  };
796  
add_async_extent(struct async_chunk * cow,u64 start,u64 ram_size,u64 compressed_size,struct folio ** folios,unsigned long nr_folios,int compress_type)797  static noinline int add_async_extent(struct async_chunk *cow,
798  				     u64 start, u64 ram_size,
799  				     u64 compressed_size,
800  				     struct folio **folios,
801  				     unsigned long nr_folios,
802  				     int compress_type)
803  {
804  	struct async_extent *async_extent;
805  
806  	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
807  	if (!async_extent)
808  		return -ENOMEM;
809  	async_extent->start = start;
810  	async_extent->ram_size = ram_size;
811  	async_extent->compressed_size = compressed_size;
812  	async_extent->folios = folios;
813  	async_extent->nr_folios = nr_folios;
814  	async_extent->compress_type = compress_type;
815  	list_add_tail(&async_extent->list, &cow->extents);
816  	return 0;
817  }
818  
819  /*
820   * Check if the inode needs to be submitted to compression, based on mount
821   * options, defragmentation, properties or heuristics.
822   */
inode_need_compress(struct btrfs_inode * inode,u64 start,u64 end)823  static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
824  				      u64 end)
825  {
826  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
827  
828  	if (!btrfs_inode_can_compress(inode)) {
829  		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
830  			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
831  			btrfs_ino(inode));
832  		return 0;
833  	}
834  	/*
835  	 * Special check for subpage.
836  	 *
837  	 * We lock the full page then run each delalloc range in the page, thus
838  	 * for the following case, we will hit some subpage specific corner case:
839  	 *
840  	 * 0		32K		64K
841  	 * |	|///////|	|///////|
842  	 *		\- A		\- B
843  	 *
844  	 * In above case, both range A and range B will try to unlock the full
845  	 * page [0, 64K), causing the one finished later will have page
846  	 * unlocked already, triggering various page lock requirement BUG_ON()s.
847  	 *
848  	 * So here we add an artificial limit that subpage compression can only
849  	 * if the range is fully page aligned.
850  	 *
851  	 * In theory we only need to ensure the first page is fully covered, but
852  	 * the tailing partial page will be locked until the full compression
853  	 * finishes, delaying the write of other range.
854  	 *
855  	 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
856  	 * first to prevent any submitted async extent to unlock the full page.
857  	 * By this, we can ensure for subpage case that only the last async_cow
858  	 * will unlock the full page.
859  	 */
860  	if (fs_info->sectorsize < PAGE_SIZE) {
861  		if (!PAGE_ALIGNED(start) ||
862  		    !PAGE_ALIGNED(end + 1))
863  			return 0;
864  	}
865  
866  	/* force compress */
867  	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
868  		return 1;
869  	/* defrag ioctl */
870  	if (inode->defrag_compress)
871  		return 1;
872  	/* bad compression ratios */
873  	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
874  		return 0;
875  	if (btrfs_test_opt(fs_info, COMPRESS) ||
876  	    inode->flags & BTRFS_INODE_COMPRESS ||
877  	    inode->prop_compress)
878  		return btrfs_compress_heuristic(inode, start, end);
879  	return 0;
880  }
881  
inode_should_defrag(struct btrfs_inode * inode,u64 start,u64 end,u64 num_bytes,u32 small_write)882  static inline void inode_should_defrag(struct btrfs_inode *inode,
883  		u64 start, u64 end, u64 num_bytes, u32 small_write)
884  {
885  	/* If this is a small write inside eof, kick off a defrag */
886  	if (num_bytes < small_write &&
887  	    (start > 0 || end + 1 < inode->disk_i_size))
888  		btrfs_add_inode_defrag(inode, small_write);
889  }
890  
extent_range_clear_dirty_for_io(struct inode * inode,u64 start,u64 end)891  static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
892  {
893  	unsigned long end_index = end >> PAGE_SHIFT;
894  	struct folio *folio;
895  	int ret = 0;
896  
897  	for (unsigned long index = start >> PAGE_SHIFT;
898  	     index <= end_index; index++) {
899  		folio = __filemap_get_folio(inode->i_mapping, index, 0, 0);
900  		if (IS_ERR(folio)) {
901  			if (!ret)
902  				ret = PTR_ERR(folio);
903  			continue;
904  		}
905  		folio_clear_dirty_for_io(folio);
906  		folio_put(folio);
907  	}
908  	return ret;
909  }
910  
911  /*
912   * Work queue call back to started compression on a file and pages.
913   *
914   * This is done inside an ordered work queue, and the compression is spread
915   * across many cpus.  The actual IO submission is step two, and the ordered work
916   * queue takes care of making sure that happens in the same order things were
917   * put onto the queue by writepages and friends.
918   *
919   * If this code finds it can't get good compression, it puts an entry onto the
920   * work queue to write the uncompressed bytes.  This makes sure that both
921   * compressed inodes and uncompressed inodes are written in the same order that
922   * the flusher thread sent them down.
923   */
compress_file_range(struct btrfs_work * work)924  static void compress_file_range(struct btrfs_work *work)
925  {
926  	struct async_chunk *async_chunk =
927  		container_of(work, struct async_chunk, work);
928  	struct btrfs_inode *inode = async_chunk->inode;
929  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
930  	struct address_space *mapping = inode->vfs_inode.i_mapping;
931  	u64 blocksize = fs_info->sectorsize;
932  	u64 start = async_chunk->start;
933  	u64 end = async_chunk->end;
934  	u64 actual_end;
935  	u64 i_size;
936  	int ret = 0;
937  	struct folio **folios;
938  	unsigned long nr_folios;
939  	unsigned long total_compressed = 0;
940  	unsigned long total_in = 0;
941  	unsigned int poff;
942  	int i;
943  	int compress_type = fs_info->compress_type;
944  
945  	inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
946  
947  	/*
948  	 * We need to call clear_page_dirty_for_io on each page in the range.
949  	 * Otherwise applications with the file mmap'd can wander in and change
950  	 * the page contents while we are compressing them.
951  	 */
952  	ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
953  
954  	/*
955  	 * All the folios should have been locked thus no failure.
956  	 *
957  	 * And even if some folios are missing, btrfs_compress_folios()
958  	 * would handle them correctly, so here just do an ASSERT() check for
959  	 * early logic errors.
960  	 */
961  	ASSERT(ret == 0);
962  
963  	/*
964  	 * We need to save i_size before now because it could change in between
965  	 * us evaluating the size and assigning it.  This is because we lock and
966  	 * unlock the page in truncate and fallocate, and then modify the i_size
967  	 * later on.
968  	 *
969  	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
970  	 * does that for us.
971  	 */
972  	barrier();
973  	i_size = i_size_read(&inode->vfs_inode);
974  	barrier();
975  	actual_end = min_t(u64, i_size, end + 1);
976  again:
977  	folios = NULL;
978  	nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
979  	nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
980  
981  	/*
982  	 * we don't want to send crud past the end of i_size through
983  	 * compression, that's just a waste of CPU time.  So, if the
984  	 * end of the file is before the start of our current
985  	 * requested range of bytes, we bail out to the uncompressed
986  	 * cleanup code that can deal with all of this.
987  	 *
988  	 * It isn't really the fastest way to fix things, but this is a
989  	 * very uncommon corner.
990  	 */
991  	if (actual_end <= start)
992  		goto cleanup_and_bail_uncompressed;
993  
994  	total_compressed = actual_end - start;
995  
996  	/*
997  	 * Skip compression for a small file range(<=blocksize) that
998  	 * isn't an inline extent, since it doesn't save disk space at all.
999  	 */
1000  	if (total_compressed <= blocksize &&
1001  	   (start > 0 || end + 1 < inode->disk_i_size))
1002  		goto cleanup_and_bail_uncompressed;
1003  
1004  	/*
1005  	 * For subpage case, we require full page alignment for the sector
1006  	 * aligned range.
1007  	 * Thus we must also check against @actual_end, not just @end.
1008  	 */
1009  	if (blocksize < PAGE_SIZE) {
1010  		if (!PAGE_ALIGNED(start) ||
1011  		    !PAGE_ALIGNED(round_up(actual_end, blocksize)))
1012  			goto cleanup_and_bail_uncompressed;
1013  	}
1014  
1015  	total_compressed = min_t(unsigned long, total_compressed,
1016  			BTRFS_MAX_UNCOMPRESSED);
1017  	total_in = 0;
1018  	ret = 0;
1019  
1020  	/*
1021  	 * We do compression for mount -o compress and when the inode has not
1022  	 * been flagged as NOCOMPRESS.  This flag can change at any time if we
1023  	 * discover bad compression ratios.
1024  	 */
1025  	if (!inode_need_compress(inode, start, end))
1026  		goto cleanup_and_bail_uncompressed;
1027  
1028  	folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
1029  	if (!folios) {
1030  		/*
1031  		 * Memory allocation failure is not a fatal error, we can fall
1032  		 * back to uncompressed code.
1033  		 */
1034  		goto cleanup_and_bail_uncompressed;
1035  	}
1036  
1037  	if (inode->defrag_compress)
1038  		compress_type = inode->defrag_compress;
1039  	else if (inode->prop_compress)
1040  		compress_type = inode->prop_compress;
1041  
1042  	/* Compression level is applied here. */
1043  	ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4),
1044  				    mapping, start, folios, &nr_folios, &total_in,
1045  				    &total_compressed);
1046  	if (ret)
1047  		goto mark_incompressible;
1048  
1049  	/*
1050  	 * Zero the tail end of the last page, as we might be sending it down
1051  	 * to disk.
1052  	 */
1053  	poff = offset_in_page(total_compressed);
1054  	if (poff)
1055  		folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
1056  
1057  	/*
1058  	 * Try to create an inline extent.
1059  	 *
1060  	 * If we didn't compress the entire range, try to create an uncompressed
1061  	 * inline extent, else a compressed one.
1062  	 *
1063  	 * Check cow_file_range() for why we don't even try to create inline
1064  	 * extent for the subpage case.
1065  	 */
1066  	if (total_in < actual_end)
1067  		ret = cow_file_range_inline(inode, NULL, start, end, 0,
1068  					    BTRFS_COMPRESS_NONE, NULL, false);
1069  	else
1070  		ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
1071  					    compress_type, folios[0], false);
1072  	if (ret <= 0) {
1073  		if (ret < 0)
1074  			mapping_set_error(mapping, -EIO);
1075  		goto free_pages;
1076  	}
1077  
1078  	/*
1079  	 * We aren't doing an inline extent. Round the compressed size up to a
1080  	 * block size boundary so the allocator does sane things.
1081  	 */
1082  	total_compressed = ALIGN(total_compressed, blocksize);
1083  
1084  	/*
1085  	 * One last check to make sure the compression is really a win, compare
1086  	 * the page count read with the blocks on disk, compression must free at
1087  	 * least one sector.
1088  	 */
1089  	total_in = round_up(total_in, fs_info->sectorsize);
1090  	if (total_compressed + blocksize > total_in)
1091  		goto mark_incompressible;
1092  
1093  	/*
1094  	 * The async work queues will take care of doing actual allocation on
1095  	 * disk for these compressed pages, and will submit the bios.
1096  	 */
1097  	ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
1098  			       nr_folios, compress_type);
1099  	BUG_ON(ret);
1100  	if (start + total_in < end) {
1101  		start += total_in;
1102  		cond_resched();
1103  		goto again;
1104  	}
1105  	return;
1106  
1107  mark_incompressible:
1108  	if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
1109  		inode->flags |= BTRFS_INODE_NOCOMPRESS;
1110  cleanup_and_bail_uncompressed:
1111  	ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
1112  			       BTRFS_COMPRESS_NONE);
1113  	BUG_ON(ret);
1114  free_pages:
1115  	if (folios) {
1116  		for (i = 0; i < nr_folios; i++) {
1117  			WARN_ON(folios[i]->mapping);
1118  			btrfs_free_compr_folio(folios[i]);
1119  		}
1120  		kfree(folios);
1121  	}
1122  }
1123  
free_async_extent_pages(struct async_extent * async_extent)1124  static void free_async_extent_pages(struct async_extent *async_extent)
1125  {
1126  	int i;
1127  
1128  	if (!async_extent->folios)
1129  		return;
1130  
1131  	for (i = 0; i < async_extent->nr_folios; i++) {
1132  		WARN_ON(async_extent->folios[i]->mapping);
1133  		btrfs_free_compr_folio(async_extent->folios[i]);
1134  	}
1135  	kfree(async_extent->folios);
1136  	async_extent->nr_folios = 0;
1137  	async_extent->folios = NULL;
1138  }
1139  
submit_uncompressed_range(struct btrfs_inode * inode,struct async_extent * async_extent,struct folio * locked_folio)1140  static void submit_uncompressed_range(struct btrfs_inode *inode,
1141  				      struct async_extent *async_extent,
1142  				      struct folio *locked_folio)
1143  {
1144  	u64 start = async_extent->start;
1145  	u64 end = async_extent->start + async_extent->ram_size - 1;
1146  	int ret;
1147  	struct writeback_control wbc = {
1148  		.sync_mode		= WB_SYNC_ALL,
1149  		.range_start		= start,
1150  		.range_end		= end,
1151  		.no_cgroup_owner	= 1,
1152  	};
1153  
1154  	wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
1155  	ret = run_delalloc_cow(inode, locked_folio, start, end,
1156  			       &wbc, false);
1157  	wbc_detach_inode(&wbc);
1158  	if (ret < 0) {
1159  		btrfs_cleanup_ordered_extents(inode, locked_folio,
1160  					      start, end - start + 1);
1161  		if (locked_folio) {
1162  			const u64 page_start = folio_pos(locked_folio);
1163  
1164  			folio_start_writeback(locked_folio);
1165  			folio_end_writeback(locked_folio);
1166  			btrfs_mark_ordered_io_finished(inode, locked_folio,
1167  						       page_start, PAGE_SIZE,
1168  						       !ret);
1169  			mapping_set_error(locked_folio->mapping, ret);
1170  			folio_unlock(locked_folio);
1171  		}
1172  	}
1173  }
1174  
submit_one_async_extent(struct async_chunk * async_chunk,struct async_extent * async_extent,u64 * alloc_hint)1175  static void submit_one_async_extent(struct async_chunk *async_chunk,
1176  				    struct async_extent *async_extent,
1177  				    u64 *alloc_hint)
1178  {
1179  	struct btrfs_inode *inode = async_chunk->inode;
1180  	struct extent_io_tree *io_tree = &inode->io_tree;
1181  	struct btrfs_root *root = inode->root;
1182  	struct btrfs_fs_info *fs_info = root->fs_info;
1183  	struct btrfs_ordered_extent *ordered;
1184  	struct btrfs_file_extent file_extent;
1185  	struct btrfs_key ins;
1186  	struct folio *locked_folio = NULL;
1187  	struct extent_state *cached = NULL;
1188  	struct extent_map *em;
1189  	int ret = 0;
1190  	u64 start = async_extent->start;
1191  	u64 end = async_extent->start + async_extent->ram_size - 1;
1192  
1193  	if (async_chunk->blkcg_css)
1194  		kthread_associate_blkcg(async_chunk->blkcg_css);
1195  
1196  	/*
1197  	 * If async_chunk->locked_folio is in the async_extent range, we need to
1198  	 * handle it.
1199  	 */
1200  	if (async_chunk->locked_folio) {
1201  		u64 locked_folio_start = folio_pos(async_chunk->locked_folio);
1202  		u64 locked_folio_end = locked_folio_start +
1203  			folio_size(async_chunk->locked_folio) - 1;
1204  
1205  		if (!(start >= locked_folio_end || end <= locked_folio_start))
1206  			locked_folio = async_chunk->locked_folio;
1207  	}
1208  
1209  	if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
1210  		submit_uncompressed_range(inode, async_extent, locked_folio);
1211  		goto done;
1212  	}
1213  
1214  	ret = btrfs_reserve_extent(root, async_extent->ram_size,
1215  				   async_extent->compressed_size,
1216  				   async_extent->compressed_size,
1217  				   0, *alloc_hint, &ins, 1, 1);
1218  	if (ret) {
1219  		/*
1220  		 * We can't reserve contiguous space for the compressed size.
1221  		 * Unlikely, but it's possible that we could have enough
1222  		 * non-contiguous space for the uncompressed size instead.  So
1223  		 * fall back to uncompressed.
1224  		 */
1225  		submit_uncompressed_range(inode, async_extent, locked_folio);
1226  		goto done;
1227  	}
1228  
1229  	lock_extent(io_tree, start, end, &cached);
1230  
1231  	/* Here we're doing allocation and writeback of the compressed pages */
1232  	file_extent.disk_bytenr = ins.objectid;
1233  	file_extent.disk_num_bytes = ins.offset;
1234  	file_extent.ram_bytes = async_extent->ram_size;
1235  	file_extent.num_bytes = async_extent->ram_size;
1236  	file_extent.offset = 0;
1237  	file_extent.compression = async_extent->compress_type;
1238  
1239  	em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
1240  	if (IS_ERR(em)) {
1241  		ret = PTR_ERR(em);
1242  		goto out_free_reserve;
1243  	}
1244  	free_extent_map(em);
1245  
1246  	ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
1247  					     1 << BTRFS_ORDERED_COMPRESSED);
1248  	if (IS_ERR(ordered)) {
1249  		btrfs_drop_extent_map_range(inode, start, end, false);
1250  		ret = PTR_ERR(ordered);
1251  		goto out_free_reserve;
1252  	}
1253  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1254  
1255  	/* Clear dirty, set writeback and unlock the pages. */
1256  	extent_clear_unlock_delalloc(inode, start, end,
1257  			NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
1258  			PAGE_UNLOCK | PAGE_START_WRITEBACK);
1259  	btrfs_submit_compressed_write(ordered,
1260  			    async_extent->folios,	/* compressed_folios */
1261  			    async_extent->nr_folios,
1262  			    async_chunk->write_flags, true);
1263  	*alloc_hint = ins.objectid + ins.offset;
1264  done:
1265  	if (async_chunk->blkcg_css)
1266  		kthread_associate_blkcg(NULL);
1267  	kfree(async_extent);
1268  	return;
1269  
1270  out_free_reserve:
1271  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1272  	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1273  	mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
1274  	extent_clear_unlock_delalloc(inode, start, end,
1275  				     NULL, &cached,
1276  				     EXTENT_LOCKED | EXTENT_DELALLOC |
1277  				     EXTENT_DELALLOC_NEW |
1278  				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1279  				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
1280  				     PAGE_END_WRITEBACK);
1281  	free_async_extent_pages(async_extent);
1282  	if (async_chunk->blkcg_css)
1283  		kthread_associate_blkcg(NULL);
1284  	btrfs_debug(fs_info,
1285  "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1286  		    btrfs_root_id(root), btrfs_ino(inode), start,
1287  		    async_extent->ram_size, ret);
1288  	kfree(async_extent);
1289  }
1290  
btrfs_get_extent_allocation_hint(struct btrfs_inode * inode,u64 start,u64 num_bytes)1291  u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1292  				     u64 num_bytes)
1293  {
1294  	struct extent_map_tree *em_tree = &inode->extent_tree;
1295  	struct extent_map *em;
1296  	u64 alloc_hint = 0;
1297  
1298  	read_lock(&em_tree->lock);
1299  	em = search_extent_mapping(em_tree, start, num_bytes);
1300  	if (em) {
1301  		/*
1302  		 * if block start isn't an actual block number then find the
1303  		 * first block in this inode and use that as a hint.  If that
1304  		 * block is also bogus then just don't worry about it.
1305  		 */
1306  		if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
1307  			free_extent_map(em);
1308  			em = search_extent_mapping(em_tree, 0, 0);
1309  			if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
1310  				alloc_hint = extent_map_block_start(em);
1311  			if (em)
1312  				free_extent_map(em);
1313  		} else {
1314  			alloc_hint = extent_map_block_start(em);
1315  			free_extent_map(em);
1316  		}
1317  	}
1318  	read_unlock(&em_tree->lock);
1319  
1320  	return alloc_hint;
1321  }
1322  
1323  /*
1324   * when extent_io.c finds a delayed allocation range in the file,
1325   * the call backs end up in this code.  The basic idea is to
1326   * allocate extents on disk for the range, and create ordered data structs
1327   * in ram to track those extents.
1328   *
1329   * locked_folio is the folio that writepage had locked already.  We use
1330   * it to make sure we don't do extra locks or unlocks.
1331   *
1332   * When this function fails, it unlocks all pages except @locked_folio.
1333   *
1334   * When this function successfully creates an inline extent, it returns 1 and
1335   * unlocks all pages including locked_folio and starts I/O on them.
1336   * (In reality inline extents are limited to a single page, so locked_folio is
1337   * the only page handled anyway).
1338   *
1339   * When this function succeed and creates a normal extent, the page locking
1340   * status depends on the passed in flags:
1341   *
1342   * - If @keep_locked is set, all pages are kept locked.
1343   * - Else all pages except for @locked_folio are unlocked.
1344   *
1345   * When a failure happens in the second or later iteration of the
1346   * while-loop, the ordered extents created in previous iterations are kept
1347   * intact. So, the caller must clean them up by calling
1348   * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
1349   * example.
1350   */
cow_file_range(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,u64 * done_offset,bool keep_locked,bool no_inline)1351  static noinline int cow_file_range(struct btrfs_inode *inode,
1352  				   struct folio *locked_folio, u64 start,
1353  				   u64 end, u64 *done_offset,
1354  				   bool keep_locked, bool no_inline)
1355  {
1356  	struct btrfs_root *root = inode->root;
1357  	struct btrfs_fs_info *fs_info = root->fs_info;
1358  	struct extent_state *cached = NULL;
1359  	u64 alloc_hint = 0;
1360  	u64 orig_start = start;
1361  	u64 num_bytes;
1362  	unsigned long ram_size;
1363  	u64 cur_alloc_size = 0;
1364  	u64 min_alloc_size;
1365  	u64 blocksize = fs_info->sectorsize;
1366  	struct btrfs_key ins;
1367  	struct extent_map *em;
1368  	unsigned clear_bits;
1369  	unsigned long page_ops;
1370  	bool extent_reserved = false;
1371  	int ret = 0;
1372  
1373  	if (btrfs_is_free_space_inode(inode)) {
1374  		ret = -EINVAL;
1375  		goto out_unlock;
1376  	}
1377  
1378  	num_bytes = ALIGN(end - start + 1, blocksize);
1379  	num_bytes = max(blocksize,  num_bytes);
1380  	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1381  
1382  	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1383  
1384  	if (!no_inline) {
1385  		/* lets try to make an inline extent */
1386  		ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
1387  					    BTRFS_COMPRESS_NONE, NULL, false);
1388  		if (ret <= 0) {
1389  			/*
1390  			 * We succeeded, return 1 so the caller knows we're done
1391  			 * with this page and already handled the IO.
1392  			 *
1393  			 * If there was an error then cow_file_range_inline() has
1394  			 * already done the cleanup.
1395  			 */
1396  			if (ret == 0)
1397  				ret = 1;
1398  			goto done;
1399  		}
1400  	}
1401  
1402  	alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
1403  
1404  	/*
1405  	 * Relocation relies on the relocated extents to have exactly the same
1406  	 * size as the original extents. Normally writeback for relocation data
1407  	 * extents follows a NOCOW path because relocation preallocates the
1408  	 * extents. However, due to an operation such as scrub turning a block
1409  	 * group to RO mode, it may fallback to COW mode, so we must make sure
1410  	 * an extent allocated during COW has exactly the requested size and can
1411  	 * not be split into smaller extents, otherwise relocation breaks and
1412  	 * fails during the stage where it updates the bytenr of file extent
1413  	 * items.
1414  	 */
1415  	if (btrfs_is_data_reloc_root(root))
1416  		min_alloc_size = num_bytes;
1417  	else
1418  		min_alloc_size = fs_info->sectorsize;
1419  
1420  	while (num_bytes > 0) {
1421  		struct btrfs_ordered_extent *ordered;
1422  		struct btrfs_file_extent file_extent;
1423  
1424  		cur_alloc_size = num_bytes;
1425  		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1426  					   min_alloc_size, 0, alloc_hint,
1427  					   &ins, 1, 1);
1428  		if (ret == -EAGAIN) {
1429  			/*
1430  			 * btrfs_reserve_extent only returns -EAGAIN for zoned
1431  			 * file systems, which is an indication that there are
1432  			 * no active zones to allocate from at the moment.
1433  			 *
1434  			 * If this is the first loop iteration, wait for at
1435  			 * least one zone to finish before retrying the
1436  			 * allocation.  Otherwise ask the caller to write out
1437  			 * the already allocated blocks before coming back to
1438  			 * us, or return -ENOSPC if it can't handle retries.
1439  			 */
1440  			ASSERT(btrfs_is_zoned(fs_info));
1441  			if (start == orig_start) {
1442  				wait_on_bit_io(&inode->root->fs_info->flags,
1443  					       BTRFS_FS_NEED_ZONE_FINISH,
1444  					       TASK_UNINTERRUPTIBLE);
1445  				continue;
1446  			}
1447  			if (done_offset) {
1448  				*done_offset = start - 1;
1449  				return 0;
1450  			}
1451  			ret = -ENOSPC;
1452  		}
1453  		if (ret < 0)
1454  			goto out_unlock;
1455  		cur_alloc_size = ins.offset;
1456  		extent_reserved = true;
1457  
1458  		ram_size = ins.offset;
1459  		file_extent.disk_bytenr = ins.objectid;
1460  		file_extent.disk_num_bytes = ins.offset;
1461  		file_extent.num_bytes = ins.offset;
1462  		file_extent.ram_bytes = ins.offset;
1463  		file_extent.offset = 0;
1464  		file_extent.compression = BTRFS_COMPRESS_NONE;
1465  
1466  		lock_extent(&inode->io_tree, start, start + ram_size - 1,
1467  			    &cached);
1468  
1469  		em = btrfs_create_io_em(inode, start, &file_extent,
1470  					BTRFS_ORDERED_REGULAR);
1471  		if (IS_ERR(em)) {
1472  			unlock_extent(&inode->io_tree, start,
1473  				      start + ram_size - 1, &cached);
1474  			ret = PTR_ERR(em);
1475  			goto out_reserve;
1476  		}
1477  		free_extent_map(em);
1478  
1479  		ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
1480  						     1 << BTRFS_ORDERED_REGULAR);
1481  		if (IS_ERR(ordered)) {
1482  			unlock_extent(&inode->io_tree, start,
1483  				      start + ram_size - 1, &cached);
1484  			ret = PTR_ERR(ordered);
1485  			goto out_drop_extent_cache;
1486  		}
1487  
1488  		if (btrfs_is_data_reloc_root(root)) {
1489  			ret = btrfs_reloc_clone_csums(ordered);
1490  
1491  			/*
1492  			 * Only drop cache here, and process as normal.
1493  			 *
1494  			 * We must not allow extent_clear_unlock_delalloc()
1495  			 * at out_unlock label to free meta of this ordered
1496  			 * extent, as its meta should be freed by
1497  			 * btrfs_finish_ordered_io().
1498  			 *
1499  			 * So we must continue until @start is increased to
1500  			 * skip current ordered extent.
1501  			 */
1502  			if (ret)
1503  				btrfs_drop_extent_map_range(inode, start,
1504  							    start + ram_size - 1,
1505  							    false);
1506  		}
1507  		btrfs_put_ordered_extent(ordered);
1508  
1509  		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1510  
1511  		/*
1512  		 * We're not doing compressed IO, don't unlock the first page
1513  		 * (which the caller expects to stay locked), don't clear any
1514  		 * dirty bits and don't set any writeback bits
1515  		 *
1516  		 * Do set the Ordered (Private2) bit so we know this page was
1517  		 * properly setup for writepage.
1518  		 */
1519  		page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
1520  		page_ops |= PAGE_SET_ORDERED;
1521  
1522  		extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
1523  					     locked_folio, &cached,
1524  					     EXTENT_LOCKED | EXTENT_DELALLOC,
1525  					     page_ops);
1526  		if (num_bytes < cur_alloc_size)
1527  			num_bytes = 0;
1528  		else
1529  			num_bytes -= cur_alloc_size;
1530  		alloc_hint = ins.objectid + ins.offset;
1531  		start += cur_alloc_size;
1532  		extent_reserved = false;
1533  
1534  		/*
1535  		 * btrfs_reloc_clone_csums() error, since start is increased
1536  		 * extent_clear_unlock_delalloc() at out_unlock label won't
1537  		 * free metadata of current ordered extent, we're OK to exit.
1538  		 */
1539  		if (ret)
1540  			goto out_unlock;
1541  	}
1542  done:
1543  	if (done_offset)
1544  		*done_offset = end;
1545  	return ret;
1546  
1547  out_drop_extent_cache:
1548  	btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
1549  out_reserve:
1550  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1551  	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1552  out_unlock:
1553  	/*
1554  	 * Now, we have three regions to clean up:
1555  	 *
1556  	 * |-------(1)----|---(2)---|-------------(3)----------|
1557  	 * `- orig_start  `- start  `- start + cur_alloc_size  `- end
1558  	 *
1559  	 * We process each region below.
1560  	 */
1561  
1562  	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1563  		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1564  	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1565  
1566  	/*
1567  	 * For the range (1). We have already instantiated the ordered extents
1568  	 * for this region. They are cleaned up by
1569  	 * btrfs_cleanup_ordered_extents() in e.g,
1570  	 * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
1571  	 * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
1572  	 * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
1573  	 * function.
1574  	 *
1575  	 * However, in case of @keep_locked, we still need to unlock the pages
1576  	 * (except @locked_folio) to ensure all the pages are unlocked.
1577  	 */
1578  	if (keep_locked && orig_start < start) {
1579  		if (!locked_folio)
1580  			mapping_set_error(inode->vfs_inode.i_mapping, ret);
1581  		extent_clear_unlock_delalloc(inode, orig_start, start - 1,
1582  					     locked_folio, NULL, 0, page_ops);
1583  	}
1584  
1585  	/*
1586  	 * At this point we're unlocked, we want to make sure we're only
1587  	 * clearing these flags under the extent lock, so lock the rest of the
1588  	 * range and clear everything up.
1589  	 */
1590  	lock_extent(&inode->io_tree, start, end, NULL);
1591  
1592  	/*
1593  	 * For the range (2). If we reserved an extent for our delalloc range
1594  	 * (or a subrange) and failed to create the respective ordered extent,
1595  	 * then it means that when we reserved the extent we decremented the
1596  	 * extent's size from the data space_info's bytes_may_use counter and
1597  	 * incremented the space_info's bytes_reserved counter by the same
1598  	 * amount. We must make sure extent_clear_unlock_delalloc() does not try
1599  	 * to decrement again the data space_info's bytes_may_use counter,
1600  	 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
1601  	 */
1602  	if (extent_reserved) {
1603  		extent_clear_unlock_delalloc(inode, start,
1604  					     start + cur_alloc_size - 1,
1605  					     locked_folio, &cached, clear_bits,
1606  					     page_ops);
1607  		btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
1608  		start += cur_alloc_size;
1609  	}
1610  
1611  	/*
1612  	 * For the range (3). We never touched the region. In addition to the
1613  	 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
1614  	 * space_info's bytes_may_use counter, reserved in
1615  	 * btrfs_check_data_free_space().
1616  	 */
1617  	if (start < end) {
1618  		clear_bits |= EXTENT_CLEAR_DATA_RESV;
1619  		extent_clear_unlock_delalloc(inode, start, end, locked_folio,
1620  					     &cached, clear_bits, page_ops);
1621  		btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL);
1622  	}
1623  	return ret;
1624  }
1625  
1626  /*
1627   * Phase two of compressed writeback.  This is the ordered portion of the code,
1628   * which only gets called in the order the work was queued.  We walk all the
1629   * async extents created by compress_file_range and send them down to the disk.
1630   *
1631   * If called with @do_free == true then it'll try to finish the work and free
1632   * the work struct eventually.
1633   */
submit_compressed_extents(struct btrfs_work * work,bool do_free)1634  static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
1635  {
1636  	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1637  						     work);
1638  	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1639  	struct async_extent *async_extent;
1640  	unsigned long nr_pages;
1641  	u64 alloc_hint = 0;
1642  
1643  	if (do_free) {
1644  		struct async_cow *async_cow;
1645  
1646  		btrfs_add_delayed_iput(async_chunk->inode);
1647  		if (async_chunk->blkcg_css)
1648  			css_put(async_chunk->blkcg_css);
1649  
1650  		async_cow = async_chunk->async_cow;
1651  		if (atomic_dec_and_test(&async_cow->num_chunks))
1652  			kvfree(async_cow);
1653  		return;
1654  	}
1655  
1656  	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1657  		PAGE_SHIFT;
1658  
1659  	while (!list_empty(&async_chunk->extents)) {
1660  		async_extent = list_entry(async_chunk->extents.next,
1661  					  struct async_extent, list);
1662  		list_del(&async_extent->list);
1663  		submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
1664  	}
1665  
1666  	/* atomic_sub_return implies a barrier */
1667  	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1668  	    5 * SZ_1M)
1669  		cond_wake_up_nomb(&fs_info->async_submit_wait);
1670  }
1671  
run_delalloc_compressed(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc)1672  static bool run_delalloc_compressed(struct btrfs_inode *inode,
1673  				    struct folio *locked_folio, u64 start,
1674  				    u64 end, struct writeback_control *wbc)
1675  {
1676  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1677  	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1678  	struct async_cow *ctx;
1679  	struct async_chunk *async_chunk;
1680  	unsigned long nr_pages;
1681  	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1682  	int i;
1683  	unsigned nofs_flag;
1684  	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
1685  
1686  	nofs_flag = memalloc_nofs_save();
1687  	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1688  	memalloc_nofs_restore(nofs_flag);
1689  	if (!ctx)
1690  		return false;
1691  
1692  	set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
1693  
1694  	async_chunk = ctx->chunks;
1695  	atomic_set(&ctx->num_chunks, num_chunks);
1696  
1697  	for (i = 0; i < num_chunks; i++) {
1698  		u64 cur_end = min(end, start + SZ_512K - 1);
1699  
1700  		/*
1701  		 * igrab is called higher up in the call chain, take only the
1702  		 * lightweight reference for the callback lifetime
1703  		 */
1704  		ihold(&inode->vfs_inode);
1705  		async_chunk[i].async_cow = ctx;
1706  		async_chunk[i].inode = inode;
1707  		async_chunk[i].start = start;
1708  		async_chunk[i].end = cur_end;
1709  		async_chunk[i].write_flags = write_flags;
1710  		INIT_LIST_HEAD(&async_chunk[i].extents);
1711  
1712  		/*
1713  		 * The locked_folio comes all the way from writepage and its
1714  		 * the original folio we were actually given.  As we spread
1715  		 * this large delalloc region across multiple async_chunk
1716  		 * structs, only the first struct needs a pointer to
1717  		 * locked_folio.
1718  		 *
1719  		 * This way we don't need racey decisions about who is supposed
1720  		 * to unlock it.
1721  		 */
1722  		if (locked_folio) {
1723  			/*
1724  			 * Depending on the compressibility, the pages might or
1725  			 * might not go through async.  We want all of them to
1726  			 * be accounted against wbc once.  Let's do it here
1727  			 * before the paths diverge.  wbc accounting is used
1728  			 * only for foreign writeback detection and doesn't
1729  			 * need full accuracy.  Just account the whole thing
1730  			 * against the first page.
1731  			 */
1732  			wbc_account_cgroup_owner(wbc, &locked_folio->page,
1733  						 cur_end - start);
1734  			async_chunk[i].locked_folio = locked_folio;
1735  			locked_folio = NULL;
1736  		} else {
1737  			async_chunk[i].locked_folio = NULL;
1738  		}
1739  
1740  		if (blkcg_css != blkcg_root_css) {
1741  			css_get(blkcg_css);
1742  			async_chunk[i].blkcg_css = blkcg_css;
1743  			async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
1744  		} else {
1745  			async_chunk[i].blkcg_css = NULL;
1746  		}
1747  
1748  		btrfs_init_work(&async_chunk[i].work, compress_file_range,
1749  				submit_compressed_extents);
1750  
1751  		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1752  		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1753  
1754  		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1755  
1756  		start = cur_end + 1;
1757  	}
1758  	return true;
1759  }
1760  
1761  /*
1762   * Run the delalloc range from start to end, and write back any dirty pages
1763   * covered by the range.
1764   */
run_delalloc_cow(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc,bool pages_dirty)1765  static noinline int run_delalloc_cow(struct btrfs_inode *inode,
1766  				     struct folio *locked_folio, u64 start,
1767  				     u64 end, struct writeback_control *wbc,
1768  				     bool pages_dirty)
1769  {
1770  	u64 done_offset = end;
1771  	int ret;
1772  
1773  	while (start <= end) {
1774  		ret = cow_file_range(inode, locked_folio, start, end,
1775  				     &done_offset, true, false);
1776  		if (ret)
1777  			return ret;
1778  		extent_write_locked_range(&inode->vfs_inode, locked_folio,
1779  					  start, done_offset, wbc, pages_dirty);
1780  		start = done_offset + 1;
1781  	}
1782  
1783  	return 1;
1784  }
1785  
fallback_to_cow(struct btrfs_inode * inode,struct folio * locked_folio,const u64 start,const u64 end)1786  static int fallback_to_cow(struct btrfs_inode *inode,
1787  			   struct folio *locked_folio, const u64 start,
1788  			   const u64 end)
1789  {
1790  	const bool is_space_ino = btrfs_is_free_space_inode(inode);
1791  	const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1792  	const u64 range_bytes = end + 1 - start;
1793  	struct extent_io_tree *io_tree = &inode->io_tree;
1794  	struct extent_state *cached_state = NULL;
1795  	u64 range_start = start;
1796  	u64 count;
1797  	int ret;
1798  
1799  	/*
1800  	 * If EXTENT_NORESERVE is set it means that when the buffered write was
1801  	 * made we had not enough available data space and therefore we did not
1802  	 * reserve data space for it, since we though we could do NOCOW for the
1803  	 * respective file range (either there is prealloc extent or the inode
1804  	 * has the NOCOW bit set).
1805  	 *
1806  	 * However when we need to fallback to COW mode (because for example the
1807  	 * block group for the corresponding extent was turned to RO mode by a
1808  	 * scrub or relocation) we need to do the following:
1809  	 *
1810  	 * 1) We increment the bytes_may_use counter of the data space info.
1811  	 *    If COW succeeds, it allocates a new data extent and after doing
1812  	 *    that it decrements the space info's bytes_may_use counter and
1813  	 *    increments its bytes_reserved counter by the same amount (we do
1814  	 *    this at btrfs_add_reserved_bytes()). So we need to increment the
1815  	 *    bytes_may_use counter to compensate (when space is reserved at
1816  	 *    buffered write time, the bytes_may_use counter is incremented);
1817  	 *
1818  	 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1819  	 *    that if the COW path fails for any reason, it decrements (through
1820  	 *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1821  	 *    data space info, which we incremented in the step above.
1822  	 *
1823  	 * If we need to fallback to cow and the inode corresponds to a free
1824  	 * space cache inode or an inode of the data relocation tree, we must
1825  	 * also increment bytes_may_use of the data space_info for the same
1826  	 * reason. Space caches and relocated data extents always get a prealloc
1827  	 * extent for them, however scrub or balance may have set the block
1828  	 * group that contains that extent to RO mode and therefore force COW
1829  	 * when starting writeback.
1830  	 */
1831  	lock_extent(io_tree, start, end, &cached_state);
1832  	count = count_range_bits(io_tree, &range_start, end, range_bytes,
1833  				 EXTENT_NORESERVE, 0, NULL);
1834  	if (count > 0 || is_space_ino || is_reloc_ino) {
1835  		u64 bytes = count;
1836  		struct btrfs_fs_info *fs_info = inode->root->fs_info;
1837  		struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1838  
1839  		if (is_space_ino || is_reloc_ino)
1840  			bytes = range_bytes;
1841  
1842  		spin_lock(&sinfo->lock);
1843  		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
1844  		spin_unlock(&sinfo->lock);
1845  
1846  		if (count > 0)
1847  			clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1848  					 NULL);
1849  	}
1850  	unlock_extent(io_tree, start, end, &cached_state);
1851  
1852  	/*
1853  	 * Don't try to create inline extents, as a mix of inline extent that
1854  	 * is written out and unlocked directly and a normal NOCOW extent
1855  	 * doesn't work.
1856  	 */
1857  	ret = cow_file_range(inode, locked_folio, start, end, NULL, false,
1858  			     true);
1859  	ASSERT(ret != 1);
1860  	return ret;
1861  }
1862  
1863  struct can_nocow_file_extent_args {
1864  	/* Input fields. */
1865  
1866  	/* Start file offset of the range we want to NOCOW. */
1867  	u64 start;
1868  	/* End file offset (inclusive) of the range we want to NOCOW. */
1869  	u64 end;
1870  	bool writeback_path;
1871  	bool strict;
1872  	/*
1873  	 * Free the path passed to can_nocow_file_extent() once it's not needed
1874  	 * anymore.
1875  	 */
1876  	bool free_path;
1877  
1878  	/*
1879  	 * Output fields. Only set when can_nocow_file_extent() returns 1.
1880  	 * The expected file extent for the NOCOW write.
1881  	 */
1882  	struct btrfs_file_extent file_extent;
1883  };
1884  
1885  /*
1886   * Check if we can NOCOW the file extent that the path points to.
1887   * This function may return with the path released, so the caller should check
1888   * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
1889   *
1890   * Returns: < 0 on error
1891   *            0 if we can not NOCOW
1892   *            1 if we can NOCOW
1893   */
can_nocow_file_extent(struct btrfs_path * path,struct btrfs_key * key,struct btrfs_inode * inode,struct can_nocow_file_extent_args * args)1894  static int can_nocow_file_extent(struct btrfs_path *path,
1895  				 struct btrfs_key *key,
1896  				 struct btrfs_inode *inode,
1897  				 struct can_nocow_file_extent_args *args)
1898  {
1899  	const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
1900  	struct extent_buffer *leaf = path->nodes[0];
1901  	struct btrfs_root *root = inode->root;
1902  	struct btrfs_file_extent_item *fi;
1903  	struct btrfs_root *csum_root;
1904  	u64 io_start;
1905  	u64 extent_end;
1906  	u8 extent_type;
1907  	int can_nocow = 0;
1908  	int ret = 0;
1909  	bool nowait = path->nowait;
1910  
1911  	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
1912  	extent_type = btrfs_file_extent_type(leaf, fi);
1913  
1914  	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1915  		goto out;
1916  
1917  	if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
1918  	    extent_type == BTRFS_FILE_EXTENT_REG)
1919  		goto out;
1920  
1921  	/*
1922  	 * If the extent was created before the generation where the last snapshot
1923  	 * for its subvolume was created, then this implies the extent is shared,
1924  	 * hence we must COW.
1925  	 */
1926  	if (!args->strict &&
1927  	    btrfs_file_extent_generation(leaf, fi) <=
1928  	    btrfs_root_last_snapshot(&root->root_item))
1929  		goto out;
1930  
1931  	/* An explicit hole, must COW. */
1932  	if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
1933  		goto out;
1934  
1935  	/* Compressed/encrypted/encoded extents must be COWed. */
1936  	if (btrfs_file_extent_compression(leaf, fi) ||
1937  	    btrfs_file_extent_encryption(leaf, fi) ||
1938  	    btrfs_file_extent_other_encoding(leaf, fi))
1939  		goto out;
1940  
1941  	extent_end = btrfs_file_extent_end(path);
1942  
1943  	args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1944  	args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1945  	args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1946  	args->file_extent.offset = btrfs_file_extent_offset(leaf, fi);
1947  	args->file_extent.compression = btrfs_file_extent_compression(leaf, fi);
1948  
1949  	/*
1950  	 * The following checks can be expensive, as they need to take other
1951  	 * locks and do btree or rbtree searches, so release the path to avoid
1952  	 * blocking other tasks for too long.
1953  	 */
1954  	btrfs_release_path(path);
1955  
1956  	ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
1957  				    key->offset - args->file_extent.offset,
1958  				    args->file_extent.disk_bytenr, args->strict, path);
1959  	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1960  	if (ret != 0)
1961  		goto out;
1962  
1963  	if (args->free_path) {
1964  		/*
1965  		 * We don't need the path anymore, plus through the
1966  		 * btrfs_lookup_csums_list() call below we will end up allocating
1967  		 * another path. So free the path to avoid unnecessary extra
1968  		 * memory usage.
1969  		 */
1970  		btrfs_free_path(path);
1971  		path = NULL;
1972  	}
1973  
1974  	/* If there are pending snapshots for this root, we must COW. */
1975  	if (args->writeback_path && !is_freespace_inode &&
1976  	    atomic_read(&root->snapshot_force_cow))
1977  		goto out;
1978  
1979  	args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start;
1980  	args->file_extent.offset += args->start - key->offset;
1981  	io_start = args->file_extent.disk_bytenr + args->file_extent.offset;
1982  
1983  	/*
1984  	 * Force COW if csums exist in the range. This ensures that csums for a
1985  	 * given extent are either valid or do not exist.
1986  	 */
1987  
1988  	csum_root = btrfs_csum_root(root->fs_info, io_start);
1989  	ret = btrfs_lookup_csums_list(csum_root, io_start,
1990  				      io_start + args->file_extent.num_bytes - 1,
1991  				      NULL, nowait);
1992  	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1993  	if (ret != 0)
1994  		goto out;
1995  
1996  	can_nocow = 1;
1997   out:
1998  	if (args->free_path && path)
1999  		btrfs_free_path(path);
2000  
2001  	return ret < 0 ? ret : can_nocow;
2002  }
2003  
2004  /*
2005   * when nowcow writeback call back.  This checks for snapshots or COW copies
2006   * of the extents that exist in the file, and COWs the file as required.
2007   *
2008   * If no cow copies or snapshots exist, we write directly to the existing
2009   * blocks on disk
2010   */
run_delalloc_nocow(struct btrfs_inode * inode,struct folio * locked_folio,const u64 start,const u64 end)2011  static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
2012  				       struct folio *locked_folio,
2013  				       const u64 start, const u64 end)
2014  {
2015  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2016  	struct btrfs_root *root = inode->root;
2017  	struct btrfs_path *path;
2018  	u64 cow_start = (u64)-1;
2019  	u64 cur_offset = start;
2020  	int ret;
2021  	bool check_prev = true;
2022  	u64 ino = btrfs_ino(inode);
2023  	struct can_nocow_file_extent_args nocow_args = { 0 };
2024  
2025  	/*
2026  	 * Normally on a zoned device we're only doing COW writes, but in case
2027  	 * of relocation on a zoned filesystem serializes I/O so that we're only
2028  	 * writing sequentially and can end up here as well.
2029  	 */
2030  	ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
2031  
2032  	path = btrfs_alloc_path();
2033  	if (!path) {
2034  		ret = -ENOMEM;
2035  		goto error;
2036  	}
2037  
2038  	nocow_args.end = end;
2039  	nocow_args.writeback_path = true;
2040  
2041  	while (cur_offset <= end) {
2042  		struct btrfs_block_group *nocow_bg = NULL;
2043  		struct btrfs_ordered_extent *ordered;
2044  		struct btrfs_key found_key;
2045  		struct btrfs_file_extent_item *fi;
2046  		struct extent_buffer *leaf;
2047  		struct extent_state *cached_state = NULL;
2048  		u64 extent_end;
2049  		u64 nocow_end;
2050  		int extent_type;
2051  		bool is_prealloc;
2052  
2053  		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
2054  					       cur_offset, 0);
2055  		if (ret < 0)
2056  			goto error;
2057  
2058  		/*
2059  		 * If there is no extent for our range when doing the initial
2060  		 * search, then go back to the previous slot as it will be the
2061  		 * one containing the search offset
2062  		 */
2063  		if (ret > 0 && path->slots[0] > 0 && check_prev) {
2064  			leaf = path->nodes[0];
2065  			btrfs_item_key_to_cpu(leaf, &found_key,
2066  					      path->slots[0] - 1);
2067  			if (found_key.objectid == ino &&
2068  			    found_key.type == BTRFS_EXTENT_DATA_KEY)
2069  				path->slots[0]--;
2070  		}
2071  		check_prev = false;
2072  next_slot:
2073  		/* Go to next leaf if we have exhausted the current one */
2074  		leaf = path->nodes[0];
2075  		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2076  			ret = btrfs_next_leaf(root, path);
2077  			if (ret < 0)
2078  				goto error;
2079  			if (ret > 0)
2080  				break;
2081  			leaf = path->nodes[0];
2082  		}
2083  
2084  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2085  
2086  		/* Didn't find anything for our INO */
2087  		if (found_key.objectid > ino)
2088  			break;
2089  		/*
2090  		 * Keep searching until we find an EXTENT_ITEM or there are no
2091  		 * more extents for this inode
2092  		 */
2093  		if (WARN_ON_ONCE(found_key.objectid < ino) ||
2094  		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
2095  			path->slots[0]++;
2096  			goto next_slot;
2097  		}
2098  
2099  		/* Found key is not EXTENT_DATA_KEY or starts after req range */
2100  		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
2101  		    found_key.offset > end)
2102  			break;
2103  
2104  		/*
2105  		 * If the found extent starts after requested offset, then
2106  		 * adjust extent_end to be right before this extent begins
2107  		 */
2108  		if (found_key.offset > cur_offset) {
2109  			extent_end = found_key.offset;
2110  			extent_type = 0;
2111  			goto must_cow;
2112  		}
2113  
2114  		/*
2115  		 * Found extent which begins before our range and potentially
2116  		 * intersect it
2117  		 */
2118  		fi = btrfs_item_ptr(leaf, path->slots[0],
2119  				    struct btrfs_file_extent_item);
2120  		extent_type = btrfs_file_extent_type(leaf, fi);
2121  		/* If this is triggered then we have a memory corruption. */
2122  		ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
2123  		if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
2124  			ret = -EUCLEAN;
2125  			goto error;
2126  		}
2127  		extent_end = btrfs_file_extent_end(path);
2128  
2129  		/*
2130  		 * If the extent we got ends before our current offset, skip to
2131  		 * the next extent.
2132  		 */
2133  		if (extent_end <= cur_offset) {
2134  			path->slots[0]++;
2135  			goto next_slot;
2136  		}
2137  
2138  		nocow_args.start = cur_offset;
2139  		ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
2140  		if (ret < 0)
2141  			goto error;
2142  		if (ret == 0)
2143  			goto must_cow;
2144  
2145  		ret = 0;
2146  		nocow_bg = btrfs_inc_nocow_writers(fs_info,
2147  				nocow_args.file_extent.disk_bytenr +
2148  				nocow_args.file_extent.offset);
2149  		if (!nocow_bg) {
2150  must_cow:
2151  			/*
2152  			 * If we can't perform NOCOW writeback for the range,
2153  			 * then record the beginning of the range that needs to
2154  			 * be COWed.  It will be written out before the next
2155  			 * NOCOW range if we find one, or when exiting this
2156  			 * loop.
2157  			 */
2158  			if (cow_start == (u64)-1)
2159  				cow_start = cur_offset;
2160  			cur_offset = extent_end;
2161  			if (cur_offset > end)
2162  				break;
2163  			if (!path->nodes[0])
2164  				continue;
2165  			path->slots[0]++;
2166  			goto next_slot;
2167  		}
2168  
2169  		/*
2170  		 * COW range from cow_start to found_key.offset - 1. As the key
2171  		 * will contain the beginning of the first extent that can be
2172  		 * NOCOW, following one which needs to be COW'ed
2173  		 */
2174  		if (cow_start != (u64)-1) {
2175  			ret = fallback_to_cow(inode, locked_folio, cow_start,
2176  					      found_key.offset - 1);
2177  			cow_start = (u64)-1;
2178  			if (ret) {
2179  				btrfs_dec_nocow_writers(nocow_bg);
2180  				goto error;
2181  			}
2182  		}
2183  
2184  		nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
2185  		lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
2186  
2187  		is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
2188  		if (is_prealloc) {
2189  			struct extent_map *em;
2190  
2191  			em = btrfs_create_io_em(inode, cur_offset,
2192  						&nocow_args.file_extent,
2193  						BTRFS_ORDERED_PREALLOC);
2194  			if (IS_ERR(em)) {
2195  				unlock_extent(&inode->io_tree, cur_offset,
2196  					      nocow_end, &cached_state);
2197  				btrfs_dec_nocow_writers(nocow_bg);
2198  				ret = PTR_ERR(em);
2199  				goto error;
2200  			}
2201  			free_extent_map(em);
2202  		}
2203  
2204  		ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
2205  				&nocow_args.file_extent,
2206  				is_prealloc
2207  				? (1 << BTRFS_ORDERED_PREALLOC)
2208  				: (1 << BTRFS_ORDERED_NOCOW));
2209  		btrfs_dec_nocow_writers(nocow_bg);
2210  		if (IS_ERR(ordered)) {
2211  			if (is_prealloc) {
2212  				btrfs_drop_extent_map_range(inode, cur_offset,
2213  							    nocow_end, false);
2214  			}
2215  			unlock_extent(&inode->io_tree, cur_offset,
2216  				      nocow_end, &cached_state);
2217  			ret = PTR_ERR(ordered);
2218  			goto error;
2219  		}
2220  
2221  		if (btrfs_is_data_reloc_root(root))
2222  			/*
2223  			 * Error handled later, as we must prevent
2224  			 * extent_clear_unlock_delalloc() in error handler
2225  			 * from freeing metadata of created ordered extent.
2226  			 */
2227  			ret = btrfs_reloc_clone_csums(ordered);
2228  		btrfs_put_ordered_extent(ordered);
2229  
2230  		extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
2231  					     locked_folio, &cached_state,
2232  					     EXTENT_LOCKED | EXTENT_DELALLOC |
2233  					     EXTENT_CLEAR_DATA_RESV,
2234  					     PAGE_UNLOCK | PAGE_SET_ORDERED);
2235  
2236  		cur_offset = extent_end;
2237  
2238  		/*
2239  		 * btrfs_reloc_clone_csums() error, now we're OK to call error
2240  		 * handler, as metadata for created ordered extent will only
2241  		 * be freed by btrfs_finish_ordered_io().
2242  		 */
2243  		if (ret)
2244  			goto error;
2245  	}
2246  	btrfs_release_path(path);
2247  
2248  	if (cur_offset <= end && cow_start == (u64)-1)
2249  		cow_start = cur_offset;
2250  
2251  	if (cow_start != (u64)-1) {
2252  		cur_offset = end;
2253  		ret = fallback_to_cow(inode, locked_folio, cow_start, end);
2254  		cow_start = (u64)-1;
2255  		if (ret)
2256  			goto error;
2257  	}
2258  
2259  	btrfs_free_path(path);
2260  	return 0;
2261  
2262  error:
2263  	/*
2264  	 * If an error happened while a COW region is outstanding, cur_offset
2265  	 * needs to be reset to cow_start to ensure the COW region is unlocked
2266  	 * as well.
2267  	 */
2268  	if (cow_start != (u64)-1)
2269  		cur_offset = cow_start;
2270  
2271  	/*
2272  	 * We need to lock the extent here because we're clearing DELALLOC and
2273  	 * we're not locked at this point.
2274  	 */
2275  	if (cur_offset < end) {
2276  		struct extent_state *cached = NULL;
2277  
2278  		lock_extent(&inode->io_tree, cur_offset, end, &cached);
2279  		extent_clear_unlock_delalloc(inode, cur_offset, end,
2280  					     locked_folio, &cached,
2281  					     EXTENT_LOCKED | EXTENT_DELALLOC |
2282  					     EXTENT_DEFRAG |
2283  					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
2284  					     PAGE_START_WRITEBACK |
2285  					     PAGE_END_WRITEBACK);
2286  		btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
2287  	}
2288  	btrfs_free_path(path);
2289  	return ret;
2290  }
2291  
should_nocow(struct btrfs_inode * inode,u64 start,u64 end)2292  static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
2293  {
2294  	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
2295  		if (inode->defrag_bytes &&
2296  		    test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
2297  			return false;
2298  		return true;
2299  	}
2300  	return false;
2301  }
2302  
2303  /*
2304   * Function to process delayed allocation (create CoW) for ranges which are
2305   * being touched for the first time.
2306   */
btrfs_run_delalloc_range(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc)2307  int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
2308  			     u64 start, u64 end, struct writeback_control *wbc)
2309  {
2310  	const bool zoned = btrfs_is_zoned(inode->root->fs_info);
2311  	int ret;
2312  
2313  	/*
2314  	 * The range must cover part of the @locked_folio, or a return of 1
2315  	 * can confuse the caller.
2316  	 */
2317  	ASSERT(!(end <= folio_pos(locked_folio) ||
2318  		 start >= folio_pos(locked_folio) + folio_size(locked_folio)));
2319  
2320  	if (should_nocow(inode, start, end)) {
2321  		ret = run_delalloc_nocow(inode, locked_folio, start, end);
2322  		goto out;
2323  	}
2324  
2325  	if (btrfs_inode_can_compress(inode) &&
2326  	    inode_need_compress(inode, start, end) &&
2327  	    run_delalloc_compressed(inode, locked_folio, start, end, wbc))
2328  		return 1;
2329  
2330  	if (zoned)
2331  		ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
2332  				       true);
2333  	else
2334  		ret = cow_file_range(inode, locked_folio, start, end, NULL,
2335  				     false, false);
2336  
2337  out:
2338  	if (ret < 0)
2339  		btrfs_cleanup_ordered_extents(inode, locked_folio, start,
2340  					      end - start + 1);
2341  	return ret;
2342  }
2343  
btrfs_split_delalloc_extent(struct btrfs_inode * inode,struct extent_state * orig,u64 split)2344  void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
2345  				 struct extent_state *orig, u64 split)
2346  {
2347  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2348  	u64 size;
2349  
2350  	lockdep_assert_held(&inode->io_tree.lock);
2351  
2352  	/* not delalloc, ignore it */
2353  	if (!(orig->state & EXTENT_DELALLOC))
2354  		return;
2355  
2356  	size = orig->end - orig->start + 1;
2357  	if (size > fs_info->max_extent_size) {
2358  		u32 num_extents;
2359  		u64 new_size;
2360  
2361  		/*
2362  		 * See the explanation in btrfs_merge_delalloc_extent, the same
2363  		 * applies here, just in reverse.
2364  		 */
2365  		new_size = orig->end - split + 1;
2366  		num_extents = count_max_extents(fs_info, new_size);
2367  		new_size = split - orig->start;
2368  		num_extents += count_max_extents(fs_info, new_size);
2369  		if (count_max_extents(fs_info, size) >= num_extents)
2370  			return;
2371  	}
2372  
2373  	spin_lock(&inode->lock);
2374  	btrfs_mod_outstanding_extents(inode, 1);
2375  	spin_unlock(&inode->lock);
2376  }
2377  
2378  /*
2379   * Handle merged delayed allocation extents so we can keep track of new extents
2380   * that are just merged onto old extents, such as when we are doing sequential
2381   * writes, so we can properly account for the metadata space we'll need.
2382   */
btrfs_merge_delalloc_extent(struct btrfs_inode * inode,struct extent_state * new,struct extent_state * other)2383  void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
2384  				 struct extent_state *other)
2385  {
2386  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2387  	u64 new_size, old_size;
2388  	u32 num_extents;
2389  
2390  	lockdep_assert_held(&inode->io_tree.lock);
2391  
2392  	/* not delalloc, ignore it */
2393  	if (!(other->state & EXTENT_DELALLOC))
2394  		return;
2395  
2396  	if (new->start > other->start)
2397  		new_size = new->end - other->start + 1;
2398  	else
2399  		new_size = other->end - new->start + 1;
2400  
2401  	/* we're not bigger than the max, unreserve the space and go */
2402  	if (new_size <= fs_info->max_extent_size) {
2403  		spin_lock(&inode->lock);
2404  		btrfs_mod_outstanding_extents(inode, -1);
2405  		spin_unlock(&inode->lock);
2406  		return;
2407  	}
2408  
2409  	/*
2410  	 * We have to add up either side to figure out how many extents were
2411  	 * accounted for before we merged into one big extent.  If the number of
2412  	 * extents we accounted for is <= the amount we need for the new range
2413  	 * then we can return, otherwise drop.  Think of it like this
2414  	 *
2415  	 * [ 4k][MAX_SIZE]
2416  	 *
2417  	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
2418  	 * need 2 outstanding extents, on one side we have 1 and the other side
2419  	 * we have 1 so they are == and we can return.  But in this case
2420  	 *
2421  	 * [MAX_SIZE+4k][MAX_SIZE+4k]
2422  	 *
2423  	 * Each range on their own accounts for 2 extents, but merged together
2424  	 * they are only 3 extents worth of accounting, so we need to drop in
2425  	 * this case.
2426  	 */
2427  	old_size = other->end - other->start + 1;
2428  	num_extents = count_max_extents(fs_info, old_size);
2429  	old_size = new->end - new->start + 1;
2430  	num_extents += count_max_extents(fs_info, old_size);
2431  	if (count_max_extents(fs_info, new_size) >= num_extents)
2432  		return;
2433  
2434  	spin_lock(&inode->lock);
2435  	btrfs_mod_outstanding_extents(inode, -1);
2436  	spin_unlock(&inode->lock);
2437  }
2438  
btrfs_add_delalloc_inode(struct btrfs_inode * inode)2439  static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
2440  {
2441  	struct btrfs_root *root = inode->root;
2442  	struct btrfs_fs_info *fs_info = root->fs_info;
2443  
2444  	spin_lock(&root->delalloc_lock);
2445  	ASSERT(list_empty(&inode->delalloc_inodes));
2446  	list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
2447  	root->nr_delalloc_inodes++;
2448  	if (root->nr_delalloc_inodes == 1) {
2449  		spin_lock(&fs_info->delalloc_root_lock);
2450  		ASSERT(list_empty(&root->delalloc_root));
2451  		list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
2452  		spin_unlock(&fs_info->delalloc_root_lock);
2453  	}
2454  	spin_unlock(&root->delalloc_lock);
2455  }
2456  
btrfs_del_delalloc_inode(struct btrfs_inode * inode)2457  void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
2458  {
2459  	struct btrfs_root *root = inode->root;
2460  	struct btrfs_fs_info *fs_info = root->fs_info;
2461  
2462  	lockdep_assert_held(&root->delalloc_lock);
2463  
2464  	/*
2465  	 * We may be called after the inode was already deleted from the list,
2466  	 * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
2467  	 * and then later through btrfs_clear_delalloc_extent() while the inode
2468  	 * still has ->delalloc_bytes > 0.
2469  	 */
2470  	if (!list_empty(&inode->delalloc_inodes)) {
2471  		list_del_init(&inode->delalloc_inodes);
2472  		root->nr_delalloc_inodes--;
2473  		if (!root->nr_delalloc_inodes) {
2474  			ASSERT(list_empty(&root->delalloc_inodes));
2475  			spin_lock(&fs_info->delalloc_root_lock);
2476  			ASSERT(!list_empty(&root->delalloc_root));
2477  			list_del_init(&root->delalloc_root);
2478  			spin_unlock(&fs_info->delalloc_root_lock);
2479  		}
2480  	}
2481  }
2482  
2483  /*
2484   * Properly track delayed allocation bytes in the inode and to maintain the
2485   * list of inodes that have pending delalloc work to be done.
2486   */
btrfs_set_delalloc_extent(struct btrfs_inode * inode,struct extent_state * state,u32 bits)2487  void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
2488  			       u32 bits)
2489  {
2490  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2491  
2492  	lockdep_assert_held(&inode->io_tree.lock);
2493  
2494  	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
2495  		WARN_ON(1);
2496  	/*
2497  	 * set_bit and clear bit hooks normally require _irqsave/restore
2498  	 * but in this case, we are only testing for the DELALLOC
2499  	 * bit, which is only set or cleared with irqs on
2500  	 */
2501  	if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2502  		u64 len = state->end + 1 - state->start;
2503  		u64 prev_delalloc_bytes;
2504  		u32 num_extents = count_max_extents(fs_info, len);
2505  
2506  		spin_lock(&inode->lock);
2507  		btrfs_mod_outstanding_extents(inode, num_extents);
2508  		spin_unlock(&inode->lock);
2509  
2510  		/* For sanity tests */
2511  		if (btrfs_is_testing(fs_info))
2512  			return;
2513  
2514  		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2515  					 fs_info->delalloc_batch);
2516  		spin_lock(&inode->lock);
2517  		prev_delalloc_bytes = inode->delalloc_bytes;
2518  		inode->delalloc_bytes += len;
2519  		if (bits & EXTENT_DEFRAG)
2520  			inode->defrag_bytes += len;
2521  		spin_unlock(&inode->lock);
2522  
2523  		/*
2524  		 * We don't need to be under the protection of the inode's lock,
2525  		 * because we are called while holding the inode's io_tree lock
2526  		 * and are therefore protected against concurrent calls of this
2527  		 * function and btrfs_clear_delalloc_extent().
2528  		 */
2529  		if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
2530  			btrfs_add_delalloc_inode(inode);
2531  	}
2532  
2533  	if (!(state->state & EXTENT_DELALLOC_NEW) &&
2534  	    (bits & EXTENT_DELALLOC_NEW)) {
2535  		spin_lock(&inode->lock);
2536  		inode->new_delalloc_bytes += state->end + 1 - state->start;
2537  		spin_unlock(&inode->lock);
2538  	}
2539  }
2540  
2541  /*
2542   * Once a range is no longer delalloc this function ensures that proper
2543   * accounting happens.
2544   */
btrfs_clear_delalloc_extent(struct btrfs_inode * inode,struct extent_state * state,u32 bits)2545  void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
2546  				 struct extent_state *state, u32 bits)
2547  {
2548  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2549  	u64 len = state->end + 1 - state->start;
2550  	u32 num_extents = count_max_extents(fs_info, len);
2551  
2552  	lockdep_assert_held(&inode->io_tree.lock);
2553  
2554  	if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
2555  		spin_lock(&inode->lock);
2556  		inode->defrag_bytes -= len;
2557  		spin_unlock(&inode->lock);
2558  	}
2559  
2560  	/*
2561  	 * set_bit and clear bit hooks normally require _irqsave/restore
2562  	 * but in this case, we are only testing for the DELALLOC
2563  	 * bit, which is only set or cleared with irqs on
2564  	 */
2565  	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2566  		struct btrfs_root *root = inode->root;
2567  		u64 new_delalloc_bytes;
2568  
2569  		spin_lock(&inode->lock);
2570  		btrfs_mod_outstanding_extents(inode, -num_extents);
2571  		spin_unlock(&inode->lock);
2572  
2573  		/*
2574  		 * We don't reserve metadata space for space cache inodes so we
2575  		 * don't need to call delalloc_release_metadata if there is an
2576  		 * error.
2577  		 */
2578  		if (bits & EXTENT_CLEAR_META_RESV &&
2579  		    root != fs_info->tree_root)
2580  			btrfs_delalloc_release_metadata(inode, len, true);
2581  
2582  		/* For sanity tests. */
2583  		if (btrfs_is_testing(fs_info))
2584  			return;
2585  
2586  		if (!btrfs_is_data_reloc_root(root) &&
2587  		    !btrfs_is_free_space_inode(inode) &&
2588  		    !(state->state & EXTENT_NORESERVE) &&
2589  		    (bits & EXTENT_CLEAR_DATA_RESV))
2590  			btrfs_free_reserved_data_space_noquota(fs_info, len);
2591  
2592  		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2593  					 fs_info->delalloc_batch);
2594  		spin_lock(&inode->lock);
2595  		inode->delalloc_bytes -= len;
2596  		new_delalloc_bytes = inode->delalloc_bytes;
2597  		spin_unlock(&inode->lock);
2598  
2599  		/*
2600  		 * We don't need to be under the protection of the inode's lock,
2601  		 * because we are called while holding the inode's io_tree lock
2602  		 * and are therefore protected against concurrent calls of this
2603  		 * function and btrfs_set_delalloc_extent().
2604  		 */
2605  		if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
2606  			spin_lock(&root->delalloc_lock);
2607  			btrfs_del_delalloc_inode(inode);
2608  			spin_unlock(&root->delalloc_lock);
2609  		}
2610  	}
2611  
2612  	if ((state->state & EXTENT_DELALLOC_NEW) &&
2613  	    (bits & EXTENT_DELALLOC_NEW)) {
2614  		spin_lock(&inode->lock);
2615  		ASSERT(inode->new_delalloc_bytes >= len);
2616  		inode->new_delalloc_bytes -= len;
2617  		if (bits & EXTENT_ADD_INODE_BYTES)
2618  			inode_add_bytes(&inode->vfs_inode, len);
2619  		spin_unlock(&inode->lock);
2620  	}
2621  }
2622  
2623  /*
2624   * given a list of ordered sums record them in the inode.  This happens
2625   * at IO completion time based on sums calculated at bio submission time.
2626   */
add_pending_csums(struct btrfs_trans_handle * trans,struct list_head * list)2627  static int add_pending_csums(struct btrfs_trans_handle *trans,
2628  			     struct list_head *list)
2629  {
2630  	struct btrfs_ordered_sum *sum;
2631  	struct btrfs_root *csum_root = NULL;
2632  	int ret;
2633  
2634  	list_for_each_entry(sum, list, list) {
2635  		trans->adding_csums = true;
2636  		if (!csum_root)
2637  			csum_root = btrfs_csum_root(trans->fs_info,
2638  						    sum->logical);
2639  		ret = btrfs_csum_file_blocks(trans, csum_root, sum);
2640  		trans->adding_csums = false;
2641  		if (ret)
2642  			return ret;
2643  	}
2644  	return 0;
2645  }
2646  
btrfs_find_new_delalloc_bytes(struct btrfs_inode * inode,const u64 start,const u64 len,struct extent_state ** cached_state)2647  static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2648  					 const u64 start,
2649  					 const u64 len,
2650  					 struct extent_state **cached_state)
2651  {
2652  	u64 search_start = start;
2653  	const u64 end = start + len - 1;
2654  
2655  	while (search_start < end) {
2656  		const u64 search_len = end - search_start + 1;
2657  		struct extent_map *em;
2658  		u64 em_len;
2659  		int ret = 0;
2660  
2661  		em = btrfs_get_extent(inode, NULL, search_start, search_len);
2662  		if (IS_ERR(em))
2663  			return PTR_ERR(em);
2664  
2665  		if (em->disk_bytenr != EXTENT_MAP_HOLE)
2666  			goto next;
2667  
2668  		em_len = em->len;
2669  		if (em->start < search_start)
2670  			em_len -= search_start - em->start;
2671  		if (em_len > search_len)
2672  			em_len = search_len;
2673  
2674  		ret = set_extent_bit(&inode->io_tree, search_start,
2675  				     search_start + em_len - 1,
2676  				     EXTENT_DELALLOC_NEW, cached_state);
2677  next:
2678  		search_start = extent_map_end(em);
2679  		free_extent_map(em);
2680  		if (ret)
2681  			return ret;
2682  	}
2683  	return 0;
2684  }
2685  
btrfs_set_extent_delalloc(struct btrfs_inode * inode,u64 start,u64 end,unsigned int extra_bits,struct extent_state ** cached_state)2686  int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2687  			      unsigned int extra_bits,
2688  			      struct extent_state **cached_state)
2689  {
2690  	WARN_ON(PAGE_ALIGNED(end));
2691  
2692  	if (start >= i_size_read(&inode->vfs_inode) &&
2693  	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
2694  		/*
2695  		 * There can't be any extents following eof in this case so just
2696  		 * set the delalloc new bit for the range directly.
2697  		 */
2698  		extra_bits |= EXTENT_DELALLOC_NEW;
2699  	} else {
2700  		int ret;
2701  
2702  		ret = btrfs_find_new_delalloc_bytes(inode, start,
2703  						    end + 1 - start,
2704  						    cached_state);
2705  		if (ret)
2706  			return ret;
2707  	}
2708  
2709  	return set_extent_bit(&inode->io_tree, start, end,
2710  			      EXTENT_DELALLOC | extra_bits, cached_state);
2711  }
2712  
2713  /* see btrfs_writepage_start_hook for details on why this is required */
2714  struct btrfs_writepage_fixup {
2715  	struct folio *folio;
2716  	struct btrfs_inode *inode;
2717  	struct btrfs_work work;
2718  };
2719  
btrfs_writepage_fixup_worker(struct btrfs_work * work)2720  static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2721  {
2722  	struct btrfs_writepage_fixup *fixup =
2723  		container_of(work, struct btrfs_writepage_fixup, work);
2724  	struct btrfs_ordered_extent *ordered;
2725  	struct extent_state *cached_state = NULL;
2726  	struct extent_changeset *data_reserved = NULL;
2727  	struct folio *folio = fixup->folio;
2728  	struct btrfs_inode *inode = fixup->inode;
2729  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2730  	u64 page_start = folio_pos(folio);
2731  	u64 page_end = folio_pos(folio) + folio_size(folio) - 1;
2732  	int ret = 0;
2733  	bool free_delalloc_space = true;
2734  
2735  	/*
2736  	 * This is similar to page_mkwrite, we need to reserve the space before
2737  	 * we take the folio lock.
2738  	 */
2739  	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2740  					   folio_size(folio));
2741  again:
2742  	folio_lock(folio);
2743  
2744  	/*
2745  	 * Before we queued this fixup, we took a reference on the folio.
2746  	 * folio->mapping may go NULL, but it shouldn't be moved to a different
2747  	 * address space.
2748  	 */
2749  	if (!folio->mapping || !folio_test_dirty(folio) ||
2750  	    !folio_test_checked(folio)) {
2751  		/*
2752  		 * Unfortunately this is a little tricky, either
2753  		 *
2754  		 * 1) We got here and our folio had already been dealt with and
2755  		 *    we reserved our space, thus ret == 0, so we need to just
2756  		 *    drop our space reservation and bail.  This can happen the
2757  		 *    first time we come into the fixup worker, or could happen
2758  		 *    while waiting for the ordered extent.
2759  		 * 2) Our folio was already dealt with, but we happened to get an
2760  		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
2761  		 *    this case we obviously don't have anything to release, but
2762  		 *    because the folio was already dealt with we don't want to
2763  		 *    mark the folio with an error, so make sure we're resetting
2764  		 *    ret to 0.  This is why we have this check _before_ the ret
2765  		 *    check, because we do not want to have a surprise ENOSPC
2766  		 *    when the folio was already properly dealt with.
2767  		 */
2768  		if (!ret) {
2769  			btrfs_delalloc_release_extents(inode, folio_size(folio));
2770  			btrfs_delalloc_release_space(inode, data_reserved,
2771  						     page_start, folio_size(folio),
2772  						     true);
2773  		}
2774  		ret = 0;
2775  		goto out_page;
2776  	}
2777  
2778  	/*
2779  	 * We can't mess with the folio state unless it is locked, so now that
2780  	 * it is locked bail if we failed to make our space reservation.
2781  	 */
2782  	if (ret)
2783  		goto out_page;
2784  
2785  	lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2786  
2787  	/* already ordered? We're done */
2788  	if (folio_test_ordered(folio))
2789  		goto out_reserved;
2790  
2791  	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2792  	if (ordered) {
2793  		unlock_extent(&inode->io_tree, page_start, page_end,
2794  			      &cached_state);
2795  		folio_unlock(folio);
2796  		btrfs_start_ordered_extent(ordered);
2797  		btrfs_put_ordered_extent(ordered);
2798  		goto again;
2799  	}
2800  
2801  	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2802  					&cached_state);
2803  	if (ret)
2804  		goto out_reserved;
2805  
2806  	/*
2807  	 * Everything went as planned, we're now the owner of a dirty page with
2808  	 * delayed allocation bits set and space reserved for our COW
2809  	 * destination.
2810  	 *
2811  	 * The page was dirty when we started, nothing should have cleaned it.
2812  	 */
2813  	BUG_ON(!folio_test_dirty(folio));
2814  	free_delalloc_space = false;
2815  out_reserved:
2816  	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2817  	if (free_delalloc_space)
2818  		btrfs_delalloc_release_space(inode, data_reserved, page_start,
2819  					     PAGE_SIZE, true);
2820  	unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2821  out_page:
2822  	if (ret) {
2823  		/*
2824  		 * We hit ENOSPC or other errors.  Update the mapping and page
2825  		 * to reflect the errors and clean the page.
2826  		 */
2827  		mapping_set_error(folio->mapping, ret);
2828  		btrfs_mark_ordered_io_finished(inode, folio, page_start,
2829  					       folio_size(folio), !ret);
2830  		folio_clear_dirty_for_io(folio);
2831  	}
2832  	btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
2833  	folio_unlock(folio);
2834  	folio_put(folio);
2835  	kfree(fixup);
2836  	extent_changeset_free(data_reserved);
2837  	/*
2838  	 * As a precaution, do a delayed iput in case it would be the last iput
2839  	 * that could need flushing space. Recursing back to fixup worker would
2840  	 * deadlock.
2841  	 */
2842  	btrfs_add_delayed_iput(inode);
2843  }
2844  
2845  /*
2846   * There are a few paths in the higher layers of the kernel that directly
2847   * set the folio dirty bit without asking the filesystem if it is a
2848   * good idea.  This causes problems because we want to make sure COW
2849   * properly happens and the data=ordered rules are followed.
2850   *
2851   * In our case any range that doesn't have the ORDERED bit set
2852   * hasn't been properly setup for IO.  We kick off an async process
2853   * to fix it up.  The async helper will wait for ordered extents, set
2854   * the delalloc bit and make it safe to write the folio.
2855   */
btrfs_writepage_cow_fixup(struct folio * folio)2856  int btrfs_writepage_cow_fixup(struct folio *folio)
2857  {
2858  	struct inode *inode = folio->mapping->host;
2859  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2860  	struct btrfs_writepage_fixup *fixup;
2861  
2862  	/* This folio has ordered extent covering it already */
2863  	if (folio_test_ordered(folio))
2864  		return 0;
2865  
2866  	/*
2867  	 * folio_checked is set below when we create a fixup worker for this
2868  	 * folio, don't try to create another one if we're already
2869  	 * folio_test_checked.
2870  	 *
2871  	 * The extent_io writepage code will redirty the foio if we send back
2872  	 * EAGAIN.
2873  	 */
2874  	if (folio_test_checked(folio))
2875  		return -EAGAIN;
2876  
2877  	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2878  	if (!fixup)
2879  		return -EAGAIN;
2880  
2881  	/*
2882  	 * We are already holding a reference to this inode from
2883  	 * write_cache_pages.  We need to hold it because the space reservation
2884  	 * takes place outside of the folio lock, and we can't trust
2885  	 * page->mapping outside of the folio lock.
2886  	 */
2887  	ihold(inode);
2888  	btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
2889  	folio_get(folio);
2890  	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
2891  	fixup->folio = folio;
2892  	fixup->inode = BTRFS_I(inode);
2893  	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2894  
2895  	return -EAGAIN;
2896  }
2897  
insert_reserved_file_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u64 file_pos,struct btrfs_file_extent_item * stack_fi,const bool update_inode_bytes,u64 qgroup_reserved)2898  static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2899  				       struct btrfs_inode *inode, u64 file_pos,
2900  				       struct btrfs_file_extent_item *stack_fi,
2901  				       const bool update_inode_bytes,
2902  				       u64 qgroup_reserved)
2903  {
2904  	struct btrfs_root *root = inode->root;
2905  	const u64 sectorsize = root->fs_info->sectorsize;
2906  	struct btrfs_path *path;
2907  	struct extent_buffer *leaf;
2908  	struct btrfs_key ins;
2909  	u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2910  	u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2911  	u64 offset = btrfs_stack_file_extent_offset(stack_fi);
2912  	u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2913  	u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2914  	struct btrfs_drop_extents_args drop_args = { 0 };
2915  	int ret;
2916  
2917  	path = btrfs_alloc_path();
2918  	if (!path)
2919  		return -ENOMEM;
2920  
2921  	/*
2922  	 * we may be replacing one extent in the tree with another.
2923  	 * The new extent is pinned in the extent map, and we don't want
2924  	 * to drop it from the cache until it is completely in the btree.
2925  	 *
2926  	 * So, tell btrfs_drop_extents to leave this extent in the cache.
2927  	 * the caller is expected to unpin it and allow it to be merged
2928  	 * with the others.
2929  	 */
2930  	drop_args.path = path;
2931  	drop_args.start = file_pos;
2932  	drop_args.end = file_pos + num_bytes;
2933  	drop_args.replace_extent = true;
2934  	drop_args.extent_item_size = sizeof(*stack_fi);
2935  	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2936  	if (ret)
2937  		goto out;
2938  
2939  	if (!drop_args.extent_inserted) {
2940  		ins.objectid = btrfs_ino(inode);
2941  		ins.offset = file_pos;
2942  		ins.type = BTRFS_EXTENT_DATA_KEY;
2943  
2944  		ret = btrfs_insert_empty_item(trans, root, path, &ins,
2945  					      sizeof(*stack_fi));
2946  		if (ret)
2947  			goto out;
2948  	}
2949  	leaf = path->nodes[0];
2950  	btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2951  	write_extent_buffer(leaf, stack_fi,
2952  			btrfs_item_ptr_offset(leaf, path->slots[0]),
2953  			sizeof(struct btrfs_file_extent_item));
2954  
2955  	btrfs_mark_buffer_dirty(trans, leaf);
2956  	btrfs_release_path(path);
2957  
2958  	/*
2959  	 * If we dropped an inline extent here, we know the range where it is
2960  	 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
2961  	 * number of bytes only for that range containing the inline extent.
2962  	 * The remaining of the range will be processed when clearning the
2963  	 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2964  	 */
2965  	if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2966  		u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
2967  
2968  		inline_size = drop_args.bytes_found - inline_size;
2969  		btrfs_update_inode_bytes(inode, sectorsize, inline_size);
2970  		drop_args.bytes_found -= inline_size;
2971  		num_bytes -= sectorsize;
2972  	}
2973  
2974  	if (update_inode_bytes)
2975  		btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
2976  
2977  	ins.objectid = disk_bytenr;
2978  	ins.offset = disk_num_bytes;
2979  	ins.type = BTRFS_EXTENT_ITEM_KEY;
2980  
2981  	ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
2982  	if (ret)
2983  		goto out;
2984  
2985  	ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
2986  					       file_pos - offset,
2987  					       qgroup_reserved, &ins);
2988  out:
2989  	btrfs_free_path(path);
2990  
2991  	return ret;
2992  }
2993  
btrfs_release_delalloc_bytes(struct btrfs_fs_info * fs_info,u64 start,u64 len)2994  static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2995  					 u64 start, u64 len)
2996  {
2997  	struct btrfs_block_group *cache;
2998  
2999  	cache = btrfs_lookup_block_group(fs_info, start);
3000  	ASSERT(cache);
3001  
3002  	spin_lock(&cache->lock);
3003  	cache->delalloc_bytes -= len;
3004  	spin_unlock(&cache->lock);
3005  
3006  	btrfs_put_block_group(cache);
3007  }
3008  
insert_ordered_extent_file_extent(struct btrfs_trans_handle * trans,struct btrfs_ordered_extent * oe)3009  static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
3010  					     struct btrfs_ordered_extent *oe)
3011  {
3012  	struct btrfs_file_extent_item stack_fi;
3013  	bool update_inode_bytes;
3014  	u64 num_bytes = oe->num_bytes;
3015  	u64 ram_bytes = oe->ram_bytes;
3016  
3017  	memset(&stack_fi, 0, sizeof(stack_fi));
3018  	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
3019  	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
3020  	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
3021  						   oe->disk_num_bytes);
3022  	btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
3023  	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
3024  		num_bytes = oe->truncated_len;
3025  	btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
3026  	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
3027  	btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
3028  	/* Encryption and other encoding is reserved and all 0 */
3029  
3030  	/*
3031  	 * For delalloc, when completing an ordered extent we update the inode's
3032  	 * bytes when clearing the range in the inode's io tree, so pass false
3033  	 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
3034  	 * except if the ordered extent was truncated.
3035  	 */
3036  	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
3037  			     test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
3038  			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
3039  
3040  	return insert_reserved_file_extent(trans, oe->inode,
3041  					   oe->file_offset, &stack_fi,
3042  					   update_inode_bytes, oe->qgroup_rsv);
3043  }
3044  
3045  /*
3046   * As ordered data IO finishes, this gets called so we can finish
3047   * an ordered extent if the range of bytes in the file it covers are
3048   * fully written.
3049   */
btrfs_finish_one_ordered(struct btrfs_ordered_extent * ordered_extent)3050  int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
3051  {
3052  	struct btrfs_inode *inode = ordered_extent->inode;
3053  	struct btrfs_root *root = inode->root;
3054  	struct btrfs_fs_info *fs_info = root->fs_info;
3055  	struct btrfs_trans_handle *trans = NULL;
3056  	struct extent_io_tree *io_tree = &inode->io_tree;
3057  	struct extent_state *cached_state = NULL;
3058  	u64 start, end;
3059  	int compress_type = 0;
3060  	int ret = 0;
3061  	u64 logical_len = ordered_extent->num_bytes;
3062  	bool freespace_inode;
3063  	bool truncated = false;
3064  	bool clear_reserved_extent = true;
3065  	unsigned int clear_bits = EXTENT_DEFRAG;
3066  
3067  	start = ordered_extent->file_offset;
3068  	end = start + ordered_extent->num_bytes - 1;
3069  
3070  	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3071  	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3072  	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
3073  	    !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
3074  		clear_bits |= EXTENT_DELALLOC_NEW;
3075  
3076  	freespace_inode = btrfs_is_free_space_inode(inode);
3077  	if (!freespace_inode)
3078  		btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
3079  
3080  	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3081  		ret = -EIO;
3082  		goto out;
3083  	}
3084  
3085  	if (btrfs_is_zoned(fs_info))
3086  		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3087  					ordered_extent->disk_num_bytes);
3088  
3089  	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3090  		truncated = true;
3091  		logical_len = ordered_extent->truncated_len;
3092  		/* Truncated the entire extent, don't bother adding */
3093  		if (!logical_len)
3094  			goto out;
3095  	}
3096  
3097  	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3098  		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3099  
3100  		btrfs_inode_safe_disk_i_size_write(inode, 0);
3101  		if (freespace_inode)
3102  			trans = btrfs_join_transaction_spacecache(root);
3103  		else
3104  			trans = btrfs_join_transaction(root);
3105  		if (IS_ERR(trans)) {
3106  			ret = PTR_ERR(trans);
3107  			trans = NULL;
3108  			goto out;
3109  		}
3110  		trans->block_rsv = &inode->block_rsv;
3111  		ret = btrfs_update_inode_fallback(trans, inode);
3112  		if (ret) /* -ENOMEM or corruption */
3113  			btrfs_abort_transaction(trans, ret);
3114  
3115  		ret = btrfs_insert_raid_extent(trans, ordered_extent);
3116  		if (ret)
3117  			btrfs_abort_transaction(trans, ret);
3118  
3119  		goto out;
3120  	}
3121  
3122  	clear_bits |= EXTENT_LOCKED;
3123  	lock_extent(io_tree, start, end, &cached_state);
3124  
3125  	if (freespace_inode)
3126  		trans = btrfs_join_transaction_spacecache(root);
3127  	else
3128  		trans = btrfs_join_transaction(root);
3129  	if (IS_ERR(trans)) {
3130  		ret = PTR_ERR(trans);
3131  		trans = NULL;
3132  		goto out;
3133  	}
3134  
3135  	trans->block_rsv = &inode->block_rsv;
3136  
3137  	ret = btrfs_insert_raid_extent(trans, ordered_extent);
3138  	if (ret)
3139  		goto out;
3140  
3141  	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3142  		compress_type = ordered_extent->compress_type;
3143  	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3144  		BUG_ON(compress_type);
3145  		ret = btrfs_mark_extent_written(trans, inode,
3146  						ordered_extent->file_offset,
3147  						ordered_extent->file_offset +
3148  						logical_len);
3149  		btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
3150  						  ordered_extent->disk_num_bytes);
3151  	} else {
3152  		BUG_ON(root == fs_info->tree_root);
3153  		ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3154  		if (!ret) {
3155  			clear_reserved_extent = false;
3156  			btrfs_release_delalloc_bytes(fs_info,
3157  						ordered_extent->disk_bytenr,
3158  						ordered_extent->disk_num_bytes);
3159  		}
3160  	}
3161  	if (ret < 0) {
3162  		btrfs_abort_transaction(trans, ret);
3163  		goto out;
3164  	}
3165  
3166  	ret = unpin_extent_cache(inode, ordered_extent->file_offset,
3167  				 ordered_extent->num_bytes, trans->transid);
3168  	if (ret < 0) {
3169  		btrfs_abort_transaction(trans, ret);
3170  		goto out;
3171  	}
3172  
3173  	ret = add_pending_csums(trans, &ordered_extent->list);
3174  	if (ret) {
3175  		btrfs_abort_transaction(trans, ret);
3176  		goto out;
3177  	}
3178  
3179  	/*
3180  	 * If this is a new delalloc range, clear its new delalloc flag to
3181  	 * update the inode's number of bytes. This needs to be done first
3182  	 * before updating the inode item.
3183  	 */
3184  	if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3185  	    !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3186  		clear_extent_bit(&inode->io_tree, start, end,
3187  				 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3188  				 &cached_state);
3189  
3190  	btrfs_inode_safe_disk_i_size_write(inode, 0);
3191  	ret = btrfs_update_inode_fallback(trans, inode);
3192  	if (ret) { /* -ENOMEM or corruption */
3193  		btrfs_abort_transaction(trans, ret);
3194  		goto out;
3195  	}
3196  out:
3197  	clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3198  			 &cached_state);
3199  
3200  	if (trans)
3201  		btrfs_end_transaction(trans);
3202  
3203  	if (ret || truncated) {
3204  		u64 unwritten_start = start;
3205  
3206  		/*
3207  		 * If we failed to finish this ordered extent for any reason we
3208  		 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3209  		 * extent, and mark the inode with the error if it wasn't
3210  		 * already set.  Any error during writeback would have already
3211  		 * set the mapping error, so we need to set it if we're the ones
3212  		 * marking this ordered extent as failed.
3213  		 */
3214  		if (ret)
3215  			btrfs_mark_ordered_extent_error(ordered_extent);
3216  
3217  		if (truncated)
3218  			unwritten_start += logical_len;
3219  		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3220  
3221  		/*
3222  		 * Drop extent maps for the part of the extent we didn't write.
3223  		 *
3224  		 * We have an exception here for the free_space_inode, this is
3225  		 * because when we do btrfs_get_extent() on the free space inode
3226  		 * we will search the commit root.  If this is a new block group
3227  		 * we won't find anything, and we will trip over the assert in
3228  		 * writepage where we do ASSERT(em->block_start !=
3229  		 * EXTENT_MAP_HOLE).
3230  		 *
3231  		 * Theoretically we could also skip this for any NOCOW extent as
3232  		 * we don't mess with the extent map tree in the NOCOW case, but
3233  		 * for now simply skip this if we are the free space inode.
3234  		 */
3235  		if (!btrfs_is_free_space_inode(inode))
3236  			btrfs_drop_extent_map_range(inode, unwritten_start,
3237  						    end, false);
3238  
3239  		/*
3240  		 * If the ordered extent had an IOERR or something else went
3241  		 * wrong we need to return the space for this ordered extent
3242  		 * back to the allocator.  We only free the extent in the
3243  		 * truncated case if we didn't write out the extent at all.
3244  		 *
3245  		 * If we made it past insert_reserved_file_extent before we
3246  		 * errored out then we don't need to do this as the accounting
3247  		 * has already been done.
3248  		 */
3249  		if ((ret || !logical_len) &&
3250  		    clear_reserved_extent &&
3251  		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3252  		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3253  			/*
3254  			 * Discard the range before returning it back to the
3255  			 * free space pool
3256  			 */
3257  			if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3258  				btrfs_discard_extent(fs_info,
3259  						ordered_extent->disk_bytenr,
3260  						ordered_extent->disk_num_bytes,
3261  						NULL);
3262  			btrfs_free_reserved_extent(fs_info,
3263  					ordered_extent->disk_bytenr,
3264  					ordered_extent->disk_num_bytes, 1);
3265  			/*
3266  			 * Actually free the qgroup rsv which was released when
3267  			 * the ordered extent was created.
3268  			 */
3269  			btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
3270  						  ordered_extent->qgroup_rsv,
3271  						  BTRFS_QGROUP_RSV_DATA);
3272  		}
3273  	}
3274  
3275  	/*
3276  	 * This needs to be done to make sure anybody waiting knows we are done
3277  	 * updating everything for this ordered extent.
3278  	 */
3279  	btrfs_remove_ordered_extent(inode, ordered_extent);
3280  
3281  	/* once for us */
3282  	btrfs_put_ordered_extent(ordered_extent);
3283  	/* once for the tree */
3284  	btrfs_put_ordered_extent(ordered_extent);
3285  
3286  	return ret;
3287  }
3288  
btrfs_finish_ordered_io(struct btrfs_ordered_extent * ordered)3289  int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
3290  {
3291  	if (btrfs_is_zoned(ordered->inode->root->fs_info) &&
3292  	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3293  	    list_empty(&ordered->bioc_list))
3294  		btrfs_finish_ordered_zoned(ordered);
3295  	return btrfs_finish_one_ordered(ordered);
3296  }
3297  
3298  /*
3299   * Verify the checksum for a single sector without any extra action that depend
3300   * on the type of I/O.
3301   */
btrfs_check_sector_csum(struct btrfs_fs_info * fs_info,struct page * page,u32 pgoff,u8 * csum,const u8 * const csum_expected)3302  int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
3303  			    u32 pgoff, u8 *csum, const u8 * const csum_expected)
3304  {
3305  	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3306  	char *kaddr;
3307  
3308  	ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
3309  
3310  	shash->tfm = fs_info->csum_shash;
3311  
3312  	kaddr = kmap_local_page(page) + pgoff;
3313  	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
3314  	kunmap_local(kaddr);
3315  
3316  	if (memcmp(csum, csum_expected, fs_info->csum_size))
3317  		return -EIO;
3318  	return 0;
3319  }
3320  
3321  /*
3322   * Verify the checksum of a single data sector.
3323   *
3324   * @bbio:	btrfs_io_bio which contains the csum
3325   * @dev:	device the sector is on
3326   * @bio_offset:	offset to the beginning of the bio (in bytes)
3327   * @bv:		bio_vec to check
3328   *
3329   * Check if the checksum on a data block is valid.  When a checksum mismatch is
3330   * detected, report the error and fill the corrupted range with zero.
3331   *
3332   * Return %true if the sector is ok or had no checksum to start with, else %false.
3333   */
btrfs_data_csum_ok(struct btrfs_bio * bbio,struct btrfs_device * dev,u32 bio_offset,struct bio_vec * bv)3334  bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
3335  			u32 bio_offset, struct bio_vec *bv)
3336  {
3337  	struct btrfs_inode *inode = bbio->inode;
3338  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3339  	u64 file_offset = bbio->file_offset + bio_offset;
3340  	u64 end = file_offset + bv->bv_len - 1;
3341  	u8 *csum_expected;
3342  	u8 csum[BTRFS_CSUM_SIZE];
3343  
3344  	ASSERT(bv->bv_len == fs_info->sectorsize);
3345  
3346  	if (!bbio->csum)
3347  		return true;
3348  
3349  	if (btrfs_is_data_reloc_root(inode->root) &&
3350  	    test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
3351  			   NULL)) {
3352  		/* Skip the range without csum for data reloc inode */
3353  		clear_extent_bits(&inode->io_tree, file_offset, end,
3354  				  EXTENT_NODATASUM);
3355  		return true;
3356  	}
3357  
3358  	csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
3359  				fs_info->csum_size;
3360  	if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
3361  				    csum_expected))
3362  		goto zeroit;
3363  	return true;
3364  
3365  zeroit:
3366  	btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
3367  				    bbio->mirror_num);
3368  	if (dev)
3369  		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
3370  	memzero_bvec(bv);
3371  	return false;
3372  }
3373  
3374  /*
3375   * Perform a delayed iput on @inode.
3376   *
3377   * @inode: The inode we want to perform iput on
3378   *
3379   * This function uses the generic vfs_inode::i_count to track whether we should
3380   * just decrement it (in case it's > 1) or if this is the last iput then link
3381   * the inode to the delayed iput machinery. Delayed iputs are processed at
3382   * transaction commit time/superblock commit/cleaner kthread.
3383   */
btrfs_add_delayed_iput(struct btrfs_inode * inode)3384  void btrfs_add_delayed_iput(struct btrfs_inode *inode)
3385  {
3386  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3387  	unsigned long flags;
3388  
3389  	if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
3390  		return;
3391  
3392  	atomic_inc(&fs_info->nr_delayed_iputs);
3393  	/*
3394  	 * Need to be irq safe here because we can be called from either an irq
3395  	 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
3396  	 * context.
3397  	 */
3398  	spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
3399  	ASSERT(list_empty(&inode->delayed_iput));
3400  	list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
3401  	spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
3402  	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3403  		wake_up_process(fs_info->cleaner_kthread);
3404  }
3405  
run_delayed_iput_locked(struct btrfs_fs_info * fs_info,struct btrfs_inode * inode)3406  static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3407  				    struct btrfs_inode *inode)
3408  {
3409  	list_del_init(&inode->delayed_iput);
3410  	spin_unlock_irq(&fs_info->delayed_iput_lock);
3411  	iput(&inode->vfs_inode);
3412  	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3413  		wake_up(&fs_info->delayed_iputs_wait);
3414  	spin_lock_irq(&fs_info->delayed_iput_lock);
3415  }
3416  
btrfs_run_delayed_iput(struct btrfs_fs_info * fs_info,struct btrfs_inode * inode)3417  static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3418  				   struct btrfs_inode *inode)
3419  {
3420  	if (!list_empty(&inode->delayed_iput)) {
3421  		spin_lock_irq(&fs_info->delayed_iput_lock);
3422  		if (!list_empty(&inode->delayed_iput))
3423  			run_delayed_iput_locked(fs_info, inode);
3424  		spin_unlock_irq(&fs_info->delayed_iput_lock);
3425  	}
3426  }
3427  
btrfs_run_delayed_iputs(struct btrfs_fs_info * fs_info)3428  void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3429  {
3430  	/*
3431  	 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
3432  	 * calls btrfs_add_delayed_iput() and that needs to lock
3433  	 * fs_info->delayed_iput_lock. So we need to disable irqs here to
3434  	 * prevent a deadlock.
3435  	 */
3436  	spin_lock_irq(&fs_info->delayed_iput_lock);
3437  	while (!list_empty(&fs_info->delayed_iputs)) {
3438  		struct btrfs_inode *inode;
3439  
3440  		inode = list_first_entry(&fs_info->delayed_iputs,
3441  				struct btrfs_inode, delayed_iput);
3442  		run_delayed_iput_locked(fs_info, inode);
3443  		if (need_resched()) {
3444  			spin_unlock_irq(&fs_info->delayed_iput_lock);
3445  			cond_resched();
3446  			spin_lock_irq(&fs_info->delayed_iput_lock);
3447  		}
3448  	}
3449  	spin_unlock_irq(&fs_info->delayed_iput_lock);
3450  }
3451  
3452  /*
3453   * Wait for flushing all delayed iputs
3454   *
3455   * @fs_info:  the filesystem
3456   *
3457   * This will wait on any delayed iputs that are currently running with KILLABLE
3458   * set.  Once they are all done running we will return, unless we are killed in
3459   * which case we return EINTR. This helps in user operations like fallocate etc
3460   * that might get blocked on the iputs.
3461   *
3462   * Return EINTR if we were killed, 0 if nothing's pending
3463   */
btrfs_wait_on_delayed_iputs(struct btrfs_fs_info * fs_info)3464  int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3465  {
3466  	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3467  			atomic_read(&fs_info->nr_delayed_iputs) == 0);
3468  	if (ret)
3469  		return -EINTR;
3470  	return 0;
3471  }
3472  
3473  /*
3474   * This creates an orphan entry for the given inode in case something goes wrong
3475   * in the middle of an unlink.
3476   */
btrfs_orphan_add(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3477  int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3478  		     struct btrfs_inode *inode)
3479  {
3480  	int ret;
3481  
3482  	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3483  	if (ret && ret != -EEXIST) {
3484  		btrfs_abort_transaction(trans, ret);
3485  		return ret;
3486  	}
3487  
3488  	return 0;
3489  }
3490  
3491  /*
3492   * We have done the delete so we can go ahead and remove the orphan item for
3493   * this particular inode.
3494   */
btrfs_orphan_del(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3495  static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3496  			    struct btrfs_inode *inode)
3497  {
3498  	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3499  }
3500  
3501  /*
3502   * this cleans up any orphans that may be left on the list from the last use
3503   * of this root.
3504   */
btrfs_orphan_cleanup(struct btrfs_root * root)3505  int btrfs_orphan_cleanup(struct btrfs_root *root)
3506  {
3507  	struct btrfs_fs_info *fs_info = root->fs_info;
3508  	struct btrfs_path *path;
3509  	struct extent_buffer *leaf;
3510  	struct btrfs_key key, found_key;
3511  	struct btrfs_trans_handle *trans;
3512  	struct inode *inode;
3513  	u64 last_objectid = 0;
3514  	int ret = 0, nr_unlink = 0;
3515  
3516  	if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
3517  		return 0;
3518  
3519  	path = btrfs_alloc_path();
3520  	if (!path) {
3521  		ret = -ENOMEM;
3522  		goto out;
3523  	}
3524  	path->reada = READA_BACK;
3525  
3526  	key.objectid = BTRFS_ORPHAN_OBJECTID;
3527  	key.type = BTRFS_ORPHAN_ITEM_KEY;
3528  	key.offset = (u64)-1;
3529  
3530  	while (1) {
3531  		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3532  		if (ret < 0)
3533  			goto out;
3534  
3535  		/*
3536  		 * if ret == 0 means we found what we were searching for, which
3537  		 * is weird, but possible, so only screw with path if we didn't
3538  		 * find the key and see if we have stuff that matches
3539  		 */
3540  		if (ret > 0) {
3541  			ret = 0;
3542  			if (path->slots[0] == 0)
3543  				break;
3544  			path->slots[0]--;
3545  		}
3546  
3547  		/* pull out the item */
3548  		leaf = path->nodes[0];
3549  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3550  
3551  		/* make sure the item matches what we want */
3552  		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3553  			break;
3554  		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3555  			break;
3556  
3557  		/* release the path since we're done with it */
3558  		btrfs_release_path(path);
3559  
3560  		/*
3561  		 * this is where we are basically btrfs_lookup, without the
3562  		 * crossing root thing.  we store the inode number in the
3563  		 * offset of the orphan item.
3564  		 */
3565  
3566  		if (found_key.offset == last_objectid) {
3567  			/*
3568  			 * We found the same inode as before. This means we were
3569  			 * not able to remove its items via eviction triggered
3570  			 * by an iput(). A transaction abort may have happened,
3571  			 * due to -ENOSPC for example, so try to grab the error
3572  			 * that lead to a transaction abort, if any.
3573  			 */
3574  			btrfs_err(fs_info,
3575  				  "Error removing orphan entry, stopping orphan cleanup");
3576  			ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
3577  			goto out;
3578  		}
3579  
3580  		last_objectid = found_key.offset;
3581  
3582  		found_key.objectid = found_key.offset;
3583  		found_key.type = BTRFS_INODE_ITEM_KEY;
3584  		found_key.offset = 0;
3585  		inode = btrfs_iget(last_objectid, root);
3586  		if (IS_ERR(inode)) {
3587  			ret = PTR_ERR(inode);
3588  			inode = NULL;
3589  			if (ret != -ENOENT)
3590  				goto out;
3591  		}
3592  
3593  		if (!inode && root == fs_info->tree_root) {
3594  			struct btrfs_root *dead_root;
3595  			int is_dead_root = 0;
3596  
3597  			/*
3598  			 * This is an orphan in the tree root. Currently these
3599  			 * could come from 2 sources:
3600  			 *  a) a root (snapshot/subvolume) deletion in progress
3601  			 *  b) a free space cache inode
3602  			 * We need to distinguish those two, as the orphan item
3603  			 * for a root must not get deleted before the deletion
3604  			 * of the snapshot/subvolume's tree completes.
3605  			 *
3606  			 * btrfs_find_orphan_roots() ran before us, which has
3607  			 * found all deleted roots and loaded them into
3608  			 * fs_info->fs_roots_radix. So here we can find if an
3609  			 * orphan item corresponds to a deleted root by looking
3610  			 * up the root from that radix tree.
3611  			 */
3612  
3613  			spin_lock(&fs_info->fs_roots_radix_lock);
3614  			dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3615  							 (unsigned long)found_key.objectid);
3616  			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3617  				is_dead_root = 1;
3618  			spin_unlock(&fs_info->fs_roots_radix_lock);
3619  
3620  			if (is_dead_root) {
3621  				/* prevent this orphan from being found again */
3622  				key.offset = found_key.objectid - 1;
3623  				continue;
3624  			}
3625  
3626  		}
3627  
3628  		/*
3629  		 * If we have an inode with links, there are a couple of
3630  		 * possibilities:
3631  		 *
3632  		 * 1. We were halfway through creating fsverity metadata for the
3633  		 * file. In that case, the orphan item represents incomplete
3634  		 * fsverity metadata which must be cleaned up with
3635  		 * btrfs_drop_verity_items and deleting the orphan item.
3636  
3637  		 * 2. Old kernels (before v3.12) used to create an
3638  		 * orphan item for truncate indicating that there were possibly
3639  		 * extent items past i_size that needed to be deleted. In v3.12,
3640  		 * truncate was changed to update i_size in sync with the extent
3641  		 * items, but the (useless) orphan item was still created. Since
3642  		 * v4.18, we don't create the orphan item for truncate at all.
3643  		 *
3644  		 * So, this item could mean that we need to do a truncate, but
3645  		 * only if this filesystem was last used on a pre-v3.12 kernel
3646  		 * and was not cleanly unmounted. The odds of that are quite
3647  		 * slim, and it's a pain to do the truncate now, so just delete
3648  		 * the orphan item.
3649  		 *
3650  		 * It's also possible that this orphan item was supposed to be
3651  		 * deleted but wasn't. The inode number may have been reused,
3652  		 * but either way, we can delete the orphan item.
3653  		 */
3654  		if (!inode || inode->i_nlink) {
3655  			if (inode) {
3656  				ret = btrfs_drop_verity_items(BTRFS_I(inode));
3657  				iput(inode);
3658  				inode = NULL;
3659  				if (ret)
3660  					goto out;
3661  			}
3662  			trans = btrfs_start_transaction(root, 1);
3663  			if (IS_ERR(trans)) {
3664  				ret = PTR_ERR(trans);
3665  				goto out;
3666  			}
3667  			btrfs_debug(fs_info, "auto deleting %Lu",
3668  				    found_key.objectid);
3669  			ret = btrfs_del_orphan_item(trans, root,
3670  						    found_key.objectid);
3671  			btrfs_end_transaction(trans);
3672  			if (ret)
3673  				goto out;
3674  			continue;
3675  		}
3676  
3677  		nr_unlink++;
3678  
3679  		/* this will do delete_inode and everything for us */
3680  		iput(inode);
3681  	}
3682  	/* release the path since we're done with it */
3683  	btrfs_release_path(path);
3684  
3685  	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3686  		trans = btrfs_join_transaction(root);
3687  		if (!IS_ERR(trans))
3688  			btrfs_end_transaction(trans);
3689  	}
3690  
3691  	if (nr_unlink)
3692  		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3693  
3694  out:
3695  	if (ret)
3696  		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3697  	btrfs_free_path(path);
3698  	return ret;
3699  }
3700  
3701  /*
3702   * very simple check to peek ahead in the leaf looking for xattrs.  If we
3703   * don't find any xattrs, we know there can't be any acls.
3704   *
3705   * slot is the slot the inode is in, objectid is the objectid of the inode
3706   */
acls_after_inode_item(struct extent_buffer * leaf,int slot,u64 objectid,int * first_xattr_slot)3707  static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3708  					  int slot, u64 objectid,
3709  					  int *first_xattr_slot)
3710  {
3711  	u32 nritems = btrfs_header_nritems(leaf);
3712  	struct btrfs_key found_key;
3713  	static u64 xattr_access = 0;
3714  	static u64 xattr_default = 0;
3715  	int scanned = 0;
3716  
3717  	if (!xattr_access) {
3718  		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3719  					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3720  		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3721  					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3722  	}
3723  
3724  	slot++;
3725  	*first_xattr_slot = -1;
3726  	while (slot < nritems) {
3727  		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3728  
3729  		/* we found a different objectid, there must not be acls */
3730  		if (found_key.objectid != objectid)
3731  			return 0;
3732  
3733  		/* we found an xattr, assume we've got an acl */
3734  		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3735  			if (*first_xattr_slot == -1)
3736  				*first_xattr_slot = slot;
3737  			if (found_key.offset == xattr_access ||
3738  			    found_key.offset == xattr_default)
3739  				return 1;
3740  		}
3741  
3742  		/*
3743  		 * we found a key greater than an xattr key, there can't
3744  		 * be any acls later on
3745  		 */
3746  		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3747  			return 0;
3748  
3749  		slot++;
3750  		scanned++;
3751  
3752  		/*
3753  		 * it goes inode, inode backrefs, xattrs, extents,
3754  		 * so if there are a ton of hard links to an inode there can
3755  		 * be a lot of backrefs.  Don't waste time searching too hard,
3756  		 * this is just an optimization
3757  		 */
3758  		if (scanned >= 8)
3759  			break;
3760  	}
3761  	/* we hit the end of the leaf before we found an xattr or
3762  	 * something larger than an xattr.  We have to assume the inode
3763  	 * has acls
3764  	 */
3765  	if (*first_xattr_slot == -1)
3766  		*first_xattr_slot = slot;
3767  	return 1;
3768  }
3769  
btrfs_init_file_extent_tree(struct btrfs_inode * inode)3770  static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
3771  {
3772  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3773  
3774  	if (WARN_ON_ONCE(inode->file_extent_tree))
3775  		return 0;
3776  	if (btrfs_fs_incompat(fs_info, NO_HOLES))
3777  		return 0;
3778  	if (!S_ISREG(inode->vfs_inode.i_mode))
3779  		return 0;
3780  	if (btrfs_is_free_space_inode(inode))
3781  		return 0;
3782  
3783  	inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
3784  	if (!inode->file_extent_tree)
3785  		return -ENOMEM;
3786  
3787  	extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT);
3788  	/* Lockdep class is set only for the file extent tree. */
3789  	lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
3790  
3791  	return 0;
3792  }
3793  
3794  /*
3795   * read an inode from the btree into the in-memory inode
3796   */
btrfs_read_locked_inode(struct inode * inode,struct btrfs_path * in_path)3797  static int btrfs_read_locked_inode(struct inode *inode,
3798  				   struct btrfs_path *in_path)
3799  {
3800  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
3801  	struct btrfs_path *path = in_path;
3802  	struct extent_buffer *leaf;
3803  	struct btrfs_inode_item *inode_item;
3804  	struct btrfs_root *root = BTRFS_I(inode)->root;
3805  	struct btrfs_key location;
3806  	unsigned long ptr;
3807  	int maybe_acls;
3808  	u32 rdev;
3809  	int ret;
3810  	bool filled = false;
3811  	int first_xattr_slot;
3812  
3813  	ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
3814  	if (ret)
3815  		return ret;
3816  
3817  	ret = btrfs_fill_inode(inode, &rdev);
3818  	if (!ret)
3819  		filled = true;
3820  
3821  	if (!path) {
3822  		path = btrfs_alloc_path();
3823  		if (!path)
3824  			return -ENOMEM;
3825  	}
3826  
3827  	btrfs_get_inode_key(BTRFS_I(inode), &location);
3828  
3829  	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3830  	if (ret) {
3831  		if (path != in_path)
3832  			btrfs_free_path(path);
3833  		return ret;
3834  	}
3835  
3836  	leaf = path->nodes[0];
3837  
3838  	if (filled)
3839  		goto cache_index;
3840  
3841  	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3842  				    struct btrfs_inode_item);
3843  	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3844  	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3845  	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3846  	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3847  	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3848  	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
3849  			round_up(i_size_read(inode), fs_info->sectorsize));
3850  
3851  	inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
3852  			btrfs_timespec_nsec(leaf, &inode_item->atime));
3853  
3854  	inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
3855  			btrfs_timespec_nsec(leaf, &inode_item->mtime));
3856  
3857  	inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
3858  			btrfs_timespec_nsec(leaf, &inode_item->ctime));
3859  
3860  	BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
3861  	BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
3862  
3863  	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3864  	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3865  	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3866  
3867  	inode_set_iversion_queried(inode,
3868  				   btrfs_inode_sequence(leaf, inode_item));
3869  	inode->i_generation = BTRFS_I(inode)->generation;
3870  	inode->i_rdev = 0;
3871  	rdev = btrfs_inode_rdev(leaf, inode_item);
3872  
3873  	if (S_ISDIR(inode->i_mode))
3874  		BTRFS_I(inode)->index_cnt = (u64)-1;
3875  
3876  	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3877  				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
3878  
3879  cache_index:
3880  	/*
3881  	 * If we were modified in the current generation and evicted from memory
3882  	 * and then re-read we need to do a full sync since we don't have any
3883  	 * idea about which extents were modified before we were evicted from
3884  	 * cache.
3885  	 *
3886  	 * This is required for both inode re-read from disk and delayed inode
3887  	 * in the delayed_nodes xarray.
3888  	 */
3889  	if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
3890  		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3891  			&BTRFS_I(inode)->runtime_flags);
3892  
3893  	/*
3894  	 * We don't persist the id of the transaction where an unlink operation
3895  	 * against the inode was last made. So here we assume the inode might
3896  	 * have been evicted, and therefore the exact value of last_unlink_trans
3897  	 * lost, and set it to last_trans to avoid metadata inconsistencies
3898  	 * between the inode and its parent if the inode is fsync'ed and the log
3899  	 * replayed. For example, in the scenario:
3900  	 *
3901  	 * touch mydir/foo
3902  	 * ln mydir/foo mydir/bar
3903  	 * sync
3904  	 * unlink mydir/bar
3905  	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3906  	 * xfs_io -c fsync mydir/foo
3907  	 * <power failure>
3908  	 * mount fs, triggers fsync log replay
3909  	 *
3910  	 * We must make sure that when we fsync our inode foo we also log its
3911  	 * parent inode, otherwise after log replay the parent still has the
3912  	 * dentry with the "bar" name but our inode foo has a link count of 1
3913  	 * and doesn't have an inode ref with the name "bar" anymore.
3914  	 *
3915  	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3916  	 * but it guarantees correctness at the expense of occasional full
3917  	 * transaction commits on fsync if our inode is a directory, or if our
3918  	 * inode is not a directory, logging its parent unnecessarily.
3919  	 */
3920  	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3921  
3922  	/*
3923  	 * Same logic as for last_unlink_trans. We don't persist the generation
3924  	 * of the last transaction where this inode was used for a reflink
3925  	 * operation, so after eviction and reloading the inode we must be
3926  	 * pessimistic and assume the last transaction that modified the inode.
3927  	 */
3928  	BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
3929  
3930  	path->slots[0]++;
3931  	if (inode->i_nlink != 1 ||
3932  	    path->slots[0] >= btrfs_header_nritems(leaf))
3933  		goto cache_acl;
3934  
3935  	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3936  	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3937  		goto cache_acl;
3938  
3939  	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3940  	if (location.type == BTRFS_INODE_REF_KEY) {
3941  		struct btrfs_inode_ref *ref;
3942  
3943  		ref = (struct btrfs_inode_ref *)ptr;
3944  		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3945  	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3946  		struct btrfs_inode_extref *extref;
3947  
3948  		extref = (struct btrfs_inode_extref *)ptr;
3949  		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3950  								     extref);
3951  	}
3952  cache_acl:
3953  	/*
3954  	 * try to precache a NULL acl entry for files that don't have
3955  	 * any xattrs or acls
3956  	 */
3957  	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3958  			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3959  	if (first_xattr_slot != -1) {
3960  		path->slots[0] = first_xattr_slot;
3961  		ret = btrfs_load_inode_props(inode, path);
3962  		if (ret)
3963  			btrfs_err(fs_info,
3964  				  "error loading props for ino %llu (root %llu): %d",
3965  				  btrfs_ino(BTRFS_I(inode)),
3966  				  btrfs_root_id(root), ret);
3967  	}
3968  	if (path != in_path)
3969  		btrfs_free_path(path);
3970  
3971  	if (!maybe_acls)
3972  		cache_no_acl(inode);
3973  
3974  	switch (inode->i_mode & S_IFMT) {
3975  	case S_IFREG:
3976  		inode->i_mapping->a_ops = &btrfs_aops;
3977  		inode->i_fop = &btrfs_file_operations;
3978  		inode->i_op = &btrfs_file_inode_operations;
3979  		break;
3980  	case S_IFDIR:
3981  		inode->i_fop = &btrfs_dir_file_operations;
3982  		inode->i_op = &btrfs_dir_inode_operations;
3983  		break;
3984  	case S_IFLNK:
3985  		inode->i_op = &btrfs_symlink_inode_operations;
3986  		inode_nohighmem(inode);
3987  		inode->i_mapping->a_ops = &btrfs_aops;
3988  		break;
3989  	default:
3990  		inode->i_op = &btrfs_special_inode_operations;
3991  		init_special_inode(inode, inode->i_mode, rdev);
3992  		break;
3993  	}
3994  
3995  	btrfs_sync_inode_flags_to_i_flags(inode);
3996  	return 0;
3997  }
3998  
3999  /*
4000   * given a leaf and an inode, copy the inode fields into the leaf
4001   */
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode)4002  static void fill_inode_item(struct btrfs_trans_handle *trans,
4003  			    struct extent_buffer *leaf,
4004  			    struct btrfs_inode_item *item,
4005  			    struct inode *inode)
4006  {
4007  	struct btrfs_map_token token;
4008  	u64 flags;
4009  
4010  	btrfs_init_map_token(&token, leaf);
4011  
4012  	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
4013  	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
4014  	btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
4015  	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
4016  	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
4017  
4018  	btrfs_set_token_timespec_sec(&token, &item->atime,
4019  				     inode_get_atime_sec(inode));
4020  	btrfs_set_token_timespec_nsec(&token, &item->atime,
4021  				      inode_get_atime_nsec(inode));
4022  
4023  	btrfs_set_token_timespec_sec(&token, &item->mtime,
4024  				     inode_get_mtime_sec(inode));
4025  	btrfs_set_token_timespec_nsec(&token, &item->mtime,
4026  				      inode_get_mtime_nsec(inode));
4027  
4028  	btrfs_set_token_timespec_sec(&token, &item->ctime,
4029  				     inode_get_ctime_sec(inode));
4030  	btrfs_set_token_timespec_nsec(&token, &item->ctime,
4031  				      inode_get_ctime_nsec(inode));
4032  
4033  	btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
4034  	btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
4035  
4036  	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
4037  	btrfs_set_token_inode_generation(&token, item,
4038  					 BTRFS_I(inode)->generation);
4039  	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4040  	btrfs_set_token_inode_transid(&token, item, trans->transid);
4041  	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
4042  	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4043  					  BTRFS_I(inode)->ro_flags);
4044  	btrfs_set_token_inode_flags(&token, item, flags);
4045  	btrfs_set_token_inode_block_group(&token, item, 0);
4046  }
4047  
4048  /*
4049   * copy everything in the in-memory inode into the btree.
4050   */
btrfs_update_inode_item(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4051  static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
4052  					    struct btrfs_inode *inode)
4053  {
4054  	struct btrfs_inode_item *inode_item;
4055  	struct btrfs_path *path;
4056  	struct extent_buffer *leaf;
4057  	struct btrfs_key key;
4058  	int ret;
4059  
4060  	path = btrfs_alloc_path();
4061  	if (!path)
4062  		return -ENOMEM;
4063  
4064  	btrfs_get_inode_key(inode, &key);
4065  	ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1);
4066  	if (ret) {
4067  		if (ret > 0)
4068  			ret = -ENOENT;
4069  		goto failed;
4070  	}
4071  
4072  	leaf = path->nodes[0];
4073  	inode_item = btrfs_item_ptr(leaf, path->slots[0],
4074  				    struct btrfs_inode_item);
4075  
4076  	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
4077  	btrfs_mark_buffer_dirty(trans, leaf);
4078  	btrfs_set_inode_last_trans(trans, inode);
4079  	ret = 0;
4080  failed:
4081  	btrfs_free_path(path);
4082  	return ret;
4083  }
4084  
4085  /*
4086   * copy everything in the in-memory inode into the btree.
4087   */
btrfs_update_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4088  int btrfs_update_inode(struct btrfs_trans_handle *trans,
4089  		       struct btrfs_inode *inode)
4090  {
4091  	struct btrfs_root *root = inode->root;
4092  	struct btrfs_fs_info *fs_info = root->fs_info;
4093  	int ret;
4094  
4095  	/*
4096  	 * If the inode is a free space inode, we can deadlock during commit
4097  	 * if we put it into the delayed code.
4098  	 *
4099  	 * The data relocation inode should also be directly updated
4100  	 * without delay
4101  	 */
4102  	if (!btrfs_is_free_space_inode(inode)
4103  	    && !btrfs_is_data_reloc_root(root)
4104  	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4105  		btrfs_update_root_times(trans, root);
4106  
4107  		ret = btrfs_delayed_update_inode(trans, inode);
4108  		if (!ret)
4109  			btrfs_set_inode_last_trans(trans, inode);
4110  		return ret;
4111  	}
4112  
4113  	return btrfs_update_inode_item(trans, inode);
4114  }
4115  
btrfs_update_inode_fallback(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4116  int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4117  				struct btrfs_inode *inode)
4118  {
4119  	int ret;
4120  
4121  	ret = btrfs_update_inode(trans, inode);
4122  	if (ret == -ENOSPC)
4123  		return btrfs_update_inode_item(trans, inode);
4124  	return ret;
4125  }
4126  
4127  /*
4128   * unlink helper that gets used here in inode.c and in the tree logging
4129   * recovery code.  It remove a link in a directory with a given name, and
4130   * also drops the back refs in the inode to the directory
4131   */
__btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name,struct btrfs_rename_ctx * rename_ctx)4132  static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4133  				struct btrfs_inode *dir,
4134  				struct btrfs_inode *inode,
4135  				const struct fscrypt_str *name,
4136  				struct btrfs_rename_ctx *rename_ctx)
4137  {
4138  	struct btrfs_root *root = dir->root;
4139  	struct btrfs_fs_info *fs_info = root->fs_info;
4140  	struct btrfs_path *path;
4141  	int ret = 0;
4142  	struct btrfs_dir_item *di;
4143  	u64 index;
4144  	u64 ino = btrfs_ino(inode);
4145  	u64 dir_ino = btrfs_ino(dir);
4146  
4147  	path = btrfs_alloc_path();
4148  	if (!path) {
4149  		ret = -ENOMEM;
4150  		goto out;
4151  	}
4152  
4153  	di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
4154  	if (IS_ERR_OR_NULL(di)) {
4155  		ret = di ? PTR_ERR(di) : -ENOENT;
4156  		goto err;
4157  	}
4158  	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4159  	if (ret)
4160  		goto err;
4161  	btrfs_release_path(path);
4162  
4163  	/*
4164  	 * If we don't have dir index, we have to get it by looking up
4165  	 * the inode ref, since we get the inode ref, remove it directly,
4166  	 * it is unnecessary to do delayed deletion.
4167  	 *
4168  	 * But if we have dir index, needn't search inode ref to get it.
4169  	 * Since the inode ref is close to the inode item, it is better
4170  	 * that we delay to delete it, and just do this deletion when
4171  	 * we update the inode item.
4172  	 */
4173  	if (inode->dir_index) {
4174  		ret = btrfs_delayed_delete_inode_ref(inode);
4175  		if (!ret) {
4176  			index = inode->dir_index;
4177  			goto skip_backref;
4178  		}
4179  	}
4180  
4181  	ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
4182  	if (ret) {
4183  		btrfs_info(fs_info,
4184  			"failed to delete reference to %.*s, inode %llu parent %llu",
4185  			name->len, name->name, ino, dir_ino);
4186  		btrfs_abort_transaction(trans, ret);
4187  		goto err;
4188  	}
4189  skip_backref:
4190  	if (rename_ctx)
4191  		rename_ctx->index = index;
4192  
4193  	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4194  	if (ret) {
4195  		btrfs_abort_transaction(trans, ret);
4196  		goto err;
4197  	}
4198  
4199  	/*
4200  	 * If we are in a rename context, we don't need to update anything in the
4201  	 * log. That will be done later during the rename by btrfs_log_new_name().
4202  	 * Besides that, doing it here would only cause extra unnecessary btree
4203  	 * operations on the log tree, increasing latency for applications.
4204  	 */
4205  	if (!rename_ctx) {
4206  		btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
4207  		btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
4208  	}
4209  
4210  	/*
4211  	 * If we have a pending delayed iput we could end up with the final iput
4212  	 * being run in btrfs-cleaner context.  If we have enough of these built
4213  	 * up we can end up burning a lot of time in btrfs-cleaner without any
4214  	 * way to throttle the unlinks.  Since we're currently holding a ref on
4215  	 * the inode we can run the delayed iput here without any issues as the
4216  	 * final iput won't be done until after we drop the ref we're currently
4217  	 * holding.
4218  	 */
4219  	btrfs_run_delayed_iput(fs_info, inode);
4220  err:
4221  	btrfs_free_path(path);
4222  	if (ret)
4223  		goto out;
4224  
4225  	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
4226  	inode_inc_iversion(&inode->vfs_inode);
4227  	inode_set_ctime_current(&inode->vfs_inode);
4228  	inode_inc_iversion(&dir->vfs_inode);
4229   	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
4230  	ret = btrfs_update_inode(trans, dir);
4231  out:
4232  	return ret;
4233  }
4234  
btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name)4235  int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4236  		       struct btrfs_inode *dir, struct btrfs_inode *inode,
4237  		       const struct fscrypt_str *name)
4238  {
4239  	int ret;
4240  
4241  	ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
4242  	if (!ret) {
4243  		drop_nlink(&inode->vfs_inode);
4244  		ret = btrfs_update_inode(trans, inode);
4245  	}
4246  	return ret;
4247  }
4248  
4249  /*
4250   * helper to start transaction for unlink and rmdir.
4251   *
4252   * unlink and rmdir are special in btrfs, they do not always free space, so
4253   * if we cannot make our reservations the normal way try and see if there is
4254   * plenty of slack room in the global reserve to migrate, otherwise we cannot
4255   * allow the unlink to occur.
4256   */
__unlink_start_trans(struct btrfs_inode * dir)4257  static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
4258  {
4259  	struct btrfs_root *root = dir->root;
4260  
4261  	return btrfs_start_transaction_fallback_global_rsv(root,
4262  						   BTRFS_UNLINK_METADATA_UNITS);
4263  }
4264  
btrfs_unlink(struct inode * dir,struct dentry * dentry)4265  static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4266  {
4267  	struct btrfs_trans_handle *trans;
4268  	struct inode *inode = d_inode(dentry);
4269  	int ret;
4270  	struct fscrypt_name fname;
4271  
4272  	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4273  	if (ret)
4274  		return ret;
4275  
4276  	/* This needs to handle no-key deletions later on */
4277  
4278  	trans = __unlink_start_trans(BTRFS_I(dir));
4279  	if (IS_ERR(trans)) {
4280  		ret = PTR_ERR(trans);
4281  		goto fscrypt_free;
4282  	}
4283  
4284  	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4285  				false);
4286  
4287  	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4288  				 &fname.disk_name);
4289  	if (ret)
4290  		goto end_trans;
4291  
4292  	if (inode->i_nlink == 0) {
4293  		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4294  		if (ret)
4295  			goto end_trans;
4296  	}
4297  
4298  end_trans:
4299  	btrfs_end_transaction(trans);
4300  	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4301  fscrypt_free:
4302  	fscrypt_free_filename(&fname);
4303  	return ret;
4304  }
4305  
btrfs_unlink_subvol(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct dentry * dentry)4306  static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4307  			       struct btrfs_inode *dir, struct dentry *dentry)
4308  {
4309  	struct btrfs_root *root = dir->root;
4310  	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4311  	struct btrfs_path *path;
4312  	struct extent_buffer *leaf;
4313  	struct btrfs_dir_item *di;
4314  	struct btrfs_key key;
4315  	u64 index;
4316  	int ret;
4317  	u64 objectid;
4318  	u64 dir_ino = btrfs_ino(dir);
4319  	struct fscrypt_name fname;
4320  
4321  	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
4322  	if (ret)
4323  		return ret;
4324  
4325  	/* This needs to handle no-key deletions later on */
4326  
4327  	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4328  		objectid = btrfs_root_id(inode->root);
4329  	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4330  		objectid = inode->ref_root_id;
4331  	} else {
4332  		WARN_ON(1);
4333  		fscrypt_free_filename(&fname);
4334  		return -EINVAL;
4335  	}
4336  
4337  	path = btrfs_alloc_path();
4338  	if (!path) {
4339  		ret = -ENOMEM;
4340  		goto out;
4341  	}
4342  
4343  	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4344  				   &fname.disk_name, -1);
4345  	if (IS_ERR_OR_NULL(di)) {
4346  		ret = di ? PTR_ERR(di) : -ENOENT;
4347  		goto out;
4348  	}
4349  
4350  	leaf = path->nodes[0];
4351  	btrfs_dir_item_key_to_cpu(leaf, di, &key);
4352  	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4353  	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4354  	if (ret) {
4355  		btrfs_abort_transaction(trans, ret);
4356  		goto out;
4357  	}
4358  	btrfs_release_path(path);
4359  
4360  	/*
4361  	 * This is a placeholder inode for a subvolume we didn't have a
4362  	 * reference to at the time of the snapshot creation.  In the meantime
4363  	 * we could have renamed the real subvol link into our snapshot, so
4364  	 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4365  	 * Instead simply lookup the dir_index_item for this entry so we can
4366  	 * remove it.  Otherwise we know we have a ref to the root and we can
4367  	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
4368  	 */
4369  	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4370  		di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
4371  		if (IS_ERR(di)) {
4372  			ret = PTR_ERR(di);
4373  			btrfs_abort_transaction(trans, ret);
4374  			goto out;
4375  		}
4376  
4377  		leaf = path->nodes[0];
4378  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4379  		index = key.offset;
4380  		btrfs_release_path(path);
4381  	} else {
4382  		ret = btrfs_del_root_ref(trans, objectid,
4383  					 btrfs_root_id(root), dir_ino,
4384  					 &index, &fname.disk_name);
4385  		if (ret) {
4386  			btrfs_abort_transaction(trans, ret);
4387  			goto out;
4388  		}
4389  	}
4390  
4391  	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4392  	if (ret) {
4393  		btrfs_abort_transaction(trans, ret);
4394  		goto out;
4395  	}
4396  
4397  	btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
4398  	inode_inc_iversion(&dir->vfs_inode);
4399  	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
4400  	ret = btrfs_update_inode_fallback(trans, dir);
4401  	if (ret)
4402  		btrfs_abort_transaction(trans, ret);
4403  out:
4404  	btrfs_free_path(path);
4405  	fscrypt_free_filename(&fname);
4406  	return ret;
4407  }
4408  
4409  /*
4410   * Helper to check if the subvolume references other subvolumes or if it's
4411   * default.
4412   */
may_destroy_subvol(struct btrfs_root * root)4413  static noinline int may_destroy_subvol(struct btrfs_root *root)
4414  {
4415  	struct btrfs_fs_info *fs_info = root->fs_info;
4416  	struct btrfs_path *path;
4417  	struct btrfs_dir_item *di;
4418  	struct btrfs_key key;
4419  	struct fscrypt_str name = FSTR_INIT("default", 7);
4420  	u64 dir_id;
4421  	int ret;
4422  
4423  	path = btrfs_alloc_path();
4424  	if (!path)
4425  		return -ENOMEM;
4426  
4427  	/* Make sure this root isn't set as the default subvol */
4428  	dir_id = btrfs_super_root_dir(fs_info->super_copy);
4429  	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4430  				   dir_id, &name, 0);
4431  	if (di && !IS_ERR(di)) {
4432  		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4433  		if (key.objectid == btrfs_root_id(root)) {
4434  			ret = -EPERM;
4435  			btrfs_err(fs_info,
4436  				  "deleting default subvolume %llu is not allowed",
4437  				  key.objectid);
4438  			goto out;
4439  		}
4440  		btrfs_release_path(path);
4441  	}
4442  
4443  	key.objectid = btrfs_root_id(root);
4444  	key.type = BTRFS_ROOT_REF_KEY;
4445  	key.offset = (u64)-1;
4446  
4447  	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4448  	if (ret < 0)
4449  		goto out;
4450  	if (ret == 0) {
4451  		/*
4452  		 * Key with offset -1 found, there would have to exist a root
4453  		 * with such id, but this is out of valid range.
4454  		 */
4455  		ret = -EUCLEAN;
4456  		goto out;
4457  	}
4458  
4459  	ret = 0;
4460  	if (path->slots[0] > 0) {
4461  		path->slots[0]--;
4462  		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4463  		if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
4464  			ret = -ENOTEMPTY;
4465  	}
4466  out:
4467  	btrfs_free_path(path);
4468  	return ret;
4469  }
4470  
4471  /* Delete all dentries for inodes belonging to the root */
btrfs_prune_dentries(struct btrfs_root * root)4472  static void btrfs_prune_dentries(struct btrfs_root *root)
4473  {
4474  	struct btrfs_fs_info *fs_info = root->fs_info;
4475  	struct btrfs_inode *inode;
4476  	u64 min_ino = 0;
4477  
4478  	if (!BTRFS_FS_ERROR(fs_info))
4479  		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4480  
4481  	inode = btrfs_find_first_inode(root, min_ino);
4482  	while (inode) {
4483  		if (atomic_read(&inode->vfs_inode.i_count) > 1)
4484  			d_prune_aliases(&inode->vfs_inode);
4485  
4486  		min_ino = btrfs_ino(inode) + 1;
4487  		/*
4488  		 * btrfs_drop_inode() will have it removed from the inode
4489  		 * cache when its usage count hits zero.
4490  		 */
4491  		iput(&inode->vfs_inode);
4492  		cond_resched();
4493  		inode = btrfs_find_first_inode(root, min_ino);
4494  	}
4495  }
4496  
btrfs_delete_subvolume(struct btrfs_inode * dir,struct dentry * dentry)4497  int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
4498  {
4499  	struct btrfs_root *root = dir->root;
4500  	struct btrfs_fs_info *fs_info = root->fs_info;
4501  	struct inode *inode = d_inode(dentry);
4502  	struct btrfs_root *dest = BTRFS_I(inode)->root;
4503  	struct btrfs_trans_handle *trans;
4504  	struct btrfs_block_rsv block_rsv;
4505  	u64 root_flags;
4506  	u64 qgroup_reserved = 0;
4507  	int ret;
4508  
4509  	down_write(&fs_info->subvol_sem);
4510  
4511  	/*
4512  	 * Don't allow to delete a subvolume with send in progress. This is
4513  	 * inside the inode lock so the error handling that has to drop the bit
4514  	 * again is not run concurrently.
4515  	 */
4516  	spin_lock(&dest->root_item_lock);
4517  	if (dest->send_in_progress) {
4518  		spin_unlock(&dest->root_item_lock);
4519  		btrfs_warn(fs_info,
4520  			   "attempt to delete subvolume %llu during send",
4521  			   btrfs_root_id(dest));
4522  		ret = -EPERM;
4523  		goto out_up_write;
4524  	}
4525  	if (atomic_read(&dest->nr_swapfiles)) {
4526  		spin_unlock(&dest->root_item_lock);
4527  		btrfs_warn(fs_info,
4528  			   "attempt to delete subvolume %llu with active swapfile",
4529  			   btrfs_root_id(root));
4530  		ret = -EPERM;
4531  		goto out_up_write;
4532  	}
4533  	root_flags = btrfs_root_flags(&dest->root_item);
4534  	btrfs_set_root_flags(&dest->root_item,
4535  			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4536  	spin_unlock(&dest->root_item_lock);
4537  
4538  	ret = may_destroy_subvol(dest);
4539  	if (ret)
4540  		goto out_undead;
4541  
4542  	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4543  	/*
4544  	 * One for dir inode,
4545  	 * two for dir entries,
4546  	 * two for root ref/backref.
4547  	 */
4548  	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4549  	if (ret)
4550  		goto out_undead;
4551  	qgroup_reserved = block_rsv.qgroup_rsv_reserved;
4552  
4553  	trans = btrfs_start_transaction(root, 0);
4554  	if (IS_ERR(trans)) {
4555  		ret = PTR_ERR(trans);
4556  		goto out_release;
4557  	}
4558  	btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
4559  	qgroup_reserved = 0;
4560  	trans->block_rsv = &block_rsv;
4561  	trans->bytes_reserved = block_rsv.size;
4562  
4563  	btrfs_record_snapshot_destroy(trans, dir);
4564  
4565  	ret = btrfs_unlink_subvol(trans, dir, dentry);
4566  	if (ret) {
4567  		btrfs_abort_transaction(trans, ret);
4568  		goto out_end_trans;
4569  	}
4570  
4571  	ret = btrfs_record_root_in_trans(trans, dest);
4572  	if (ret) {
4573  		btrfs_abort_transaction(trans, ret);
4574  		goto out_end_trans;
4575  	}
4576  
4577  	memset(&dest->root_item.drop_progress, 0,
4578  		sizeof(dest->root_item.drop_progress));
4579  	btrfs_set_root_drop_level(&dest->root_item, 0);
4580  	btrfs_set_root_refs(&dest->root_item, 0);
4581  
4582  	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4583  		ret = btrfs_insert_orphan_item(trans,
4584  					fs_info->tree_root,
4585  					btrfs_root_id(dest));
4586  		if (ret) {
4587  			btrfs_abort_transaction(trans, ret);
4588  			goto out_end_trans;
4589  		}
4590  	}
4591  
4592  	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4593  				     BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
4594  	if (ret && ret != -ENOENT) {
4595  		btrfs_abort_transaction(trans, ret);
4596  		goto out_end_trans;
4597  	}
4598  	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4599  		ret = btrfs_uuid_tree_remove(trans,
4600  					  dest->root_item.received_uuid,
4601  					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4602  					  btrfs_root_id(dest));
4603  		if (ret && ret != -ENOENT) {
4604  			btrfs_abort_transaction(trans, ret);
4605  			goto out_end_trans;
4606  		}
4607  	}
4608  
4609  	free_anon_bdev(dest->anon_dev);
4610  	dest->anon_dev = 0;
4611  out_end_trans:
4612  	trans->block_rsv = NULL;
4613  	trans->bytes_reserved = 0;
4614  	ret = btrfs_end_transaction(trans);
4615  	inode->i_flags |= S_DEAD;
4616  out_release:
4617  	btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
4618  	if (qgroup_reserved)
4619  		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
4620  out_undead:
4621  	if (ret) {
4622  		spin_lock(&dest->root_item_lock);
4623  		root_flags = btrfs_root_flags(&dest->root_item);
4624  		btrfs_set_root_flags(&dest->root_item,
4625  				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4626  		spin_unlock(&dest->root_item_lock);
4627  	}
4628  out_up_write:
4629  	up_write(&fs_info->subvol_sem);
4630  	if (!ret) {
4631  		d_invalidate(dentry);
4632  		btrfs_prune_dentries(dest);
4633  		ASSERT(dest->send_in_progress == 0);
4634  	}
4635  
4636  	return ret;
4637  }
4638  
btrfs_rmdir(struct inode * dir,struct dentry * dentry)4639  static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4640  {
4641  	struct inode *inode = d_inode(dentry);
4642  	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4643  	int ret = 0;
4644  	struct btrfs_trans_handle *trans;
4645  	u64 last_unlink_trans;
4646  	struct fscrypt_name fname;
4647  
4648  	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4649  		return -ENOTEMPTY;
4650  	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
4651  		if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
4652  			btrfs_err(fs_info,
4653  			"extent tree v2 doesn't support snapshot deletion yet");
4654  			return -EOPNOTSUPP;
4655  		}
4656  		return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
4657  	}
4658  
4659  	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4660  	if (ret)
4661  		return ret;
4662  
4663  	/* This needs to handle no-key deletions later on */
4664  
4665  	trans = __unlink_start_trans(BTRFS_I(dir));
4666  	if (IS_ERR(trans)) {
4667  		ret = PTR_ERR(trans);
4668  		goto out_notrans;
4669  	}
4670  
4671  	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4672  		ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
4673  		goto out;
4674  	}
4675  
4676  	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4677  	if (ret)
4678  		goto out;
4679  
4680  	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4681  
4682  	/* now the directory is empty */
4683  	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4684  				 &fname.disk_name);
4685  	if (!ret) {
4686  		btrfs_i_size_write(BTRFS_I(inode), 0);
4687  		/*
4688  		 * Propagate the last_unlink_trans value of the deleted dir to
4689  		 * its parent directory. This is to prevent an unrecoverable
4690  		 * log tree in the case we do something like this:
4691  		 * 1) create dir foo
4692  		 * 2) create snapshot under dir foo
4693  		 * 3) delete the snapshot
4694  		 * 4) rmdir foo
4695  		 * 5) mkdir foo
4696  		 * 6) fsync foo or some file inside foo
4697  		 */
4698  		if (last_unlink_trans >= trans->transid)
4699  			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4700  	}
4701  out:
4702  	btrfs_end_transaction(trans);
4703  out_notrans:
4704  	btrfs_btree_balance_dirty(fs_info);
4705  	fscrypt_free_filename(&fname);
4706  
4707  	return ret;
4708  }
4709  
4710  /*
4711   * Read, zero a chunk and write a block.
4712   *
4713   * @inode - inode that we're zeroing
4714   * @from - the offset to start zeroing
4715   * @len - the length to zero, 0 to zero the entire range respective to the
4716   *	offset
4717   * @front - zero up to the offset instead of from the offset on
4718   *
4719   * This will find the block for the "from" offset and cow the block and zero the
4720   * part we want to zero.  This is used with truncate and hole punching.
4721   */
btrfs_truncate_block(struct btrfs_inode * inode,loff_t from,loff_t len,int front)4722  int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
4723  			 int front)
4724  {
4725  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
4726  	struct address_space *mapping = inode->vfs_inode.i_mapping;
4727  	struct extent_io_tree *io_tree = &inode->io_tree;
4728  	struct btrfs_ordered_extent *ordered;
4729  	struct extent_state *cached_state = NULL;
4730  	struct extent_changeset *data_reserved = NULL;
4731  	bool only_release_metadata = false;
4732  	u32 blocksize = fs_info->sectorsize;
4733  	pgoff_t index = from >> PAGE_SHIFT;
4734  	unsigned offset = from & (blocksize - 1);
4735  	struct folio *folio;
4736  	gfp_t mask = btrfs_alloc_write_mask(mapping);
4737  	size_t write_bytes = blocksize;
4738  	int ret = 0;
4739  	u64 block_start;
4740  	u64 block_end;
4741  
4742  	if (IS_ALIGNED(offset, blocksize) &&
4743  	    (!len || IS_ALIGNED(len, blocksize)))
4744  		goto out;
4745  
4746  	block_start = round_down(from, blocksize);
4747  	block_end = block_start + blocksize - 1;
4748  
4749  	ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
4750  					  blocksize, false);
4751  	if (ret < 0) {
4752  		if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
4753  			/* For nocow case, no need to reserve data space */
4754  			only_release_metadata = true;
4755  		} else {
4756  			goto out;
4757  		}
4758  	}
4759  	ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
4760  	if (ret < 0) {
4761  		if (!only_release_metadata)
4762  			btrfs_free_reserved_data_space(inode, data_reserved,
4763  						       block_start, blocksize);
4764  		goto out;
4765  	}
4766  again:
4767  	folio = __filemap_get_folio(mapping, index,
4768  				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
4769  	if (IS_ERR(folio)) {
4770  		btrfs_delalloc_release_space(inode, data_reserved, block_start,
4771  					     blocksize, true);
4772  		btrfs_delalloc_release_extents(inode, blocksize);
4773  		ret = -ENOMEM;
4774  		goto out;
4775  	}
4776  
4777  	if (!folio_test_uptodate(folio)) {
4778  		ret = btrfs_read_folio(NULL, folio);
4779  		folio_lock(folio);
4780  		if (folio->mapping != mapping) {
4781  			folio_unlock(folio);
4782  			folio_put(folio);
4783  			goto again;
4784  		}
4785  		if (!folio_test_uptodate(folio)) {
4786  			ret = -EIO;
4787  			goto out_unlock;
4788  		}
4789  	}
4790  
4791  	/*
4792  	 * We unlock the page after the io is completed and then re-lock it
4793  	 * above.  release_folio() could have come in between that and cleared
4794  	 * folio private, but left the page in the mapping.  Set the page mapped
4795  	 * here to make sure it's properly set for the subpage stuff.
4796  	 */
4797  	ret = set_folio_extent_mapped(folio);
4798  	if (ret < 0)
4799  		goto out_unlock;
4800  
4801  	folio_wait_writeback(folio);
4802  
4803  	lock_extent(io_tree, block_start, block_end, &cached_state);
4804  
4805  	ordered = btrfs_lookup_ordered_extent(inode, block_start);
4806  	if (ordered) {
4807  		unlock_extent(io_tree, block_start, block_end, &cached_state);
4808  		folio_unlock(folio);
4809  		folio_put(folio);
4810  		btrfs_start_ordered_extent(ordered);
4811  		btrfs_put_ordered_extent(ordered);
4812  		goto again;
4813  	}
4814  
4815  	clear_extent_bit(&inode->io_tree, block_start, block_end,
4816  			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4817  			 &cached_state);
4818  
4819  	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4820  					&cached_state);
4821  	if (ret) {
4822  		unlock_extent(io_tree, block_start, block_end, &cached_state);
4823  		goto out_unlock;
4824  	}
4825  
4826  	if (offset != blocksize) {
4827  		if (!len)
4828  			len = blocksize - offset;
4829  		if (front)
4830  			folio_zero_range(folio, block_start - folio_pos(folio),
4831  					 offset);
4832  		else
4833  			folio_zero_range(folio,
4834  					 (block_start - folio_pos(folio)) + offset,
4835  					 len);
4836  	}
4837  	btrfs_folio_clear_checked(fs_info, folio, block_start,
4838  				  block_end + 1 - block_start);
4839  	btrfs_folio_set_dirty(fs_info, folio, block_start,
4840  			      block_end + 1 - block_start);
4841  	unlock_extent(io_tree, block_start, block_end, &cached_state);
4842  
4843  	if (only_release_metadata)
4844  		set_extent_bit(&inode->io_tree, block_start, block_end,
4845  			       EXTENT_NORESERVE, NULL);
4846  
4847  out_unlock:
4848  	if (ret) {
4849  		if (only_release_metadata)
4850  			btrfs_delalloc_release_metadata(inode, blocksize, true);
4851  		else
4852  			btrfs_delalloc_release_space(inode, data_reserved,
4853  					block_start, blocksize, true);
4854  	}
4855  	btrfs_delalloc_release_extents(inode, blocksize);
4856  	folio_unlock(folio);
4857  	folio_put(folio);
4858  out:
4859  	if (only_release_metadata)
4860  		btrfs_check_nocow_unlock(inode);
4861  	extent_changeset_free(data_reserved);
4862  	return ret;
4863  }
4864  
maybe_insert_hole(struct btrfs_inode * inode,u64 offset,u64 len)4865  static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
4866  {
4867  	struct btrfs_root *root = inode->root;
4868  	struct btrfs_fs_info *fs_info = root->fs_info;
4869  	struct btrfs_trans_handle *trans;
4870  	struct btrfs_drop_extents_args drop_args = { 0 };
4871  	int ret;
4872  
4873  	/*
4874  	 * If NO_HOLES is enabled, we don't need to do anything.
4875  	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
4876  	 * or btrfs_update_inode() will be called, which guarantee that the next
4877  	 * fsync will know this inode was changed and needs to be logged.
4878  	 */
4879  	if (btrfs_fs_incompat(fs_info, NO_HOLES))
4880  		return 0;
4881  
4882  	/*
4883  	 * 1 - for the one we're dropping
4884  	 * 1 - for the one we're adding
4885  	 * 1 - for updating the inode.
4886  	 */
4887  	trans = btrfs_start_transaction(root, 3);
4888  	if (IS_ERR(trans))
4889  		return PTR_ERR(trans);
4890  
4891  	drop_args.start = offset;
4892  	drop_args.end = offset + len;
4893  	drop_args.drop_cache = true;
4894  
4895  	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
4896  	if (ret) {
4897  		btrfs_abort_transaction(trans, ret);
4898  		btrfs_end_transaction(trans);
4899  		return ret;
4900  	}
4901  
4902  	ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
4903  	if (ret) {
4904  		btrfs_abort_transaction(trans, ret);
4905  	} else {
4906  		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
4907  		btrfs_update_inode(trans, inode);
4908  	}
4909  	btrfs_end_transaction(trans);
4910  	return ret;
4911  }
4912  
4913  /*
4914   * This function puts in dummy file extents for the area we're creating a hole
4915   * for.  So if we are truncating this file to a larger size we need to insert
4916   * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4917   * the range between oldsize and size
4918   */
btrfs_cont_expand(struct btrfs_inode * inode,loff_t oldsize,loff_t size)4919  int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
4920  {
4921  	struct btrfs_root *root = inode->root;
4922  	struct btrfs_fs_info *fs_info = root->fs_info;
4923  	struct extent_io_tree *io_tree = &inode->io_tree;
4924  	struct extent_map *em = NULL;
4925  	struct extent_state *cached_state = NULL;
4926  	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4927  	u64 block_end = ALIGN(size, fs_info->sectorsize);
4928  	u64 last_byte;
4929  	u64 cur_offset;
4930  	u64 hole_size;
4931  	int ret = 0;
4932  
4933  	/*
4934  	 * If our size started in the middle of a block we need to zero out the
4935  	 * rest of the block before we expand the i_size, otherwise we could
4936  	 * expose stale data.
4937  	 */
4938  	ret = btrfs_truncate_block(inode, oldsize, 0, 0);
4939  	if (ret)
4940  		return ret;
4941  
4942  	if (size <= hole_start)
4943  		return 0;
4944  
4945  	btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
4946  					   &cached_state);
4947  	cur_offset = hole_start;
4948  	while (1) {
4949  		em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
4950  		if (IS_ERR(em)) {
4951  			ret = PTR_ERR(em);
4952  			em = NULL;
4953  			break;
4954  		}
4955  		last_byte = min(extent_map_end(em), block_end);
4956  		last_byte = ALIGN(last_byte, fs_info->sectorsize);
4957  		hole_size = last_byte - cur_offset;
4958  
4959  		if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
4960  			struct extent_map *hole_em;
4961  
4962  			ret = maybe_insert_hole(inode, cur_offset, hole_size);
4963  			if (ret)
4964  				break;
4965  
4966  			ret = btrfs_inode_set_file_extent_range(inode,
4967  							cur_offset, hole_size);
4968  			if (ret)
4969  				break;
4970  
4971  			hole_em = alloc_extent_map();
4972  			if (!hole_em) {
4973  				btrfs_drop_extent_map_range(inode, cur_offset,
4974  						    cur_offset + hole_size - 1,
4975  						    false);
4976  				btrfs_set_inode_full_sync(inode);
4977  				goto next;
4978  			}
4979  			hole_em->start = cur_offset;
4980  			hole_em->len = hole_size;
4981  
4982  			hole_em->disk_bytenr = EXTENT_MAP_HOLE;
4983  			hole_em->disk_num_bytes = 0;
4984  			hole_em->ram_bytes = hole_size;
4985  			hole_em->generation = btrfs_get_fs_generation(fs_info);
4986  
4987  			ret = btrfs_replace_extent_map_range(inode, hole_em, true);
4988  			free_extent_map(hole_em);
4989  		} else {
4990  			ret = btrfs_inode_set_file_extent_range(inode,
4991  							cur_offset, hole_size);
4992  			if (ret)
4993  				break;
4994  		}
4995  next:
4996  		free_extent_map(em);
4997  		em = NULL;
4998  		cur_offset = last_byte;
4999  		if (cur_offset >= block_end)
5000  			break;
5001  	}
5002  	free_extent_map(em);
5003  	unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
5004  	return ret;
5005  }
5006  
btrfs_setsize(struct inode * inode,struct iattr * attr)5007  static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5008  {
5009  	struct btrfs_root *root = BTRFS_I(inode)->root;
5010  	struct btrfs_trans_handle *trans;
5011  	loff_t oldsize = i_size_read(inode);
5012  	loff_t newsize = attr->ia_size;
5013  	int mask = attr->ia_valid;
5014  	int ret;
5015  
5016  	/*
5017  	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5018  	 * special case where we need to update the times despite not having
5019  	 * these flags set.  For all other operations the VFS set these flags
5020  	 * explicitly if it wants a timestamp update.
5021  	 */
5022  	if (newsize != oldsize) {
5023  		inode_inc_iversion(inode);
5024  		if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
5025  			inode_set_mtime_to_ts(inode,
5026  					      inode_set_ctime_current(inode));
5027  		}
5028  	}
5029  
5030  	if (newsize > oldsize) {
5031  		/*
5032  		 * Don't do an expanding truncate while snapshotting is ongoing.
5033  		 * This is to ensure the snapshot captures a fully consistent
5034  		 * state of this file - if the snapshot captures this expanding
5035  		 * truncation, it must capture all writes that happened before
5036  		 * this truncation.
5037  		 */
5038  		btrfs_drew_write_lock(&root->snapshot_lock);
5039  		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
5040  		if (ret) {
5041  			btrfs_drew_write_unlock(&root->snapshot_lock);
5042  			return ret;
5043  		}
5044  
5045  		trans = btrfs_start_transaction(root, 1);
5046  		if (IS_ERR(trans)) {
5047  			btrfs_drew_write_unlock(&root->snapshot_lock);
5048  			return PTR_ERR(trans);
5049  		}
5050  
5051  		i_size_write(inode, newsize);
5052  		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5053  		pagecache_isize_extended(inode, oldsize, newsize);
5054  		ret = btrfs_update_inode(trans, BTRFS_I(inode));
5055  		btrfs_drew_write_unlock(&root->snapshot_lock);
5056  		btrfs_end_transaction(trans);
5057  	} else {
5058  		struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
5059  
5060  		if (btrfs_is_zoned(fs_info)) {
5061  			ret = btrfs_wait_ordered_range(BTRFS_I(inode),
5062  					ALIGN(newsize, fs_info->sectorsize),
5063  					(u64)-1);
5064  			if (ret)
5065  				return ret;
5066  		}
5067  
5068  		/*
5069  		 * We're truncating a file that used to have good data down to
5070  		 * zero. Make sure any new writes to the file get on disk
5071  		 * on close.
5072  		 */
5073  		if (newsize == 0)
5074  			set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5075  				&BTRFS_I(inode)->runtime_flags);
5076  
5077  		truncate_setsize(inode, newsize);
5078  
5079  		inode_dio_wait(inode);
5080  
5081  		ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
5082  		if (ret && inode->i_nlink) {
5083  			int err;
5084  
5085  			/*
5086  			 * Truncate failed, so fix up the in-memory size. We
5087  			 * adjusted disk_i_size down as we removed extents, so
5088  			 * wait for disk_i_size to be stable and then update the
5089  			 * in-memory size to match.
5090  			 */
5091  			err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
5092  			if (err)
5093  				return err;
5094  			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5095  		}
5096  	}
5097  
5098  	return ret;
5099  }
5100  
btrfs_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)5101  static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
5102  			 struct iattr *attr)
5103  {
5104  	struct inode *inode = d_inode(dentry);
5105  	struct btrfs_root *root = BTRFS_I(inode)->root;
5106  	int err;
5107  
5108  	if (btrfs_root_readonly(root))
5109  		return -EROFS;
5110  
5111  	err = setattr_prepare(idmap, dentry, attr);
5112  	if (err)
5113  		return err;
5114  
5115  	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5116  		err = btrfs_setsize(inode, attr);
5117  		if (err)
5118  			return err;
5119  	}
5120  
5121  	if (attr->ia_valid) {
5122  		setattr_copy(idmap, inode, attr);
5123  		inode_inc_iversion(inode);
5124  		err = btrfs_dirty_inode(BTRFS_I(inode));
5125  
5126  		if (!err && attr->ia_valid & ATTR_MODE)
5127  			err = posix_acl_chmod(idmap, dentry, inode->i_mode);
5128  	}
5129  
5130  	return err;
5131  }
5132  
5133  /*
5134   * While truncating the inode pages during eviction, we get the VFS
5135   * calling btrfs_invalidate_folio() against each folio of the inode. This
5136   * is slow because the calls to btrfs_invalidate_folio() result in a
5137   * huge amount of calls to lock_extent() and clear_extent_bit(),
5138   * which keep merging and splitting extent_state structures over and over,
5139   * wasting lots of time.
5140   *
5141   * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
5142   * skip all those expensive operations on a per folio basis and do only
5143   * the ordered io finishing, while we release here the extent_map and
5144   * extent_state structures, without the excessive merging and splitting.
5145   */
evict_inode_truncate_pages(struct inode * inode)5146  static void evict_inode_truncate_pages(struct inode *inode)
5147  {
5148  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5149  	struct rb_node *node;
5150  
5151  	ASSERT(inode->i_state & I_FREEING);
5152  	truncate_inode_pages_final(&inode->i_data);
5153  
5154  	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
5155  
5156  	/*
5157  	 * Keep looping until we have no more ranges in the io tree.
5158  	 * We can have ongoing bios started by readahead that have
5159  	 * their endio callback (extent_io.c:end_bio_extent_readpage)
5160  	 * still in progress (unlocked the pages in the bio but did not yet
5161  	 * unlocked the ranges in the io tree). Therefore this means some
5162  	 * ranges can still be locked and eviction started because before
5163  	 * submitting those bios, which are executed by a separate task (work
5164  	 * queue kthread), inode references (inode->i_count) were not taken
5165  	 * (which would be dropped in the end io callback of each bio).
5166  	 * Therefore here we effectively end up waiting for those bios and
5167  	 * anyone else holding locked ranges without having bumped the inode's
5168  	 * reference count - if we don't do it, when they access the inode's
5169  	 * io_tree to unlock a range it may be too late, leading to an
5170  	 * use-after-free issue.
5171  	 */
5172  	spin_lock(&io_tree->lock);
5173  	while (!RB_EMPTY_ROOT(&io_tree->state)) {
5174  		struct extent_state *state;
5175  		struct extent_state *cached_state = NULL;
5176  		u64 start;
5177  		u64 end;
5178  		unsigned state_flags;
5179  
5180  		node = rb_first(&io_tree->state);
5181  		state = rb_entry(node, struct extent_state, rb_node);
5182  		start = state->start;
5183  		end = state->end;
5184  		state_flags = state->state;
5185  		spin_unlock(&io_tree->lock);
5186  
5187  		lock_extent(io_tree, start, end, &cached_state);
5188  
5189  		/*
5190  		 * If still has DELALLOC flag, the extent didn't reach disk,
5191  		 * and its reserved space won't be freed by delayed_ref.
5192  		 * So we need to free its reserved space here.
5193  		 * (Refer to comment in btrfs_invalidate_folio, case 2)
5194  		 *
5195  		 * Note, end is the bytenr of last byte, so we need + 1 here.
5196  		 */
5197  		if (state_flags & EXTENT_DELALLOC)
5198  			btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5199  					       end - start + 1, NULL);
5200  
5201  		clear_extent_bit(io_tree, start, end,
5202  				 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
5203  				 &cached_state);
5204  
5205  		cond_resched();
5206  		spin_lock(&io_tree->lock);
5207  	}
5208  	spin_unlock(&io_tree->lock);
5209  }
5210  
evict_refill_and_join(struct btrfs_root * root,struct btrfs_block_rsv * rsv)5211  static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5212  							struct btrfs_block_rsv *rsv)
5213  {
5214  	struct btrfs_fs_info *fs_info = root->fs_info;
5215  	struct btrfs_trans_handle *trans;
5216  	u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
5217  	int ret;
5218  
5219  	/*
5220  	 * Eviction should be taking place at some place safe because of our
5221  	 * delayed iputs.  However the normal flushing code will run delayed
5222  	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5223  	 *
5224  	 * We reserve the delayed_refs_extra here again because we can't use
5225  	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5226  	 * above.  We reserve our extra bit here because we generate a ton of
5227  	 * delayed refs activity by truncating.
5228  	 *
5229  	 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5230  	 * if we fail to make this reservation we can re-try without the
5231  	 * delayed_refs_extra so we can make some forward progress.
5232  	 */
5233  	ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
5234  				     BTRFS_RESERVE_FLUSH_EVICT);
5235  	if (ret) {
5236  		ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
5237  					     BTRFS_RESERVE_FLUSH_EVICT);
5238  		if (ret) {
5239  			btrfs_warn(fs_info,
5240  				   "could not allocate space for delete; will truncate on mount");
5241  			return ERR_PTR(-ENOSPC);
5242  		}
5243  		delayed_refs_extra = 0;
5244  	}
5245  
5246  	trans = btrfs_join_transaction(root);
5247  	if (IS_ERR(trans))
5248  		return trans;
5249  
5250  	if (delayed_refs_extra) {
5251  		trans->block_rsv = &fs_info->trans_block_rsv;
5252  		trans->bytes_reserved = delayed_refs_extra;
5253  		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5254  					delayed_refs_extra, true);
5255  	}
5256  	return trans;
5257  }
5258  
btrfs_evict_inode(struct inode * inode)5259  void btrfs_evict_inode(struct inode *inode)
5260  {
5261  	struct btrfs_fs_info *fs_info;
5262  	struct btrfs_trans_handle *trans;
5263  	struct btrfs_root *root = BTRFS_I(inode)->root;
5264  	struct btrfs_block_rsv *rsv = NULL;
5265  	int ret;
5266  
5267  	trace_btrfs_inode_evict(inode);
5268  
5269  	if (!root) {
5270  		fsverity_cleanup_inode(inode);
5271  		clear_inode(inode);
5272  		return;
5273  	}
5274  
5275  	fs_info = inode_to_fs_info(inode);
5276  	evict_inode_truncate_pages(inode);
5277  
5278  	if (inode->i_nlink &&
5279  	    ((btrfs_root_refs(&root->root_item) != 0 &&
5280  	      btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
5281  	     btrfs_is_free_space_inode(BTRFS_I(inode))))
5282  		goto out;
5283  
5284  	if (is_bad_inode(inode))
5285  		goto out;
5286  
5287  	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5288  		goto out;
5289  
5290  	if (inode->i_nlink > 0) {
5291  		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5292  		       btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
5293  		goto out;
5294  	}
5295  
5296  	/*
5297  	 * This makes sure the inode item in tree is uptodate and the space for
5298  	 * the inode update is released.
5299  	 */
5300  	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5301  	if (ret)
5302  		goto out;
5303  
5304  	/*
5305  	 * This drops any pending insert or delete operations we have for this
5306  	 * inode.  We could have a delayed dir index deletion queued up, but
5307  	 * we're removing the inode completely so that'll be taken care of in
5308  	 * the truncate.
5309  	 */
5310  	btrfs_kill_delayed_inode_items(BTRFS_I(inode));
5311  
5312  	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5313  	if (!rsv)
5314  		goto out;
5315  	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5316  	rsv->failfast = true;
5317  
5318  	btrfs_i_size_write(BTRFS_I(inode), 0);
5319  
5320  	while (1) {
5321  		struct btrfs_truncate_control control = {
5322  			.inode = BTRFS_I(inode),
5323  			.ino = btrfs_ino(BTRFS_I(inode)),
5324  			.new_size = 0,
5325  			.min_type = 0,
5326  		};
5327  
5328  		trans = evict_refill_and_join(root, rsv);
5329  		if (IS_ERR(trans))
5330  			goto out;
5331  
5332  		trans->block_rsv = rsv;
5333  
5334  		ret = btrfs_truncate_inode_items(trans, root, &control);
5335  		trans->block_rsv = &fs_info->trans_block_rsv;
5336  		btrfs_end_transaction(trans);
5337  		/*
5338  		 * We have not added new delayed items for our inode after we
5339  		 * have flushed its delayed items, so no need to throttle on
5340  		 * delayed items. However we have modified extent buffers.
5341  		 */
5342  		btrfs_btree_balance_dirty_nodelay(fs_info);
5343  		if (ret && ret != -ENOSPC && ret != -EAGAIN)
5344  			goto out;
5345  		else if (!ret)
5346  			break;
5347  	}
5348  
5349  	/*
5350  	 * Errors here aren't a big deal, it just means we leave orphan items in
5351  	 * the tree. They will be cleaned up on the next mount. If the inode
5352  	 * number gets reused, cleanup deletes the orphan item without doing
5353  	 * anything, and unlink reuses the existing orphan item.
5354  	 *
5355  	 * If it turns out that we are dropping too many of these, we might want
5356  	 * to add a mechanism for retrying these after a commit.
5357  	 */
5358  	trans = evict_refill_and_join(root, rsv);
5359  	if (!IS_ERR(trans)) {
5360  		trans->block_rsv = rsv;
5361  		btrfs_orphan_del(trans, BTRFS_I(inode));
5362  		trans->block_rsv = &fs_info->trans_block_rsv;
5363  		btrfs_end_transaction(trans);
5364  	}
5365  
5366  out:
5367  	btrfs_free_block_rsv(fs_info, rsv);
5368  	/*
5369  	 * If we didn't successfully delete, the orphan item will still be in
5370  	 * the tree and we'll retry on the next mount. Again, we might also want
5371  	 * to retry these periodically in the future.
5372  	 */
5373  	btrfs_remove_delayed_node(BTRFS_I(inode));
5374  	fsverity_cleanup_inode(inode);
5375  	clear_inode(inode);
5376  }
5377  
5378  /*
5379   * Return the key found in the dir entry in the location pointer, fill @type
5380   * with BTRFS_FT_*, and return 0.
5381   *
5382   * If no dir entries were found, returns -ENOENT.
5383   * If found a corrupted location in dir entry, returns -EUCLEAN.
5384   */
btrfs_inode_by_name(struct btrfs_inode * dir,struct dentry * dentry,struct btrfs_key * location,u8 * type)5385  static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
5386  			       struct btrfs_key *location, u8 *type)
5387  {
5388  	struct btrfs_dir_item *di;
5389  	struct btrfs_path *path;
5390  	struct btrfs_root *root = dir->root;
5391  	int ret = 0;
5392  	struct fscrypt_name fname;
5393  
5394  	path = btrfs_alloc_path();
5395  	if (!path)
5396  		return -ENOMEM;
5397  
5398  	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
5399  	if (ret < 0)
5400  		goto out;
5401  	/*
5402  	 * fscrypt_setup_filename() should never return a positive value, but
5403  	 * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
5404  	 */
5405  	ASSERT(ret == 0);
5406  
5407  	/* This needs to handle no-key deletions later on */
5408  
5409  	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
5410  				   &fname.disk_name, 0);
5411  	if (IS_ERR_OR_NULL(di)) {
5412  		ret = di ? PTR_ERR(di) : -ENOENT;
5413  		goto out;
5414  	}
5415  
5416  	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5417  	if (location->type != BTRFS_INODE_ITEM_KEY &&
5418  	    location->type != BTRFS_ROOT_ITEM_KEY) {
5419  		ret = -EUCLEAN;
5420  		btrfs_warn(root->fs_info,
5421  "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5422  			   __func__, fname.disk_name.name, btrfs_ino(dir),
5423  			   location->objectid, location->type, location->offset);
5424  	}
5425  	if (!ret)
5426  		*type = btrfs_dir_ftype(path->nodes[0], di);
5427  out:
5428  	fscrypt_free_filename(&fname);
5429  	btrfs_free_path(path);
5430  	return ret;
5431  }
5432  
5433  /*
5434   * when we hit a tree root in a directory, the btrfs part of the inode
5435   * needs to be changed to reflect the root directory of the tree root.  This
5436   * is kind of like crossing a mount point.
5437   */
fixup_tree_root_location(struct btrfs_fs_info * fs_info,struct btrfs_inode * dir,struct dentry * dentry,struct btrfs_key * location,struct btrfs_root ** sub_root)5438  static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5439  				    struct btrfs_inode *dir,
5440  				    struct dentry *dentry,
5441  				    struct btrfs_key *location,
5442  				    struct btrfs_root **sub_root)
5443  {
5444  	struct btrfs_path *path;
5445  	struct btrfs_root *new_root;
5446  	struct btrfs_root_ref *ref;
5447  	struct extent_buffer *leaf;
5448  	struct btrfs_key key;
5449  	int ret;
5450  	int err = 0;
5451  	struct fscrypt_name fname;
5452  
5453  	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
5454  	if (ret)
5455  		return ret;
5456  
5457  	path = btrfs_alloc_path();
5458  	if (!path) {
5459  		err = -ENOMEM;
5460  		goto out;
5461  	}
5462  
5463  	err = -ENOENT;
5464  	key.objectid = btrfs_root_id(dir->root);
5465  	key.type = BTRFS_ROOT_REF_KEY;
5466  	key.offset = location->objectid;
5467  
5468  	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5469  	if (ret) {
5470  		if (ret < 0)
5471  			err = ret;
5472  		goto out;
5473  	}
5474  
5475  	leaf = path->nodes[0];
5476  	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5477  	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5478  	    btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
5479  		goto out;
5480  
5481  	ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
5482  				   (unsigned long)(ref + 1), fname.disk_name.len);
5483  	if (ret)
5484  		goto out;
5485  
5486  	btrfs_release_path(path);
5487  
5488  	new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5489  	if (IS_ERR(new_root)) {
5490  		err = PTR_ERR(new_root);
5491  		goto out;
5492  	}
5493  
5494  	*sub_root = new_root;
5495  	location->objectid = btrfs_root_dirid(&new_root->root_item);
5496  	location->type = BTRFS_INODE_ITEM_KEY;
5497  	location->offset = 0;
5498  	err = 0;
5499  out:
5500  	btrfs_free_path(path);
5501  	fscrypt_free_filename(&fname);
5502  	return err;
5503  }
5504  
btrfs_add_inode_to_root(struct btrfs_inode * inode,bool prealloc)5505  static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
5506  {
5507  	struct btrfs_root *root = inode->root;
5508  	struct btrfs_inode *existing;
5509  	const u64 ino = btrfs_ino(inode);
5510  	int ret;
5511  
5512  	if (inode_unhashed(&inode->vfs_inode))
5513  		return 0;
5514  
5515  	if (prealloc) {
5516  		ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
5517  		if (ret)
5518  			return ret;
5519  	}
5520  
5521  	existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
5522  
5523  	if (xa_is_err(existing)) {
5524  		ret = xa_err(existing);
5525  		ASSERT(ret != -EINVAL);
5526  		ASSERT(ret != -ENOMEM);
5527  		return ret;
5528  	} else if (existing) {
5529  		WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
5530  	}
5531  
5532  	return 0;
5533  }
5534  
btrfs_del_inode_from_root(struct btrfs_inode * inode)5535  static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
5536  {
5537  	struct btrfs_root *root = inode->root;
5538  	struct btrfs_inode *entry;
5539  	bool empty = false;
5540  
5541  	xa_lock(&root->inodes);
5542  	entry = __xa_erase(&root->inodes, btrfs_ino(inode));
5543  	if (entry == inode)
5544  		empty = xa_empty(&root->inodes);
5545  	xa_unlock(&root->inodes);
5546  
5547  	if (empty && btrfs_root_refs(&root->root_item) == 0) {
5548  		xa_lock(&root->inodes);
5549  		empty = xa_empty(&root->inodes);
5550  		xa_unlock(&root->inodes);
5551  		if (empty)
5552  			btrfs_add_dead_root(root);
5553  	}
5554  }
5555  
5556  
btrfs_init_locked_inode(struct inode * inode,void * p)5557  static int btrfs_init_locked_inode(struct inode *inode, void *p)
5558  {
5559  	struct btrfs_iget_args *args = p;
5560  
5561  	btrfs_set_inode_number(BTRFS_I(inode), args->ino);
5562  	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5563  
5564  	if (args->root && args->root == args->root->fs_info->tree_root &&
5565  	    args->ino != BTRFS_BTREE_INODE_OBJECTID)
5566  		set_bit(BTRFS_INODE_FREE_SPACE_INODE,
5567  			&BTRFS_I(inode)->runtime_flags);
5568  	return 0;
5569  }
5570  
btrfs_find_actor(struct inode * inode,void * opaque)5571  static int btrfs_find_actor(struct inode *inode, void *opaque)
5572  {
5573  	struct btrfs_iget_args *args = opaque;
5574  
5575  	return args->ino == btrfs_ino(BTRFS_I(inode)) &&
5576  		args->root == BTRFS_I(inode)->root;
5577  }
5578  
btrfs_iget_locked(u64 ino,struct btrfs_root * root)5579  static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
5580  {
5581  	struct inode *inode;
5582  	struct btrfs_iget_args args;
5583  	unsigned long hashval = btrfs_inode_hash(ino, root);
5584  
5585  	args.ino = ino;
5586  	args.root = root;
5587  
5588  	inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
5589  			     btrfs_init_locked_inode,
5590  			     (void *)&args);
5591  	return inode;
5592  }
5593  
5594  /*
5595   * Get an inode object given its inode number and corresponding root.
5596   * Path can be preallocated to prevent recursing back to iget through
5597   * allocator. NULL is also valid but may require an additional allocation
5598   * later.
5599   */
btrfs_iget_path(u64 ino,struct btrfs_root * root,struct btrfs_path * path)5600  struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
5601  			      struct btrfs_path *path)
5602  {
5603  	struct inode *inode;
5604  	int ret;
5605  
5606  	inode = btrfs_iget_locked(ino, root);
5607  	if (!inode)
5608  		return ERR_PTR(-ENOMEM);
5609  
5610  	if (!(inode->i_state & I_NEW))
5611  		return inode;
5612  
5613  	ret = btrfs_read_locked_inode(inode, path);
5614  	/*
5615  	 * ret > 0 can come from btrfs_search_slot called by
5616  	 * btrfs_read_locked_inode(), this means the inode item was not found.
5617  	 */
5618  	if (ret > 0)
5619  		ret = -ENOENT;
5620  	if (ret < 0)
5621  		goto error;
5622  
5623  	ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
5624  	if (ret < 0)
5625  		goto error;
5626  
5627  	unlock_new_inode(inode);
5628  
5629  	return inode;
5630  error:
5631  	iget_failed(inode);
5632  	return ERR_PTR(ret);
5633  }
5634  
btrfs_iget(u64 ino,struct btrfs_root * root)5635  struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
5636  {
5637  	return btrfs_iget_path(ino, root, NULL);
5638  }
5639  
new_simple_dir(struct inode * dir,struct btrfs_key * key,struct btrfs_root * root)5640  static struct inode *new_simple_dir(struct inode *dir,
5641  				    struct btrfs_key *key,
5642  				    struct btrfs_root *root)
5643  {
5644  	struct timespec64 ts;
5645  	struct inode *inode = new_inode(dir->i_sb);
5646  
5647  	if (!inode)
5648  		return ERR_PTR(-ENOMEM);
5649  
5650  	BTRFS_I(inode)->root = btrfs_grab_root(root);
5651  	BTRFS_I(inode)->ref_root_id = key->objectid;
5652  	set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags);
5653  	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5654  
5655  	btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
5656  	/*
5657  	 * We only need lookup, the rest is read-only and there's no inode
5658  	 * associated with the dentry
5659  	 */
5660  	inode->i_op = &simple_dir_inode_operations;
5661  	inode->i_opflags &= ~IOP_XATTR;
5662  	inode->i_fop = &simple_dir_operations;
5663  	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5664  
5665  	ts = inode_set_ctime_current(inode);
5666  	inode_set_mtime_to_ts(inode, ts);
5667  	inode_set_atime_to_ts(inode, inode_get_atime(dir));
5668  	BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
5669  	BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
5670  
5671  	inode->i_uid = dir->i_uid;
5672  	inode->i_gid = dir->i_gid;
5673  
5674  	return inode;
5675  }
5676  
5677  static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
5678  static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
5679  static_assert(BTRFS_FT_DIR == FT_DIR);
5680  static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
5681  static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
5682  static_assert(BTRFS_FT_FIFO == FT_FIFO);
5683  static_assert(BTRFS_FT_SOCK == FT_SOCK);
5684  static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
5685  
btrfs_inode_type(struct inode * inode)5686  static inline u8 btrfs_inode_type(struct inode *inode)
5687  {
5688  	return fs_umode_to_ftype(inode->i_mode);
5689  }
5690  
btrfs_lookup_dentry(struct inode * dir,struct dentry * dentry)5691  struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5692  {
5693  	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
5694  	struct inode *inode;
5695  	struct btrfs_root *root = BTRFS_I(dir)->root;
5696  	struct btrfs_root *sub_root = root;
5697  	struct btrfs_key location = { 0 };
5698  	u8 di_type = 0;
5699  	int ret = 0;
5700  
5701  	if (dentry->d_name.len > BTRFS_NAME_LEN)
5702  		return ERR_PTR(-ENAMETOOLONG);
5703  
5704  	ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
5705  	if (ret < 0)
5706  		return ERR_PTR(ret);
5707  
5708  	if (location.type == BTRFS_INODE_ITEM_KEY) {
5709  		inode = btrfs_iget(location.objectid, root);
5710  		if (IS_ERR(inode))
5711  			return inode;
5712  
5713  		/* Do extra check against inode mode with di_type */
5714  		if (btrfs_inode_type(inode) != di_type) {
5715  			btrfs_crit(fs_info,
5716  "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5717  				  inode->i_mode, btrfs_inode_type(inode),
5718  				  di_type);
5719  			iput(inode);
5720  			return ERR_PTR(-EUCLEAN);
5721  		}
5722  		return inode;
5723  	}
5724  
5725  	ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
5726  				       &location, &sub_root);
5727  	if (ret < 0) {
5728  		if (ret != -ENOENT)
5729  			inode = ERR_PTR(ret);
5730  		else
5731  			inode = new_simple_dir(dir, &location, root);
5732  	} else {
5733  		inode = btrfs_iget(location.objectid, sub_root);
5734  		btrfs_put_root(sub_root);
5735  
5736  		if (IS_ERR(inode))
5737  			return inode;
5738  
5739  		down_read(&fs_info->cleanup_work_sem);
5740  		if (!sb_rdonly(inode->i_sb))
5741  			ret = btrfs_orphan_cleanup(sub_root);
5742  		up_read(&fs_info->cleanup_work_sem);
5743  		if (ret) {
5744  			iput(inode);
5745  			inode = ERR_PTR(ret);
5746  		}
5747  	}
5748  
5749  	return inode;
5750  }
5751  
btrfs_dentry_delete(const struct dentry * dentry)5752  static int btrfs_dentry_delete(const struct dentry *dentry)
5753  {
5754  	struct btrfs_root *root;
5755  	struct inode *inode = d_inode(dentry);
5756  
5757  	if (!inode && !IS_ROOT(dentry))
5758  		inode = d_inode(dentry->d_parent);
5759  
5760  	if (inode) {
5761  		root = BTRFS_I(inode)->root;
5762  		if (btrfs_root_refs(&root->root_item) == 0)
5763  			return 1;
5764  
5765  		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5766  			return 1;
5767  	}
5768  	return 0;
5769  }
5770  
btrfs_lookup(struct inode * dir,struct dentry * dentry,unsigned int flags)5771  static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5772  				   unsigned int flags)
5773  {
5774  	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5775  
5776  	if (inode == ERR_PTR(-ENOENT))
5777  		inode = NULL;
5778  	return d_splice_alias(inode, dentry);
5779  }
5780  
5781  /*
5782   * Find the highest existing sequence number in a directory and then set the
5783   * in-memory index_cnt variable to the first free sequence number.
5784   */
btrfs_set_inode_index_count(struct btrfs_inode * inode)5785  static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5786  {
5787  	struct btrfs_root *root = inode->root;
5788  	struct btrfs_key key, found_key;
5789  	struct btrfs_path *path;
5790  	struct extent_buffer *leaf;
5791  	int ret;
5792  
5793  	key.objectid = btrfs_ino(inode);
5794  	key.type = BTRFS_DIR_INDEX_KEY;
5795  	key.offset = (u64)-1;
5796  
5797  	path = btrfs_alloc_path();
5798  	if (!path)
5799  		return -ENOMEM;
5800  
5801  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5802  	if (ret < 0)
5803  		goto out;
5804  	/* FIXME: we should be able to handle this */
5805  	if (ret == 0)
5806  		goto out;
5807  	ret = 0;
5808  
5809  	if (path->slots[0] == 0) {
5810  		inode->index_cnt = BTRFS_DIR_START_INDEX;
5811  		goto out;
5812  	}
5813  
5814  	path->slots[0]--;
5815  
5816  	leaf = path->nodes[0];
5817  	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5818  
5819  	if (found_key.objectid != btrfs_ino(inode) ||
5820  	    found_key.type != BTRFS_DIR_INDEX_KEY) {
5821  		inode->index_cnt = BTRFS_DIR_START_INDEX;
5822  		goto out;
5823  	}
5824  
5825  	inode->index_cnt = found_key.offset + 1;
5826  out:
5827  	btrfs_free_path(path);
5828  	return ret;
5829  }
5830  
btrfs_get_dir_last_index(struct btrfs_inode * dir,u64 * index)5831  static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
5832  {
5833  	int ret = 0;
5834  
5835  	btrfs_inode_lock(dir, 0);
5836  	if (dir->index_cnt == (u64)-1) {
5837  		ret = btrfs_inode_delayed_dir_index_count(dir);
5838  		if (ret) {
5839  			ret = btrfs_set_inode_index_count(dir);
5840  			if (ret)
5841  				goto out;
5842  		}
5843  	}
5844  
5845  	/* index_cnt is the index number of next new entry, so decrement it. */
5846  	*index = dir->index_cnt - 1;
5847  out:
5848  	btrfs_inode_unlock(dir, 0);
5849  
5850  	return ret;
5851  }
5852  
5853  /*
5854   * All this infrastructure exists because dir_emit can fault, and we are holding
5855   * the tree lock when doing readdir.  For now just allocate a buffer and copy
5856   * our information into that, and then dir_emit from the buffer.  This is
5857   * similar to what NFS does, only we don't keep the buffer around in pagecache
5858   * because I'm afraid I'll mess that up.  Long term we need to make filldir do
5859   * copy_to_user_inatomic so we don't have to worry about page faulting under the
5860   * tree lock.
5861   */
btrfs_opendir(struct inode * inode,struct file * file)5862  static int btrfs_opendir(struct inode *inode, struct file *file)
5863  {
5864  	struct btrfs_file_private *private;
5865  	u64 last_index;
5866  	int ret;
5867  
5868  	ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
5869  	if (ret)
5870  		return ret;
5871  
5872  	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
5873  	if (!private)
5874  		return -ENOMEM;
5875  	private->last_index = last_index;
5876  	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
5877  	if (!private->filldir_buf) {
5878  		kfree(private);
5879  		return -ENOMEM;
5880  	}
5881  	file->private_data = private;
5882  	return 0;
5883  }
5884  
btrfs_dir_llseek(struct file * file,loff_t offset,int whence)5885  static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
5886  {
5887  	struct btrfs_file_private *private = file->private_data;
5888  	int ret;
5889  
5890  	ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
5891  				       &private->last_index);
5892  	if (ret)
5893  		return ret;
5894  
5895  	return generic_file_llseek(file, offset, whence);
5896  }
5897  
5898  struct dir_entry {
5899  	u64 ino;
5900  	u64 offset;
5901  	unsigned type;
5902  	int name_len;
5903  };
5904  
btrfs_filldir(void * addr,int entries,struct dir_context * ctx)5905  static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
5906  {
5907  	while (entries--) {
5908  		struct dir_entry *entry = addr;
5909  		char *name = (char *)(entry + 1);
5910  
5911  		ctx->pos = get_unaligned(&entry->offset);
5912  		if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
5913  					 get_unaligned(&entry->ino),
5914  					 get_unaligned(&entry->type)))
5915  			return 1;
5916  		addr += sizeof(struct dir_entry) +
5917  			get_unaligned(&entry->name_len);
5918  		ctx->pos++;
5919  	}
5920  	return 0;
5921  }
5922  
btrfs_real_readdir(struct file * file,struct dir_context * ctx)5923  static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5924  {
5925  	struct inode *inode = file_inode(file);
5926  	struct btrfs_root *root = BTRFS_I(inode)->root;
5927  	struct btrfs_file_private *private = file->private_data;
5928  	struct btrfs_dir_item *di;
5929  	struct btrfs_key key;
5930  	struct btrfs_key found_key;
5931  	struct btrfs_path *path;
5932  	void *addr;
5933  	LIST_HEAD(ins_list);
5934  	LIST_HEAD(del_list);
5935  	int ret;
5936  	char *name_ptr;
5937  	int name_len;
5938  	int entries = 0;
5939  	int total_len = 0;
5940  	bool put = false;
5941  	struct btrfs_key location;
5942  
5943  	if (!dir_emit_dots(file, ctx))
5944  		return 0;
5945  
5946  	path = btrfs_alloc_path();
5947  	if (!path)
5948  		return -ENOMEM;
5949  
5950  	addr = private->filldir_buf;
5951  	path->reada = READA_FORWARD;
5952  
5953  	put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index,
5954  					      &ins_list, &del_list);
5955  
5956  again:
5957  	key.type = BTRFS_DIR_INDEX_KEY;
5958  	key.offset = ctx->pos;
5959  	key.objectid = btrfs_ino(BTRFS_I(inode));
5960  
5961  	btrfs_for_each_slot(root, &key, &found_key, path, ret) {
5962  		struct dir_entry *entry;
5963  		struct extent_buffer *leaf = path->nodes[0];
5964  		u8 ftype;
5965  
5966  		if (found_key.objectid != key.objectid)
5967  			break;
5968  		if (found_key.type != BTRFS_DIR_INDEX_KEY)
5969  			break;
5970  		if (found_key.offset < ctx->pos)
5971  			continue;
5972  		if (found_key.offset > private->last_index)
5973  			break;
5974  		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
5975  			continue;
5976  		di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5977  		name_len = btrfs_dir_name_len(leaf, di);
5978  		if ((total_len + sizeof(struct dir_entry) + name_len) >=
5979  		    PAGE_SIZE) {
5980  			btrfs_release_path(path);
5981  			ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5982  			if (ret)
5983  				goto nopos;
5984  			addr = private->filldir_buf;
5985  			entries = 0;
5986  			total_len = 0;
5987  			goto again;
5988  		}
5989  
5990  		ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
5991  		entry = addr;
5992  		name_ptr = (char *)(entry + 1);
5993  		read_extent_buffer(leaf, name_ptr,
5994  				   (unsigned long)(di + 1), name_len);
5995  		put_unaligned(name_len, &entry->name_len);
5996  		put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
5997  		btrfs_dir_item_key_to_cpu(leaf, di, &location);
5998  		put_unaligned(location.objectid, &entry->ino);
5999  		put_unaligned(found_key.offset, &entry->offset);
6000  		entries++;
6001  		addr += sizeof(struct dir_entry) + name_len;
6002  		total_len += sizeof(struct dir_entry) + name_len;
6003  	}
6004  	/* Catch error encountered during iteration */
6005  	if (ret < 0)
6006  		goto err;
6007  
6008  	btrfs_release_path(path);
6009  
6010  	ret = btrfs_filldir(private->filldir_buf, entries, ctx);
6011  	if (ret)
6012  		goto nopos;
6013  
6014  	ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
6015  	if (ret)
6016  		goto nopos;
6017  
6018  	/*
6019  	 * Stop new entries from being returned after we return the last
6020  	 * entry.
6021  	 *
6022  	 * New directory entries are assigned a strictly increasing
6023  	 * offset.  This means that new entries created during readdir
6024  	 * are *guaranteed* to be seen in the future by that readdir.
6025  	 * This has broken buggy programs which operate on names as
6026  	 * they're returned by readdir.  Until we re-use freed offsets
6027  	 * we have this hack to stop new entries from being returned
6028  	 * under the assumption that they'll never reach this huge
6029  	 * offset.
6030  	 *
6031  	 * This is being careful not to overflow 32bit loff_t unless the
6032  	 * last entry requires it because doing so has broken 32bit apps
6033  	 * in the past.
6034  	 */
6035  	if (ctx->pos >= INT_MAX)
6036  		ctx->pos = LLONG_MAX;
6037  	else
6038  		ctx->pos = INT_MAX;
6039  nopos:
6040  	ret = 0;
6041  err:
6042  	if (put)
6043  		btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
6044  	btrfs_free_path(path);
6045  	return ret;
6046  }
6047  
6048  /*
6049   * This is somewhat expensive, updating the tree every time the
6050   * inode changes.  But, it is most likely to find the inode in cache.
6051   * FIXME, needs more benchmarking...there are no reasons other than performance
6052   * to keep or drop this code.
6053   */
btrfs_dirty_inode(struct btrfs_inode * inode)6054  static int btrfs_dirty_inode(struct btrfs_inode *inode)
6055  {
6056  	struct btrfs_root *root = inode->root;
6057  	struct btrfs_fs_info *fs_info = root->fs_info;
6058  	struct btrfs_trans_handle *trans;
6059  	int ret;
6060  
6061  	if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
6062  		return 0;
6063  
6064  	trans = btrfs_join_transaction(root);
6065  	if (IS_ERR(trans))
6066  		return PTR_ERR(trans);
6067  
6068  	ret = btrfs_update_inode(trans, inode);
6069  	if (ret == -ENOSPC || ret == -EDQUOT) {
6070  		/* whoops, lets try again with the full transaction */
6071  		btrfs_end_transaction(trans);
6072  		trans = btrfs_start_transaction(root, 1);
6073  		if (IS_ERR(trans))
6074  			return PTR_ERR(trans);
6075  
6076  		ret = btrfs_update_inode(trans, inode);
6077  	}
6078  	btrfs_end_transaction(trans);
6079  	if (inode->delayed_node)
6080  		btrfs_balance_delayed_items(fs_info);
6081  
6082  	return ret;
6083  }
6084  
6085  /*
6086   * This is a copy of file_update_time.  We need this so we can return error on
6087   * ENOSPC for updating the inode in the case of file write and mmap writes.
6088   */
btrfs_update_time(struct inode * inode,int flags)6089  static int btrfs_update_time(struct inode *inode, int flags)
6090  {
6091  	struct btrfs_root *root = BTRFS_I(inode)->root;
6092  	bool dirty;
6093  
6094  	if (btrfs_root_readonly(root))
6095  		return -EROFS;
6096  
6097  	dirty = inode_update_timestamps(inode, flags);
6098  	return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
6099  }
6100  
6101  /*
6102   * helper to find a free sequence number in a given directory.  This current
6103   * code is very simple, later versions will do smarter things in the btree
6104   */
btrfs_set_inode_index(struct btrfs_inode * dir,u64 * index)6105  int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6106  {
6107  	int ret = 0;
6108  
6109  	if (dir->index_cnt == (u64)-1) {
6110  		ret = btrfs_inode_delayed_dir_index_count(dir);
6111  		if (ret) {
6112  			ret = btrfs_set_inode_index_count(dir);
6113  			if (ret)
6114  				return ret;
6115  		}
6116  	}
6117  
6118  	*index = dir->index_cnt;
6119  	dir->index_cnt++;
6120  
6121  	return ret;
6122  }
6123  
btrfs_insert_inode_locked(struct inode * inode)6124  static int btrfs_insert_inode_locked(struct inode *inode)
6125  {
6126  	struct btrfs_iget_args args;
6127  
6128  	args.ino = btrfs_ino(BTRFS_I(inode));
6129  	args.root = BTRFS_I(inode)->root;
6130  
6131  	return insert_inode_locked4(inode,
6132  		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6133  		   btrfs_find_actor, &args);
6134  }
6135  
btrfs_new_inode_prepare(struct btrfs_new_inode_args * args,unsigned int * trans_num_items)6136  int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
6137  			    unsigned int *trans_num_items)
6138  {
6139  	struct inode *dir = args->dir;
6140  	struct inode *inode = args->inode;
6141  	int ret;
6142  
6143  	if (!args->orphan) {
6144  		ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
6145  					     &args->fname);
6146  		if (ret)
6147  			return ret;
6148  	}
6149  
6150  	ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
6151  	if (ret) {
6152  		fscrypt_free_filename(&args->fname);
6153  		return ret;
6154  	}
6155  
6156  	/* 1 to add inode item */
6157  	*trans_num_items = 1;
6158  	/* 1 to add compression property */
6159  	if (BTRFS_I(dir)->prop_compress)
6160  		(*trans_num_items)++;
6161  	/* 1 to add default ACL xattr */
6162  	if (args->default_acl)
6163  		(*trans_num_items)++;
6164  	/* 1 to add access ACL xattr */
6165  	if (args->acl)
6166  		(*trans_num_items)++;
6167  #ifdef CONFIG_SECURITY
6168  	/* 1 to add LSM xattr */
6169  	if (dir->i_security)
6170  		(*trans_num_items)++;
6171  #endif
6172  	if (args->orphan) {
6173  		/* 1 to add orphan item */
6174  		(*trans_num_items)++;
6175  	} else {
6176  		/*
6177  		 * 1 to add dir item
6178  		 * 1 to add dir index
6179  		 * 1 to update parent inode item
6180  		 *
6181  		 * No need for 1 unit for the inode ref item because it is
6182  		 * inserted in a batch together with the inode item at
6183  		 * btrfs_create_new_inode().
6184  		 */
6185  		*trans_num_items += 3;
6186  	}
6187  	return 0;
6188  }
6189  
btrfs_new_inode_args_destroy(struct btrfs_new_inode_args * args)6190  void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
6191  {
6192  	posix_acl_release(args->acl);
6193  	posix_acl_release(args->default_acl);
6194  	fscrypt_free_filename(&args->fname);
6195  }
6196  
6197  /*
6198   * Inherit flags from the parent inode.
6199   *
6200   * Currently only the compression flags and the cow flags are inherited.
6201   */
btrfs_inherit_iflags(struct btrfs_inode * inode,struct btrfs_inode * dir)6202  static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
6203  {
6204  	unsigned int flags;
6205  
6206  	flags = dir->flags;
6207  
6208  	if (flags & BTRFS_INODE_NOCOMPRESS) {
6209  		inode->flags &= ~BTRFS_INODE_COMPRESS;
6210  		inode->flags |= BTRFS_INODE_NOCOMPRESS;
6211  	} else if (flags & BTRFS_INODE_COMPRESS) {
6212  		inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
6213  		inode->flags |= BTRFS_INODE_COMPRESS;
6214  	}
6215  
6216  	if (flags & BTRFS_INODE_NODATACOW) {
6217  		inode->flags |= BTRFS_INODE_NODATACOW;
6218  		if (S_ISREG(inode->vfs_inode.i_mode))
6219  			inode->flags |= BTRFS_INODE_NODATASUM;
6220  	}
6221  
6222  	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
6223  }
6224  
btrfs_create_new_inode(struct btrfs_trans_handle * trans,struct btrfs_new_inode_args * args)6225  int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
6226  			   struct btrfs_new_inode_args *args)
6227  {
6228  	struct timespec64 ts;
6229  	struct inode *dir = args->dir;
6230  	struct inode *inode = args->inode;
6231  	const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
6232  	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
6233  	struct btrfs_root *root;
6234  	struct btrfs_inode_item *inode_item;
6235  	struct btrfs_path *path;
6236  	u64 objectid;
6237  	struct btrfs_inode_ref *ref;
6238  	struct btrfs_key key[2];
6239  	u32 sizes[2];
6240  	struct btrfs_item_batch batch;
6241  	unsigned long ptr;
6242  	int ret;
6243  	bool xa_reserved = false;
6244  
6245  	path = btrfs_alloc_path();
6246  	if (!path)
6247  		return -ENOMEM;
6248  
6249  	if (!args->subvol)
6250  		BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
6251  	root = BTRFS_I(inode)->root;
6252  
6253  	ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
6254  	if (ret)
6255  		goto out;
6256  
6257  	ret = btrfs_get_free_objectid(root, &objectid);
6258  	if (ret)
6259  		goto out;
6260  	btrfs_set_inode_number(BTRFS_I(inode), objectid);
6261  
6262  	ret = xa_reserve(&root->inodes, objectid, GFP_NOFS);
6263  	if (ret)
6264  		goto out;
6265  	xa_reserved = true;
6266  
6267  	if (args->orphan) {
6268  		/*
6269  		 * O_TMPFILE, set link count to 0, so that after this point, we
6270  		 * fill in an inode item with the correct link count.
6271  		 */
6272  		set_nlink(inode, 0);
6273  	} else {
6274  		trace_btrfs_inode_request(dir);
6275  
6276  		ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
6277  		if (ret)
6278  			goto out;
6279  	}
6280  
6281  	if (S_ISDIR(inode->i_mode))
6282  		BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
6283  
6284  	BTRFS_I(inode)->generation = trans->transid;
6285  	inode->i_generation = BTRFS_I(inode)->generation;
6286  
6287  	/*
6288  	 * We don't have any capability xattrs set here yet, shortcut any
6289  	 * queries for the xattrs here.  If we add them later via the inode
6290  	 * security init path or any other path this flag will be cleared.
6291  	 */
6292  	set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
6293  
6294  	/*
6295  	 * Subvolumes don't inherit flags from their parent directory.
6296  	 * Originally this was probably by accident, but we probably can't
6297  	 * change it now without compatibility issues.
6298  	 */
6299  	if (!args->subvol)
6300  		btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
6301  
6302  	if (S_ISREG(inode->i_mode)) {
6303  		if (btrfs_test_opt(fs_info, NODATASUM))
6304  			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6305  		if (btrfs_test_opt(fs_info, NODATACOW))
6306  			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6307  				BTRFS_INODE_NODATASUM;
6308  	}
6309  
6310  	ret = btrfs_insert_inode_locked(inode);
6311  	if (ret < 0) {
6312  		if (!args->orphan)
6313  			BTRFS_I(dir)->index_cnt--;
6314  		goto out;
6315  	}
6316  
6317  	/*
6318  	 * We could have gotten an inode number from somebody who was fsynced
6319  	 * and then removed in this same transaction, so let's just set full
6320  	 * sync since it will be a full sync anyway and this will blow away the
6321  	 * old info in the log.
6322  	 */
6323  	btrfs_set_inode_full_sync(BTRFS_I(inode));
6324  
6325  	key[0].objectid = objectid;
6326  	key[0].type = BTRFS_INODE_ITEM_KEY;
6327  	key[0].offset = 0;
6328  
6329  	sizes[0] = sizeof(struct btrfs_inode_item);
6330  
6331  	if (!args->orphan) {
6332  		/*
6333  		 * Start new inodes with an inode_ref. This is slightly more
6334  		 * efficient for small numbers of hard links since they will
6335  		 * be packed into one item. Extended refs will kick in if we
6336  		 * add more hard links than can fit in the ref item.
6337  		 */
6338  		key[1].objectid = objectid;
6339  		key[1].type = BTRFS_INODE_REF_KEY;
6340  		if (args->subvol) {
6341  			key[1].offset = objectid;
6342  			sizes[1] = 2 + sizeof(*ref);
6343  		} else {
6344  			key[1].offset = btrfs_ino(BTRFS_I(dir));
6345  			sizes[1] = name->len + sizeof(*ref);
6346  		}
6347  	}
6348  
6349  	batch.keys = &key[0];
6350  	batch.data_sizes = &sizes[0];
6351  	batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
6352  	batch.nr = args->orphan ? 1 : 2;
6353  	ret = btrfs_insert_empty_items(trans, root, path, &batch);
6354  	if (ret != 0) {
6355  		btrfs_abort_transaction(trans, ret);
6356  		goto discard;
6357  	}
6358  
6359  	ts = simple_inode_init_ts(inode);
6360  	BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
6361  	BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
6362  
6363  	/*
6364  	 * We're going to fill the inode item now, so at this point the inode
6365  	 * must be fully initialized.
6366  	 */
6367  
6368  	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6369  				  struct btrfs_inode_item);
6370  	memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6371  			     sizeof(*inode_item));
6372  	fill_inode_item(trans, path->nodes[0], inode_item, inode);
6373  
6374  	if (!args->orphan) {
6375  		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6376  				     struct btrfs_inode_ref);
6377  		ptr = (unsigned long)(ref + 1);
6378  		if (args->subvol) {
6379  			btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
6380  			btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
6381  			write_extent_buffer(path->nodes[0], "..", ptr, 2);
6382  		} else {
6383  			btrfs_set_inode_ref_name_len(path->nodes[0], ref,
6384  						     name->len);
6385  			btrfs_set_inode_ref_index(path->nodes[0], ref,
6386  						  BTRFS_I(inode)->dir_index);
6387  			write_extent_buffer(path->nodes[0], name->name, ptr,
6388  					    name->len);
6389  		}
6390  	}
6391  
6392  	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
6393  	/*
6394  	 * We don't need the path anymore, plus inheriting properties, adding
6395  	 * ACLs, security xattrs, orphan item or adding the link, will result in
6396  	 * allocating yet another path. So just free our path.
6397  	 */
6398  	btrfs_free_path(path);
6399  	path = NULL;
6400  
6401  	if (args->subvol) {
6402  		struct inode *parent;
6403  
6404  		/*
6405  		 * Subvolumes inherit properties from their parent subvolume,
6406  		 * not the directory they were created in.
6407  		 */
6408  		parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root);
6409  		if (IS_ERR(parent)) {
6410  			ret = PTR_ERR(parent);
6411  		} else {
6412  			ret = btrfs_inode_inherit_props(trans, inode, parent);
6413  			iput(parent);
6414  		}
6415  	} else {
6416  		ret = btrfs_inode_inherit_props(trans, inode, dir);
6417  	}
6418  	if (ret) {
6419  		btrfs_err(fs_info,
6420  			  "error inheriting props for ino %llu (root %llu): %d",
6421  			  btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
6422  	}
6423  
6424  	/*
6425  	 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
6426  	 * probably a bug.
6427  	 */
6428  	if (!args->subvol) {
6429  		ret = btrfs_init_inode_security(trans, args);
6430  		if (ret) {
6431  			btrfs_abort_transaction(trans, ret);
6432  			goto discard;
6433  		}
6434  	}
6435  
6436  	ret = btrfs_add_inode_to_root(BTRFS_I(inode), false);
6437  	if (WARN_ON(ret)) {
6438  		/* Shouldn't happen, we used xa_reserve() before. */
6439  		btrfs_abort_transaction(trans, ret);
6440  		goto discard;
6441  	}
6442  
6443  	trace_btrfs_inode_new(inode);
6444  	btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6445  
6446  	btrfs_update_root_times(trans, root);
6447  
6448  	if (args->orphan) {
6449  		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
6450  	} else {
6451  		ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
6452  				     0, BTRFS_I(inode)->dir_index);
6453  	}
6454  	if (ret) {
6455  		btrfs_abort_transaction(trans, ret);
6456  		goto discard;
6457  	}
6458  
6459  	return 0;
6460  
6461  discard:
6462  	/*
6463  	 * discard_new_inode() calls iput(), but the caller owns the reference
6464  	 * to the inode.
6465  	 */
6466  	ihold(inode);
6467  	discard_new_inode(inode);
6468  out:
6469  	if (xa_reserved)
6470  		xa_release(&root->inodes, objectid);
6471  
6472  	btrfs_free_path(path);
6473  	return ret;
6474  }
6475  
6476  /*
6477   * utility function to add 'inode' into 'parent_inode' with
6478   * a give name and a given sequence number.
6479   * if 'add_backref' is true, also insert a backref from the
6480   * inode to the parent directory.
6481   */
btrfs_add_link(struct btrfs_trans_handle * trans,struct btrfs_inode * parent_inode,struct btrfs_inode * inode,const struct fscrypt_str * name,int add_backref,u64 index)6482  int btrfs_add_link(struct btrfs_trans_handle *trans,
6483  		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6484  		   const struct fscrypt_str *name, int add_backref, u64 index)
6485  {
6486  	int ret = 0;
6487  	struct btrfs_key key;
6488  	struct btrfs_root *root = parent_inode->root;
6489  	u64 ino = btrfs_ino(inode);
6490  	u64 parent_ino = btrfs_ino(parent_inode);
6491  
6492  	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6493  		memcpy(&key, &inode->root->root_key, sizeof(key));
6494  	} else {
6495  		key.objectid = ino;
6496  		key.type = BTRFS_INODE_ITEM_KEY;
6497  		key.offset = 0;
6498  	}
6499  
6500  	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6501  		ret = btrfs_add_root_ref(trans, key.objectid,
6502  					 btrfs_root_id(root), parent_ino,
6503  					 index, name);
6504  	} else if (add_backref) {
6505  		ret = btrfs_insert_inode_ref(trans, root, name,
6506  					     ino, parent_ino, index);
6507  	}
6508  
6509  	/* Nothing to clean up yet */
6510  	if (ret)
6511  		return ret;
6512  
6513  	ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
6514  				    btrfs_inode_type(&inode->vfs_inode), index);
6515  	if (ret == -EEXIST || ret == -EOVERFLOW)
6516  		goto fail_dir_item;
6517  	else if (ret) {
6518  		btrfs_abort_transaction(trans, ret);
6519  		return ret;
6520  	}
6521  
6522  	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6523  			   name->len * 2);
6524  	inode_inc_iversion(&parent_inode->vfs_inode);
6525  	/*
6526  	 * If we are replaying a log tree, we do not want to update the mtime
6527  	 * and ctime of the parent directory with the current time, since the
6528  	 * log replay procedure is responsible for setting them to their correct
6529  	 * values (the ones it had when the fsync was done).
6530  	 */
6531  	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
6532  		inode_set_mtime_to_ts(&parent_inode->vfs_inode,
6533  				      inode_set_ctime_current(&parent_inode->vfs_inode));
6534  
6535  	ret = btrfs_update_inode(trans, parent_inode);
6536  	if (ret)
6537  		btrfs_abort_transaction(trans, ret);
6538  	return ret;
6539  
6540  fail_dir_item:
6541  	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6542  		u64 local_index;
6543  		int err;
6544  		err = btrfs_del_root_ref(trans, key.objectid,
6545  					 btrfs_root_id(root), parent_ino,
6546  					 &local_index, name);
6547  		if (err)
6548  			btrfs_abort_transaction(trans, err);
6549  	} else if (add_backref) {
6550  		u64 local_index;
6551  		int err;
6552  
6553  		err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
6554  					  &local_index);
6555  		if (err)
6556  			btrfs_abort_transaction(trans, err);
6557  	}
6558  
6559  	/* Return the original error code */
6560  	return ret;
6561  }
6562  
btrfs_create_common(struct inode * dir,struct dentry * dentry,struct inode * inode)6563  static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
6564  			       struct inode *inode)
6565  {
6566  	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
6567  	struct btrfs_root *root = BTRFS_I(dir)->root;
6568  	struct btrfs_new_inode_args new_inode_args = {
6569  		.dir = dir,
6570  		.dentry = dentry,
6571  		.inode = inode,
6572  	};
6573  	unsigned int trans_num_items;
6574  	struct btrfs_trans_handle *trans;
6575  	int err;
6576  
6577  	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
6578  	if (err)
6579  		goto out_inode;
6580  
6581  	trans = btrfs_start_transaction(root, trans_num_items);
6582  	if (IS_ERR(trans)) {
6583  		err = PTR_ERR(trans);
6584  		goto out_new_inode_args;
6585  	}
6586  
6587  	err = btrfs_create_new_inode(trans, &new_inode_args);
6588  	if (!err)
6589  		d_instantiate_new(dentry, inode);
6590  
6591  	btrfs_end_transaction(trans);
6592  	btrfs_btree_balance_dirty(fs_info);
6593  out_new_inode_args:
6594  	btrfs_new_inode_args_destroy(&new_inode_args);
6595  out_inode:
6596  	if (err)
6597  		iput(inode);
6598  	return err;
6599  }
6600  
btrfs_mknod(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,dev_t rdev)6601  static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
6602  		       struct dentry *dentry, umode_t mode, dev_t rdev)
6603  {
6604  	struct inode *inode;
6605  
6606  	inode = new_inode(dir->i_sb);
6607  	if (!inode)
6608  		return -ENOMEM;
6609  	inode_init_owner(idmap, inode, dir, mode);
6610  	inode->i_op = &btrfs_special_inode_operations;
6611  	init_special_inode(inode, inode->i_mode, rdev);
6612  	return btrfs_create_common(dir, dentry, inode);
6613  }
6614  
btrfs_create(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,bool excl)6615  static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
6616  			struct dentry *dentry, umode_t mode, bool excl)
6617  {
6618  	struct inode *inode;
6619  
6620  	inode = new_inode(dir->i_sb);
6621  	if (!inode)
6622  		return -ENOMEM;
6623  	inode_init_owner(idmap, inode, dir, mode);
6624  	inode->i_fop = &btrfs_file_operations;
6625  	inode->i_op = &btrfs_file_inode_operations;
6626  	inode->i_mapping->a_ops = &btrfs_aops;
6627  	return btrfs_create_common(dir, dentry, inode);
6628  }
6629  
btrfs_link(struct dentry * old_dentry,struct inode * dir,struct dentry * dentry)6630  static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6631  		      struct dentry *dentry)
6632  {
6633  	struct btrfs_trans_handle *trans = NULL;
6634  	struct btrfs_root *root = BTRFS_I(dir)->root;
6635  	struct inode *inode = d_inode(old_dentry);
6636  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
6637  	struct fscrypt_name fname;
6638  	u64 index;
6639  	int err;
6640  	int drop_inode = 0;
6641  
6642  	/* do not allow sys_link's with other subvols of the same device */
6643  	if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
6644  		return -EXDEV;
6645  
6646  	if (inode->i_nlink >= BTRFS_LINK_MAX)
6647  		return -EMLINK;
6648  
6649  	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
6650  	if (err)
6651  		goto fail;
6652  
6653  	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6654  	if (err)
6655  		goto fail;
6656  
6657  	/*
6658  	 * 2 items for inode and inode ref
6659  	 * 2 items for dir items
6660  	 * 1 item for parent inode
6661  	 * 1 item for orphan item deletion if O_TMPFILE
6662  	 */
6663  	trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6664  	if (IS_ERR(trans)) {
6665  		err = PTR_ERR(trans);
6666  		trans = NULL;
6667  		goto fail;
6668  	}
6669  
6670  	/* There are several dir indexes for this inode, clear the cache. */
6671  	BTRFS_I(inode)->dir_index = 0ULL;
6672  	inc_nlink(inode);
6673  	inode_inc_iversion(inode);
6674  	inode_set_ctime_current(inode);
6675  	ihold(inode);
6676  	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6677  
6678  	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6679  			     &fname.disk_name, 1, index);
6680  
6681  	if (err) {
6682  		drop_inode = 1;
6683  	} else {
6684  		struct dentry *parent = dentry->d_parent;
6685  
6686  		err = btrfs_update_inode(trans, BTRFS_I(inode));
6687  		if (err)
6688  			goto fail;
6689  		if (inode->i_nlink == 1) {
6690  			/*
6691  			 * If new hard link count is 1, it's a file created
6692  			 * with open(2) O_TMPFILE flag.
6693  			 */
6694  			err = btrfs_orphan_del(trans, BTRFS_I(inode));
6695  			if (err)
6696  				goto fail;
6697  		}
6698  		d_instantiate(dentry, inode);
6699  		btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
6700  	}
6701  
6702  fail:
6703  	fscrypt_free_filename(&fname);
6704  	if (trans)
6705  		btrfs_end_transaction(trans);
6706  	if (drop_inode) {
6707  		inode_dec_link_count(inode);
6708  		iput(inode);
6709  	}
6710  	btrfs_btree_balance_dirty(fs_info);
6711  	return err;
6712  }
6713  
btrfs_mkdir(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode)6714  static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
6715  		       struct dentry *dentry, umode_t mode)
6716  {
6717  	struct inode *inode;
6718  
6719  	inode = new_inode(dir->i_sb);
6720  	if (!inode)
6721  		return -ENOMEM;
6722  	inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
6723  	inode->i_op = &btrfs_dir_inode_operations;
6724  	inode->i_fop = &btrfs_dir_file_operations;
6725  	return btrfs_create_common(dir, dentry, inode);
6726  }
6727  
uncompress_inline(struct btrfs_path * path,struct folio * folio,struct btrfs_file_extent_item * item)6728  static noinline int uncompress_inline(struct btrfs_path *path,
6729  				      struct folio *folio,
6730  				      struct btrfs_file_extent_item *item)
6731  {
6732  	int ret;
6733  	struct extent_buffer *leaf = path->nodes[0];
6734  	char *tmp;
6735  	size_t max_size;
6736  	unsigned long inline_size;
6737  	unsigned long ptr;
6738  	int compress_type;
6739  
6740  	compress_type = btrfs_file_extent_compression(leaf, item);
6741  	max_size = btrfs_file_extent_ram_bytes(leaf, item);
6742  	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
6743  	tmp = kmalloc(inline_size, GFP_NOFS);
6744  	if (!tmp)
6745  		return -ENOMEM;
6746  	ptr = btrfs_file_extent_inline_start(item);
6747  
6748  	read_extent_buffer(leaf, tmp, ptr, inline_size);
6749  
6750  	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6751  	ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
6752  			       max_size);
6753  
6754  	/*
6755  	 * decompression code contains a memset to fill in any space between the end
6756  	 * of the uncompressed data and the end of max_size in case the decompressed
6757  	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
6758  	 * the end of an inline extent and the beginning of the next block, so we
6759  	 * cover that region here.
6760  	 */
6761  
6762  	if (max_size < PAGE_SIZE)
6763  		folio_zero_range(folio, max_size, PAGE_SIZE - max_size);
6764  	kfree(tmp);
6765  	return ret;
6766  }
6767  
read_inline_extent(struct btrfs_inode * inode,struct btrfs_path * path,struct folio * folio)6768  static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
6769  			      struct folio *folio)
6770  {
6771  	struct btrfs_file_extent_item *fi;
6772  	void *kaddr;
6773  	size_t copy_size;
6774  
6775  	if (!folio || folio_test_uptodate(folio))
6776  		return 0;
6777  
6778  	ASSERT(folio_pos(folio) == 0);
6779  
6780  	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
6781  			    struct btrfs_file_extent_item);
6782  	if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
6783  		return uncompress_inline(path, folio, fi);
6784  
6785  	copy_size = min_t(u64, PAGE_SIZE,
6786  			  btrfs_file_extent_ram_bytes(path->nodes[0], fi));
6787  	kaddr = kmap_local_folio(folio, 0);
6788  	read_extent_buffer(path->nodes[0], kaddr,
6789  			   btrfs_file_extent_inline_start(fi), copy_size);
6790  	kunmap_local(kaddr);
6791  	if (copy_size < PAGE_SIZE)
6792  		folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size);
6793  	return 0;
6794  }
6795  
6796  /*
6797   * Lookup the first extent overlapping a range in a file.
6798   *
6799   * @inode:	file to search in
6800   * @page:	page to read extent data into if the extent is inline
6801   * @start:	file offset
6802   * @len:	length of range starting at @start
6803   *
6804   * Return the first &struct extent_map which overlaps the given range, reading
6805   * it from the B-tree and caching it if necessary. Note that there may be more
6806   * extents which overlap the given range after the returned extent_map.
6807   *
6808   * If @page is not NULL and the extent is inline, this also reads the extent
6809   * data directly into the page and marks the extent up to date in the io_tree.
6810   *
6811   * Return: ERR_PTR on error, non-NULL extent_map on success.
6812   */
btrfs_get_extent(struct btrfs_inode * inode,struct folio * folio,u64 start,u64 len)6813  struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6814  				    struct folio *folio, u64 start, u64 len)
6815  {
6816  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
6817  	int ret = 0;
6818  	u64 extent_start = 0;
6819  	u64 extent_end = 0;
6820  	u64 objectid = btrfs_ino(inode);
6821  	int extent_type = -1;
6822  	struct btrfs_path *path = NULL;
6823  	struct btrfs_root *root = inode->root;
6824  	struct btrfs_file_extent_item *item;
6825  	struct extent_buffer *leaf;
6826  	struct btrfs_key found_key;
6827  	struct extent_map *em = NULL;
6828  	struct extent_map_tree *em_tree = &inode->extent_tree;
6829  
6830  	read_lock(&em_tree->lock);
6831  	em = lookup_extent_mapping(em_tree, start, len);
6832  	read_unlock(&em_tree->lock);
6833  
6834  	if (em) {
6835  		if (em->start > start || em->start + em->len <= start)
6836  			free_extent_map(em);
6837  		else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
6838  			free_extent_map(em);
6839  		else
6840  			goto out;
6841  	}
6842  	em = alloc_extent_map();
6843  	if (!em) {
6844  		ret = -ENOMEM;
6845  		goto out;
6846  	}
6847  	em->start = EXTENT_MAP_HOLE;
6848  	em->disk_bytenr = EXTENT_MAP_HOLE;
6849  	em->len = (u64)-1;
6850  
6851  	path = btrfs_alloc_path();
6852  	if (!path) {
6853  		ret = -ENOMEM;
6854  		goto out;
6855  	}
6856  
6857  	/* Chances are we'll be called again, so go ahead and do readahead */
6858  	path->reada = READA_FORWARD;
6859  
6860  	/*
6861  	 * The same explanation in load_free_space_cache applies here as well,
6862  	 * we only read when we're loading the free space cache, and at that
6863  	 * point the commit_root has everything we need.
6864  	 */
6865  	if (btrfs_is_free_space_inode(inode)) {
6866  		path->search_commit_root = 1;
6867  		path->skip_locking = 1;
6868  	}
6869  
6870  	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6871  	if (ret < 0) {
6872  		goto out;
6873  	} else if (ret > 0) {
6874  		if (path->slots[0] == 0)
6875  			goto not_found;
6876  		path->slots[0]--;
6877  		ret = 0;
6878  	}
6879  
6880  	leaf = path->nodes[0];
6881  	item = btrfs_item_ptr(leaf, path->slots[0],
6882  			      struct btrfs_file_extent_item);
6883  	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6884  	if (found_key.objectid != objectid ||
6885  	    found_key.type != BTRFS_EXTENT_DATA_KEY) {
6886  		/*
6887  		 * If we backup past the first extent we want to move forward
6888  		 * and see if there is an extent in front of us, otherwise we'll
6889  		 * say there is a hole for our whole search range which can
6890  		 * cause problems.
6891  		 */
6892  		extent_end = start;
6893  		goto next;
6894  	}
6895  
6896  	extent_type = btrfs_file_extent_type(leaf, item);
6897  	extent_start = found_key.offset;
6898  	extent_end = btrfs_file_extent_end(path);
6899  	if (extent_type == BTRFS_FILE_EXTENT_REG ||
6900  	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6901  		/* Only regular file could have regular/prealloc extent */
6902  		if (!S_ISREG(inode->vfs_inode.i_mode)) {
6903  			ret = -EUCLEAN;
6904  			btrfs_crit(fs_info,
6905  		"regular/prealloc extent found for non-regular inode %llu",
6906  				   btrfs_ino(inode));
6907  			goto out;
6908  		}
6909  		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6910  						       extent_start);
6911  	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6912  		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6913  						      path->slots[0],
6914  						      extent_start);
6915  	}
6916  next:
6917  	if (start >= extent_end) {
6918  		path->slots[0]++;
6919  		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6920  			ret = btrfs_next_leaf(root, path);
6921  			if (ret < 0)
6922  				goto out;
6923  			else if (ret > 0)
6924  				goto not_found;
6925  
6926  			leaf = path->nodes[0];
6927  		}
6928  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6929  		if (found_key.objectid != objectid ||
6930  		    found_key.type != BTRFS_EXTENT_DATA_KEY)
6931  			goto not_found;
6932  		if (start + len <= found_key.offset)
6933  			goto not_found;
6934  		if (start > found_key.offset)
6935  			goto next;
6936  
6937  		/* New extent overlaps with existing one */
6938  		em->start = start;
6939  		em->len = found_key.offset - start;
6940  		em->disk_bytenr = EXTENT_MAP_HOLE;
6941  		goto insert;
6942  	}
6943  
6944  	btrfs_extent_item_to_extent_map(inode, path, item, em);
6945  
6946  	if (extent_type == BTRFS_FILE_EXTENT_REG ||
6947  	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6948  		goto insert;
6949  	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6950  		/*
6951  		 * Inline extent can only exist at file offset 0. This is
6952  		 * ensured by tree-checker and inline extent creation path.
6953  		 * Thus all members representing file offsets should be zero.
6954  		 */
6955  		ASSERT(extent_start == 0);
6956  		ASSERT(em->start == 0);
6957  
6958  		/*
6959  		 * btrfs_extent_item_to_extent_map() should have properly
6960  		 * initialized em members already.
6961  		 *
6962  		 * Other members are not utilized for inline extents.
6963  		 */
6964  		ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
6965  		ASSERT(em->len == fs_info->sectorsize);
6966  
6967  		ret = read_inline_extent(inode, path, folio);
6968  		if (ret < 0)
6969  			goto out;
6970  		goto insert;
6971  	}
6972  not_found:
6973  	em->start = start;
6974  	em->len = len;
6975  	em->disk_bytenr = EXTENT_MAP_HOLE;
6976  insert:
6977  	ret = 0;
6978  	btrfs_release_path(path);
6979  	if (em->start > start || extent_map_end(em) <= start) {
6980  		btrfs_err(fs_info,
6981  			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
6982  			  em->start, em->len, start, len);
6983  		ret = -EIO;
6984  		goto out;
6985  	}
6986  
6987  	write_lock(&em_tree->lock);
6988  	ret = btrfs_add_extent_mapping(inode, &em, start, len);
6989  	write_unlock(&em_tree->lock);
6990  out:
6991  	btrfs_free_path(path);
6992  
6993  	trace_btrfs_get_extent(root, inode, em);
6994  
6995  	if (ret) {
6996  		free_extent_map(em);
6997  		return ERR_PTR(ret);
6998  	}
6999  	return em;
7000  }
7001  
btrfs_extent_readonly(struct btrfs_fs_info * fs_info,u64 bytenr)7002  static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7003  {
7004  	struct btrfs_block_group *block_group;
7005  	bool readonly = false;
7006  
7007  	block_group = btrfs_lookup_block_group(fs_info, bytenr);
7008  	if (!block_group || block_group->ro)
7009  		readonly = true;
7010  	if (block_group)
7011  		btrfs_put_block_group(block_group);
7012  	return readonly;
7013  }
7014  
7015  /*
7016   * Check if we can do nocow write into the range [@offset, @offset + @len)
7017   *
7018   * @offset:	File offset
7019   * @len:	The length to write, will be updated to the nocow writeable
7020   *		range
7021   * @orig_start:	(optional) Return the original file offset of the file extent
7022   * @orig_len:	(optional) Return the original on-disk length of the file extent
7023   * @ram_bytes:	(optional) Return the ram_bytes of the file extent
7024   * @strict:	if true, omit optimizations that might force us into unnecessary
7025   *		cow. e.g., don't trust generation number.
7026   *
7027   * Return:
7028   * >0	and update @len if we can do nocow write
7029   *  0	if we can't do nocow write
7030   * <0	if error happened
7031   *
7032   * NOTE: This only checks the file extents, caller is responsible to wait for
7033   *	 any ordered extents.
7034   */
can_nocow_extent(struct inode * inode,u64 offset,u64 * len,struct btrfs_file_extent * file_extent,bool nowait,bool strict)7035  noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7036  			      struct btrfs_file_extent *file_extent,
7037  			      bool nowait, bool strict)
7038  {
7039  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
7040  	struct can_nocow_file_extent_args nocow_args = { 0 };
7041  	struct btrfs_path *path;
7042  	int ret;
7043  	struct extent_buffer *leaf;
7044  	struct btrfs_root *root = BTRFS_I(inode)->root;
7045  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7046  	struct btrfs_file_extent_item *fi;
7047  	struct btrfs_key key;
7048  	int found_type;
7049  
7050  	path = btrfs_alloc_path();
7051  	if (!path)
7052  		return -ENOMEM;
7053  	path->nowait = nowait;
7054  
7055  	ret = btrfs_lookup_file_extent(NULL, root, path,
7056  			btrfs_ino(BTRFS_I(inode)), offset, 0);
7057  	if (ret < 0)
7058  		goto out;
7059  
7060  	if (ret == 1) {
7061  		if (path->slots[0] == 0) {
7062  			/* can't find the item, must cow */
7063  			ret = 0;
7064  			goto out;
7065  		}
7066  		path->slots[0]--;
7067  	}
7068  	ret = 0;
7069  	leaf = path->nodes[0];
7070  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7071  	if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
7072  	    key.type != BTRFS_EXTENT_DATA_KEY) {
7073  		/* not our file or wrong item type, must cow */
7074  		goto out;
7075  	}
7076  
7077  	if (key.offset > offset) {
7078  		/* Wrong offset, must cow */
7079  		goto out;
7080  	}
7081  
7082  	if (btrfs_file_extent_end(path) <= offset)
7083  		goto out;
7084  
7085  	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
7086  	found_type = btrfs_file_extent_type(leaf, fi);
7087  
7088  	nocow_args.start = offset;
7089  	nocow_args.end = offset + *len - 1;
7090  	nocow_args.strict = strict;
7091  	nocow_args.free_path = true;
7092  
7093  	ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
7094  	/* can_nocow_file_extent() has freed the path. */
7095  	path = NULL;
7096  
7097  	if (ret != 1) {
7098  		/* Treat errors as not being able to NOCOW. */
7099  		ret = 0;
7100  		goto out;
7101  	}
7102  
7103  	ret = 0;
7104  	if (btrfs_extent_readonly(fs_info,
7105  				  nocow_args.file_extent.disk_bytenr +
7106  				  nocow_args.file_extent.offset))
7107  		goto out;
7108  
7109  	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7110  	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7111  		u64 range_end;
7112  
7113  		range_end = round_up(offset + nocow_args.file_extent.num_bytes,
7114  				     root->fs_info->sectorsize) - 1;
7115  		ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
7116  		if (ret) {
7117  			ret = -EAGAIN;
7118  			goto out;
7119  		}
7120  	}
7121  
7122  	if (file_extent)
7123  		memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
7124  
7125  	*len = nocow_args.file_extent.num_bytes;
7126  	ret = 1;
7127  out:
7128  	btrfs_free_path(path);
7129  	return ret;
7130  }
7131  
7132  /* The callers of this must take lock_extent() */
btrfs_create_io_em(struct btrfs_inode * inode,u64 start,const struct btrfs_file_extent * file_extent,int type)7133  struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
7134  				      const struct btrfs_file_extent *file_extent,
7135  				      int type)
7136  {
7137  	struct extent_map *em;
7138  	int ret;
7139  
7140  	/*
7141  	 * Note the missing NOCOW type.
7142  	 *
7143  	 * For pure NOCOW writes, we should not create an io extent map, but
7144  	 * just reusing the existing one.
7145  	 * Only PREALLOC writes (NOCOW write into preallocated range) can
7146  	 * create an io extent map.
7147  	 */
7148  	ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7149  	       type == BTRFS_ORDERED_COMPRESSED ||
7150  	       type == BTRFS_ORDERED_REGULAR);
7151  
7152  	switch (type) {
7153  	case BTRFS_ORDERED_PREALLOC:
7154  		/* We're only referring part of a larger preallocated extent. */
7155  		ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
7156  		break;
7157  	case BTRFS_ORDERED_REGULAR:
7158  		/* COW results a new extent matching our file extent size. */
7159  		ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes);
7160  		ASSERT(file_extent->ram_bytes == file_extent->num_bytes);
7161  
7162  		/* Since it's a new extent, we should not have any offset. */
7163  		ASSERT(file_extent->offset == 0);
7164  		break;
7165  	case BTRFS_ORDERED_COMPRESSED:
7166  		/* Must be compressed. */
7167  		ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);
7168  
7169  		/*
7170  		 * Encoded write can make us to refer to part of the
7171  		 * uncompressed extent.
7172  		 */
7173  		ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
7174  		break;
7175  	}
7176  
7177  	em = alloc_extent_map();
7178  	if (!em)
7179  		return ERR_PTR(-ENOMEM);
7180  
7181  	em->start = start;
7182  	em->len = file_extent->num_bytes;
7183  	em->disk_bytenr = file_extent->disk_bytenr;
7184  	em->disk_num_bytes = file_extent->disk_num_bytes;
7185  	em->ram_bytes = file_extent->ram_bytes;
7186  	em->generation = -1;
7187  	em->offset = file_extent->offset;
7188  	em->flags |= EXTENT_FLAG_PINNED;
7189  	if (type == BTRFS_ORDERED_COMPRESSED)
7190  		extent_map_set_compression(em, file_extent->compression);
7191  
7192  	ret = btrfs_replace_extent_map_range(inode, em, true);
7193  	if (ret) {
7194  		free_extent_map(em);
7195  		return ERR_PTR(ret);
7196  	}
7197  
7198  	/* em got 2 refs now, callers needs to do free_extent_map once. */
7199  	return em;
7200  }
7201  
7202  /*
7203   * For release_folio() and invalidate_folio() we have a race window where
7204   * folio_end_writeback() is called but the subpage spinlock is not yet released.
7205   * If we continue to release/invalidate the page, we could cause use-after-free
7206   * for subpage spinlock.  So this function is to spin and wait for subpage
7207   * spinlock.
7208   */
wait_subpage_spinlock(struct folio * folio)7209  static void wait_subpage_spinlock(struct folio *folio)
7210  {
7211  	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
7212  	struct btrfs_subpage *subpage;
7213  
7214  	if (!btrfs_is_subpage(fs_info, folio->mapping))
7215  		return;
7216  
7217  	ASSERT(folio_test_private(folio) && folio_get_private(folio));
7218  	subpage = folio_get_private(folio);
7219  
7220  	/*
7221  	 * This may look insane as we just acquire the spinlock and release it,
7222  	 * without doing anything.  But we just want to make sure no one is
7223  	 * still holding the subpage spinlock.
7224  	 * And since the page is not dirty nor writeback, and we have page
7225  	 * locked, the only possible way to hold a spinlock is from the endio
7226  	 * function to clear page writeback.
7227  	 *
7228  	 * Here we just acquire the spinlock so that all existing callers
7229  	 * should exit and we're safe to release/invalidate the page.
7230  	 */
7231  	spin_lock_irq(&subpage->lock);
7232  	spin_unlock_irq(&subpage->lock);
7233  }
7234  
btrfs_launder_folio(struct folio * folio)7235  static int btrfs_launder_folio(struct folio *folio)
7236  {
7237  	return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
7238  				      PAGE_SIZE, NULL);
7239  }
7240  
__btrfs_release_folio(struct folio * folio,gfp_t gfp_flags)7241  static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7242  {
7243  	if (try_release_extent_mapping(folio, gfp_flags)) {
7244  		wait_subpage_spinlock(folio);
7245  		clear_folio_extent_mapped(folio);
7246  		return true;
7247  	}
7248  	return false;
7249  }
7250  
btrfs_release_folio(struct folio * folio,gfp_t gfp_flags)7251  static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7252  {
7253  	if (folio_test_writeback(folio) || folio_test_dirty(folio))
7254  		return false;
7255  	return __btrfs_release_folio(folio, gfp_flags);
7256  }
7257  
7258  #ifdef CONFIG_MIGRATION
btrfs_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)7259  static int btrfs_migrate_folio(struct address_space *mapping,
7260  			     struct folio *dst, struct folio *src,
7261  			     enum migrate_mode mode)
7262  {
7263  	int ret = filemap_migrate_folio(mapping, dst, src, mode);
7264  
7265  	if (ret != MIGRATEPAGE_SUCCESS)
7266  		return ret;
7267  
7268  	if (folio_test_ordered(src)) {
7269  		folio_clear_ordered(src);
7270  		folio_set_ordered(dst);
7271  	}
7272  
7273  	return MIGRATEPAGE_SUCCESS;
7274  }
7275  #else
7276  #define btrfs_migrate_folio NULL
7277  #endif
7278  
btrfs_invalidate_folio(struct folio * folio,size_t offset,size_t length)7279  static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
7280  				 size_t length)
7281  {
7282  	struct btrfs_inode *inode = folio_to_inode(folio);
7283  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
7284  	struct extent_io_tree *tree = &inode->io_tree;
7285  	struct extent_state *cached_state = NULL;
7286  	u64 page_start = folio_pos(folio);
7287  	u64 page_end = page_start + folio_size(folio) - 1;
7288  	u64 cur;
7289  	int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
7290  
7291  	/*
7292  	 * We have folio locked so no new ordered extent can be created on this
7293  	 * page, nor bio can be submitted for this folio.
7294  	 *
7295  	 * But already submitted bio can still be finished on this folio.
7296  	 * Furthermore, endio function won't skip folio which has Ordered
7297  	 * (Private2) already cleared, so it's possible for endio and
7298  	 * invalidate_folio to do the same ordered extent accounting twice
7299  	 * on one folio.
7300  	 *
7301  	 * So here we wait for any submitted bios to finish, so that we won't
7302  	 * do double ordered extent accounting on the same folio.
7303  	 */
7304  	folio_wait_writeback(folio);
7305  	wait_subpage_spinlock(folio);
7306  
7307  	/*
7308  	 * For subpage case, we have call sites like
7309  	 * btrfs_punch_hole_lock_range() which passes range not aligned to
7310  	 * sectorsize.
7311  	 * If the range doesn't cover the full folio, we don't need to and
7312  	 * shouldn't clear page extent mapped, as folio->private can still
7313  	 * record subpage dirty bits for other part of the range.
7314  	 *
7315  	 * For cases that invalidate the full folio even the range doesn't
7316  	 * cover the full folio, like invalidating the last folio, we're
7317  	 * still safe to wait for ordered extent to finish.
7318  	 */
7319  	if (!(offset == 0 && length == folio_size(folio))) {
7320  		btrfs_release_folio(folio, GFP_NOFS);
7321  		return;
7322  	}
7323  
7324  	if (!inode_evicting)
7325  		lock_extent(tree, page_start, page_end, &cached_state);
7326  
7327  	cur = page_start;
7328  	while (cur < page_end) {
7329  		struct btrfs_ordered_extent *ordered;
7330  		u64 range_end;
7331  		u32 range_len;
7332  		u32 extra_flags = 0;
7333  
7334  		ordered = btrfs_lookup_first_ordered_range(inode, cur,
7335  							   page_end + 1 - cur);
7336  		if (!ordered) {
7337  			range_end = page_end;
7338  			/*
7339  			 * No ordered extent covering this range, we are safe
7340  			 * to delete all extent states in the range.
7341  			 */
7342  			extra_flags = EXTENT_CLEAR_ALL_BITS;
7343  			goto next;
7344  		}
7345  		if (ordered->file_offset > cur) {
7346  			/*
7347  			 * There is a range between [cur, oe->file_offset) not
7348  			 * covered by any ordered extent.
7349  			 * We are safe to delete all extent states, and handle
7350  			 * the ordered extent in the next iteration.
7351  			 */
7352  			range_end = ordered->file_offset - 1;
7353  			extra_flags = EXTENT_CLEAR_ALL_BITS;
7354  			goto next;
7355  		}
7356  
7357  		range_end = min(ordered->file_offset + ordered->num_bytes - 1,
7358  				page_end);
7359  		ASSERT(range_end + 1 - cur < U32_MAX);
7360  		range_len = range_end + 1 - cur;
7361  		if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
7362  			/*
7363  			 * If Ordered (Private2) is cleared, it means endio has
7364  			 * already been executed for the range.
7365  			 * We can't delete the extent states as
7366  			 * btrfs_finish_ordered_io() may still use some of them.
7367  			 */
7368  			goto next;
7369  		}
7370  		btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
7371  
7372  		/*
7373  		 * IO on this page will never be started, so we need to account
7374  		 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
7375  		 * here, must leave that up for the ordered extent completion.
7376  		 *
7377  		 * This will also unlock the range for incoming
7378  		 * btrfs_finish_ordered_io().
7379  		 */
7380  		if (!inode_evicting)
7381  			clear_extent_bit(tree, cur, range_end,
7382  					 EXTENT_DELALLOC |
7383  					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
7384  					 EXTENT_DEFRAG, &cached_state);
7385  
7386  		spin_lock_irq(&inode->ordered_tree_lock);
7387  		set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
7388  		ordered->truncated_len = min(ordered->truncated_len,
7389  					     cur - ordered->file_offset);
7390  		spin_unlock_irq(&inode->ordered_tree_lock);
7391  
7392  		/*
7393  		 * If the ordered extent has finished, we're safe to delete all
7394  		 * the extent states of the range, otherwise
7395  		 * btrfs_finish_ordered_io() will get executed by endio for
7396  		 * other pages, so we can't delete extent states.
7397  		 */
7398  		if (btrfs_dec_test_ordered_pending(inode, &ordered,
7399  						   cur, range_end + 1 - cur)) {
7400  			btrfs_finish_ordered_io(ordered);
7401  			/*
7402  			 * The ordered extent has finished, now we're again
7403  			 * safe to delete all extent states of the range.
7404  			 */
7405  			extra_flags = EXTENT_CLEAR_ALL_BITS;
7406  		}
7407  next:
7408  		if (ordered)
7409  			btrfs_put_ordered_extent(ordered);
7410  		/*
7411  		 * Qgroup reserved space handler
7412  		 * Sector(s) here will be either:
7413  		 *
7414  		 * 1) Already written to disk or bio already finished
7415  		 *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
7416  		 *    Qgroup will be handled by its qgroup_record then.
7417  		 *    btrfs_qgroup_free_data() call will do nothing here.
7418  		 *
7419  		 * 2) Not written to disk yet
7420  		 *    Then btrfs_qgroup_free_data() call will clear the
7421  		 *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
7422  		 *    reserved data space.
7423  		 *    Since the IO will never happen for this page.
7424  		 */
7425  		btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
7426  		if (!inode_evicting) {
7427  			clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
7428  				 EXTENT_DELALLOC | EXTENT_UPTODATE |
7429  				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
7430  				 extra_flags, &cached_state);
7431  		}
7432  		cur = range_end + 1;
7433  	}
7434  	/*
7435  	 * We have iterated through all ordered extents of the page, the page
7436  	 * should not have Ordered (Private2) anymore, or the above iteration
7437  	 * did something wrong.
7438  	 */
7439  	ASSERT(!folio_test_ordered(folio));
7440  	btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
7441  	if (!inode_evicting)
7442  		__btrfs_release_folio(folio, GFP_NOFS);
7443  	clear_folio_extent_mapped(folio);
7444  }
7445  
btrfs_truncate(struct btrfs_inode * inode,bool skip_writeback)7446  static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
7447  {
7448  	struct btrfs_truncate_control control = {
7449  		.inode = inode,
7450  		.ino = btrfs_ino(inode),
7451  		.min_type = BTRFS_EXTENT_DATA_KEY,
7452  		.clear_extent_range = true,
7453  	};
7454  	struct btrfs_root *root = inode->root;
7455  	struct btrfs_fs_info *fs_info = root->fs_info;
7456  	struct btrfs_block_rsv *rsv;
7457  	int ret;
7458  	struct btrfs_trans_handle *trans;
7459  	u64 mask = fs_info->sectorsize - 1;
7460  	const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
7461  
7462  	if (!skip_writeback) {
7463  		ret = btrfs_wait_ordered_range(inode,
7464  					       inode->vfs_inode.i_size & (~mask),
7465  					       (u64)-1);
7466  		if (ret)
7467  			return ret;
7468  	}
7469  
7470  	/*
7471  	 * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
7472  	 * things going on here:
7473  	 *
7474  	 * 1) We need to reserve space to update our inode.
7475  	 *
7476  	 * 2) We need to have something to cache all the space that is going to
7477  	 * be free'd up by the truncate operation, but also have some slack
7478  	 * space reserved in case it uses space during the truncate (thank you
7479  	 * very much snapshotting).
7480  	 *
7481  	 * And we need these to be separate.  The fact is we can use a lot of
7482  	 * space doing the truncate, and we have no earthly idea how much space
7483  	 * we will use, so we need the truncate reservation to be separate so it
7484  	 * doesn't end up using space reserved for updating the inode.  We also
7485  	 * need to be able to stop the transaction and start a new one, which
7486  	 * means we need to be able to update the inode several times, and we
7487  	 * have no idea of knowing how many times that will be, so we can't just
7488  	 * reserve 1 item for the entirety of the operation, so that has to be
7489  	 * done separately as well.
7490  	 *
7491  	 * So that leaves us with
7492  	 *
7493  	 * 1) rsv - for the truncate reservation, which we will steal from the
7494  	 * transaction reservation.
7495  	 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
7496  	 * updating the inode.
7497  	 */
7498  	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
7499  	if (!rsv)
7500  		return -ENOMEM;
7501  	rsv->size = min_size;
7502  	rsv->failfast = true;
7503  
7504  	/*
7505  	 * 1 for the truncate slack space
7506  	 * 1 for updating the inode.
7507  	 */
7508  	trans = btrfs_start_transaction(root, 2);
7509  	if (IS_ERR(trans)) {
7510  		ret = PTR_ERR(trans);
7511  		goto out;
7512  	}
7513  
7514  	/* Migrate the slack space for the truncate to our reserve */
7515  	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
7516  				      min_size, false);
7517  	/*
7518  	 * We have reserved 2 metadata units when we started the transaction and
7519  	 * min_size matches 1 unit, so this should never fail, but if it does,
7520  	 * it's not critical we just fail truncation.
7521  	 */
7522  	if (WARN_ON(ret)) {
7523  		btrfs_end_transaction(trans);
7524  		goto out;
7525  	}
7526  
7527  	trans->block_rsv = rsv;
7528  
7529  	while (1) {
7530  		struct extent_state *cached_state = NULL;
7531  		const u64 new_size = inode->vfs_inode.i_size;
7532  		const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
7533  
7534  		control.new_size = new_size;
7535  		lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
7536  		/*
7537  		 * We want to drop from the next block forward in case this new
7538  		 * size is not block aligned since we will be keeping the last
7539  		 * block of the extent just the way it is.
7540  		 */
7541  		btrfs_drop_extent_map_range(inode,
7542  					    ALIGN(new_size, fs_info->sectorsize),
7543  					    (u64)-1, false);
7544  
7545  		ret = btrfs_truncate_inode_items(trans, root, &control);
7546  
7547  		inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
7548  		btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
7549  
7550  		unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
7551  
7552  		trans->block_rsv = &fs_info->trans_block_rsv;
7553  		if (ret != -ENOSPC && ret != -EAGAIN)
7554  			break;
7555  
7556  		ret = btrfs_update_inode(trans, inode);
7557  		if (ret)
7558  			break;
7559  
7560  		btrfs_end_transaction(trans);
7561  		btrfs_btree_balance_dirty(fs_info);
7562  
7563  		trans = btrfs_start_transaction(root, 2);
7564  		if (IS_ERR(trans)) {
7565  			ret = PTR_ERR(trans);
7566  			trans = NULL;
7567  			break;
7568  		}
7569  
7570  		btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
7571  		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
7572  					      rsv, min_size, false);
7573  		/*
7574  		 * We have reserved 2 metadata units when we started the
7575  		 * transaction and min_size matches 1 unit, so this should never
7576  		 * fail, but if it does, it's not critical we just fail truncation.
7577  		 */
7578  		if (WARN_ON(ret))
7579  			break;
7580  
7581  		trans->block_rsv = rsv;
7582  	}
7583  
7584  	/*
7585  	 * We can't call btrfs_truncate_block inside a trans handle as we could
7586  	 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
7587  	 * know we've truncated everything except the last little bit, and can
7588  	 * do btrfs_truncate_block and then update the disk_i_size.
7589  	 */
7590  	if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
7591  		btrfs_end_transaction(trans);
7592  		btrfs_btree_balance_dirty(fs_info);
7593  
7594  		ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
7595  		if (ret)
7596  			goto out;
7597  		trans = btrfs_start_transaction(root, 1);
7598  		if (IS_ERR(trans)) {
7599  			ret = PTR_ERR(trans);
7600  			goto out;
7601  		}
7602  		btrfs_inode_safe_disk_i_size_write(inode, 0);
7603  	}
7604  
7605  	if (trans) {
7606  		int ret2;
7607  
7608  		trans->block_rsv = &fs_info->trans_block_rsv;
7609  		ret2 = btrfs_update_inode(trans, inode);
7610  		if (ret2 && !ret)
7611  			ret = ret2;
7612  
7613  		ret2 = btrfs_end_transaction(trans);
7614  		if (ret2 && !ret)
7615  			ret = ret2;
7616  		btrfs_btree_balance_dirty(fs_info);
7617  	}
7618  out:
7619  	btrfs_free_block_rsv(fs_info, rsv);
7620  	/*
7621  	 * So if we truncate and then write and fsync we normally would just
7622  	 * write the extents that changed, which is a problem if we need to
7623  	 * first truncate that entire inode.  So set this flag so we write out
7624  	 * all of the extents in the inode to the sync log so we're completely
7625  	 * safe.
7626  	 *
7627  	 * If no extents were dropped or trimmed we don't need to force the next
7628  	 * fsync to truncate all the inode's items from the log and re-log them
7629  	 * all. This means the truncate operation did not change the file size,
7630  	 * or changed it to a smaller size but there was only an implicit hole
7631  	 * between the old i_size and the new i_size, and there were no prealloc
7632  	 * extents beyond i_size to drop.
7633  	 */
7634  	if (control.extents_found > 0)
7635  		btrfs_set_inode_full_sync(inode);
7636  
7637  	return ret;
7638  }
7639  
btrfs_new_subvol_inode(struct mnt_idmap * idmap,struct inode * dir)7640  struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
7641  				     struct inode *dir)
7642  {
7643  	struct inode *inode;
7644  
7645  	inode = new_inode(dir->i_sb);
7646  	if (inode) {
7647  		/*
7648  		 * Subvolumes don't inherit the sgid bit or the parent's gid if
7649  		 * the parent's sgid bit is set. This is probably a bug.
7650  		 */
7651  		inode_init_owner(idmap, inode, NULL,
7652  				 S_IFDIR | (~current_umask() & S_IRWXUGO));
7653  		inode->i_op = &btrfs_dir_inode_operations;
7654  		inode->i_fop = &btrfs_dir_file_operations;
7655  	}
7656  	return inode;
7657  }
7658  
btrfs_alloc_inode(struct super_block * sb)7659  struct inode *btrfs_alloc_inode(struct super_block *sb)
7660  {
7661  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
7662  	struct btrfs_inode *ei;
7663  	struct inode *inode;
7664  
7665  	ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
7666  	if (!ei)
7667  		return NULL;
7668  
7669  	ei->root = NULL;
7670  	ei->generation = 0;
7671  	ei->last_trans = 0;
7672  	ei->last_sub_trans = 0;
7673  	ei->logged_trans = 0;
7674  	ei->delalloc_bytes = 0;
7675  	ei->new_delalloc_bytes = 0;
7676  	ei->defrag_bytes = 0;
7677  	ei->disk_i_size = 0;
7678  	ei->flags = 0;
7679  	ei->ro_flags = 0;
7680  	/*
7681  	 * ->index_cnt will be properly initialized later when creating a new
7682  	 * inode (btrfs_create_new_inode()) or when reading an existing inode
7683  	 * from disk (btrfs_read_locked_inode()).
7684  	 */
7685  	ei->csum_bytes = 0;
7686  	ei->dir_index = 0;
7687  	ei->last_unlink_trans = 0;
7688  	ei->last_reflink_trans = 0;
7689  	ei->last_log_commit = 0;
7690  
7691  	spin_lock_init(&ei->lock);
7692  	ei->outstanding_extents = 0;
7693  	if (sb->s_magic != BTRFS_TEST_MAGIC)
7694  		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
7695  					      BTRFS_BLOCK_RSV_DELALLOC);
7696  	ei->runtime_flags = 0;
7697  	ei->prop_compress = BTRFS_COMPRESS_NONE;
7698  	ei->defrag_compress = BTRFS_COMPRESS_NONE;
7699  
7700  	ei->delayed_node = NULL;
7701  
7702  	ei->i_otime_sec = 0;
7703  	ei->i_otime_nsec = 0;
7704  
7705  	inode = &ei->vfs_inode;
7706  	extent_map_tree_init(&ei->extent_tree);
7707  
7708  	/* This io tree sets the valid inode. */
7709  	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
7710  	ei->io_tree.inode = ei;
7711  
7712  	ei->file_extent_tree = NULL;
7713  
7714  	mutex_init(&ei->log_mutex);
7715  	spin_lock_init(&ei->ordered_tree_lock);
7716  	ei->ordered_tree = RB_ROOT;
7717  	ei->ordered_tree_last = NULL;
7718  	INIT_LIST_HEAD(&ei->delalloc_inodes);
7719  	INIT_LIST_HEAD(&ei->delayed_iput);
7720  	init_rwsem(&ei->i_mmap_lock);
7721  
7722  	return inode;
7723  }
7724  
7725  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
btrfs_test_destroy_inode(struct inode * inode)7726  void btrfs_test_destroy_inode(struct inode *inode)
7727  {
7728  	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
7729  	kfree(BTRFS_I(inode)->file_extent_tree);
7730  	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7731  }
7732  #endif
7733  
btrfs_free_inode(struct inode * inode)7734  void btrfs_free_inode(struct inode *inode)
7735  {
7736  	kfree(BTRFS_I(inode)->file_extent_tree);
7737  	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7738  }
7739  
btrfs_destroy_inode(struct inode * vfs_inode)7740  void btrfs_destroy_inode(struct inode *vfs_inode)
7741  {
7742  	struct btrfs_ordered_extent *ordered;
7743  	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
7744  	struct btrfs_root *root = inode->root;
7745  	bool freespace_inode;
7746  
7747  	WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
7748  	WARN_ON(vfs_inode->i_data.nrpages);
7749  	WARN_ON(inode->block_rsv.reserved);
7750  	WARN_ON(inode->block_rsv.size);
7751  	WARN_ON(inode->outstanding_extents);
7752  	if (!S_ISDIR(vfs_inode->i_mode)) {
7753  		WARN_ON(inode->delalloc_bytes);
7754  		WARN_ON(inode->new_delalloc_bytes);
7755  		WARN_ON(inode->csum_bytes);
7756  	}
7757  	if (!root || !btrfs_is_data_reloc_root(root))
7758  		WARN_ON(inode->defrag_bytes);
7759  
7760  	/*
7761  	 * This can happen where we create an inode, but somebody else also
7762  	 * created the same inode and we need to destroy the one we already
7763  	 * created.
7764  	 */
7765  	if (!root)
7766  		return;
7767  
7768  	/*
7769  	 * If this is a free space inode do not take the ordered extents lockdep
7770  	 * map.
7771  	 */
7772  	freespace_inode = btrfs_is_free_space_inode(inode);
7773  
7774  	while (1) {
7775  		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
7776  		if (!ordered)
7777  			break;
7778  		else {
7779  			btrfs_err(root->fs_info,
7780  				  "found ordered extent %llu %llu on inode cleanup",
7781  				  ordered->file_offset, ordered->num_bytes);
7782  
7783  			if (!freespace_inode)
7784  				btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
7785  
7786  			btrfs_remove_ordered_extent(inode, ordered);
7787  			btrfs_put_ordered_extent(ordered);
7788  			btrfs_put_ordered_extent(ordered);
7789  		}
7790  	}
7791  	btrfs_qgroup_check_reserved_leak(inode);
7792  	btrfs_del_inode_from_root(inode);
7793  	btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
7794  	btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
7795  	btrfs_put_root(inode->root);
7796  }
7797  
btrfs_drop_inode(struct inode * inode)7798  int btrfs_drop_inode(struct inode *inode)
7799  {
7800  	struct btrfs_root *root = BTRFS_I(inode)->root;
7801  
7802  	if (root == NULL)
7803  		return 1;
7804  
7805  	/* the snap/subvol tree is on deleting */
7806  	if (btrfs_root_refs(&root->root_item) == 0)
7807  		return 1;
7808  	else
7809  		return generic_drop_inode(inode);
7810  }
7811  
init_once(void * foo)7812  static void init_once(void *foo)
7813  {
7814  	struct btrfs_inode *ei = foo;
7815  
7816  	inode_init_once(&ei->vfs_inode);
7817  }
7818  
btrfs_destroy_cachep(void)7819  void __cold btrfs_destroy_cachep(void)
7820  {
7821  	/*
7822  	 * Make sure all delayed rcu free inodes are flushed before we
7823  	 * destroy cache.
7824  	 */
7825  	rcu_barrier();
7826  	kmem_cache_destroy(btrfs_inode_cachep);
7827  }
7828  
btrfs_init_cachep(void)7829  int __init btrfs_init_cachep(void)
7830  {
7831  	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
7832  			sizeof(struct btrfs_inode), 0,
7833  			SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
7834  			init_once);
7835  	if (!btrfs_inode_cachep)
7836  		return -ENOMEM;
7837  
7838  	return 0;
7839  }
7840  
btrfs_getattr(struct mnt_idmap * idmap,const struct path * path,struct kstat * stat,u32 request_mask,unsigned int flags)7841  static int btrfs_getattr(struct mnt_idmap *idmap,
7842  			 const struct path *path, struct kstat *stat,
7843  			 u32 request_mask, unsigned int flags)
7844  {
7845  	u64 delalloc_bytes;
7846  	u64 inode_bytes;
7847  	struct inode *inode = d_inode(path->dentry);
7848  	u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
7849  	u32 bi_flags = BTRFS_I(inode)->flags;
7850  	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
7851  
7852  	stat->result_mask |= STATX_BTIME;
7853  	stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
7854  	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
7855  	if (bi_flags & BTRFS_INODE_APPEND)
7856  		stat->attributes |= STATX_ATTR_APPEND;
7857  	if (bi_flags & BTRFS_INODE_COMPRESS)
7858  		stat->attributes |= STATX_ATTR_COMPRESSED;
7859  	if (bi_flags & BTRFS_INODE_IMMUTABLE)
7860  		stat->attributes |= STATX_ATTR_IMMUTABLE;
7861  	if (bi_flags & BTRFS_INODE_NODUMP)
7862  		stat->attributes |= STATX_ATTR_NODUMP;
7863  	if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
7864  		stat->attributes |= STATX_ATTR_VERITY;
7865  
7866  	stat->attributes_mask |= (STATX_ATTR_APPEND |
7867  				  STATX_ATTR_COMPRESSED |
7868  				  STATX_ATTR_IMMUTABLE |
7869  				  STATX_ATTR_NODUMP);
7870  
7871  	generic_fillattr(idmap, request_mask, inode, stat);
7872  	stat->dev = BTRFS_I(inode)->root->anon_dev;
7873  
7874  	stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
7875  	stat->result_mask |= STATX_SUBVOL;
7876  
7877  	spin_lock(&BTRFS_I(inode)->lock);
7878  	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
7879  	inode_bytes = inode_get_bytes(inode);
7880  	spin_unlock(&BTRFS_I(inode)->lock);
7881  	stat->blocks = (ALIGN(inode_bytes, blocksize) +
7882  			ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
7883  	return 0;
7884  }
7885  
btrfs_rename_exchange(struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry)7886  static int btrfs_rename_exchange(struct inode *old_dir,
7887  			      struct dentry *old_dentry,
7888  			      struct inode *new_dir,
7889  			      struct dentry *new_dentry)
7890  {
7891  	struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
7892  	struct btrfs_trans_handle *trans;
7893  	unsigned int trans_num_items;
7894  	struct btrfs_root *root = BTRFS_I(old_dir)->root;
7895  	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
7896  	struct inode *new_inode = new_dentry->d_inode;
7897  	struct inode *old_inode = old_dentry->d_inode;
7898  	struct btrfs_rename_ctx old_rename_ctx;
7899  	struct btrfs_rename_ctx new_rename_ctx;
7900  	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
7901  	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
7902  	u64 old_idx = 0;
7903  	u64 new_idx = 0;
7904  	int ret;
7905  	int ret2;
7906  	bool need_abort = false;
7907  	struct fscrypt_name old_fname, new_fname;
7908  	struct fscrypt_str *old_name, *new_name;
7909  
7910  	/*
7911  	 * For non-subvolumes allow exchange only within one subvolume, in the
7912  	 * same inode namespace. Two subvolumes (represented as directory) can
7913  	 * be exchanged as they're a logical link and have a fixed inode number.
7914  	 */
7915  	if (root != dest &&
7916  	    (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
7917  	     new_ino != BTRFS_FIRST_FREE_OBJECTID))
7918  		return -EXDEV;
7919  
7920  	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
7921  	if (ret)
7922  		return ret;
7923  
7924  	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
7925  	if (ret) {
7926  		fscrypt_free_filename(&old_fname);
7927  		return ret;
7928  	}
7929  
7930  	old_name = &old_fname.disk_name;
7931  	new_name = &new_fname.disk_name;
7932  
7933  	/* close the race window with snapshot create/destroy ioctl */
7934  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
7935  	    new_ino == BTRFS_FIRST_FREE_OBJECTID)
7936  		down_read(&fs_info->subvol_sem);
7937  
7938  	/*
7939  	 * For each inode:
7940  	 * 1 to remove old dir item
7941  	 * 1 to remove old dir index
7942  	 * 1 to add new dir item
7943  	 * 1 to add new dir index
7944  	 * 1 to update parent inode
7945  	 *
7946  	 * If the parents are the same, we only need to account for one
7947  	 */
7948  	trans_num_items = (old_dir == new_dir ? 9 : 10);
7949  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
7950  		/*
7951  		 * 1 to remove old root ref
7952  		 * 1 to remove old root backref
7953  		 * 1 to add new root ref
7954  		 * 1 to add new root backref
7955  		 */
7956  		trans_num_items += 4;
7957  	} else {
7958  		/*
7959  		 * 1 to update inode item
7960  		 * 1 to remove old inode ref
7961  		 * 1 to add new inode ref
7962  		 */
7963  		trans_num_items += 3;
7964  	}
7965  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
7966  		trans_num_items += 4;
7967  	else
7968  		trans_num_items += 3;
7969  	trans = btrfs_start_transaction(root, trans_num_items);
7970  	if (IS_ERR(trans)) {
7971  		ret = PTR_ERR(trans);
7972  		goto out_notrans;
7973  	}
7974  
7975  	if (dest != root) {
7976  		ret = btrfs_record_root_in_trans(trans, dest);
7977  		if (ret)
7978  			goto out_fail;
7979  	}
7980  
7981  	/*
7982  	 * We need to find a free sequence number both in the source and
7983  	 * in the destination directory for the exchange.
7984  	 */
7985  	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
7986  	if (ret)
7987  		goto out_fail;
7988  	ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
7989  	if (ret)
7990  		goto out_fail;
7991  
7992  	BTRFS_I(old_inode)->dir_index = 0ULL;
7993  	BTRFS_I(new_inode)->dir_index = 0ULL;
7994  
7995  	/* Reference for the source. */
7996  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
7997  		/* force full log commit if subvolume involved. */
7998  		btrfs_set_log_full_commit(trans);
7999  	} else {
8000  		ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
8001  					     btrfs_ino(BTRFS_I(new_dir)),
8002  					     old_idx);
8003  		if (ret)
8004  			goto out_fail;
8005  		need_abort = true;
8006  	}
8007  
8008  	/* And now for the dest. */
8009  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8010  		/* force full log commit if subvolume involved. */
8011  		btrfs_set_log_full_commit(trans);
8012  	} else {
8013  		ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
8014  					     btrfs_ino(BTRFS_I(old_dir)),
8015  					     new_idx);
8016  		if (ret) {
8017  			if (need_abort)
8018  				btrfs_abort_transaction(trans, ret);
8019  			goto out_fail;
8020  		}
8021  	}
8022  
8023  	/* Update inode version and ctime/mtime. */
8024  	inode_inc_iversion(old_dir);
8025  	inode_inc_iversion(new_dir);
8026  	inode_inc_iversion(old_inode);
8027  	inode_inc_iversion(new_inode);
8028  	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8029  
8030  	if (old_dentry->d_parent != new_dentry->d_parent) {
8031  		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8032  					BTRFS_I(old_inode), true);
8033  		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
8034  					BTRFS_I(new_inode), true);
8035  	}
8036  
8037  	/* src is a subvolume */
8038  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8039  		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8040  	} else { /* src is an inode */
8041  		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8042  					   BTRFS_I(old_dentry->d_inode),
8043  					   old_name, &old_rename_ctx);
8044  		if (!ret)
8045  			ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
8046  	}
8047  	if (ret) {
8048  		btrfs_abort_transaction(trans, ret);
8049  		goto out_fail;
8050  	}
8051  
8052  	/* dest is a subvolume */
8053  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8054  		ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8055  	} else { /* dest is an inode */
8056  		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8057  					   BTRFS_I(new_dentry->d_inode),
8058  					   new_name, &new_rename_ctx);
8059  		if (!ret)
8060  			ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
8061  	}
8062  	if (ret) {
8063  		btrfs_abort_transaction(trans, ret);
8064  		goto out_fail;
8065  	}
8066  
8067  	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8068  			     new_name, 0, old_idx);
8069  	if (ret) {
8070  		btrfs_abort_transaction(trans, ret);
8071  		goto out_fail;
8072  	}
8073  
8074  	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
8075  			     old_name, 0, new_idx);
8076  	if (ret) {
8077  		btrfs_abort_transaction(trans, ret);
8078  		goto out_fail;
8079  	}
8080  
8081  	if (old_inode->i_nlink == 1)
8082  		BTRFS_I(old_inode)->dir_index = old_idx;
8083  	if (new_inode->i_nlink == 1)
8084  		BTRFS_I(new_inode)->dir_index = new_idx;
8085  
8086  	/*
8087  	 * Now pin the logs of the roots. We do it to ensure that no other task
8088  	 * can sync the logs while we are in progress with the rename, because
8089  	 * that could result in an inconsistency in case any of the inodes that
8090  	 * are part of this rename operation were logged before.
8091  	 */
8092  	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8093  		btrfs_pin_log_trans(root);
8094  	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8095  		btrfs_pin_log_trans(dest);
8096  
8097  	/* Do the log updates for all inodes. */
8098  	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8099  		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8100  				   old_rename_ctx.index, new_dentry->d_parent);
8101  	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8102  		btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
8103  				   new_rename_ctx.index, old_dentry->d_parent);
8104  
8105  	/* Now unpin the logs. */
8106  	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8107  		btrfs_end_log_trans(root);
8108  	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8109  		btrfs_end_log_trans(dest);
8110  out_fail:
8111  	ret2 = btrfs_end_transaction(trans);
8112  	ret = ret ? ret : ret2;
8113  out_notrans:
8114  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
8115  	    old_ino == BTRFS_FIRST_FREE_OBJECTID)
8116  		up_read(&fs_info->subvol_sem);
8117  
8118  	fscrypt_free_filename(&new_fname);
8119  	fscrypt_free_filename(&old_fname);
8120  	return ret;
8121  }
8122  
new_whiteout_inode(struct mnt_idmap * idmap,struct inode * dir)8123  static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
8124  					struct inode *dir)
8125  {
8126  	struct inode *inode;
8127  
8128  	inode = new_inode(dir->i_sb);
8129  	if (inode) {
8130  		inode_init_owner(idmap, inode, dir,
8131  				 S_IFCHR | WHITEOUT_MODE);
8132  		inode->i_op = &btrfs_special_inode_operations;
8133  		init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
8134  	}
8135  	return inode;
8136  }
8137  
btrfs_rename(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)8138  static int btrfs_rename(struct mnt_idmap *idmap,
8139  			struct inode *old_dir, struct dentry *old_dentry,
8140  			struct inode *new_dir, struct dentry *new_dentry,
8141  			unsigned int flags)
8142  {
8143  	struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
8144  	struct btrfs_new_inode_args whiteout_args = {
8145  		.dir = old_dir,
8146  		.dentry = old_dentry,
8147  	};
8148  	struct btrfs_trans_handle *trans;
8149  	unsigned int trans_num_items;
8150  	struct btrfs_root *root = BTRFS_I(old_dir)->root;
8151  	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8152  	struct inode *new_inode = d_inode(new_dentry);
8153  	struct inode *old_inode = d_inode(old_dentry);
8154  	struct btrfs_rename_ctx rename_ctx;
8155  	u64 index = 0;
8156  	int ret;
8157  	int ret2;
8158  	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8159  	struct fscrypt_name old_fname, new_fname;
8160  
8161  	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
8162  		return -EPERM;
8163  
8164  	/* we only allow rename subvolume link between subvolumes */
8165  	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
8166  		return -EXDEV;
8167  
8168  	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
8169  	    (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
8170  		return -ENOTEMPTY;
8171  
8172  	if (S_ISDIR(old_inode->i_mode) && new_inode &&
8173  	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8174  		return -ENOTEMPTY;
8175  
8176  	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8177  	if (ret)
8178  		return ret;
8179  
8180  	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8181  	if (ret) {
8182  		fscrypt_free_filename(&old_fname);
8183  		return ret;
8184  	}
8185  
8186  	/* check for collisions, even if the  name isn't there */
8187  	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
8188  	if (ret) {
8189  		if (ret == -EEXIST) {
8190  			/* we shouldn't get
8191  			 * eexist without a new_inode */
8192  			if (WARN_ON(!new_inode)) {
8193  				goto out_fscrypt_names;
8194  			}
8195  		} else {
8196  			/* maybe -EOVERFLOW */
8197  			goto out_fscrypt_names;
8198  		}
8199  	}
8200  	ret = 0;
8201  
8202  	/*
8203  	 * we're using rename to replace one file with another.  Start IO on it
8204  	 * now so  we don't add too much work to the end of the transaction
8205  	 */
8206  	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8207  		filemap_flush(old_inode->i_mapping);
8208  
8209  	if (flags & RENAME_WHITEOUT) {
8210  		whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
8211  		if (!whiteout_args.inode) {
8212  			ret = -ENOMEM;
8213  			goto out_fscrypt_names;
8214  		}
8215  		ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
8216  		if (ret)
8217  			goto out_whiteout_inode;
8218  	} else {
8219  		/* 1 to update the old parent inode. */
8220  		trans_num_items = 1;
8221  	}
8222  
8223  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8224  		/* Close the race window with snapshot create/destroy ioctl */
8225  		down_read(&fs_info->subvol_sem);
8226  		/*
8227  		 * 1 to remove old root ref
8228  		 * 1 to remove old root backref
8229  		 * 1 to add new root ref
8230  		 * 1 to add new root backref
8231  		 */
8232  		trans_num_items += 4;
8233  	} else {
8234  		/*
8235  		 * 1 to update inode
8236  		 * 1 to remove old inode ref
8237  		 * 1 to add new inode ref
8238  		 */
8239  		trans_num_items += 3;
8240  	}
8241  	/*
8242  	 * 1 to remove old dir item
8243  	 * 1 to remove old dir index
8244  	 * 1 to add new dir item
8245  	 * 1 to add new dir index
8246  	 */
8247  	trans_num_items += 4;
8248  	/* 1 to update new parent inode if it's not the same as the old parent */
8249  	if (new_dir != old_dir)
8250  		trans_num_items++;
8251  	if (new_inode) {
8252  		/*
8253  		 * 1 to update inode
8254  		 * 1 to remove inode ref
8255  		 * 1 to remove dir item
8256  		 * 1 to remove dir index
8257  		 * 1 to possibly add orphan item
8258  		 */
8259  		trans_num_items += 5;
8260  	}
8261  	trans = btrfs_start_transaction(root, trans_num_items);
8262  	if (IS_ERR(trans)) {
8263  		ret = PTR_ERR(trans);
8264  		goto out_notrans;
8265  	}
8266  
8267  	if (dest != root) {
8268  		ret = btrfs_record_root_in_trans(trans, dest);
8269  		if (ret)
8270  			goto out_fail;
8271  	}
8272  
8273  	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
8274  	if (ret)
8275  		goto out_fail;
8276  
8277  	BTRFS_I(old_inode)->dir_index = 0ULL;
8278  	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8279  		/* force full log commit if subvolume involved. */
8280  		btrfs_set_log_full_commit(trans);
8281  	} else {
8282  		ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
8283  					     old_ino, btrfs_ino(BTRFS_I(new_dir)),
8284  					     index);
8285  		if (ret)
8286  			goto out_fail;
8287  	}
8288  
8289  	inode_inc_iversion(old_dir);
8290  	inode_inc_iversion(new_dir);
8291  	inode_inc_iversion(old_inode);
8292  	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8293  
8294  	if (old_dentry->d_parent != new_dentry->d_parent)
8295  		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8296  					BTRFS_I(old_inode), true);
8297  
8298  	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8299  		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8300  	} else {
8301  		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8302  					   BTRFS_I(d_inode(old_dentry)),
8303  					   &old_fname.disk_name, &rename_ctx);
8304  		if (!ret)
8305  			ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
8306  	}
8307  	if (ret) {
8308  		btrfs_abort_transaction(trans, ret);
8309  		goto out_fail;
8310  	}
8311  
8312  	if (new_inode) {
8313  		inode_inc_iversion(new_inode);
8314  		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
8315  			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8316  			ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8317  			BUG_ON(new_inode->i_nlink == 0);
8318  		} else {
8319  			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8320  						 BTRFS_I(d_inode(new_dentry)),
8321  						 &new_fname.disk_name);
8322  		}
8323  		if (!ret && new_inode->i_nlink == 0)
8324  			ret = btrfs_orphan_add(trans,
8325  					BTRFS_I(d_inode(new_dentry)));
8326  		if (ret) {
8327  			btrfs_abort_transaction(trans, ret);
8328  			goto out_fail;
8329  		}
8330  	}
8331  
8332  	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8333  			     &new_fname.disk_name, 0, index);
8334  	if (ret) {
8335  		btrfs_abort_transaction(trans, ret);
8336  		goto out_fail;
8337  	}
8338  
8339  	if (old_inode->i_nlink == 1)
8340  		BTRFS_I(old_inode)->dir_index = index;
8341  
8342  	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8343  		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8344  				   rename_ctx.index, new_dentry->d_parent);
8345  
8346  	if (flags & RENAME_WHITEOUT) {
8347  		ret = btrfs_create_new_inode(trans, &whiteout_args);
8348  		if (ret) {
8349  			btrfs_abort_transaction(trans, ret);
8350  			goto out_fail;
8351  		} else {
8352  			unlock_new_inode(whiteout_args.inode);
8353  			iput(whiteout_args.inode);
8354  			whiteout_args.inode = NULL;
8355  		}
8356  	}
8357  out_fail:
8358  	ret2 = btrfs_end_transaction(trans);
8359  	ret = ret ? ret : ret2;
8360  out_notrans:
8361  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8362  		up_read(&fs_info->subvol_sem);
8363  	if (flags & RENAME_WHITEOUT)
8364  		btrfs_new_inode_args_destroy(&whiteout_args);
8365  out_whiteout_inode:
8366  	if (flags & RENAME_WHITEOUT)
8367  		iput(whiteout_args.inode);
8368  out_fscrypt_names:
8369  	fscrypt_free_filename(&old_fname);
8370  	fscrypt_free_filename(&new_fname);
8371  	return ret;
8372  }
8373  
btrfs_rename2(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)8374  static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
8375  			 struct dentry *old_dentry, struct inode *new_dir,
8376  			 struct dentry *new_dentry, unsigned int flags)
8377  {
8378  	int ret;
8379  
8380  	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
8381  		return -EINVAL;
8382  
8383  	if (flags & RENAME_EXCHANGE)
8384  		ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
8385  					    new_dentry);
8386  	else
8387  		ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
8388  				   new_dentry, flags);
8389  
8390  	btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
8391  
8392  	return ret;
8393  }
8394  
8395  struct btrfs_delalloc_work {
8396  	struct inode *inode;
8397  	struct completion completion;
8398  	struct list_head list;
8399  	struct btrfs_work work;
8400  };
8401  
btrfs_run_delalloc_work(struct btrfs_work * work)8402  static void btrfs_run_delalloc_work(struct btrfs_work *work)
8403  {
8404  	struct btrfs_delalloc_work *delalloc_work;
8405  	struct inode *inode;
8406  
8407  	delalloc_work = container_of(work, struct btrfs_delalloc_work,
8408  				     work);
8409  	inode = delalloc_work->inode;
8410  	filemap_flush(inode->i_mapping);
8411  	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8412  				&BTRFS_I(inode)->runtime_flags))
8413  		filemap_flush(inode->i_mapping);
8414  
8415  	iput(inode);
8416  	complete(&delalloc_work->completion);
8417  }
8418  
btrfs_alloc_delalloc_work(struct inode * inode)8419  static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
8420  {
8421  	struct btrfs_delalloc_work *work;
8422  
8423  	work = kmalloc(sizeof(*work), GFP_NOFS);
8424  	if (!work)
8425  		return NULL;
8426  
8427  	init_completion(&work->completion);
8428  	INIT_LIST_HEAD(&work->list);
8429  	work->inode = inode;
8430  	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
8431  
8432  	return work;
8433  }
8434  
8435  /*
8436   * some fairly slow code that needs optimization. This walks the list
8437   * of all the inodes with pending delalloc and forces them to disk.
8438   */
start_delalloc_inodes(struct btrfs_root * root,struct writeback_control * wbc,bool snapshot,bool in_reclaim_context)8439  static int start_delalloc_inodes(struct btrfs_root *root,
8440  				 struct writeback_control *wbc, bool snapshot,
8441  				 bool in_reclaim_context)
8442  {
8443  	struct btrfs_inode *binode;
8444  	struct inode *inode;
8445  	struct btrfs_delalloc_work *work, *next;
8446  	LIST_HEAD(works);
8447  	LIST_HEAD(splice);
8448  	int ret = 0;
8449  	bool full_flush = wbc->nr_to_write == LONG_MAX;
8450  
8451  	mutex_lock(&root->delalloc_mutex);
8452  	spin_lock(&root->delalloc_lock);
8453  	list_splice_init(&root->delalloc_inodes, &splice);
8454  	while (!list_empty(&splice)) {
8455  		binode = list_entry(splice.next, struct btrfs_inode,
8456  				    delalloc_inodes);
8457  
8458  		list_move_tail(&binode->delalloc_inodes,
8459  			       &root->delalloc_inodes);
8460  
8461  		if (in_reclaim_context &&
8462  		    test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
8463  			continue;
8464  
8465  		inode = igrab(&binode->vfs_inode);
8466  		if (!inode) {
8467  			cond_resched_lock(&root->delalloc_lock);
8468  			continue;
8469  		}
8470  		spin_unlock(&root->delalloc_lock);
8471  
8472  		if (snapshot)
8473  			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
8474  				&binode->runtime_flags);
8475  		if (full_flush) {
8476  			work = btrfs_alloc_delalloc_work(inode);
8477  			if (!work) {
8478  				iput(inode);
8479  				ret = -ENOMEM;
8480  				goto out;
8481  			}
8482  			list_add_tail(&work->list, &works);
8483  			btrfs_queue_work(root->fs_info->flush_workers,
8484  					 &work->work);
8485  		} else {
8486  			ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
8487  			btrfs_add_delayed_iput(BTRFS_I(inode));
8488  			if (ret || wbc->nr_to_write <= 0)
8489  				goto out;
8490  		}
8491  		cond_resched();
8492  		spin_lock(&root->delalloc_lock);
8493  	}
8494  	spin_unlock(&root->delalloc_lock);
8495  
8496  out:
8497  	list_for_each_entry_safe(work, next, &works, list) {
8498  		list_del_init(&work->list);
8499  		wait_for_completion(&work->completion);
8500  		kfree(work);
8501  	}
8502  
8503  	if (!list_empty(&splice)) {
8504  		spin_lock(&root->delalloc_lock);
8505  		list_splice_tail(&splice, &root->delalloc_inodes);
8506  		spin_unlock(&root->delalloc_lock);
8507  	}
8508  	mutex_unlock(&root->delalloc_mutex);
8509  	return ret;
8510  }
8511  
btrfs_start_delalloc_snapshot(struct btrfs_root * root,bool in_reclaim_context)8512  int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
8513  {
8514  	struct writeback_control wbc = {
8515  		.nr_to_write = LONG_MAX,
8516  		.sync_mode = WB_SYNC_NONE,
8517  		.range_start = 0,
8518  		.range_end = LLONG_MAX,
8519  	};
8520  	struct btrfs_fs_info *fs_info = root->fs_info;
8521  
8522  	if (BTRFS_FS_ERROR(fs_info))
8523  		return -EROFS;
8524  
8525  	return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
8526  }
8527  
btrfs_start_delalloc_roots(struct btrfs_fs_info * fs_info,long nr,bool in_reclaim_context)8528  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
8529  			       bool in_reclaim_context)
8530  {
8531  	struct writeback_control wbc = {
8532  		.nr_to_write = nr,
8533  		.sync_mode = WB_SYNC_NONE,
8534  		.range_start = 0,
8535  		.range_end = LLONG_MAX,
8536  	};
8537  	struct btrfs_root *root;
8538  	LIST_HEAD(splice);
8539  	int ret;
8540  
8541  	if (BTRFS_FS_ERROR(fs_info))
8542  		return -EROFS;
8543  
8544  	mutex_lock(&fs_info->delalloc_root_mutex);
8545  	spin_lock(&fs_info->delalloc_root_lock);
8546  	list_splice_init(&fs_info->delalloc_roots, &splice);
8547  	while (!list_empty(&splice)) {
8548  		/*
8549  		 * Reset nr_to_write here so we know that we're doing a full
8550  		 * flush.
8551  		 */
8552  		if (nr == LONG_MAX)
8553  			wbc.nr_to_write = LONG_MAX;
8554  
8555  		root = list_first_entry(&splice, struct btrfs_root,
8556  					delalloc_root);
8557  		root = btrfs_grab_root(root);
8558  		BUG_ON(!root);
8559  		list_move_tail(&root->delalloc_root,
8560  			       &fs_info->delalloc_roots);
8561  		spin_unlock(&fs_info->delalloc_root_lock);
8562  
8563  		ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
8564  		btrfs_put_root(root);
8565  		if (ret < 0 || wbc.nr_to_write <= 0)
8566  			goto out;
8567  		spin_lock(&fs_info->delalloc_root_lock);
8568  	}
8569  	spin_unlock(&fs_info->delalloc_root_lock);
8570  
8571  	ret = 0;
8572  out:
8573  	if (!list_empty(&splice)) {
8574  		spin_lock(&fs_info->delalloc_root_lock);
8575  		list_splice_tail(&splice, &fs_info->delalloc_roots);
8576  		spin_unlock(&fs_info->delalloc_root_lock);
8577  	}
8578  	mutex_unlock(&fs_info->delalloc_root_mutex);
8579  	return ret;
8580  }
8581  
btrfs_symlink(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,const char * symname)8582  static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
8583  			 struct dentry *dentry, const char *symname)
8584  {
8585  	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
8586  	struct btrfs_trans_handle *trans;
8587  	struct btrfs_root *root = BTRFS_I(dir)->root;
8588  	struct btrfs_path *path;
8589  	struct btrfs_key key;
8590  	struct inode *inode;
8591  	struct btrfs_new_inode_args new_inode_args = {
8592  		.dir = dir,
8593  		.dentry = dentry,
8594  	};
8595  	unsigned int trans_num_items;
8596  	int err;
8597  	int name_len;
8598  	int datasize;
8599  	unsigned long ptr;
8600  	struct btrfs_file_extent_item *ei;
8601  	struct extent_buffer *leaf;
8602  
8603  	name_len = strlen(symname);
8604  	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
8605  		return -ENAMETOOLONG;
8606  
8607  	inode = new_inode(dir->i_sb);
8608  	if (!inode)
8609  		return -ENOMEM;
8610  	inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
8611  	inode->i_op = &btrfs_symlink_inode_operations;
8612  	inode_nohighmem(inode);
8613  	inode->i_mapping->a_ops = &btrfs_aops;
8614  	btrfs_i_size_write(BTRFS_I(inode), name_len);
8615  	inode_set_bytes(inode, name_len);
8616  
8617  	new_inode_args.inode = inode;
8618  	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
8619  	if (err)
8620  		goto out_inode;
8621  	/* 1 additional item for the inline extent */
8622  	trans_num_items++;
8623  
8624  	trans = btrfs_start_transaction(root, trans_num_items);
8625  	if (IS_ERR(trans)) {
8626  		err = PTR_ERR(trans);
8627  		goto out_new_inode_args;
8628  	}
8629  
8630  	err = btrfs_create_new_inode(trans, &new_inode_args);
8631  	if (err)
8632  		goto out;
8633  
8634  	path = btrfs_alloc_path();
8635  	if (!path) {
8636  		err = -ENOMEM;
8637  		btrfs_abort_transaction(trans, err);
8638  		discard_new_inode(inode);
8639  		inode = NULL;
8640  		goto out;
8641  	}
8642  	key.objectid = btrfs_ino(BTRFS_I(inode));
8643  	key.offset = 0;
8644  	key.type = BTRFS_EXTENT_DATA_KEY;
8645  	datasize = btrfs_file_extent_calc_inline_size(name_len);
8646  	err = btrfs_insert_empty_item(trans, root, path, &key,
8647  				      datasize);
8648  	if (err) {
8649  		btrfs_abort_transaction(trans, err);
8650  		btrfs_free_path(path);
8651  		discard_new_inode(inode);
8652  		inode = NULL;
8653  		goto out;
8654  	}
8655  	leaf = path->nodes[0];
8656  	ei = btrfs_item_ptr(leaf, path->slots[0],
8657  			    struct btrfs_file_extent_item);
8658  	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
8659  	btrfs_set_file_extent_type(leaf, ei,
8660  				   BTRFS_FILE_EXTENT_INLINE);
8661  	btrfs_set_file_extent_encryption(leaf, ei, 0);
8662  	btrfs_set_file_extent_compression(leaf, ei, 0);
8663  	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
8664  	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
8665  
8666  	ptr = btrfs_file_extent_inline_start(ei);
8667  	write_extent_buffer(leaf, symname, ptr, name_len);
8668  	btrfs_mark_buffer_dirty(trans, leaf);
8669  	btrfs_free_path(path);
8670  
8671  	d_instantiate_new(dentry, inode);
8672  	err = 0;
8673  out:
8674  	btrfs_end_transaction(trans);
8675  	btrfs_btree_balance_dirty(fs_info);
8676  out_new_inode_args:
8677  	btrfs_new_inode_args_destroy(&new_inode_args);
8678  out_inode:
8679  	if (err)
8680  		iput(inode);
8681  	return err;
8682  }
8683  
insert_prealloc_file_extent(struct btrfs_trans_handle * trans_in,struct btrfs_inode * inode,struct btrfs_key * ins,u64 file_offset)8684  static struct btrfs_trans_handle *insert_prealloc_file_extent(
8685  				       struct btrfs_trans_handle *trans_in,
8686  				       struct btrfs_inode *inode,
8687  				       struct btrfs_key *ins,
8688  				       u64 file_offset)
8689  {
8690  	struct btrfs_file_extent_item stack_fi;
8691  	struct btrfs_replace_extent_info extent_info;
8692  	struct btrfs_trans_handle *trans = trans_in;
8693  	struct btrfs_path *path;
8694  	u64 start = ins->objectid;
8695  	u64 len = ins->offset;
8696  	u64 qgroup_released = 0;
8697  	int ret;
8698  
8699  	memset(&stack_fi, 0, sizeof(stack_fi));
8700  
8701  	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
8702  	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
8703  	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
8704  	btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
8705  	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
8706  	btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
8707  	/* Encryption and other encoding is reserved and all 0 */
8708  
8709  	ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
8710  	if (ret < 0)
8711  		return ERR_PTR(ret);
8712  
8713  	if (trans) {
8714  		ret = insert_reserved_file_extent(trans, inode,
8715  						  file_offset, &stack_fi,
8716  						  true, qgroup_released);
8717  		if (ret)
8718  			goto free_qgroup;
8719  		return trans;
8720  	}
8721  
8722  	extent_info.disk_offset = start;
8723  	extent_info.disk_len = len;
8724  	extent_info.data_offset = 0;
8725  	extent_info.data_len = len;
8726  	extent_info.file_offset = file_offset;
8727  	extent_info.extent_buf = (char *)&stack_fi;
8728  	extent_info.is_new_extent = true;
8729  	extent_info.update_times = true;
8730  	extent_info.qgroup_reserved = qgroup_released;
8731  	extent_info.insertions = 0;
8732  
8733  	path = btrfs_alloc_path();
8734  	if (!path) {
8735  		ret = -ENOMEM;
8736  		goto free_qgroup;
8737  	}
8738  
8739  	ret = btrfs_replace_file_extents(inode, path, file_offset,
8740  				     file_offset + len - 1, &extent_info,
8741  				     &trans);
8742  	btrfs_free_path(path);
8743  	if (ret)
8744  		goto free_qgroup;
8745  	return trans;
8746  
8747  free_qgroup:
8748  	/*
8749  	 * We have released qgroup data range at the beginning of the function,
8750  	 * and normally qgroup_released bytes will be freed when committing
8751  	 * transaction.
8752  	 * But if we error out early, we have to free what we have released
8753  	 * or we leak qgroup data reservation.
8754  	 */
8755  	btrfs_qgroup_free_refroot(inode->root->fs_info,
8756  			btrfs_root_id(inode->root), qgroup_released,
8757  			BTRFS_QGROUP_RSV_DATA);
8758  	return ERR_PTR(ret);
8759  }
8760  
__btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint,struct btrfs_trans_handle * trans)8761  static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
8762  				       u64 start, u64 num_bytes, u64 min_size,
8763  				       loff_t actual_len, u64 *alloc_hint,
8764  				       struct btrfs_trans_handle *trans)
8765  {
8766  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
8767  	struct extent_map *em;
8768  	struct btrfs_root *root = BTRFS_I(inode)->root;
8769  	struct btrfs_key ins;
8770  	u64 cur_offset = start;
8771  	u64 clear_offset = start;
8772  	u64 i_size;
8773  	u64 cur_bytes;
8774  	u64 last_alloc = (u64)-1;
8775  	int ret = 0;
8776  	bool own_trans = true;
8777  	u64 end = start + num_bytes - 1;
8778  
8779  	if (trans)
8780  		own_trans = false;
8781  	while (num_bytes > 0) {
8782  		cur_bytes = min_t(u64, num_bytes, SZ_256M);
8783  		cur_bytes = max(cur_bytes, min_size);
8784  		/*
8785  		 * If we are severely fragmented we could end up with really
8786  		 * small allocations, so if the allocator is returning small
8787  		 * chunks lets make its job easier by only searching for those
8788  		 * sized chunks.
8789  		 */
8790  		cur_bytes = min(cur_bytes, last_alloc);
8791  		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
8792  				min_size, 0, *alloc_hint, &ins, 1, 0);
8793  		if (ret)
8794  			break;
8795  
8796  		/*
8797  		 * We've reserved this space, and thus converted it from
8798  		 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
8799  		 * from here on out we will only need to clear our reservation
8800  		 * for the remaining unreserved area, so advance our
8801  		 * clear_offset by our extent size.
8802  		 */
8803  		clear_offset += ins.offset;
8804  
8805  		last_alloc = ins.offset;
8806  		trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
8807  						    &ins, cur_offset);
8808  		/*
8809  		 * Now that we inserted the prealloc extent we can finally
8810  		 * decrement the number of reservations in the block group.
8811  		 * If we did it before, we could race with relocation and have
8812  		 * relocation miss the reserved extent, making it fail later.
8813  		 */
8814  		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
8815  		if (IS_ERR(trans)) {
8816  			ret = PTR_ERR(trans);
8817  			btrfs_free_reserved_extent(fs_info, ins.objectid,
8818  						   ins.offset, 0);
8819  			break;
8820  		}
8821  
8822  		em = alloc_extent_map();
8823  		if (!em) {
8824  			btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
8825  					    cur_offset + ins.offset - 1, false);
8826  			btrfs_set_inode_full_sync(BTRFS_I(inode));
8827  			goto next;
8828  		}
8829  
8830  		em->start = cur_offset;
8831  		em->len = ins.offset;
8832  		em->disk_bytenr = ins.objectid;
8833  		em->offset = 0;
8834  		em->disk_num_bytes = ins.offset;
8835  		em->ram_bytes = ins.offset;
8836  		em->flags |= EXTENT_FLAG_PREALLOC;
8837  		em->generation = trans->transid;
8838  
8839  		ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
8840  		free_extent_map(em);
8841  next:
8842  		num_bytes -= ins.offset;
8843  		cur_offset += ins.offset;
8844  		*alloc_hint = ins.objectid + ins.offset;
8845  
8846  		inode_inc_iversion(inode);
8847  		inode_set_ctime_current(inode);
8848  		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
8849  		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
8850  		    (actual_len > inode->i_size) &&
8851  		    (cur_offset > inode->i_size)) {
8852  			if (cur_offset > actual_len)
8853  				i_size = actual_len;
8854  			else
8855  				i_size = cur_offset;
8856  			i_size_write(inode, i_size);
8857  			btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
8858  		}
8859  
8860  		ret = btrfs_update_inode(trans, BTRFS_I(inode));
8861  
8862  		if (ret) {
8863  			btrfs_abort_transaction(trans, ret);
8864  			if (own_trans)
8865  				btrfs_end_transaction(trans);
8866  			break;
8867  		}
8868  
8869  		if (own_trans) {
8870  			btrfs_end_transaction(trans);
8871  			trans = NULL;
8872  		}
8873  	}
8874  	if (clear_offset < end)
8875  		btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
8876  			end - clear_offset + 1);
8877  	return ret;
8878  }
8879  
btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)8880  int btrfs_prealloc_file_range(struct inode *inode, int mode,
8881  			      u64 start, u64 num_bytes, u64 min_size,
8882  			      loff_t actual_len, u64 *alloc_hint)
8883  {
8884  	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8885  					   min_size, actual_len, alloc_hint,
8886  					   NULL);
8887  }
8888  
btrfs_prealloc_file_range_trans(struct inode * inode,struct btrfs_trans_handle * trans,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)8889  int btrfs_prealloc_file_range_trans(struct inode *inode,
8890  				    struct btrfs_trans_handle *trans, int mode,
8891  				    u64 start, u64 num_bytes, u64 min_size,
8892  				    loff_t actual_len, u64 *alloc_hint)
8893  {
8894  	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8895  					   min_size, actual_len, alloc_hint, trans);
8896  }
8897  
btrfs_permission(struct mnt_idmap * idmap,struct inode * inode,int mask)8898  static int btrfs_permission(struct mnt_idmap *idmap,
8899  			    struct inode *inode, int mask)
8900  {
8901  	struct btrfs_root *root = BTRFS_I(inode)->root;
8902  	umode_t mode = inode->i_mode;
8903  
8904  	if (mask & MAY_WRITE &&
8905  	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
8906  		if (btrfs_root_readonly(root))
8907  			return -EROFS;
8908  		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
8909  			return -EACCES;
8910  	}
8911  	return generic_permission(idmap, inode, mask);
8912  }
8913  
btrfs_tmpfile(struct mnt_idmap * idmap,struct inode * dir,struct file * file,umode_t mode)8914  static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
8915  			 struct file *file, umode_t mode)
8916  {
8917  	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
8918  	struct btrfs_trans_handle *trans;
8919  	struct btrfs_root *root = BTRFS_I(dir)->root;
8920  	struct inode *inode;
8921  	struct btrfs_new_inode_args new_inode_args = {
8922  		.dir = dir,
8923  		.dentry = file->f_path.dentry,
8924  		.orphan = true,
8925  	};
8926  	unsigned int trans_num_items;
8927  	int ret;
8928  
8929  	inode = new_inode(dir->i_sb);
8930  	if (!inode)
8931  		return -ENOMEM;
8932  	inode_init_owner(idmap, inode, dir, mode);
8933  	inode->i_fop = &btrfs_file_operations;
8934  	inode->i_op = &btrfs_file_inode_operations;
8935  	inode->i_mapping->a_ops = &btrfs_aops;
8936  
8937  	new_inode_args.inode = inode;
8938  	ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
8939  	if (ret)
8940  		goto out_inode;
8941  
8942  	trans = btrfs_start_transaction(root, trans_num_items);
8943  	if (IS_ERR(trans)) {
8944  		ret = PTR_ERR(trans);
8945  		goto out_new_inode_args;
8946  	}
8947  
8948  	ret = btrfs_create_new_inode(trans, &new_inode_args);
8949  
8950  	/*
8951  	 * We set number of links to 0 in btrfs_create_new_inode(), and here we
8952  	 * set it to 1 because d_tmpfile() will issue a warning if the count is
8953  	 * 0, through:
8954  	 *
8955  	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
8956  	 */
8957  	set_nlink(inode, 1);
8958  
8959  	if (!ret) {
8960  		d_tmpfile(file, inode);
8961  		unlock_new_inode(inode);
8962  		mark_inode_dirty(inode);
8963  	}
8964  
8965  	btrfs_end_transaction(trans);
8966  	btrfs_btree_balance_dirty(fs_info);
8967  out_new_inode_args:
8968  	btrfs_new_inode_args_destroy(&new_inode_args);
8969  out_inode:
8970  	if (ret)
8971  		iput(inode);
8972  	return finish_open_simple(file, ret);
8973  }
8974  
btrfs_set_range_writeback(struct btrfs_inode * inode,u64 start,u64 end)8975  void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
8976  {
8977  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
8978  	unsigned long index = start >> PAGE_SHIFT;
8979  	unsigned long end_index = end >> PAGE_SHIFT;
8980  	struct folio *folio;
8981  	u32 len;
8982  
8983  	ASSERT(end + 1 - start <= U32_MAX);
8984  	len = end + 1 - start;
8985  	while (index <= end_index) {
8986  		folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
8987  		ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */
8988  
8989  		/* This is for data, which doesn't yet support larger folio. */
8990  		ASSERT(folio_order(folio) == 0);
8991  		btrfs_folio_set_writeback(fs_info, folio, start, len);
8992  		folio_put(folio);
8993  		index++;
8994  	}
8995  }
8996  
btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info * fs_info,int compress_type)8997  int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
8998  					     int compress_type)
8999  {
9000  	switch (compress_type) {
9001  	case BTRFS_COMPRESS_NONE:
9002  		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
9003  	case BTRFS_COMPRESS_ZLIB:
9004  		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
9005  	case BTRFS_COMPRESS_LZO:
9006  		/*
9007  		 * The LZO format depends on the sector size. 64K is the maximum
9008  		 * sector size that we support.
9009  		 */
9010  		if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
9011  			return -EINVAL;
9012  		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
9013  		       (fs_info->sectorsize_bits - 12);
9014  	case BTRFS_COMPRESS_ZSTD:
9015  		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
9016  	default:
9017  		return -EUCLEAN;
9018  	}
9019  }
9020  
btrfs_encoded_read_inline(struct kiocb * iocb,struct iov_iter * iter,u64 start,u64 lockend,struct extent_state ** cached_state,u64 extent_start,size_t count,struct btrfs_ioctl_encoded_io_args * encoded,bool * unlocked)9021  static ssize_t btrfs_encoded_read_inline(
9022  				struct kiocb *iocb,
9023  				struct iov_iter *iter, u64 start,
9024  				u64 lockend,
9025  				struct extent_state **cached_state,
9026  				u64 extent_start, size_t count,
9027  				struct btrfs_ioctl_encoded_io_args *encoded,
9028  				bool *unlocked)
9029  {
9030  	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9031  	struct btrfs_root *root = inode->root;
9032  	struct btrfs_fs_info *fs_info = root->fs_info;
9033  	struct extent_io_tree *io_tree = &inode->io_tree;
9034  	struct btrfs_path *path;
9035  	struct extent_buffer *leaf;
9036  	struct btrfs_file_extent_item *item;
9037  	u64 ram_bytes;
9038  	unsigned long ptr;
9039  	void *tmp;
9040  	ssize_t ret;
9041  
9042  	path = btrfs_alloc_path();
9043  	if (!path) {
9044  		ret = -ENOMEM;
9045  		goto out;
9046  	}
9047  	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
9048  				       extent_start, 0);
9049  	if (ret) {
9050  		if (ret > 0) {
9051  			/* The extent item disappeared? */
9052  			ret = -EIO;
9053  		}
9054  		goto out;
9055  	}
9056  	leaf = path->nodes[0];
9057  	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
9058  
9059  	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
9060  	ptr = btrfs_file_extent_inline_start(item);
9061  
9062  	encoded->len = min_t(u64, extent_start + ram_bytes,
9063  			     inode->vfs_inode.i_size) - iocb->ki_pos;
9064  	ret = btrfs_encoded_io_compression_from_extent(fs_info,
9065  				 btrfs_file_extent_compression(leaf, item));
9066  	if (ret < 0)
9067  		goto out;
9068  	encoded->compression = ret;
9069  	if (encoded->compression) {
9070  		size_t inline_size;
9071  
9072  		inline_size = btrfs_file_extent_inline_item_len(leaf,
9073  								path->slots[0]);
9074  		if (inline_size > count) {
9075  			ret = -ENOBUFS;
9076  			goto out;
9077  		}
9078  		count = inline_size;
9079  		encoded->unencoded_len = ram_bytes;
9080  		encoded->unencoded_offset = iocb->ki_pos - extent_start;
9081  	} else {
9082  		count = min_t(u64, count, encoded->len);
9083  		encoded->len = count;
9084  		encoded->unencoded_len = count;
9085  		ptr += iocb->ki_pos - extent_start;
9086  	}
9087  
9088  	tmp = kmalloc(count, GFP_NOFS);
9089  	if (!tmp) {
9090  		ret = -ENOMEM;
9091  		goto out;
9092  	}
9093  	read_extent_buffer(leaf, tmp, ptr, count);
9094  	btrfs_release_path(path);
9095  	unlock_extent(io_tree, start, lockend, cached_state);
9096  	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9097  	*unlocked = true;
9098  
9099  	ret = copy_to_iter(tmp, count, iter);
9100  	if (ret != count)
9101  		ret = -EFAULT;
9102  	kfree(tmp);
9103  out:
9104  	btrfs_free_path(path);
9105  	return ret;
9106  }
9107  
9108  struct btrfs_encoded_read_private {
9109  	wait_queue_head_t wait;
9110  	atomic_t pending;
9111  	blk_status_t status;
9112  };
9113  
btrfs_encoded_read_endio(struct btrfs_bio * bbio)9114  static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
9115  {
9116  	struct btrfs_encoded_read_private *priv = bbio->private;
9117  
9118  	if (bbio->bio.bi_status) {
9119  		/*
9120  		 * The memory barrier implied by the atomic_dec_return() here
9121  		 * pairs with the memory barrier implied by the
9122  		 * atomic_dec_return() or io_wait_event() in
9123  		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
9124  		 * write is observed before the load of status in
9125  		 * btrfs_encoded_read_regular_fill_pages().
9126  		 */
9127  		WRITE_ONCE(priv->status, bbio->bio.bi_status);
9128  	}
9129  	if (!atomic_dec_return(&priv->pending))
9130  		wake_up(&priv->wait);
9131  	bio_put(&bbio->bio);
9132  }
9133  
btrfs_encoded_read_regular_fill_pages(struct btrfs_inode * inode,u64 file_offset,u64 disk_bytenr,u64 disk_io_size,struct page ** pages)9134  int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
9135  					  u64 file_offset, u64 disk_bytenr,
9136  					  u64 disk_io_size, struct page **pages)
9137  {
9138  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
9139  	struct btrfs_encoded_read_private priv = {
9140  		.pending = ATOMIC_INIT(1),
9141  	};
9142  	unsigned long i = 0;
9143  	struct btrfs_bio *bbio;
9144  
9145  	init_waitqueue_head(&priv.wait);
9146  
9147  	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9148  			       btrfs_encoded_read_endio, &priv);
9149  	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9150  	bbio->inode = inode;
9151  
9152  	do {
9153  		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
9154  
9155  		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
9156  			atomic_inc(&priv.pending);
9157  			btrfs_submit_bbio(bbio, 0);
9158  
9159  			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9160  					       btrfs_encoded_read_endio, &priv);
9161  			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9162  			bbio->inode = inode;
9163  			continue;
9164  		}
9165  
9166  		i++;
9167  		disk_bytenr += bytes;
9168  		disk_io_size -= bytes;
9169  	} while (disk_io_size);
9170  
9171  	atomic_inc(&priv.pending);
9172  	btrfs_submit_bbio(bbio, 0);
9173  
9174  	if (atomic_dec_return(&priv.pending))
9175  		io_wait_event(priv.wait, !atomic_read(&priv.pending));
9176  	/* See btrfs_encoded_read_endio() for ordering. */
9177  	return blk_status_to_errno(READ_ONCE(priv.status));
9178  }
9179  
btrfs_encoded_read_regular(struct kiocb * iocb,struct iov_iter * iter,u64 start,u64 lockend,struct extent_state ** cached_state,u64 disk_bytenr,u64 disk_io_size,size_t count,bool compressed,bool * unlocked)9180  static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
9181  					  struct iov_iter *iter,
9182  					  u64 start, u64 lockend,
9183  					  struct extent_state **cached_state,
9184  					  u64 disk_bytenr, u64 disk_io_size,
9185  					  size_t count, bool compressed,
9186  					  bool *unlocked)
9187  {
9188  	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9189  	struct extent_io_tree *io_tree = &inode->io_tree;
9190  	struct page **pages;
9191  	unsigned long nr_pages, i;
9192  	u64 cur;
9193  	size_t page_offset;
9194  	ssize_t ret;
9195  
9196  	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
9197  	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
9198  	if (!pages)
9199  		return -ENOMEM;
9200  	ret = btrfs_alloc_page_array(nr_pages, pages, false);
9201  	if (ret) {
9202  		ret = -ENOMEM;
9203  		goto out;
9204  		}
9205  
9206  	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
9207  						    disk_io_size, pages);
9208  	if (ret)
9209  		goto out;
9210  
9211  	unlock_extent(io_tree, start, lockend, cached_state);
9212  	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9213  	*unlocked = true;
9214  
9215  	if (compressed) {
9216  		i = 0;
9217  		page_offset = 0;
9218  	} else {
9219  		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
9220  		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
9221  	}
9222  	cur = 0;
9223  	while (cur < count) {
9224  		size_t bytes = min_t(size_t, count - cur,
9225  				     PAGE_SIZE - page_offset);
9226  
9227  		if (copy_page_to_iter(pages[i], page_offset, bytes,
9228  				      iter) != bytes) {
9229  			ret = -EFAULT;
9230  			goto out;
9231  		}
9232  		i++;
9233  		cur += bytes;
9234  		page_offset = 0;
9235  	}
9236  	ret = count;
9237  out:
9238  	for (i = 0; i < nr_pages; i++) {
9239  		if (pages[i])
9240  			__free_page(pages[i]);
9241  	}
9242  	kfree(pages);
9243  	return ret;
9244  }
9245  
btrfs_encoded_read(struct kiocb * iocb,struct iov_iter * iter,struct btrfs_ioctl_encoded_io_args * encoded)9246  ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
9247  			   struct btrfs_ioctl_encoded_io_args *encoded)
9248  {
9249  	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9250  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
9251  	struct extent_io_tree *io_tree = &inode->io_tree;
9252  	ssize_t ret;
9253  	size_t count = iov_iter_count(iter);
9254  	u64 start, lockend, disk_bytenr, disk_io_size;
9255  	struct extent_state *cached_state = NULL;
9256  	struct extent_map *em;
9257  	bool unlocked = false;
9258  
9259  	file_accessed(iocb->ki_filp);
9260  
9261  	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
9262  
9263  	if (iocb->ki_pos >= inode->vfs_inode.i_size) {
9264  		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9265  		return 0;
9266  	}
9267  	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
9268  	/*
9269  	 * We don't know how long the extent containing iocb->ki_pos is, but if
9270  	 * it's compressed we know that it won't be longer than this.
9271  	 */
9272  	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
9273  
9274  	for (;;) {
9275  		struct btrfs_ordered_extent *ordered;
9276  
9277  		ret = btrfs_wait_ordered_range(inode, start,
9278  					       lockend - start + 1);
9279  		if (ret)
9280  			goto out_unlock_inode;
9281  		lock_extent(io_tree, start, lockend, &cached_state);
9282  		ordered = btrfs_lookup_ordered_range(inode, start,
9283  						     lockend - start + 1);
9284  		if (!ordered)
9285  			break;
9286  		btrfs_put_ordered_extent(ordered);
9287  		unlock_extent(io_tree, start, lockend, &cached_state);
9288  		cond_resched();
9289  	}
9290  
9291  	em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
9292  	if (IS_ERR(em)) {
9293  		ret = PTR_ERR(em);
9294  		goto out_unlock_extent;
9295  	}
9296  
9297  	if (em->disk_bytenr == EXTENT_MAP_INLINE) {
9298  		u64 extent_start = em->start;
9299  
9300  		/*
9301  		 * For inline extents we get everything we need out of the
9302  		 * extent item.
9303  		 */
9304  		free_extent_map(em);
9305  		em = NULL;
9306  		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
9307  						&cached_state, extent_start,
9308  						count, encoded, &unlocked);
9309  		goto out;
9310  	}
9311  
9312  	/*
9313  	 * We only want to return up to EOF even if the extent extends beyond
9314  	 * that.
9315  	 */
9316  	encoded->len = min_t(u64, extent_map_end(em),
9317  			     inode->vfs_inode.i_size) - iocb->ki_pos;
9318  	if (em->disk_bytenr == EXTENT_MAP_HOLE ||
9319  	    (em->flags & EXTENT_FLAG_PREALLOC)) {
9320  		disk_bytenr = EXTENT_MAP_HOLE;
9321  		count = min_t(u64, count, encoded->len);
9322  		encoded->len = count;
9323  		encoded->unencoded_len = count;
9324  	} else if (extent_map_is_compressed(em)) {
9325  		disk_bytenr = em->disk_bytenr;
9326  		/*
9327  		 * Bail if the buffer isn't large enough to return the whole
9328  		 * compressed extent.
9329  		 */
9330  		if (em->disk_num_bytes > count) {
9331  			ret = -ENOBUFS;
9332  			goto out_em;
9333  		}
9334  		disk_io_size = em->disk_num_bytes;
9335  		count = em->disk_num_bytes;
9336  		encoded->unencoded_len = em->ram_bytes;
9337  		encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
9338  		ret = btrfs_encoded_io_compression_from_extent(fs_info,
9339  							       extent_map_compression(em));
9340  		if (ret < 0)
9341  			goto out_em;
9342  		encoded->compression = ret;
9343  	} else {
9344  		disk_bytenr = extent_map_block_start(em) + (start - em->start);
9345  		if (encoded->len > count)
9346  			encoded->len = count;
9347  		/*
9348  		 * Don't read beyond what we locked. This also limits the page
9349  		 * allocations that we'll do.
9350  		 */
9351  		disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
9352  		count = start + disk_io_size - iocb->ki_pos;
9353  		encoded->len = count;
9354  		encoded->unencoded_len = count;
9355  		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
9356  	}
9357  	free_extent_map(em);
9358  	em = NULL;
9359  
9360  	if (disk_bytenr == EXTENT_MAP_HOLE) {
9361  		unlock_extent(io_tree, start, lockend, &cached_state);
9362  		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9363  		unlocked = true;
9364  		ret = iov_iter_zero(count, iter);
9365  		if (ret != count)
9366  			ret = -EFAULT;
9367  	} else {
9368  		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
9369  						 &cached_state, disk_bytenr,
9370  						 disk_io_size, count,
9371  						 encoded->compression,
9372  						 &unlocked);
9373  	}
9374  
9375  out:
9376  	if (ret >= 0)
9377  		iocb->ki_pos += encoded->len;
9378  out_em:
9379  	free_extent_map(em);
9380  out_unlock_extent:
9381  	if (!unlocked)
9382  		unlock_extent(io_tree, start, lockend, &cached_state);
9383  out_unlock_inode:
9384  	if (!unlocked)
9385  		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9386  	return ret;
9387  }
9388  
btrfs_do_encoded_write(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)9389  ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
9390  			       const struct btrfs_ioctl_encoded_io_args *encoded)
9391  {
9392  	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9393  	struct btrfs_root *root = inode->root;
9394  	struct btrfs_fs_info *fs_info = root->fs_info;
9395  	struct extent_io_tree *io_tree = &inode->io_tree;
9396  	struct extent_changeset *data_reserved = NULL;
9397  	struct extent_state *cached_state = NULL;
9398  	struct btrfs_ordered_extent *ordered;
9399  	struct btrfs_file_extent file_extent;
9400  	int compression;
9401  	size_t orig_count;
9402  	u64 start, end;
9403  	u64 num_bytes, ram_bytes, disk_num_bytes;
9404  	unsigned long nr_folios, i;
9405  	struct folio **folios;
9406  	struct btrfs_key ins;
9407  	bool extent_reserved = false;
9408  	struct extent_map *em;
9409  	ssize_t ret;
9410  
9411  	switch (encoded->compression) {
9412  	case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
9413  		compression = BTRFS_COMPRESS_ZLIB;
9414  		break;
9415  	case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
9416  		compression = BTRFS_COMPRESS_ZSTD;
9417  		break;
9418  	case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
9419  	case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
9420  	case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
9421  	case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
9422  	case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
9423  		/* The sector size must match for LZO. */
9424  		if (encoded->compression -
9425  		    BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
9426  		    fs_info->sectorsize_bits)
9427  			return -EINVAL;
9428  		compression = BTRFS_COMPRESS_LZO;
9429  		break;
9430  	default:
9431  		return -EINVAL;
9432  	}
9433  	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
9434  		return -EINVAL;
9435  
9436  	/*
9437  	 * Compressed extents should always have checksums, so error out if we
9438  	 * have a NOCOW file or inode was created while mounted with NODATASUM.
9439  	 */
9440  	if (inode->flags & BTRFS_INODE_NODATASUM)
9441  		return -EINVAL;
9442  
9443  	orig_count = iov_iter_count(from);
9444  
9445  	/* The extent size must be sane. */
9446  	if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
9447  	    orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
9448  		return -EINVAL;
9449  
9450  	/*
9451  	 * The compressed data must be smaller than the decompressed data.
9452  	 *
9453  	 * It's of course possible for data to compress to larger or the same
9454  	 * size, but the buffered I/O path falls back to no compression for such
9455  	 * data, and we don't want to break any assumptions by creating these
9456  	 * extents.
9457  	 *
9458  	 * Note that this is less strict than the current check we have that the
9459  	 * compressed data must be at least one sector smaller than the
9460  	 * decompressed data. We only want to enforce the weaker requirement
9461  	 * from old kernels that it is at least one byte smaller.
9462  	 */
9463  	if (orig_count >= encoded->unencoded_len)
9464  		return -EINVAL;
9465  
9466  	/* The extent must start on a sector boundary. */
9467  	start = iocb->ki_pos;
9468  	if (!IS_ALIGNED(start, fs_info->sectorsize))
9469  		return -EINVAL;
9470  
9471  	/*
9472  	 * The extent must end on a sector boundary. However, we allow a write
9473  	 * which ends at or extends i_size to have an unaligned length; we round
9474  	 * up the extent size and set i_size to the unaligned end.
9475  	 */
9476  	if (start + encoded->len < inode->vfs_inode.i_size &&
9477  	    !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
9478  		return -EINVAL;
9479  
9480  	/* Finally, the offset in the unencoded data must be sector-aligned. */
9481  	if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
9482  		return -EINVAL;
9483  
9484  	num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
9485  	ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
9486  	end = start + num_bytes - 1;
9487  
9488  	/*
9489  	 * If the extent cannot be inline, the compressed data on disk must be
9490  	 * sector-aligned. For convenience, we extend it with zeroes if it
9491  	 * isn't.
9492  	 */
9493  	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
9494  	nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
9495  	folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
9496  	if (!folios)
9497  		return -ENOMEM;
9498  	for (i = 0; i < nr_folios; i++) {
9499  		size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
9500  		char *kaddr;
9501  
9502  		folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
9503  		if (!folios[i]) {
9504  			ret = -ENOMEM;
9505  			goto out_folios;
9506  		}
9507  		kaddr = kmap_local_folio(folios[i], 0);
9508  		if (copy_from_iter(kaddr, bytes, from) != bytes) {
9509  			kunmap_local(kaddr);
9510  			ret = -EFAULT;
9511  			goto out_folios;
9512  		}
9513  		if (bytes < PAGE_SIZE)
9514  			memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
9515  		kunmap_local(kaddr);
9516  	}
9517  
9518  	for (;;) {
9519  		struct btrfs_ordered_extent *ordered;
9520  
9521  		ret = btrfs_wait_ordered_range(inode, start, num_bytes);
9522  		if (ret)
9523  			goto out_folios;
9524  		ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
9525  						    start >> PAGE_SHIFT,
9526  						    end >> PAGE_SHIFT);
9527  		if (ret)
9528  			goto out_folios;
9529  		lock_extent(io_tree, start, end, &cached_state);
9530  		ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
9531  		if (!ordered &&
9532  		    !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
9533  			break;
9534  		if (ordered)
9535  			btrfs_put_ordered_extent(ordered);
9536  		unlock_extent(io_tree, start, end, &cached_state);
9537  		cond_resched();
9538  	}
9539  
9540  	/*
9541  	 * We don't use the higher-level delalloc space functions because our
9542  	 * num_bytes and disk_num_bytes are different.
9543  	 */
9544  	ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
9545  	if (ret)
9546  		goto out_unlock;
9547  	ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
9548  	if (ret)
9549  		goto out_free_data_space;
9550  	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
9551  					      false);
9552  	if (ret)
9553  		goto out_qgroup_free_data;
9554  
9555  	/* Try an inline extent first. */
9556  	if (encoded->unencoded_len == encoded->len &&
9557  	    encoded->unencoded_offset == 0 &&
9558  	    can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
9559  		ret = __cow_file_range_inline(inode, start, encoded->len,
9560  					      orig_count, compression, folios[0],
9561  					      true);
9562  		if (ret <= 0) {
9563  			if (ret == 0)
9564  				ret = orig_count;
9565  			goto out_delalloc_release;
9566  		}
9567  	}
9568  
9569  	ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
9570  				   disk_num_bytes, 0, 0, &ins, 1, 1);
9571  	if (ret)
9572  		goto out_delalloc_release;
9573  	extent_reserved = true;
9574  
9575  	file_extent.disk_bytenr = ins.objectid;
9576  	file_extent.disk_num_bytes = ins.offset;
9577  	file_extent.num_bytes = num_bytes;
9578  	file_extent.ram_bytes = ram_bytes;
9579  	file_extent.offset = encoded->unencoded_offset;
9580  	file_extent.compression = compression;
9581  	em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
9582  	if (IS_ERR(em)) {
9583  		ret = PTR_ERR(em);
9584  		goto out_free_reserved;
9585  	}
9586  	free_extent_map(em);
9587  
9588  	ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
9589  				       (1 << BTRFS_ORDERED_ENCODED) |
9590  				       (1 << BTRFS_ORDERED_COMPRESSED));
9591  	if (IS_ERR(ordered)) {
9592  		btrfs_drop_extent_map_range(inode, start, end, false);
9593  		ret = PTR_ERR(ordered);
9594  		goto out_free_reserved;
9595  	}
9596  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9597  
9598  	if (start + encoded->len > inode->vfs_inode.i_size)
9599  		i_size_write(&inode->vfs_inode, start + encoded->len);
9600  
9601  	unlock_extent(io_tree, start, end, &cached_state);
9602  
9603  	btrfs_delalloc_release_extents(inode, num_bytes);
9604  
9605  	btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
9606  	ret = orig_count;
9607  	goto out;
9608  
9609  out_free_reserved:
9610  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9611  	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
9612  out_delalloc_release:
9613  	btrfs_delalloc_release_extents(inode, num_bytes);
9614  	btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
9615  out_qgroup_free_data:
9616  	if (ret < 0)
9617  		btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
9618  out_free_data_space:
9619  	/*
9620  	 * If btrfs_reserve_extent() succeeded, then we already decremented
9621  	 * bytes_may_use.
9622  	 */
9623  	if (!extent_reserved)
9624  		btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
9625  out_unlock:
9626  	unlock_extent(io_tree, start, end, &cached_state);
9627  out_folios:
9628  	for (i = 0; i < nr_folios; i++) {
9629  		if (folios[i])
9630  			folio_put(folios[i]);
9631  	}
9632  	kvfree(folios);
9633  out:
9634  	if (ret >= 0)
9635  		iocb->ki_pos += encoded->len;
9636  	return ret;
9637  }
9638  
9639  #ifdef CONFIG_SWAP
9640  /*
9641   * Add an entry indicating a block group or device which is pinned by a
9642   * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
9643   * negative errno on failure.
9644   */
btrfs_add_swapfile_pin(struct inode * inode,void * ptr,bool is_block_group)9645  static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
9646  				  bool is_block_group)
9647  {
9648  	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
9649  	struct btrfs_swapfile_pin *sp, *entry;
9650  	struct rb_node **p;
9651  	struct rb_node *parent = NULL;
9652  
9653  	sp = kmalloc(sizeof(*sp), GFP_NOFS);
9654  	if (!sp)
9655  		return -ENOMEM;
9656  	sp->ptr = ptr;
9657  	sp->inode = inode;
9658  	sp->is_block_group = is_block_group;
9659  	sp->bg_extent_count = 1;
9660  
9661  	spin_lock(&fs_info->swapfile_pins_lock);
9662  	p = &fs_info->swapfile_pins.rb_node;
9663  	while (*p) {
9664  		parent = *p;
9665  		entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
9666  		if (sp->ptr < entry->ptr ||
9667  		    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
9668  			p = &(*p)->rb_left;
9669  		} else if (sp->ptr > entry->ptr ||
9670  			   (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
9671  			p = &(*p)->rb_right;
9672  		} else {
9673  			if (is_block_group)
9674  				entry->bg_extent_count++;
9675  			spin_unlock(&fs_info->swapfile_pins_lock);
9676  			kfree(sp);
9677  			return 1;
9678  		}
9679  	}
9680  	rb_link_node(&sp->node, parent, p);
9681  	rb_insert_color(&sp->node, &fs_info->swapfile_pins);
9682  	spin_unlock(&fs_info->swapfile_pins_lock);
9683  	return 0;
9684  }
9685  
9686  /* Free all of the entries pinned by this swapfile. */
btrfs_free_swapfile_pins(struct inode * inode)9687  static void btrfs_free_swapfile_pins(struct inode *inode)
9688  {
9689  	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
9690  	struct btrfs_swapfile_pin *sp;
9691  	struct rb_node *node, *next;
9692  
9693  	spin_lock(&fs_info->swapfile_pins_lock);
9694  	node = rb_first(&fs_info->swapfile_pins);
9695  	while (node) {
9696  		next = rb_next(node);
9697  		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
9698  		if (sp->inode == inode) {
9699  			rb_erase(&sp->node, &fs_info->swapfile_pins);
9700  			if (sp->is_block_group) {
9701  				btrfs_dec_block_group_swap_extents(sp->ptr,
9702  							   sp->bg_extent_count);
9703  				btrfs_put_block_group(sp->ptr);
9704  			}
9705  			kfree(sp);
9706  		}
9707  		node = next;
9708  	}
9709  	spin_unlock(&fs_info->swapfile_pins_lock);
9710  }
9711  
9712  struct btrfs_swap_info {
9713  	u64 start;
9714  	u64 block_start;
9715  	u64 block_len;
9716  	u64 lowest_ppage;
9717  	u64 highest_ppage;
9718  	unsigned long nr_pages;
9719  	int nr_extents;
9720  };
9721  
btrfs_add_swap_extent(struct swap_info_struct * sis,struct btrfs_swap_info * bsi)9722  static int btrfs_add_swap_extent(struct swap_info_struct *sis,
9723  				 struct btrfs_swap_info *bsi)
9724  {
9725  	unsigned long nr_pages;
9726  	unsigned long max_pages;
9727  	u64 first_ppage, first_ppage_reported, next_ppage;
9728  	int ret;
9729  
9730  	/*
9731  	 * Our swapfile may have had its size extended after the swap header was
9732  	 * written. In that case activating the swapfile should not go beyond
9733  	 * the max size set in the swap header.
9734  	 */
9735  	if (bsi->nr_pages >= sis->max)
9736  		return 0;
9737  
9738  	max_pages = sis->max - bsi->nr_pages;
9739  	first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
9740  	next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
9741  
9742  	if (first_ppage >= next_ppage)
9743  		return 0;
9744  	nr_pages = next_ppage - first_ppage;
9745  	nr_pages = min(nr_pages, max_pages);
9746  
9747  	first_ppage_reported = first_ppage;
9748  	if (bsi->start == 0)
9749  		first_ppage_reported++;
9750  	if (bsi->lowest_ppage > first_ppage_reported)
9751  		bsi->lowest_ppage = first_ppage_reported;
9752  	if (bsi->highest_ppage < (next_ppage - 1))
9753  		bsi->highest_ppage = next_ppage - 1;
9754  
9755  	ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
9756  	if (ret < 0)
9757  		return ret;
9758  	bsi->nr_extents += ret;
9759  	bsi->nr_pages += nr_pages;
9760  	return 0;
9761  }
9762  
btrfs_swap_deactivate(struct file * file)9763  static void btrfs_swap_deactivate(struct file *file)
9764  {
9765  	struct inode *inode = file_inode(file);
9766  
9767  	btrfs_free_swapfile_pins(inode);
9768  	atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
9769  }
9770  
btrfs_swap_activate(struct swap_info_struct * sis,struct file * file,sector_t * span)9771  static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
9772  			       sector_t *span)
9773  {
9774  	struct inode *inode = file_inode(file);
9775  	struct btrfs_root *root = BTRFS_I(inode)->root;
9776  	struct btrfs_fs_info *fs_info = root->fs_info;
9777  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
9778  	struct extent_state *cached_state = NULL;
9779  	struct extent_map *em = NULL;
9780  	struct btrfs_chunk_map *map = NULL;
9781  	struct btrfs_device *device = NULL;
9782  	struct btrfs_swap_info bsi = {
9783  		.lowest_ppage = (sector_t)-1ULL,
9784  	};
9785  	int ret = 0;
9786  	u64 isize;
9787  	u64 start;
9788  
9789  	/*
9790  	 * If the swap file was just created, make sure delalloc is done. If the
9791  	 * file changes again after this, the user is doing something stupid and
9792  	 * we don't really care.
9793  	 */
9794  	ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
9795  	if (ret)
9796  		return ret;
9797  
9798  	/*
9799  	 * The inode is locked, so these flags won't change after we check them.
9800  	 */
9801  	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
9802  		btrfs_warn(fs_info, "swapfile must not be compressed");
9803  		return -EINVAL;
9804  	}
9805  	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
9806  		btrfs_warn(fs_info, "swapfile must not be copy-on-write");
9807  		return -EINVAL;
9808  	}
9809  	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
9810  		btrfs_warn(fs_info, "swapfile must not be checksummed");
9811  		return -EINVAL;
9812  	}
9813  
9814  	/*
9815  	 * Balance or device remove/replace/resize can move stuff around from
9816  	 * under us. The exclop protection makes sure they aren't running/won't
9817  	 * run concurrently while we are mapping the swap extents, and
9818  	 * fs_info->swapfile_pins prevents them from running while the swap
9819  	 * file is active and moving the extents. Note that this also prevents
9820  	 * a concurrent device add which isn't actually necessary, but it's not
9821  	 * really worth the trouble to allow it.
9822  	 */
9823  	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
9824  		btrfs_warn(fs_info,
9825  	   "cannot activate swapfile while exclusive operation is running");
9826  		return -EBUSY;
9827  	}
9828  
9829  	/*
9830  	 * Prevent snapshot creation while we are activating the swap file.
9831  	 * We do not want to race with snapshot creation. If snapshot creation
9832  	 * already started before we bumped nr_swapfiles from 0 to 1 and
9833  	 * completes before the first write into the swap file after it is
9834  	 * activated, than that write would fallback to COW.
9835  	 */
9836  	if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
9837  		btrfs_exclop_finish(fs_info);
9838  		btrfs_warn(fs_info,
9839  	   "cannot activate swapfile because snapshot creation is in progress");
9840  		return -EINVAL;
9841  	}
9842  	/*
9843  	 * Snapshots can create extents which require COW even if NODATACOW is
9844  	 * set. We use this counter to prevent snapshots. We must increment it
9845  	 * before walking the extents because we don't want a concurrent
9846  	 * snapshot to run after we've already checked the extents.
9847  	 *
9848  	 * It is possible that subvolume is marked for deletion but still not
9849  	 * removed yet. To prevent this race, we check the root status before
9850  	 * activating the swapfile.
9851  	 */
9852  	spin_lock(&root->root_item_lock);
9853  	if (btrfs_root_dead(root)) {
9854  		spin_unlock(&root->root_item_lock);
9855  
9856  		btrfs_exclop_finish(fs_info);
9857  		btrfs_warn(fs_info,
9858  		"cannot activate swapfile because subvolume %llu is being deleted",
9859  			btrfs_root_id(root));
9860  		return -EPERM;
9861  	}
9862  	atomic_inc(&root->nr_swapfiles);
9863  	spin_unlock(&root->root_item_lock);
9864  
9865  	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
9866  
9867  	lock_extent(io_tree, 0, isize - 1, &cached_state);
9868  	start = 0;
9869  	while (start < isize) {
9870  		u64 logical_block_start, physical_block_start;
9871  		struct btrfs_block_group *bg;
9872  		u64 len = isize - start;
9873  
9874  		em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
9875  		if (IS_ERR(em)) {
9876  			ret = PTR_ERR(em);
9877  			goto out;
9878  		}
9879  
9880  		if (em->disk_bytenr == EXTENT_MAP_HOLE) {
9881  			btrfs_warn(fs_info, "swapfile must not have holes");
9882  			ret = -EINVAL;
9883  			goto out;
9884  		}
9885  		if (em->disk_bytenr == EXTENT_MAP_INLINE) {
9886  			/*
9887  			 * It's unlikely we'll ever actually find ourselves
9888  			 * here, as a file small enough to fit inline won't be
9889  			 * big enough to store more than the swap header, but in
9890  			 * case something changes in the future, let's catch it
9891  			 * here rather than later.
9892  			 */
9893  			btrfs_warn(fs_info, "swapfile must not be inline");
9894  			ret = -EINVAL;
9895  			goto out;
9896  		}
9897  		if (extent_map_is_compressed(em)) {
9898  			btrfs_warn(fs_info, "swapfile must not be compressed");
9899  			ret = -EINVAL;
9900  			goto out;
9901  		}
9902  
9903  		logical_block_start = extent_map_block_start(em) + (start - em->start);
9904  		len = min(len, em->len - (start - em->start));
9905  		free_extent_map(em);
9906  		em = NULL;
9907  
9908  		ret = can_nocow_extent(inode, start, &len, NULL, false, true);
9909  		if (ret < 0) {
9910  			goto out;
9911  		} else if (ret) {
9912  			ret = 0;
9913  		} else {
9914  			btrfs_warn(fs_info,
9915  				   "swapfile must not be copy-on-write");
9916  			ret = -EINVAL;
9917  			goto out;
9918  		}
9919  
9920  		map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
9921  		if (IS_ERR(map)) {
9922  			ret = PTR_ERR(map);
9923  			goto out;
9924  		}
9925  
9926  		if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
9927  			btrfs_warn(fs_info,
9928  				   "swapfile must have single data profile");
9929  			ret = -EINVAL;
9930  			goto out;
9931  		}
9932  
9933  		if (device == NULL) {
9934  			device = map->stripes[0].dev;
9935  			ret = btrfs_add_swapfile_pin(inode, device, false);
9936  			if (ret == 1)
9937  				ret = 0;
9938  			else if (ret)
9939  				goto out;
9940  		} else if (device != map->stripes[0].dev) {
9941  			btrfs_warn(fs_info, "swapfile must be on one device");
9942  			ret = -EINVAL;
9943  			goto out;
9944  		}
9945  
9946  		physical_block_start = (map->stripes[0].physical +
9947  					(logical_block_start - map->start));
9948  		len = min(len, map->chunk_len - (logical_block_start - map->start));
9949  		btrfs_free_chunk_map(map);
9950  		map = NULL;
9951  
9952  		bg = btrfs_lookup_block_group(fs_info, logical_block_start);
9953  		if (!bg) {
9954  			btrfs_warn(fs_info,
9955  			   "could not find block group containing swapfile");
9956  			ret = -EINVAL;
9957  			goto out;
9958  		}
9959  
9960  		if (!btrfs_inc_block_group_swap_extents(bg)) {
9961  			btrfs_warn(fs_info,
9962  			   "block group for swapfile at %llu is read-only%s",
9963  			   bg->start,
9964  			   atomic_read(&fs_info->scrubs_running) ?
9965  				       " (scrub running)" : "");
9966  			btrfs_put_block_group(bg);
9967  			ret = -EINVAL;
9968  			goto out;
9969  		}
9970  
9971  		ret = btrfs_add_swapfile_pin(inode, bg, true);
9972  		if (ret) {
9973  			btrfs_put_block_group(bg);
9974  			if (ret == 1)
9975  				ret = 0;
9976  			else
9977  				goto out;
9978  		}
9979  
9980  		if (bsi.block_len &&
9981  		    bsi.block_start + bsi.block_len == physical_block_start) {
9982  			bsi.block_len += len;
9983  		} else {
9984  			if (bsi.block_len) {
9985  				ret = btrfs_add_swap_extent(sis, &bsi);
9986  				if (ret)
9987  					goto out;
9988  			}
9989  			bsi.start = start;
9990  			bsi.block_start = physical_block_start;
9991  			bsi.block_len = len;
9992  		}
9993  
9994  		start += len;
9995  	}
9996  
9997  	if (bsi.block_len)
9998  		ret = btrfs_add_swap_extent(sis, &bsi);
9999  
10000  out:
10001  	if (!IS_ERR_OR_NULL(em))
10002  		free_extent_map(em);
10003  	if (!IS_ERR_OR_NULL(map))
10004  		btrfs_free_chunk_map(map);
10005  
10006  	unlock_extent(io_tree, 0, isize - 1, &cached_state);
10007  
10008  	if (ret)
10009  		btrfs_swap_deactivate(file);
10010  
10011  	btrfs_drew_write_unlock(&root->snapshot_lock);
10012  
10013  	btrfs_exclop_finish(fs_info);
10014  
10015  	if (ret)
10016  		return ret;
10017  
10018  	if (device)
10019  		sis->bdev = device->bdev;
10020  	*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10021  	sis->max = bsi.nr_pages;
10022  	sis->pages = bsi.nr_pages - 1;
10023  	sis->highest_bit = bsi.nr_pages - 1;
10024  	return bsi.nr_extents;
10025  }
10026  #else
btrfs_swap_deactivate(struct file * file)10027  static void btrfs_swap_deactivate(struct file *file)
10028  {
10029  }
10030  
btrfs_swap_activate(struct swap_info_struct * sis,struct file * file,sector_t * span)10031  static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10032  			       sector_t *span)
10033  {
10034  	return -EOPNOTSUPP;
10035  }
10036  #endif
10037  
10038  /*
10039   * Update the number of bytes used in the VFS' inode. When we replace extents in
10040   * a range (clone, dedupe, fallocate's zero range), we must update the number of
10041   * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10042   * always get a correct value.
10043   */
btrfs_update_inode_bytes(struct btrfs_inode * inode,const u64 add_bytes,const u64 del_bytes)10044  void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10045  			      const u64 add_bytes,
10046  			      const u64 del_bytes)
10047  {
10048  	if (add_bytes == del_bytes)
10049  		return;
10050  
10051  	spin_lock(&inode->lock);
10052  	if (del_bytes > 0)
10053  		inode_sub_bytes(&inode->vfs_inode, del_bytes);
10054  	if (add_bytes > 0)
10055  		inode_add_bytes(&inode->vfs_inode, add_bytes);
10056  	spin_unlock(&inode->lock);
10057  }
10058  
10059  /*
10060   * Verify that there are no ordered extents for a given file range.
10061   *
10062   * @inode:   The target inode.
10063   * @start:   Start offset of the file range, should be sector size aligned.
10064   * @end:     End offset (inclusive) of the file range, its value +1 should be
10065   *           sector size aligned.
10066   *
10067   * This should typically be used for cases where we locked an inode's VFS lock in
10068   * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10069   * we have flushed all delalloc in the range, we have waited for all ordered
10070   * extents in the range to complete and finally we have locked the file range in
10071   * the inode's io_tree.
10072   */
btrfs_assert_inode_range_clean(struct btrfs_inode * inode,u64 start,u64 end)10073  void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10074  {
10075  	struct btrfs_root *root = inode->root;
10076  	struct btrfs_ordered_extent *ordered;
10077  
10078  	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10079  		return;
10080  
10081  	ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10082  	if (ordered) {
10083  		btrfs_err(root->fs_info,
10084  "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10085  			  start, end, btrfs_ino(inode), btrfs_root_id(root),
10086  			  ordered->file_offset,
10087  			  ordered->file_offset + ordered->num_bytes - 1);
10088  		btrfs_put_ordered_extent(ordered);
10089  	}
10090  
10091  	ASSERT(ordered == NULL);
10092  }
10093  
10094  /*
10095   * Find the first inode with a minimum number.
10096   *
10097   * @root:	The root to search for.
10098   * @min_ino:	The minimum inode number.
10099   *
10100   * Find the first inode in the @root with a number >= @min_ino and return it.
10101   * Returns NULL if no such inode found.
10102   */
btrfs_find_first_inode(struct btrfs_root * root,u64 min_ino)10103  struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
10104  {
10105  	struct btrfs_inode *inode;
10106  	unsigned long from = min_ino;
10107  
10108  	xa_lock(&root->inodes);
10109  	while (true) {
10110  		inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
10111  		if (!inode)
10112  			break;
10113  		if (igrab(&inode->vfs_inode))
10114  			break;
10115  
10116  		from = btrfs_ino(inode) + 1;
10117  		cond_resched_lock(&root->inodes.xa_lock);
10118  	}
10119  	xa_unlock(&root->inodes);
10120  
10121  	return inode;
10122  }
10123  
10124  static const struct inode_operations btrfs_dir_inode_operations = {
10125  	.getattr	= btrfs_getattr,
10126  	.lookup		= btrfs_lookup,
10127  	.create		= btrfs_create,
10128  	.unlink		= btrfs_unlink,
10129  	.link		= btrfs_link,
10130  	.mkdir		= btrfs_mkdir,
10131  	.rmdir		= btrfs_rmdir,
10132  	.rename		= btrfs_rename2,
10133  	.symlink	= btrfs_symlink,
10134  	.setattr	= btrfs_setattr,
10135  	.mknod		= btrfs_mknod,
10136  	.listxattr	= btrfs_listxattr,
10137  	.permission	= btrfs_permission,
10138  	.get_inode_acl	= btrfs_get_acl,
10139  	.set_acl	= btrfs_set_acl,
10140  	.update_time	= btrfs_update_time,
10141  	.tmpfile        = btrfs_tmpfile,
10142  	.fileattr_get	= btrfs_fileattr_get,
10143  	.fileattr_set	= btrfs_fileattr_set,
10144  };
10145  
10146  static const struct file_operations btrfs_dir_file_operations = {
10147  	.llseek		= btrfs_dir_llseek,
10148  	.read		= generic_read_dir,
10149  	.iterate_shared	= btrfs_real_readdir,
10150  	.open		= btrfs_opendir,
10151  	.unlocked_ioctl	= btrfs_ioctl,
10152  #ifdef CONFIG_COMPAT
10153  	.compat_ioctl	= btrfs_compat_ioctl,
10154  #endif
10155  	.release        = btrfs_release_file,
10156  	.fsync		= btrfs_sync_file,
10157  };
10158  
10159  /*
10160   * btrfs doesn't support the bmap operation because swapfiles
10161   * use bmap to make a mapping of extents in the file.  They assume
10162   * these extents won't change over the life of the file and they
10163   * use the bmap result to do IO directly to the drive.
10164   *
10165   * the btrfs bmap call would return logical addresses that aren't
10166   * suitable for IO and they also will change frequently as COW
10167   * operations happen.  So, swapfile + btrfs == corruption.
10168   *
10169   * For now we're avoiding this by dropping bmap.
10170   */
10171  static const struct address_space_operations btrfs_aops = {
10172  	.read_folio	= btrfs_read_folio,
10173  	.writepages	= btrfs_writepages,
10174  	.readahead	= btrfs_readahead,
10175  	.invalidate_folio = btrfs_invalidate_folio,
10176  	.launder_folio	= btrfs_launder_folio,
10177  	.release_folio	= btrfs_release_folio,
10178  	.migrate_folio	= btrfs_migrate_folio,
10179  	.dirty_folio	= filemap_dirty_folio,
10180  	.error_remove_folio = generic_error_remove_folio,
10181  	.swap_activate	= btrfs_swap_activate,
10182  	.swap_deactivate = btrfs_swap_deactivate,
10183  };
10184  
10185  static const struct inode_operations btrfs_file_inode_operations = {
10186  	.getattr	= btrfs_getattr,
10187  	.setattr	= btrfs_setattr,
10188  	.listxattr      = btrfs_listxattr,
10189  	.permission	= btrfs_permission,
10190  	.fiemap		= btrfs_fiemap,
10191  	.get_inode_acl	= btrfs_get_acl,
10192  	.set_acl	= btrfs_set_acl,
10193  	.update_time	= btrfs_update_time,
10194  	.fileattr_get	= btrfs_fileattr_get,
10195  	.fileattr_set	= btrfs_fileattr_set,
10196  };
10197  static const struct inode_operations btrfs_special_inode_operations = {
10198  	.getattr	= btrfs_getattr,
10199  	.setattr	= btrfs_setattr,
10200  	.permission	= btrfs_permission,
10201  	.listxattr	= btrfs_listxattr,
10202  	.get_inode_acl	= btrfs_get_acl,
10203  	.set_acl	= btrfs_set_acl,
10204  	.update_time	= btrfs_update_time,
10205  };
10206  static const struct inode_operations btrfs_symlink_inode_operations = {
10207  	.get_link	= page_get_link,
10208  	.getattr	= btrfs_getattr,
10209  	.setattr	= btrfs_setattr,
10210  	.permission	= btrfs_permission,
10211  	.listxattr	= btrfs_listxattr,
10212  	.update_time	= btrfs_update_time,
10213  };
10214  
10215  const struct dentry_operations btrfs_dentry_operations = {
10216  	.d_delete	= btrfs_dentry_delete,
10217  };
10218