1  // SPDX-License-Identifier: GPL-2.0
2  #ifndef NO_BCACHEFS_FS
3  
4  #include "bcachefs.h"
5  #include "acl.h"
6  #include "bkey_buf.h"
7  #include "btree_update.h"
8  #include "buckets.h"
9  #include "chardev.h"
10  #include "dirent.h"
11  #include "errcode.h"
12  #include "extents.h"
13  #include "fs.h"
14  #include "fs-common.h"
15  #include "fs-io.h"
16  #include "fs-ioctl.h"
17  #include "fs-io-buffered.h"
18  #include "fs-io-direct.h"
19  #include "fs-io-pagecache.h"
20  #include "fsck.h"
21  #include "inode.h"
22  #include "io_read.h"
23  #include "journal.h"
24  #include "keylist.h"
25  #include "quota.h"
26  #include "snapshot.h"
27  #include "super.h"
28  #include "xattr.h"
29  #include "trace.h"
30  
31  #include <linux/aio.h>
32  #include <linux/backing-dev.h>
33  #include <linux/exportfs.h>
34  #include <linux/fiemap.h>
35  #include <linux/fs_context.h>
36  #include <linux/module.h>
37  #include <linux/pagemap.h>
38  #include <linux/posix_acl.h>
39  #include <linux/random.h>
40  #include <linux/seq_file.h>
41  #include <linux/statfs.h>
42  #include <linux/string.h>
43  #include <linux/xattr.h>
44  
45  static struct kmem_cache *bch2_inode_cache;
46  
47  static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
48  				struct bch_inode_info *,
49  				struct bch_inode_unpacked *,
50  				struct bch_subvolume *);
51  
bch2_inode_update_after_write(struct btree_trans * trans,struct bch_inode_info * inode,struct bch_inode_unpacked * bi,unsigned fields)52  void bch2_inode_update_after_write(struct btree_trans *trans,
53  				   struct bch_inode_info *inode,
54  				   struct bch_inode_unpacked *bi,
55  				   unsigned fields)
56  {
57  	struct bch_fs *c = trans->c;
58  
59  	BUG_ON(bi->bi_inum != inode->v.i_ino);
60  
61  	bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum));
62  
63  	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
64  	i_uid_write(&inode->v, bi->bi_uid);
65  	i_gid_write(&inode->v, bi->bi_gid);
66  	inode->v.i_mode	= bi->bi_mode;
67  
68  	if (fields & ATTR_ATIME)
69  		inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
70  	if (fields & ATTR_MTIME)
71  		inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
72  	if (fields & ATTR_CTIME)
73  		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
74  
75  	inode->ei_inode		= *bi;
76  
77  	bch2_inode_flags_to_vfs(inode);
78  }
79  
bch2_write_inode(struct bch_fs * c,struct bch_inode_info * inode,inode_set_fn set,void * p,unsigned fields)80  int __must_check bch2_write_inode(struct bch_fs *c,
81  				  struct bch_inode_info *inode,
82  				  inode_set_fn set,
83  				  void *p, unsigned fields)
84  {
85  	struct btree_trans *trans = bch2_trans_get(c);
86  	struct btree_iter iter = { NULL };
87  	struct bch_inode_unpacked inode_u;
88  	int ret;
89  retry:
90  	bch2_trans_begin(trans);
91  
92  	ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
93  				BTREE_ITER_intent) ?:
94  		(set ? set(trans, inode, &inode_u, p) : 0) ?:
95  		bch2_inode_write(trans, &iter, &inode_u) ?:
96  		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
97  
98  	/*
99  	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
100  	 * this is important for inode updates via bchfs_write_index_update
101  	 */
102  	if (!ret)
103  		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
104  
105  	bch2_trans_iter_exit(trans, &iter);
106  
107  	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
108  		goto retry;
109  
110  	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
111  			     "%s: inode %llu:%llu not found when updating",
112  			     bch2_err_str(ret),
113  			     inode_inum(inode).subvol,
114  			     inode_inum(inode).inum);
115  
116  	bch2_trans_put(trans);
117  	return ret < 0 ? ret : 0;
118  }
119  
bch2_fs_quota_transfer(struct bch_fs * c,struct bch_inode_info * inode,struct bch_qid new_qid,unsigned qtypes,enum quota_acct_mode mode)120  int bch2_fs_quota_transfer(struct bch_fs *c,
121  			   struct bch_inode_info *inode,
122  			   struct bch_qid new_qid,
123  			   unsigned qtypes,
124  			   enum quota_acct_mode mode)
125  {
126  	unsigned i;
127  	int ret;
128  
129  	qtypes &= enabled_qtypes(c);
130  
131  	for (i = 0; i < QTYP_NR; i++)
132  		if (new_qid.q[i] == inode->ei_qid.q[i])
133  			qtypes &= ~(1U << i);
134  
135  	if (!qtypes)
136  		return 0;
137  
138  	mutex_lock(&inode->ei_quota_lock);
139  
140  	ret = bch2_quota_transfer(c, qtypes, new_qid,
141  				  inode->ei_qid,
142  				  inode->v.i_blocks +
143  				  inode->ei_quota_reserved,
144  				  mode);
145  	if (!ret)
146  		for (i = 0; i < QTYP_NR; i++)
147  			if (qtypes & (1 << i))
148  				inode->ei_qid.q[i] = new_qid.q[i];
149  
150  	mutex_unlock(&inode->ei_quota_lock);
151  
152  	return ret;
153  }
154  
subvol_inum_eq(subvol_inum a,subvol_inum b)155  static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
156  {
157  	return a.subvol == b.subvol && a.inum == b.inum;
158  }
159  
bch2_vfs_inode_hash_fn(const void * data,u32 len,u32 seed)160  static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
161  {
162  	const subvol_inum *inum = data;
163  
164  	return jhash(&inum->inum, sizeof(inum->inum), seed);
165  }
166  
bch2_vfs_inode_obj_hash_fn(const void * data,u32 len,u32 seed)167  static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
168  {
169  	const struct bch_inode_info *inode = data;
170  
171  	return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
172  }
173  
bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg * arg,const void * obj)174  static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
175  				 const void *obj)
176  {
177  	const struct bch_inode_info *inode = obj;
178  	const subvol_inum *v = arg->key;
179  
180  	return !subvol_inum_eq(inode->ei_inum, *v);
181  }
182  
183  static const struct rhashtable_params bch2_vfs_inodes_params = {
184  	.head_offset		= offsetof(struct bch_inode_info, hash),
185  	.key_offset		= offsetof(struct bch_inode_info, ei_inum),
186  	.key_len		= sizeof(subvol_inum),
187  	.hashfn			= bch2_vfs_inode_hash_fn,
188  	.obj_hashfn		= bch2_vfs_inode_obj_hash_fn,
189  	.obj_cmpfn		= bch2_vfs_inode_cmp_fn,
190  	.automatic_shrinking	= true,
191  };
192  
bch2_inode_or_descendents_is_open(struct btree_trans * trans,struct bpos p)193  int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
194  {
195  	struct bch_fs *c = trans->c;
196  	struct rhashtable *ht = &c->vfs_inodes_table;
197  	subvol_inum inum = (subvol_inum) { .inum = p.offset };
198  	DARRAY(u32) subvols;
199  	int ret = 0;
200  
201  	if (!test_bit(BCH_FS_started, &c->flags))
202  		return false;
203  
204  	darray_init(&subvols);
205  restart_from_top:
206  
207  	/*
208  	 * Tweaked version of __rhashtable_lookup(); we need to get a list of
209  	 * subvolumes in which the given inode number is open.
210  	 *
211  	 * For this to work, we don't include the subvolume ID in the key that
212  	 * we hash - all inodes with the same inode number regardless of
213  	 * subvolume will hash to the same slot.
214  	 *
215  	 * This will be less than ideal if the same file is ever open
216  	 * simultaneously in many different snapshots:
217  	 */
218  	rcu_read_lock();
219  	struct rhash_lock_head __rcu *const *bkt;
220  	struct rhash_head *he;
221  	unsigned int hash;
222  	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
223  restart:
224  	hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params);
225  	bkt = rht_bucket(tbl, hash);
226  	do {
227  		struct bch_inode_info *inode;
228  
229  		rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
230  			if (inode->ei_inum.inum == inum.inum) {
231  				ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
232  						      GFP_NOWAIT|__GFP_NOWARN);
233  				if (ret) {
234  					rcu_read_unlock();
235  					ret = darray_make_room(&subvols, 1);
236  					if (ret)
237  						goto err;
238  					subvols.nr = 0;
239  					goto restart_from_top;
240  				}
241  			}
242  		}
243  		/* An object might have been moved to a different hash chain,
244  		 * while we walk along it - better check and retry.
245  		 */
246  	} while (he != RHT_NULLS_MARKER(bkt));
247  
248  	/* Ensure we see any new tables. */
249  	smp_rmb();
250  
251  	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
252  	if (unlikely(tbl))
253  		goto restart;
254  	rcu_read_unlock();
255  
256  	darray_for_each(subvols, i) {
257  		u32 snap;
258  		ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
259  		if (ret)
260  			goto err;
261  
262  		ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
263  		if (ret)
264  			break;
265  	}
266  err:
267  	darray_exit(&subvols);
268  	return ret;
269  }
270  
__bch2_inode_hash_find(struct bch_fs * c,subvol_inum inum)271  static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
272  {
273  	return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
274  }
275  
__wait_on_freeing_inode(struct bch_fs * c,struct bch_inode_info * inode,subvol_inum inum)276  static void __wait_on_freeing_inode(struct bch_fs *c,
277  				    struct bch_inode_info *inode,
278  				    subvol_inum inum)
279  {
280  	wait_queue_head_t *wq;
281  	struct wait_bit_queue_entry wait;
282  
283  	wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
284  	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
285  	spin_unlock(&inode->v.i_lock);
286  
287  	if (__bch2_inode_hash_find(c, inum) == inode)
288  		schedule_timeout(HZ * 10);
289  	finish_wait(wq, &wait.wq_entry);
290  }
291  
bch2_inode_hash_find(struct bch_fs * c,struct btree_trans * trans,subvol_inum inum)292  static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
293  						   subvol_inum inum)
294  {
295  	struct bch_inode_info *inode;
296  repeat:
297  	inode = __bch2_inode_hash_find(c, inum);
298  	if (inode) {
299  		spin_lock(&inode->v.i_lock);
300  		if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
301  			spin_unlock(&inode->v.i_lock);
302  			return NULL;
303  		}
304  		if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
305  			if (!trans) {
306  				__wait_on_freeing_inode(c, inode, inum);
307  			} else {
308  				bch2_trans_unlock(trans);
309  				__wait_on_freeing_inode(c, inode, inum);
310  				int ret = bch2_trans_relock(trans);
311  				if (ret)
312  					return ERR_PTR(ret);
313  			}
314  			goto repeat;
315  		}
316  		__iget(&inode->v);
317  		spin_unlock(&inode->v.i_lock);
318  	}
319  
320  	return inode;
321  }
322  
bch2_inode_hash_remove(struct bch_fs * c,struct bch_inode_info * inode)323  static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
324  {
325  	spin_lock(&inode->v.i_lock);
326  	bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
327  	spin_unlock(&inode->v.i_lock);
328  
329  	if (remove) {
330  		int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
331  					&inode->hash, bch2_vfs_inodes_params);
332  		BUG_ON(ret);
333  		inode->v.i_hash.pprev = NULL;
334  		/*
335  		 * This pairs with the bch2_inode_hash_find() ->
336  		 * __wait_on_freeing_inode() path
337  		 */
338  		inode_wake_up_bit(&inode->v, __I_NEW);
339  	}
340  }
341  
bch2_inode_hash_insert(struct bch_fs * c,struct btree_trans * trans,struct bch_inode_info * inode)342  static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
343  						     struct btree_trans *trans,
344  						     struct bch_inode_info *inode)
345  {
346  	struct bch_inode_info *old = inode;
347  
348  	set_bit(EI_INODE_HASHED, &inode->ei_flags);
349  retry:
350  	if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
351  					&inode->ei_inum,
352  					&inode->hash,
353  					bch2_vfs_inodes_params))) {
354  		old = bch2_inode_hash_find(c, trans, inode->ei_inum);
355  		if (!old)
356  			goto retry;
357  
358  		clear_bit(EI_INODE_HASHED, &inode->ei_flags);
359  
360  		/*
361  		 * bcachefs doesn't use I_NEW; we have no use for it since we
362  		 * only insert fully created inodes in the inode hash table. But
363  		 * discard_new_inode() expects it to be set...
364  		 */
365  		inode->v.i_state |= I_NEW;
366  		/*
367  		 * We don't want bch2_evict_inode() to delete the inode on disk,
368  		 * we just raced and had another inode in cache. Normally new
369  		 * inodes don't have nlink == 0 - except tmpfiles do...
370  		 */
371  		set_nlink(&inode->v, 1);
372  		discard_new_inode(&inode->v);
373  		return old;
374  	} else {
375  		inode_fake_hash(&inode->v);
376  
377  		inode_sb_list_add(&inode->v);
378  
379  		mutex_lock(&c->vfs_inodes_lock);
380  		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
381  		mutex_unlock(&c->vfs_inodes_lock);
382  		return inode;
383  	}
384  }
385  
386  #define memalloc_flags_do(_flags, _do)						\
387  ({										\
388  	unsigned _saved_flags = memalloc_flags_save(_flags);			\
389  	typeof(_do) _ret = _do;							\
390  	memalloc_noreclaim_restore(_saved_flags);				\
391  	_ret;									\
392  })
393  
bch2_alloc_inode(struct super_block * sb)394  static struct inode *bch2_alloc_inode(struct super_block *sb)
395  {
396  	BUG();
397  }
398  
__bch2_new_inode(struct bch_fs * c,gfp_t gfp)399  static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
400  {
401  	struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
402  						bch2_inode_cache, gfp);
403  	if (!inode)
404  		return NULL;
405  
406  	inode_init_once(&inode->v);
407  	mutex_init(&inode->ei_update_lock);
408  	two_state_lock_init(&inode->ei_pagecache_lock);
409  	INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
410  	inode->ei_flags = 0;
411  	mutex_init(&inode->ei_quota_lock);
412  	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
413  
414  	if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) {
415  		kmem_cache_free(bch2_inode_cache, inode);
416  		return NULL;
417  	}
418  
419  	return inode;
420  }
421  
422  /*
423   * Allocate a new inode, dropping/retaking btree locks if necessary:
424   */
bch2_new_inode(struct btree_trans * trans)425  static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
426  {
427  	struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT);
428  
429  	if (unlikely(!inode)) {
430  		int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
431  		if (ret && inode) {
432  			__destroy_inode(&inode->v);
433  			kmem_cache_free(bch2_inode_cache, inode);
434  		}
435  		if (ret)
436  			return ERR_PTR(ret);
437  	}
438  
439  	return inode;
440  }
441  
bch2_inode_hash_init_insert(struct btree_trans * trans,subvol_inum inum,struct bch_inode_unpacked * bi,struct bch_subvolume * subvol)442  static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
443  							  subvol_inum inum,
444  							  struct bch_inode_unpacked *bi,
445  							  struct bch_subvolume *subvol)
446  {
447  	struct bch_inode_info *inode = bch2_new_inode(trans);
448  	if (IS_ERR(inode))
449  		return inode;
450  
451  	bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
452  
453  	return bch2_inode_hash_insert(trans->c, trans, inode);
454  
455  }
456  
bch2_vfs_inode_get(struct bch_fs * c,subvol_inum inum)457  struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
458  {
459  	struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
460  	if (inode)
461  		return &inode->v;
462  
463  	struct btree_trans *trans = bch2_trans_get(c);
464  
465  	struct bch_inode_unpacked inode_u;
466  	struct bch_subvolume subvol;
467  	int ret = lockrestart_do(trans,
468  		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
469  		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
470  		PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
471  	bch2_trans_put(trans);
472  
473  	return ret ? ERR_PTR(ret) : &inode->v;
474  }
475  
476  struct bch_inode_info *
__bch2_create(struct mnt_idmap * idmap,struct bch_inode_info * dir,struct dentry * dentry,umode_t mode,dev_t rdev,subvol_inum snapshot_src,unsigned flags)477  __bch2_create(struct mnt_idmap *idmap,
478  	      struct bch_inode_info *dir, struct dentry *dentry,
479  	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
480  	      unsigned flags)
481  {
482  	struct bch_fs *c = dir->v.i_sb->s_fs_info;
483  	struct btree_trans *trans;
484  	struct bch_inode_unpacked dir_u;
485  	struct bch_inode_info *inode;
486  	struct bch_inode_unpacked inode_u;
487  	struct posix_acl *default_acl = NULL, *acl = NULL;
488  	subvol_inum inum;
489  	struct bch_subvolume subvol;
490  	u64 journal_seq = 0;
491  	kuid_t kuid;
492  	kgid_t kgid;
493  	int ret;
494  
495  	/*
496  	 * preallocate acls + vfs inode before btree transaction, so that
497  	 * nothing can fail after the transaction succeeds:
498  	 */
499  #ifdef CONFIG_BCACHEFS_POSIX_ACL
500  	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
501  	if (ret)
502  		return ERR_PTR(ret);
503  #endif
504  	inode = __bch2_new_inode(c, GFP_NOFS);
505  	if (unlikely(!inode)) {
506  		inode = ERR_PTR(-ENOMEM);
507  		goto err;
508  	}
509  
510  	bch2_inode_init_early(c, &inode_u);
511  
512  	if (!(flags & BCH_CREATE_TMPFILE))
513  		mutex_lock(&dir->ei_update_lock);
514  
515  	trans = bch2_trans_get(c);
516  retry:
517  	bch2_trans_begin(trans);
518  
519  	kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
520  	kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
521  	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
522  		bch2_create_trans(trans,
523  				  inode_inum(dir), &dir_u, &inode_u,
524  				  !(flags & BCH_CREATE_TMPFILE)
525  				  ? &dentry->d_name : NULL,
526  				  from_kuid(i_user_ns(&dir->v), kuid),
527  				  from_kgid(i_user_ns(&dir->v), kgid),
528  				  mode, rdev,
529  				  default_acl, acl, snapshot_src, flags) ?:
530  		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
531  				KEY_TYPE_QUOTA_PREALLOC);
532  	if (unlikely(ret))
533  		goto err_before_quota;
534  
535  	inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
536  	inum.inum = inode_u.bi_inum;
537  
538  	ret   = bch2_subvolume_get(trans, inum.subvol, true,
539  				   BTREE_ITER_with_updates, &subvol) ?:
540  		bch2_trans_commit(trans, NULL, &journal_seq, 0);
541  	if (unlikely(ret)) {
542  		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
543  				KEY_TYPE_QUOTA_WARN);
544  err_before_quota:
545  		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
546  			goto retry;
547  		goto err_trans;
548  	}
549  
550  	if (!(flags & BCH_CREATE_TMPFILE)) {
551  		bch2_inode_update_after_write(trans, dir, &dir_u,
552  					      ATTR_MTIME|ATTR_CTIME);
553  		mutex_unlock(&dir->ei_update_lock);
554  	}
555  
556  	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
557  
558  	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
559  	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
560  
561  	/*
562  	 * we must insert the new inode into the inode cache before calling
563  	 * bch2_trans_exit() and dropping locks, else we could race with another
564  	 * thread pulling the inode in and modifying it:
565  	 *
566  	 * also, calling bch2_inode_hash_insert() without passing in the
567  	 * transaction object is sketchy - if we could ever end up in
568  	 * __wait_on_freeing_inode(), we'd risk deadlock.
569  	 *
570  	 * But that shouldn't be possible, since we still have the inode locked
571  	 * that we just created, and we _really_ can't take a transaction
572  	 * restart here.
573  	 */
574  	inode = bch2_inode_hash_insert(c, NULL, inode);
575  	bch2_trans_put(trans);
576  err:
577  	posix_acl_release(default_acl);
578  	posix_acl_release(acl);
579  	return inode;
580  err_trans:
581  	if (!(flags & BCH_CREATE_TMPFILE))
582  		mutex_unlock(&dir->ei_update_lock);
583  
584  	bch2_trans_put(trans);
585  	make_bad_inode(&inode->v);
586  	iput(&inode->v);
587  	inode = ERR_PTR(ret);
588  	goto err;
589  }
590  
591  /* methods */
592  
bch2_lookup_trans(struct btree_trans * trans,subvol_inum dir,struct bch_hash_info * dir_hash_info,const struct qstr * name)593  static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
594  			subvol_inum dir, struct bch_hash_info *dir_hash_info,
595  			const struct qstr *name)
596  {
597  	struct bch_fs *c = trans->c;
598  	struct btree_iter dirent_iter = {};
599  	subvol_inum inum = {};
600  	struct printbuf buf = PRINTBUF;
601  
602  	struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
603  					     dir_hash_info, dir, name, 0);
604  	int ret = bkey_err(k);
605  	if (ret)
606  		return ERR_PTR(ret);
607  
608  	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
609  	if (ret > 0)
610  		ret = -ENOENT;
611  	if (ret)
612  		goto err;
613  
614  	struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
615  	if (inode)
616  		goto out;
617  
618  	struct bch_subvolume subvol;
619  	struct bch_inode_unpacked inode_u;
620  	ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
621  		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
622  		PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
623  
624  	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
625  				c, "dirent to missing inode:\n  %s",
626  				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
627  	if (ret)
628  		goto err;
629  
630  	/* regular files may have hardlinks: */
631  	if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) &&
632  				    !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
633  				    c,
634  				    "dirent points to inode that does not point back:\n  %s",
635  				    (bch2_bkey_val_to_text(&buf, c, k),
636  				     prt_printf(&buf, "\n  "),
637  				     bch2_inode_unpacked_to_text(&buf, &inode_u),
638  				     buf.buf))) {
639  		ret = -ENOENT;
640  		goto err;
641  	}
642  out:
643  	bch2_trans_iter_exit(trans, &dirent_iter);
644  	printbuf_exit(&buf);
645  	return inode;
646  err:
647  	inode = ERR_PTR(ret);
648  	goto out;
649  }
650  
bch2_lookup(struct inode * vdir,struct dentry * dentry,unsigned int flags)651  static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
652  				  unsigned int flags)
653  {
654  	struct bch_fs *c = vdir->i_sb->s_fs_info;
655  	struct bch_inode_info *dir = to_bch_ei(vdir);
656  	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
657  
658  	struct bch_inode_info *inode;
659  	bch2_trans_do(c,
660  		PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
661  							  &hash, &dentry->d_name)));
662  	if (IS_ERR(inode))
663  		inode = NULL;
664  
665  	return d_splice_alias(&inode->v, dentry);
666  }
667  
bch2_mknod(struct mnt_idmap * idmap,struct inode * vdir,struct dentry * dentry,umode_t mode,dev_t rdev)668  static int bch2_mknod(struct mnt_idmap *idmap,
669  		      struct inode *vdir, struct dentry *dentry,
670  		      umode_t mode, dev_t rdev)
671  {
672  	struct bch_inode_info *inode =
673  		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
674  			      (subvol_inum) { 0 }, 0);
675  
676  	if (IS_ERR(inode))
677  		return bch2_err_class(PTR_ERR(inode));
678  
679  	d_instantiate(dentry, &inode->v);
680  	return 0;
681  }
682  
bch2_create(struct mnt_idmap * idmap,struct inode * vdir,struct dentry * dentry,umode_t mode,bool excl)683  static int bch2_create(struct mnt_idmap *idmap,
684  		       struct inode *vdir, struct dentry *dentry,
685  		       umode_t mode, bool excl)
686  {
687  	return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
688  }
689  
__bch2_link(struct bch_fs * c,struct bch_inode_info * inode,struct bch_inode_info * dir,struct dentry * dentry)690  static int __bch2_link(struct bch_fs *c,
691  		       struct bch_inode_info *inode,
692  		       struct bch_inode_info *dir,
693  		       struct dentry *dentry)
694  {
695  	struct bch_inode_unpacked dir_u, inode_u;
696  	int ret;
697  
698  	mutex_lock(&inode->ei_update_lock);
699  	struct btree_trans *trans = bch2_trans_get(c);
700  
701  	ret = commit_do(trans, NULL, NULL, 0,
702  			bch2_link_trans(trans,
703  					inode_inum(dir),   &dir_u,
704  					inode_inum(inode), &inode_u,
705  					&dentry->d_name));
706  
707  	if (likely(!ret)) {
708  		bch2_inode_update_after_write(trans, dir, &dir_u,
709  					      ATTR_MTIME|ATTR_CTIME);
710  		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
711  	}
712  
713  	bch2_trans_put(trans);
714  	mutex_unlock(&inode->ei_update_lock);
715  	return ret;
716  }
717  
bch2_link(struct dentry * old_dentry,struct inode * vdir,struct dentry * dentry)718  static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
719  		     struct dentry *dentry)
720  {
721  	struct bch_fs *c = vdir->i_sb->s_fs_info;
722  	struct bch_inode_info *dir = to_bch_ei(vdir);
723  	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
724  	int ret;
725  
726  	lockdep_assert_held(&inode->v.i_rwsem);
727  
728  	ret   = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
729  		bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
730  		__bch2_link(c, inode, dir, dentry);
731  	if (unlikely(ret))
732  		return bch2_err_class(ret);
733  
734  	ihold(&inode->v);
735  	d_instantiate(dentry, &inode->v);
736  	return 0;
737  }
738  
__bch2_unlink(struct inode * vdir,struct dentry * dentry,bool deleting_snapshot)739  int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
740  		  bool deleting_snapshot)
741  {
742  	struct bch_fs *c = vdir->i_sb->s_fs_info;
743  	struct bch_inode_info *dir = to_bch_ei(vdir);
744  	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
745  	struct bch_inode_unpacked dir_u, inode_u;
746  	int ret;
747  
748  	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
749  
750  	struct btree_trans *trans = bch2_trans_get(c);
751  
752  	ret = commit_do(trans, NULL, NULL,
753  			BCH_TRANS_COMMIT_no_enospc,
754  		bch2_unlink_trans(trans,
755  				  inode_inum(dir), &dir_u,
756  				  &inode_u, &dentry->d_name,
757  				  deleting_snapshot));
758  	if (unlikely(ret))
759  		goto err;
760  
761  	bch2_inode_update_after_write(trans, dir, &dir_u,
762  				      ATTR_MTIME|ATTR_CTIME);
763  	bch2_inode_update_after_write(trans, inode, &inode_u,
764  				      ATTR_MTIME);
765  
766  	if (inode_u.bi_subvol) {
767  		/*
768  		 * Subvolume deletion is asynchronous, but we still want to tell
769  		 * the VFS that it's been deleted here:
770  		 */
771  		set_nlink(&inode->v, 0);
772  	}
773  err:
774  	bch2_trans_put(trans);
775  	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
776  
777  	return ret;
778  }
779  
bch2_unlink(struct inode * vdir,struct dentry * dentry)780  static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
781  {
782  	struct bch_inode_info *dir= to_bch_ei(vdir);
783  	struct bch_fs *c = dir->v.i_sb->s_fs_info;
784  
785  	int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
786  		__bch2_unlink(vdir, dentry, false);
787  	return bch2_err_class(ret);
788  }
789  
bch2_symlink(struct mnt_idmap * idmap,struct inode * vdir,struct dentry * dentry,const char * symname)790  static int bch2_symlink(struct mnt_idmap *idmap,
791  			struct inode *vdir, struct dentry *dentry,
792  			const char *symname)
793  {
794  	struct bch_fs *c = vdir->i_sb->s_fs_info;
795  	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
796  	int ret;
797  
798  	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
799  			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
800  	if (IS_ERR(inode))
801  		return bch2_err_class(PTR_ERR(inode));
802  
803  	inode_lock(&inode->v);
804  	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
805  	inode_unlock(&inode->v);
806  
807  	if (unlikely(ret))
808  		goto err;
809  
810  	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
811  	if (unlikely(ret))
812  		goto err;
813  
814  	ret = __bch2_link(c, inode, dir, dentry);
815  	if (unlikely(ret))
816  		goto err;
817  
818  	d_instantiate(dentry, &inode->v);
819  	return 0;
820  err:
821  	iput(&inode->v);
822  	return bch2_err_class(ret);
823  }
824  
bch2_mkdir(struct mnt_idmap * idmap,struct inode * vdir,struct dentry * dentry,umode_t mode)825  static int bch2_mkdir(struct mnt_idmap *idmap,
826  		      struct inode *vdir, struct dentry *dentry, umode_t mode)
827  {
828  	return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
829  }
830  
bch2_rename2(struct mnt_idmap * idmap,struct inode * src_vdir,struct dentry * src_dentry,struct inode * dst_vdir,struct dentry * dst_dentry,unsigned flags)831  static int bch2_rename2(struct mnt_idmap *idmap,
832  			struct inode *src_vdir, struct dentry *src_dentry,
833  			struct inode *dst_vdir, struct dentry *dst_dentry,
834  			unsigned flags)
835  {
836  	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
837  	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
838  	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
839  	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
840  	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
841  	struct bch_inode_unpacked dst_dir_u, src_dir_u;
842  	struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
843  	struct btree_trans *trans;
844  	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
845  		? BCH_RENAME_EXCHANGE
846  		: dst_dentry->d_inode
847  		? BCH_RENAME_OVERWRITE : BCH_RENAME;
848  	bool whiteout = !!(flags & RENAME_WHITEOUT);
849  	int ret;
850  
851  	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
852  		return -EINVAL;
853  
854  	if (mode == BCH_RENAME_OVERWRITE) {
855  		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
856  						   0, LLONG_MAX);
857  		if (ret)
858  			return ret;
859  	}
860  
861  	bch2_lock_inodes(INODE_UPDATE_LOCK,
862  			 src_dir,
863  			 dst_dir,
864  			 src_inode,
865  			 dst_inode);
866  
867  	trans = bch2_trans_get(c);
868  
869  	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
870  		bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
871  	if (ret)
872  		goto err_tx_restart;
873  
874  	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
875  		ret = bch2_fs_quota_transfer(c, src_inode,
876  					     dst_dir->ei_qid,
877  					     1 << QTYP_PRJ,
878  					     KEY_TYPE_QUOTA_PREALLOC);
879  		if (ret)
880  			goto err;
881  	}
882  
883  	if (mode == BCH_RENAME_EXCHANGE &&
884  	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
885  		ret = bch2_fs_quota_transfer(c, dst_inode,
886  					     src_dir->ei_qid,
887  					     1 << QTYP_PRJ,
888  					     KEY_TYPE_QUOTA_PREALLOC);
889  		if (ret)
890  			goto err;
891  	}
892  retry:
893  	bch2_trans_begin(trans);
894  
895  	ret = bch2_rename_trans(trans,
896  				inode_inum(src_dir), &src_dir_u,
897  				inode_inum(dst_dir), &dst_dir_u,
898  				&src_inode_u,
899  				&dst_inode_u,
900  				&src_dentry->d_name,
901  				&dst_dentry->d_name,
902  				mode);
903  	if (unlikely(ret))
904  		goto err_tx_restart;
905  
906  	if (whiteout) {
907  		whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
908  		ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
909  		if (unlikely(ret))
910  			goto err_tx_restart;
911  		bch2_inode_init_early(c, whiteout_inode_u);
912  
913  		ret = bch2_create_trans(trans,
914  					inode_inum(src_dir), &src_dir_u,
915  					whiteout_inode_u,
916  					&src_dentry->d_name,
917  					from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
918  					from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
919  					S_IFCHR|WHITEOUT_MODE, 0,
920  					NULL, NULL, (subvol_inum) { 0 }, 0) ?:
921  		      bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
922  				      KEY_TYPE_QUOTA_PREALLOC);
923  		if (unlikely(ret))
924  			goto err_tx_restart;
925  	}
926  
927  	ret = bch2_trans_commit(trans, NULL, NULL, 0);
928  	if (unlikely(ret)) {
929  err_tx_restart:
930  		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
931  			goto retry;
932  		goto err;
933  	}
934  
935  	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
936  	BUG_ON(dst_inode &&
937  	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
938  
939  	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
940  				      ATTR_MTIME|ATTR_CTIME);
941  
942  	if (src_dir != dst_dir)
943  		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
944  					      ATTR_MTIME|ATTR_CTIME);
945  
946  	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
947  				      ATTR_CTIME);
948  
949  	if (dst_inode)
950  		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
951  					      ATTR_CTIME);
952  err:
953  	bch2_trans_put(trans);
954  
955  	bch2_fs_quota_transfer(c, src_inode,
956  			       bch_qid(&src_inode->ei_inode),
957  			       1 << QTYP_PRJ,
958  			       KEY_TYPE_QUOTA_NOCHECK);
959  	if (dst_inode)
960  		bch2_fs_quota_transfer(c, dst_inode,
961  				       bch_qid(&dst_inode->ei_inode),
962  				       1 << QTYP_PRJ,
963  				       KEY_TYPE_QUOTA_NOCHECK);
964  
965  	bch2_unlock_inodes(INODE_UPDATE_LOCK,
966  			   src_dir,
967  			   dst_dir,
968  			   src_inode,
969  			   dst_inode);
970  
971  	return bch2_err_class(ret);
972  }
973  
bch2_setattr_copy(struct mnt_idmap * idmap,struct bch_inode_info * inode,struct bch_inode_unpacked * bi,struct iattr * attr)974  static void bch2_setattr_copy(struct mnt_idmap *idmap,
975  			      struct bch_inode_info *inode,
976  			      struct bch_inode_unpacked *bi,
977  			      struct iattr *attr)
978  {
979  	struct bch_fs *c = inode->v.i_sb->s_fs_info;
980  	unsigned int ia_valid = attr->ia_valid;
981  	kuid_t kuid;
982  	kgid_t kgid;
983  
984  	if (ia_valid & ATTR_UID) {
985  		kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
986  		bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
987  	}
988  	if (ia_valid & ATTR_GID) {
989  		kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
990  		bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
991  	}
992  
993  	if (ia_valid & ATTR_SIZE)
994  		bi->bi_size = attr->ia_size;
995  
996  	if (ia_valid & ATTR_ATIME)
997  		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
998  	if (ia_valid & ATTR_MTIME)
999  		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
1000  	if (ia_valid & ATTR_CTIME)
1001  		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
1002  
1003  	if (ia_valid & ATTR_MODE) {
1004  		umode_t mode = attr->ia_mode;
1005  		kgid_t gid = ia_valid & ATTR_GID
1006  			? kgid
1007  			: inode->v.i_gid;
1008  
1009  		if (!in_group_or_capable(idmap, &inode->v,
1010  			make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
1011  			mode &= ~S_ISGID;
1012  		bi->bi_mode = mode;
1013  	}
1014  }
1015  
bch2_setattr_nonsize(struct mnt_idmap * idmap,struct bch_inode_info * inode,struct iattr * attr)1016  int bch2_setattr_nonsize(struct mnt_idmap *idmap,
1017  			 struct bch_inode_info *inode,
1018  			 struct iattr *attr)
1019  {
1020  	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1021  	struct bch_qid qid;
1022  	struct btree_trans *trans;
1023  	struct btree_iter inode_iter = { NULL };
1024  	struct bch_inode_unpacked inode_u;
1025  	struct posix_acl *acl = NULL;
1026  	kuid_t kuid;
1027  	kgid_t kgid;
1028  	int ret;
1029  
1030  	mutex_lock(&inode->ei_update_lock);
1031  
1032  	qid = inode->ei_qid;
1033  
1034  	if (attr->ia_valid & ATTR_UID) {
1035  		kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
1036  		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
1037  	}
1038  
1039  	if (attr->ia_valid & ATTR_GID) {
1040  		kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
1041  		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
1042  	}
1043  
1044  	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
1045  				     KEY_TYPE_QUOTA_PREALLOC);
1046  	if (ret)
1047  		goto err;
1048  
1049  	trans = bch2_trans_get(c);
1050  retry:
1051  	bch2_trans_begin(trans);
1052  	kfree(acl);
1053  	acl = NULL;
1054  
1055  	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
1056  			      BTREE_ITER_intent);
1057  	if (ret)
1058  		goto btree_err;
1059  
1060  	bch2_setattr_copy(idmap, inode, &inode_u, attr);
1061  
1062  	if (attr->ia_valid & ATTR_MODE) {
1063  		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
1064  				     inode_u.bi_mode, &acl);
1065  		if (ret)
1066  			goto btree_err;
1067  	}
1068  
1069  	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
1070  		bch2_trans_commit(trans, NULL, NULL,
1071  				  BCH_TRANS_COMMIT_no_enospc);
1072  btree_err:
1073  	bch2_trans_iter_exit(trans, &inode_iter);
1074  
1075  	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1076  		goto retry;
1077  	if (unlikely(ret))
1078  		goto err_trans;
1079  
1080  	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
1081  
1082  	if (acl)
1083  		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
1084  err_trans:
1085  	bch2_trans_put(trans);
1086  err:
1087  	mutex_unlock(&inode->ei_update_lock);
1088  
1089  	return bch2_err_class(ret);
1090  }
1091  
bch2_getattr(struct mnt_idmap * idmap,const struct path * path,struct kstat * stat,u32 request_mask,unsigned query_flags)1092  static int bch2_getattr(struct mnt_idmap *idmap,
1093  			const struct path *path, struct kstat *stat,
1094  			u32 request_mask, unsigned query_flags)
1095  {
1096  	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
1097  	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1098  	vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
1099  	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
1100  
1101  	stat->dev	= inode->v.i_sb->s_dev;
1102  	stat->ino	= inode->v.i_ino;
1103  	stat->mode	= inode->v.i_mode;
1104  	stat->nlink	= inode->v.i_nlink;
1105  	stat->uid	= vfsuid_into_kuid(vfsuid);
1106  	stat->gid	= vfsgid_into_kgid(vfsgid);
1107  	stat->rdev	= inode->v.i_rdev;
1108  	stat->size	= i_size_read(&inode->v);
1109  	stat->atime	= inode_get_atime(&inode->v);
1110  	stat->mtime	= inode_get_mtime(&inode->v);
1111  	stat->ctime	= inode_get_ctime(&inode->v);
1112  	stat->blksize	= block_bytes(c);
1113  	stat->blocks	= inode->v.i_blocks;
1114  
1115  	stat->subvol	= inode->ei_inum.subvol;
1116  	stat->result_mask |= STATX_SUBVOL;
1117  
1118  	if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
1119  		stat->result_mask |= STATX_DIOALIGN;
1120  		/*
1121  		 * this is incorrect; we should be tracking this in superblock,
1122  		 * and checking the alignment of open devices
1123  		 */
1124  		stat->dio_mem_align = SECTOR_SIZE;
1125  		stat->dio_offset_align = block_bytes(c);
1126  	}
1127  
1128  	if (request_mask & STATX_BTIME) {
1129  		stat->result_mask |= STATX_BTIME;
1130  		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
1131  	}
1132  
1133  	if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
1134  		stat->attributes |= STATX_ATTR_IMMUTABLE;
1135  	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
1136  
1137  	if (inode->ei_inode.bi_flags & BCH_INODE_append)
1138  		stat->attributes |= STATX_ATTR_APPEND;
1139  	stat->attributes_mask	 |= STATX_ATTR_APPEND;
1140  
1141  	if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
1142  		stat->attributes |= STATX_ATTR_NODUMP;
1143  	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
1144  
1145  	return 0;
1146  }
1147  
bch2_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * iattr)1148  static int bch2_setattr(struct mnt_idmap *idmap,
1149  			struct dentry *dentry, struct iattr *iattr)
1150  {
1151  	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
1152  	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1153  	int ret;
1154  
1155  	lockdep_assert_held(&inode->v.i_rwsem);
1156  
1157  	ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
1158  		setattr_prepare(idmap, dentry, iattr);
1159  	if (ret)
1160  		return ret;
1161  
1162  	return iattr->ia_valid & ATTR_SIZE
1163  		? bchfs_truncate(idmap, inode, iattr)
1164  		: bch2_setattr_nonsize(idmap, inode, iattr);
1165  }
1166  
bch2_tmpfile(struct mnt_idmap * idmap,struct inode * vdir,struct file * file,umode_t mode)1167  static int bch2_tmpfile(struct mnt_idmap *idmap,
1168  			struct inode *vdir, struct file *file, umode_t mode)
1169  {
1170  	struct bch_inode_info *inode =
1171  		__bch2_create(idmap, to_bch_ei(vdir),
1172  			      file->f_path.dentry, mode, 0,
1173  			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
1174  
1175  	if (IS_ERR(inode))
1176  		return bch2_err_class(PTR_ERR(inode));
1177  
1178  	d_mark_tmpfile(file, &inode->v);
1179  	d_instantiate(file->f_path.dentry, &inode->v);
1180  	return finish_open_simple(file, 0);
1181  }
1182  
bch2_fill_extent(struct bch_fs * c,struct fiemap_extent_info * info,struct bkey_s_c k,unsigned flags)1183  static int bch2_fill_extent(struct bch_fs *c,
1184  			    struct fiemap_extent_info *info,
1185  			    struct bkey_s_c k, unsigned flags)
1186  {
1187  	if (bkey_extent_is_direct_data(k.k)) {
1188  		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1189  		const union bch_extent_entry *entry;
1190  		struct extent_ptr_decoded p;
1191  		int ret;
1192  
1193  		if (k.k->type == KEY_TYPE_reflink_v)
1194  			flags |= FIEMAP_EXTENT_SHARED;
1195  
1196  		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
1197  			int flags2 = 0;
1198  			u64 offset = p.ptr.offset;
1199  
1200  			if (p.ptr.unwritten)
1201  				flags2 |= FIEMAP_EXTENT_UNWRITTEN;
1202  
1203  			if (p.crc.compression_type)
1204  				flags2 |= FIEMAP_EXTENT_ENCODED;
1205  			else
1206  				offset += p.crc.offset;
1207  
1208  			if ((offset & (block_sectors(c) - 1)) ||
1209  			    (k.k->size & (block_sectors(c) - 1)))
1210  				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
1211  
1212  			ret = fiemap_fill_next_extent(info,
1213  						bkey_start_offset(k.k) << 9,
1214  						offset << 9,
1215  						k.k->size << 9, flags|flags2);
1216  			if (ret)
1217  				return ret;
1218  		}
1219  
1220  		return 0;
1221  	} else if (bkey_extent_is_inline_data(k.k)) {
1222  		return fiemap_fill_next_extent(info,
1223  					       bkey_start_offset(k.k) << 9,
1224  					       0, k.k->size << 9,
1225  					       flags|
1226  					       FIEMAP_EXTENT_DATA_INLINE);
1227  	} else if (k.k->type == KEY_TYPE_reservation) {
1228  		return fiemap_fill_next_extent(info,
1229  					       bkey_start_offset(k.k) << 9,
1230  					       0, k.k->size << 9,
1231  					       flags|
1232  					       FIEMAP_EXTENT_DELALLOC|
1233  					       FIEMAP_EXTENT_UNWRITTEN);
1234  	} else {
1235  		BUG();
1236  	}
1237  }
1238  
bch2_fiemap(struct inode * vinode,struct fiemap_extent_info * info,u64 start,u64 len)1239  static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
1240  		       u64 start, u64 len)
1241  {
1242  	struct bch_fs *c = vinode->i_sb->s_fs_info;
1243  	struct bch_inode_info *ei = to_bch_ei(vinode);
1244  	struct btree_trans *trans;
1245  	struct btree_iter iter;
1246  	struct bkey_s_c k;
1247  	struct bkey_buf cur, prev;
1248  	unsigned offset_into_extent, sectors;
1249  	bool have_extent = false;
1250  	int ret = 0;
1251  
1252  	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
1253  	if (ret)
1254  		return ret;
1255  
1256  	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
1257  	if (start + len < start)
1258  		return -EINVAL;
1259  
1260  	start >>= 9;
1261  
1262  	bch2_bkey_buf_init(&cur);
1263  	bch2_bkey_buf_init(&prev);
1264  	trans = bch2_trans_get(c);
1265  
1266  	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1267  			     POS(ei->v.i_ino, start), 0);
1268  
1269  	while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1270  		enum btree_id data_btree = BTREE_ID_extents;
1271  
1272  		bch2_trans_begin(trans);
1273  
1274  		u32 snapshot;
1275  		ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
1276  		if (ret)
1277  			continue;
1278  
1279  		bch2_btree_iter_set_snapshot(&iter, snapshot);
1280  
1281  		k = bch2_btree_iter_peek_upto(&iter, end);
1282  		ret = bkey_err(k);
1283  		if (ret)
1284  			continue;
1285  
1286  		if (!k.k)
1287  			break;
1288  
1289  		if (!bkey_extent_is_data(k.k) &&
1290  		    k.k->type != KEY_TYPE_reservation) {
1291  			bch2_btree_iter_advance(&iter);
1292  			continue;
1293  		}
1294  
1295  		offset_into_extent	= iter.pos.offset -
1296  			bkey_start_offset(k.k);
1297  		sectors			= k.k->size - offset_into_extent;
1298  
1299  		bch2_bkey_buf_reassemble(&cur, c, k);
1300  
1301  		ret = bch2_read_indirect_extent(trans, &data_btree,
1302  					&offset_into_extent, &cur);
1303  		if (ret)
1304  			continue;
1305  
1306  		k = bkey_i_to_s_c(cur.k);
1307  		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
1308  
1309  		sectors = min(sectors, k.k->size - offset_into_extent);
1310  
1311  		bch2_cut_front(POS(k.k->p.inode,
1312  				   bkey_start_offset(k.k) +
1313  				   offset_into_extent),
1314  			       cur.k);
1315  		bch2_key_resize(&cur.k->k, sectors);
1316  		cur.k->k.p = iter.pos;
1317  		cur.k->k.p.offset += cur.k->k.size;
1318  
1319  		if (have_extent) {
1320  			bch2_trans_unlock(trans);
1321  			ret = bch2_fill_extent(c, info,
1322  					bkey_i_to_s_c(prev.k), 0);
1323  			if (ret)
1324  				break;
1325  		}
1326  
1327  		bkey_copy(prev.k, cur.k);
1328  		have_extent = true;
1329  
1330  		bch2_btree_iter_set_pos(&iter,
1331  			POS(iter.pos.inode, iter.pos.offset + sectors));
1332  	}
1333  	bch2_trans_iter_exit(trans, &iter);
1334  
1335  	if (!ret && have_extent) {
1336  		bch2_trans_unlock(trans);
1337  		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
1338  				       FIEMAP_EXTENT_LAST);
1339  	}
1340  
1341  	bch2_trans_put(trans);
1342  	bch2_bkey_buf_exit(&cur, c);
1343  	bch2_bkey_buf_exit(&prev, c);
1344  	return ret < 0 ? ret : 0;
1345  }
1346  
1347  static const struct vm_operations_struct bch_vm_ops = {
1348  	.fault		= bch2_page_fault,
1349  	.map_pages	= filemap_map_pages,
1350  	.page_mkwrite   = bch2_page_mkwrite,
1351  };
1352  
bch2_mmap(struct file * file,struct vm_area_struct * vma)1353  static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
1354  {
1355  	file_accessed(file);
1356  
1357  	vma->vm_ops = &bch_vm_ops;
1358  	return 0;
1359  }
1360  
1361  /* Directories: */
1362  
bch2_dir_llseek(struct file * file,loff_t offset,int whence)1363  static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1364  {
1365  	return generic_file_llseek_size(file, offset, whence,
1366  					S64_MAX, S64_MAX);
1367  }
1368  
bch2_vfs_readdir(struct file * file,struct dir_context * ctx)1369  static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1370  {
1371  	struct bch_inode_info *inode = file_bch_inode(file);
1372  	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1373  
1374  	if (!dir_emit_dots(file, ctx))
1375  		return 0;
1376  
1377  	int ret = bch2_readdir(c, inode_inum(inode), ctx);
1378  
1379  	bch_err_fn(c, ret);
1380  	return bch2_err_class(ret);
1381  }
1382  
bch2_open(struct inode * vinode,struct file * file)1383  static int bch2_open(struct inode *vinode, struct file *file)
1384  {
1385  	if (file->f_flags & (O_WRONLY|O_RDWR)) {
1386  		struct bch_inode_info *inode = to_bch_ei(vinode);
1387  		struct bch_fs *c = inode->v.i_sb->s_fs_info;
1388  
1389  		int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
1390  		if (ret)
1391  			return ret;
1392  	}
1393  
1394  	file->f_mode |= FMODE_CAN_ODIRECT;
1395  
1396  	return generic_file_open(vinode, file);
1397  }
1398  
1399  static const struct file_operations bch_file_operations = {
1400  	.open		= bch2_open,
1401  	.llseek		= bch2_llseek,
1402  	.read_iter	= bch2_read_iter,
1403  	.write_iter	= bch2_write_iter,
1404  	.mmap		= bch2_mmap,
1405  	.get_unmapped_area = thp_get_unmapped_area,
1406  	.fsync		= bch2_fsync,
1407  	.splice_read	= filemap_splice_read,
1408  	.splice_write	= iter_file_splice_write,
1409  	.fallocate	= bch2_fallocate_dispatch,
1410  	.unlocked_ioctl = bch2_fs_file_ioctl,
1411  #ifdef CONFIG_COMPAT
1412  	.compat_ioctl	= bch2_compat_fs_ioctl,
1413  #endif
1414  	.remap_file_range = bch2_remap_file_range,
1415  };
1416  
1417  static const struct inode_operations bch_file_inode_operations = {
1418  	.getattr	= bch2_getattr,
1419  	.setattr	= bch2_setattr,
1420  	.fiemap		= bch2_fiemap,
1421  	.listxattr	= bch2_xattr_list,
1422  #ifdef CONFIG_BCACHEFS_POSIX_ACL
1423  	.get_inode_acl	= bch2_get_acl,
1424  	.set_acl	= bch2_set_acl,
1425  #endif
1426  };
1427  
1428  static const struct inode_operations bch_dir_inode_operations = {
1429  	.lookup		= bch2_lookup,
1430  	.create		= bch2_create,
1431  	.link		= bch2_link,
1432  	.unlink		= bch2_unlink,
1433  	.symlink	= bch2_symlink,
1434  	.mkdir		= bch2_mkdir,
1435  	.rmdir		= bch2_unlink,
1436  	.mknod		= bch2_mknod,
1437  	.rename		= bch2_rename2,
1438  	.getattr	= bch2_getattr,
1439  	.setattr	= bch2_setattr,
1440  	.tmpfile	= bch2_tmpfile,
1441  	.listxattr	= bch2_xattr_list,
1442  #ifdef CONFIG_BCACHEFS_POSIX_ACL
1443  	.get_inode_acl	= bch2_get_acl,
1444  	.set_acl	= bch2_set_acl,
1445  #endif
1446  };
1447  
1448  static const struct file_operations bch_dir_file_operations = {
1449  	.llseek		= bch2_dir_llseek,
1450  	.read		= generic_read_dir,
1451  	.iterate_shared	= bch2_vfs_readdir,
1452  	.fsync		= bch2_fsync,
1453  	.unlocked_ioctl = bch2_fs_file_ioctl,
1454  #ifdef CONFIG_COMPAT
1455  	.compat_ioctl	= bch2_compat_fs_ioctl,
1456  #endif
1457  };
1458  
1459  static const struct inode_operations bch_symlink_inode_operations = {
1460  	.get_link	= page_get_link,
1461  	.getattr	= bch2_getattr,
1462  	.setattr	= bch2_setattr,
1463  	.listxattr	= bch2_xattr_list,
1464  #ifdef CONFIG_BCACHEFS_POSIX_ACL
1465  	.get_inode_acl	= bch2_get_acl,
1466  	.set_acl	= bch2_set_acl,
1467  #endif
1468  };
1469  
1470  static const struct inode_operations bch_special_inode_operations = {
1471  	.getattr	= bch2_getattr,
1472  	.setattr	= bch2_setattr,
1473  	.listxattr	= bch2_xattr_list,
1474  #ifdef CONFIG_BCACHEFS_POSIX_ACL
1475  	.get_inode_acl	= bch2_get_acl,
1476  	.set_acl	= bch2_set_acl,
1477  #endif
1478  };
1479  
1480  static const struct address_space_operations bch_address_space_operations = {
1481  	.read_folio	= bch2_read_folio,
1482  	.writepages	= bch2_writepages,
1483  	.readahead	= bch2_readahead,
1484  	.dirty_folio	= filemap_dirty_folio,
1485  	.write_begin	= bch2_write_begin,
1486  	.write_end	= bch2_write_end,
1487  	.invalidate_folio = bch2_invalidate_folio,
1488  	.release_folio	= bch2_release_folio,
1489  #ifdef CONFIG_MIGRATION
1490  	.migrate_folio	= filemap_migrate_folio,
1491  #endif
1492  	.error_remove_folio = generic_error_remove_folio,
1493  };
1494  
1495  struct bcachefs_fid {
1496  	u64		inum;
1497  	u32		subvol;
1498  	u32		gen;
1499  } __packed;
1500  
1501  struct bcachefs_fid_with_parent {
1502  	struct bcachefs_fid	fid;
1503  	struct bcachefs_fid	dir;
1504  } __packed;
1505  
bcachefs_fid_valid(int fh_len,int fh_type)1506  static int bcachefs_fid_valid(int fh_len, int fh_type)
1507  {
1508  	switch (fh_type) {
1509  	case FILEID_BCACHEFS_WITHOUT_PARENT:
1510  		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
1511  	case FILEID_BCACHEFS_WITH_PARENT:
1512  		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
1513  	default:
1514  		return false;
1515  	}
1516  }
1517  
bch2_inode_to_fid(struct bch_inode_info * inode)1518  static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
1519  {
1520  	return (struct bcachefs_fid) {
1521  		.inum	= inode->ei_inum.inum,
1522  		.subvol	= inode->ei_inum.subvol,
1523  		.gen	= inode->ei_inode.bi_generation,
1524  	};
1525  }
1526  
bch2_encode_fh(struct inode * vinode,u32 * fh,int * len,struct inode * vdir)1527  static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
1528  			  struct inode *vdir)
1529  {
1530  	struct bch_inode_info *inode	= to_bch_ei(vinode);
1531  	struct bch_inode_info *dir	= to_bch_ei(vdir);
1532  	int min_len;
1533  
1534  	if (!S_ISDIR(inode->v.i_mode) && dir) {
1535  		struct bcachefs_fid_with_parent *fid = (void *) fh;
1536  
1537  		min_len = sizeof(*fid) / sizeof(u32);
1538  		if (*len < min_len) {
1539  			*len = min_len;
1540  			return FILEID_INVALID;
1541  		}
1542  
1543  		fid->fid = bch2_inode_to_fid(inode);
1544  		fid->dir = bch2_inode_to_fid(dir);
1545  
1546  		*len = min_len;
1547  		return FILEID_BCACHEFS_WITH_PARENT;
1548  	} else {
1549  		struct bcachefs_fid *fid = (void *) fh;
1550  
1551  		min_len = sizeof(*fid) / sizeof(u32);
1552  		if (*len < min_len) {
1553  			*len = min_len;
1554  			return FILEID_INVALID;
1555  		}
1556  		*fid = bch2_inode_to_fid(inode);
1557  
1558  		*len = min_len;
1559  		return FILEID_BCACHEFS_WITHOUT_PARENT;
1560  	}
1561  }
1562  
bch2_nfs_get_inode(struct super_block * sb,struct bcachefs_fid fid)1563  static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1564  					struct bcachefs_fid fid)
1565  {
1566  	struct bch_fs *c = sb->s_fs_info;
1567  	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
1568  				    .subvol = fid.subvol,
1569  				    .inum = fid.inum,
1570  	});
1571  	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
1572  		iput(vinode);
1573  		vinode = ERR_PTR(-ESTALE);
1574  	}
1575  	return vinode;
1576  }
1577  
bch2_fh_to_dentry(struct super_block * sb,struct fid * _fid,int fh_len,int fh_type)1578  static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
1579  		int fh_len, int fh_type)
1580  {
1581  	struct bcachefs_fid *fid = (void *) _fid;
1582  
1583  	if (!bcachefs_fid_valid(fh_len, fh_type))
1584  		return NULL;
1585  
1586  	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
1587  }
1588  
bch2_fh_to_parent(struct super_block * sb,struct fid * _fid,int fh_len,int fh_type)1589  static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
1590  		int fh_len, int fh_type)
1591  {
1592  	struct bcachefs_fid_with_parent *fid = (void *) _fid;
1593  
1594  	if (!bcachefs_fid_valid(fh_len, fh_type) ||
1595  	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
1596  		return NULL;
1597  
1598  	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
1599  }
1600  
bch2_get_parent(struct dentry * child)1601  static struct dentry *bch2_get_parent(struct dentry *child)
1602  {
1603  	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
1604  	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1605  	subvol_inum parent_inum = {
1606  		.subvol = inode->ei_inode.bi_parent_subvol ?:
1607  			inode->ei_inum.subvol,
1608  		.inum = inode->ei_inode.bi_dir,
1609  	};
1610  
1611  	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
1612  }
1613  
bch2_get_name(struct dentry * parent,char * name,struct dentry * child)1614  static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
1615  {
1616  	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
1617  	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
1618  	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1619  	struct btree_trans *trans;
1620  	struct btree_iter iter1;
1621  	struct btree_iter iter2;
1622  	struct bkey_s_c k;
1623  	struct bkey_s_c_dirent d;
1624  	struct bch_inode_unpacked inode_u;
1625  	subvol_inum target;
1626  	u32 snapshot;
1627  	struct qstr dirent_name;
1628  	unsigned name_len = 0;
1629  	int ret;
1630  
1631  	if (!S_ISDIR(dir->v.i_mode))
1632  		return -EINVAL;
1633  
1634  	trans = bch2_trans_get(c);
1635  
1636  	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
1637  			     POS(dir->ei_inode.bi_inum, 0), 0);
1638  	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
1639  			     POS(dir->ei_inode.bi_inum, 0), 0);
1640  retry:
1641  	bch2_trans_begin(trans);
1642  
1643  	ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
1644  	if (ret)
1645  		goto err;
1646  
1647  	bch2_btree_iter_set_snapshot(&iter1, snapshot);
1648  	bch2_btree_iter_set_snapshot(&iter2, snapshot);
1649  
1650  	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
1651  	if (ret)
1652  		goto err;
1653  
1654  	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
1655  		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
1656  
1657  		k = bch2_btree_iter_peek_slot(&iter1);
1658  		ret = bkey_err(k);
1659  		if (ret)
1660  			goto err;
1661  
1662  		if (k.k->type != KEY_TYPE_dirent) {
1663  			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1664  			goto err;
1665  		}
1666  
1667  		d = bkey_s_c_to_dirent(k);
1668  		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1669  		if (ret > 0)
1670  			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1671  		if (ret)
1672  			goto err;
1673  
1674  		if (subvol_inum_eq(target, inode->ei_inum))
1675  			goto found;
1676  	} else {
1677  		/*
1678  		 * File with multiple hardlinks and our backref is to the wrong
1679  		 * directory - linear search:
1680  		 */
1681  		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
1682  			if (k.k->p.inode > dir->ei_inode.bi_inum)
1683  				break;
1684  
1685  			if (k.k->type != KEY_TYPE_dirent)
1686  				continue;
1687  
1688  			d = bkey_s_c_to_dirent(k);
1689  			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1690  			if (ret < 0)
1691  				break;
1692  			if (ret)
1693  				continue;
1694  
1695  			if (subvol_inum_eq(target, inode->ei_inum))
1696  				goto found;
1697  		}
1698  	}
1699  
1700  	ret = -ENOENT;
1701  	goto err;
1702  found:
1703  	dirent_name = bch2_dirent_get_name(d);
1704  
1705  	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
1706  	memcpy(name, dirent_name.name, name_len);
1707  	name[name_len] = '\0';
1708  err:
1709  	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1710  		goto retry;
1711  
1712  	bch2_trans_iter_exit(trans, &iter1);
1713  	bch2_trans_iter_exit(trans, &iter2);
1714  	bch2_trans_put(trans);
1715  
1716  	return ret;
1717  }
1718  
1719  static const struct export_operations bch_export_ops = {
1720  	.encode_fh	= bch2_encode_fh,
1721  	.fh_to_dentry	= bch2_fh_to_dentry,
1722  	.fh_to_parent	= bch2_fh_to_parent,
1723  	.get_parent	= bch2_get_parent,
1724  	.get_name	= bch2_get_name,
1725  };
1726  
bch2_vfs_inode_init(struct btree_trans * trans,subvol_inum inum,struct bch_inode_info * inode,struct bch_inode_unpacked * bi,struct bch_subvolume * subvol)1727  static void bch2_vfs_inode_init(struct btree_trans *trans,
1728  				subvol_inum inum,
1729  				struct bch_inode_info *inode,
1730  				struct bch_inode_unpacked *bi,
1731  				struct bch_subvolume *subvol)
1732  {
1733  	inode->v.i_ino		= inum.inum;
1734  	inode->ei_inum		= inum;
1735  	inode->ei_inode.bi_inum	= inum.inum;
1736  	bch2_inode_update_after_write(trans, inode, bi, ~0);
1737  
1738  	inode->v.i_blocks	= bi->bi_sectors;
1739  	inode->v.i_ino		= bi->bi_inum;
1740  	inode->v.i_rdev		= bi->bi_dev;
1741  	inode->v.i_generation	= bi->bi_generation;
1742  	inode->v.i_size		= bi->bi_size;
1743  
1744  	inode->ei_flags		= 0;
1745  	inode->ei_quota_reserved = 0;
1746  	inode->ei_qid		= bch_qid(bi);
1747  
1748  	if (BCH_SUBVOLUME_SNAP(subvol))
1749  		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1750  
1751  	inode->v.i_mapping->a_ops = &bch_address_space_operations;
1752  
1753  	switch (inode->v.i_mode & S_IFMT) {
1754  	case S_IFREG:
1755  		inode->v.i_op	= &bch_file_inode_operations;
1756  		inode->v.i_fop	= &bch_file_operations;
1757  		break;
1758  	case S_IFDIR:
1759  		inode->v.i_op	= &bch_dir_inode_operations;
1760  		inode->v.i_fop	= &bch_dir_file_operations;
1761  		break;
1762  	case S_IFLNK:
1763  		inode_nohighmem(&inode->v);
1764  		inode->v.i_op	= &bch_symlink_inode_operations;
1765  		break;
1766  	default:
1767  		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1768  		inode->v.i_op	= &bch_special_inode_operations;
1769  		break;
1770  	}
1771  
1772  	mapping_set_large_folios(inode->v.i_mapping);
1773  }
1774  
bch2_free_inode(struct inode * vinode)1775  static void bch2_free_inode(struct inode *vinode)
1776  {
1777  	kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
1778  }
1779  
inode_update_times_fn(struct btree_trans * trans,struct bch_inode_info * inode,struct bch_inode_unpacked * bi,void * p)1780  static int inode_update_times_fn(struct btree_trans *trans,
1781  				 struct bch_inode_info *inode,
1782  				 struct bch_inode_unpacked *bi,
1783  				 void *p)
1784  {
1785  	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1786  
1787  	bi->bi_atime	= timespec_to_bch2_time(c, inode_get_atime(&inode->v));
1788  	bi->bi_mtime	= timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
1789  	bi->bi_ctime	= timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
1790  
1791  	return 0;
1792  }
1793  
bch2_vfs_write_inode(struct inode * vinode,struct writeback_control * wbc)1794  static int bch2_vfs_write_inode(struct inode *vinode,
1795  				struct writeback_control *wbc)
1796  {
1797  	struct bch_fs *c = vinode->i_sb->s_fs_info;
1798  	struct bch_inode_info *inode = to_bch_ei(vinode);
1799  	int ret;
1800  
1801  	mutex_lock(&inode->ei_update_lock);
1802  	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1803  			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1804  	mutex_unlock(&inode->ei_update_lock);
1805  
1806  	return bch2_err_class(ret);
1807  }
1808  
bch2_evict_inode(struct inode * vinode)1809  static void bch2_evict_inode(struct inode *vinode)
1810  {
1811  	struct bch_fs *c = vinode->i_sb->s_fs_info;
1812  	struct bch_inode_info *inode = to_bch_ei(vinode);
1813  	bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v);
1814  
1815  	/*
1816  	 * evict() has waited for outstanding writeback, we'll do no more IO
1817  	 * through this inode: it's safe to remove from VFS inode hashtable here
1818  	 *
1819  	 * Do that now so that other threads aren't blocked from pulling it back
1820  	 * in, there's no reason for them to be:
1821  	 */
1822  	if (!delete)
1823  		bch2_inode_hash_remove(c, inode);
1824  
1825  	truncate_inode_pages_final(&inode->v.i_data);
1826  
1827  	clear_inode(&inode->v);
1828  
1829  	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1830  
1831  	if (delete) {
1832  		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1833  				KEY_TYPE_QUOTA_WARN);
1834  		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1835  				KEY_TYPE_QUOTA_WARN);
1836  		bch2_inode_rm(c, inode_inum(inode));
1837  
1838  		/*
1839  		 * If we are deleting, we need it present in the vfs hash table
1840  		 * so that fsck can check if unlinked inodes are still open:
1841  		 */
1842  		bch2_inode_hash_remove(c, inode);
1843  	}
1844  
1845  	mutex_lock(&c->vfs_inodes_lock);
1846  	list_del_init(&inode->ei_vfs_inode_list);
1847  	mutex_unlock(&c->vfs_inodes_lock);
1848  }
1849  
bch2_evict_subvolume_inodes(struct bch_fs * c,snapshot_id_list * s)1850  void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
1851  {
1852  	struct bch_inode_info *inode;
1853  	DARRAY(struct bch_inode_info *) grabbed;
1854  	bool clean_pass = false, this_pass_clean;
1855  
1856  	/*
1857  	 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
1858  	 * be pruned with d_mark_dontcache().
1859  	 *
1860  	 * Once we've had a clean pass where we didn't find any inodes without
1861  	 * I_DONTCACHE, we wait for them to be freed:
1862  	 */
1863  
1864  	darray_init(&grabbed);
1865  	darray_make_room(&grabbed, 1024);
1866  again:
1867  	cond_resched();
1868  	this_pass_clean = true;
1869  
1870  	mutex_lock(&c->vfs_inodes_lock);
1871  	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
1872  		if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
1873  			continue;
1874  
1875  		if (!(inode->v.i_state & I_DONTCACHE) &&
1876  		    !(inode->v.i_state & I_FREEING) &&
1877  		    igrab(&inode->v)) {
1878  			this_pass_clean = false;
1879  
1880  			if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
1881  				iput(&inode->v);
1882  				break;
1883  			}
1884  		} else if (clean_pass && this_pass_clean) {
1885  			struct wait_bit_queue_entry wqe;
1886  			struct wait_queue_head *wq_head;
1887  
1888  			wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
1889  			prepare_to_wait_event(wq_head, &wqe.wq_entry,
1890  					      TASK_UNINTERRUPTIBLE);
1891  			mutex_unlock(&c->vfs_inodes_lock);
1892  
1893  			schedule();
1894  			finish_wait(wq_head, &wqe.wq_entry);
1895  			goto again;
1896  		}
1897  	}
1898  	mutex_unlock(&c->vfs_inodes_lock);
1899  
1900  	darray_for_each(grabbed, i) {
1901  		inode = *i;
1902  		d_mark_dontcache(&inode->v);
1903  		d_prune_aliases(&inode->v);
1904  		iput(&inode->v);
1905  	}
1906  	grabbed.nr = 0;
1907  
1908  	if (!clean_pass || !this_pass_clean) {
1909  		clean_pass = this_pass_clean;
1910  		goto again;
1911  	}
1912  
1913  	darray_exit(&grabbed);
1914  }
1915  
bch2_statfs(struct dentry * dentry,struct kstatfs * buf)1916  static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1917  {
1918  	struct super_block *sb = dentry->d_sb;
1919  	struct bch_fs *c = sb->s_fs_info;
1920  	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1921  	unsigned shift = sb->s_blocksize_bits - 9;
1922  	/*
1923  	 * this assumes inodes take up 64 bytes, which is a decent average
1924  	 * number:
1925  	 */
1926  	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1927  
1928  	buf->f_type	= BCACHEFS_STATFS_MAGIC;
1929  	buf->f_bsize	= sb->s_blocksize;
1930  	buf->f_blocks	= usage.capacity >> shift;
1931  	buf->f_bfree	= usage.free >> shift;
1932  	buf->f_bavail	= avail_factor(usage.free) >> shift;
1933  
1934  	buf->f_files	= usage.nr_inodes + avail_inodes;
1935  	buf->f_ffree	= avail_inodes;
1936  
1937  	buf->f_fsid	= uuid_to_fsid(c->sb.user_uuid.b);
1938  	buf->f_namelen	= BCH_NAME_MAX;
1939  
1940  	return 0;
1941  }
1942  
bch2_sync_fs(struct super_block * sb,int wait)1943  static int bch2_sync_fs(struct super_block *sb, int wait)
1944  {
1945  	struct bch_fs *c = sb->s_fs_info;
1946  	int ret;
1947  
1948  	trace_bch2_sync_fs(sb, wait);
1949  
1950  	if (c->opts.journal_flush_disabled)
1951  		return 0;
1952  
1953  	if (!wait) {
1954  		bch2_journal_flush_async(&c->journal, NULL);
1955  		return 0;
1956  	}
1957  
1958  	ret = bch2_journal_flush(&c->journal);
1959  	return bch2_err_class(ret);
1960  }
1961  
bch2_path_to_fs(const char * path)1962  static struct bch_fs *bch2_path_to_fs(const char *path)
1963  {
1964  	struct bch_fs *c;
1965  	dev_t dev;
1966  	int ret;
1967  
1968  	ret = lookup_bdev(path, &dev);
1969  	if (ret)
1970  		return ERR_PTR(ret);
1971  
1972  	c = bch2_dev_to_fs(dev);
1973  	if (c)
1974  		closure_put(&c->cl);
1975  	return c ?: ERR_PTR(-ENOENT);
1976  }
1977  
bch2_remount(struct super_block * sb,int * flags,struct bch_opts opts)1978  static int bch2_remount(struct super_block *sb, int *flags,
1979  			struct bch_opts opts)
1980  {
1981  	struct bch_fs *c = sb->s_fs_info;
1982  	int ret = 0;
1983  
1984  	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1985  
1986  	if (opts.read_only != c->opts.read_only) {
1987  		down_write(&c->state_lock);
1988  
1989  		if (opts.read_only) {
1990  			bch2_fs_read_only(c);
1991  
1992  			sb->s_flags |= SB_RDONLY;
1993  		} else {
1994  			ret = bch2_fs_read_write(c);
1995  			if (ret) {
1996  				bch_err(c, "error going rw: %i", ret);
1997  				up_write(&c->state_lock);
1998  				ret = -EINVAL;
1999  				goto err;
2000  			}
2001  
2002  			sb->s_flags &= ~SB_RDONLY;
2003  		}
2004  
2005  		c->opts.read_only = opts.read_only;
2006  
2007  		up_write(&c->state_lock);
2008  	}
2009  
2010  	if (opt_defined(opts, errors))
2011  		c->opts.errors = opts.errors;
2012  err:
2013  	return bch2_err_class(ret);
2014  }
2015  
bch2_show_devname(struct seq_file * seq,struct dentry * root)2016  static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
2017  {
2018  	struct bch_fs *c = root->d_sb->s_fs_info;
2019  	bool first = true;
2020  
2021  	for_each_online_member(c, ca) {
2022  		if (!first)
2023  			seq_putc(seq, ':');
2024  		first = false;
2025  		seq_puts(seq, ca->disk_sb.sb_name);
2026  	}
2027  
2028  	return 0;
2029  }
2030  
bch2_show_options(struct seq_file * seq,struct dentry * root)2031  static int bch2_show_options(struct seq_file *seq, struct dentry *root)
2032  {
2033  	struct bch_fs *c = root->d_sb->s_fs_info;
2034  	struct printbuf buf = PRINTBUF;
2035  
2036  	bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
2037  			  OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
2038  	printbuf_nul_terminate(&buf);
2039  	seq_printf(seq, ",%s", buf.buf);
2040  
2041  	int ret = buf.allocation_failure ? -ENOMEM : 0;
2042  	printbuf_exit(&buf);
2043  	return ret;
2044  }
2045  
bch2_put_super(struct super_block * sb)2046  static void bch2_put_super(struct super_block *sb)
2047  {
2048  	struct bch_fs *c = sb->s_fs_info;
2049  
2050  	__bch2_fs_stop(c);
2051  }
2052  
2053  /*
2054   * bcachefs doesn't currently integrate intwrite freeze protection but the
2055   * internal write references serve the same purpose. Therefore reuse the
2056   * read-only transition code to perform the quiesce. The caveat is that we don't
2057   * currently have the ability to block tasks that want a write reference while
2058   * the superblock is frozen. This is fine for now, but we should either add
2059   * blocking support or find a way to integrate sb_start_intwrite() and friends.
2060   */
bch2_freeze(struct super_block * sb)2061  static int bch2_freeze(struct super_block *sb)
2062  {
2063  	struct bch_fs *c = sb->s_fs_info;
2064  
2065  	down_write(&c->state_lock);
2066  	bch2_fs_read_only(c);
2067  	up_write(&c->state_lock);
2068  	return 0;
2069  }
2070  
bch2_unfreeze(struct super_block * sb)2071  static int bch2_unfreeze(struct super_block *sb)
2072  {
2073  	struct bch_fs *c = sb->s_fs_info;
2074  	int ret;
2075  
2076  	if (test_bit(BCH_FS_emergency_ro, &c->flags))
2077  		return 0;
2078  
2079  	down_write(&c->state_lock);
2080  	ret = bch2_fs_read_write(c);
2081  	up_write(&c->state_lock);
2082  	return ret;
2083  }
2084  
2085  static const struct super_operations bch_super_operations = {
2086  	.alloc_inode	= bch2_alloc_inode,
2087  	.free_inode	= bch2_free_inode,
2088  	.write_inode	= bch2_vfs_write_inode,
2089  	.evict_inode	= bch2_evict_inode,
2090  	.sync_fs	= bch2_sync_fs,
2091  	.statfs		= bch2_statfs,
2092  	.show_devname	= bch2_show_devname,
2093  	.show_options	= bch2_show_options,
2094  	.put_super	= bch2_put_super,
2095  	.freeze_fs	= bch2_freeze,
2096  	.unfreeze_fs	= bch2_unfreeze,
2097  };
2098  
bch2_set_super(struct super_block * s,void * data)2099  static int bch2_set_super(struct super_block *s, void *data)
2100  {
2101  	s->s_fs_info = data;
2102  	return 0;
2103  }
2104  
bch2_noset_super(struct super_block * s,void * data)2105  static int bch2_noset_super(struct super_block *s, void *data)
2106  {
2107  	return -EBUSY;
2108  }
2109  
2110  typedef DARRAY(struct bch_fs *) darray_fs;
2111  
bch2_test_super(struct super_block * s,void * data)2112  static int bch2_test_super(struct super_block *s, void *data)
2113  {
2114  	struct bch_fs *c = s->s_fs_info;
2115  	darray_fs *d = data;
2116  
2117  	if (!c)
2118  		return false;
2119  
2120  	darray_for_each(*d, i)
2121  		if (c != *i)
2122  			return false;
2123  	return true;
2124  }
2125  
bch2_fs_get_tree(struct fs_context * fc)2126  static int bch2_fs_get_tree(struct fs_context *fc)
2127  {
2128  	struct bch_fs *c;
2129  	struct super_block *sb;
2130  	struct inode *vinode;
2131  	struct bch2_opts_parse *opts_parse = fc->fs_private;
2132  	struct bch_opts opts = opts_parse->opts;
2133  	darray_str devs;
2134  	darray_fs devs_to_fs = {};
2135  	int ret;
2136  
2137  	opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
2138  	opt_set(opts, nostart, true);
2139  
2140  	if (!fc->source || strlen(fc->source) == 0)
2141  		return -EINVAL;
2142  
2143  	ret = bch2_split_devs(fc->source, &devs);
2144  	if (ret)
2145  		return ret;
2146  
2147  	darray_for_each(devs, i) {
2148  		ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
2149  		if (ret)
2150  			goto err;
2151  	}
2152  
2153  	sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs);
2154  	if (!IS_ERR(sb))
2155  		goto got_sb;
2156  
2157  	c = bch2_fs_open(devs.data, devs.nr, opts);
2158  	ret = PTR_ERR_OR_ZERO(c);
2159  	if (ret)
2160  		goto err;
2161  
2162  	/* Some options can't be parsed until after the fs is started: */
2163  	opts = bch2_opts_empty();
2164  	ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
2165  	if (ret)
2166  		goto err_stop_fs;
2167  
2168  	bch2_opts_apply(&c->opts, opts);
2169  
2170  	ret = bch2_fs_start(c);
2171  	if (ret)
2172  		goto err_stop_fs;
2173  
2174  	sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
2175  	ret = PTR_ERR_OR_ZERO(sb);
2176  	if (ret)
2177  		goto err_stop_fs;
2178  got_sb:
2179  	c = sb->s_fs_info;
2180  
2181  	if (sb->s_root) {
2182  		if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) {
2183  			ret = -EBUSY;
2184  			goto err_put_super;
2185  		}
2186  		goto out;
2187  	}
2188  
2189  	sb->s_blocksize		= block_bytes(c);
2190  	sb->s_blocksize_bits	= ilog2(block_bytes(c));
2191  	sb->s_maxbytes		= MAX_LFS_FILESIZE;
2192  	sb->s_op		= &bch_super_operations;
2193  	sb->s_export_op		= &bch_export_ops;
2194  #ifdef CONFIG_BCACHEFS_QUOTA
2195  	sb->s_qcop		= &bch2_quotactl_operations;
2196  	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
2197  #endif
2198  	sb->s_xattr		= bch2_xattr_handlers;
2199  	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
2200  	sb->s_time_gran		= c->sb.nsec_per_time_unit;
2201  	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
2202  	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
2203  	sb->s_uuid		= c->sb.user_uuid;
2204  	sb->s_shrink->seeks	= 0;
2205  	c->vfs_sb		= sb;
2206  	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
2207  
2208  	ret = super_setup_bdi(sb);
2209  	if (ret)
2210  		goto err_put_super;
2211  
2212  	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
2213  
2214  	for_each_online_member(c, ca) {
2215  		struct block_device *bdev = ca->disk_sb.bdev;
2216  
2217  		/* XXX: create an anonymous device for multi device filesystems */
2218  		sb->s_bdev	= bdev;
2219  		sb->s_dev	= bdev->bd_dev;
2220  		percpu_ref_put(&ca->io_ref);
2221  		break;
2222  	}
2223  
2224  	c->dev = sb->s_dev;
2225  
2226  #ifdef CONFIG_BCACHEFS_POSIX_ACL
2227  	if (c->opts.acl)
2228  		sb->s_flags	|= SB_POSIXACL;
2229  #endif
2230  
2231  	sb->s_shrink->seeks = 0;
2232  
2233  	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
2234  	ret = PTR_ERR_OR_ZERO(vinode);
2235  	bch_err_msg(c, ret, "mounting: error getting root inode");
2236  	if (ret)
2237  		goto err_put_super;
2238  
2239  	sb->s_root = d_make_root(vinode);
2240  	if (!sb->s_root) {
2241  		bch_err(c, "error mounting: error allocating root dentry");
2242  		ret = -ENOMEM;
2243  		goto err_put_super;
2244  	}
2245  
2246  	sb->s_flags |= SB_ACTIVE;
2247  out:
2248  	fc->root = dget(sb->s_root);
2249  err:
2250  	darray_exit(&devs_to_fs);
2251  	bch2_darray_str_exit(&devs);
2252  	if (ret)
2253  		pr_err("error: %s", bch2_err_str(ret));
2254  	/*
2255  	 * On an inconsistency error in recovery we might see an -EROFS derived
2256  	 * errorcode (from the journal), but we don't want to return that to
2257  	 * userspace as that causes util-linux to retry the mount RO - which is
2258  	 * confusing:
2259  	 */
2260  	if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
2261  		ret = -EIO;
2262  	return bch2_err_class(ret);
2263  
2264  err_stop_fs:
2265  	bch2_fs_stop(c);
2266  	goto err;
2267  
2268  err_put_super:
2269  	__bch2_fs_stop(c);
2270  	deactivate_locked_super(sb);
2271  	goto err;
2272  }
2273  
bch2_kill_sb(struct super_block * sb)2274  static void bch2_kill_sb(struct super_block *sb)
2275  {
2276  	struct bch_fs *c = sb->s_fs_info;
2277  
2278  	generic_shutdown_super(sb);
2279  	bch2_fs_free(c);
2280  }
2281  
bch2_fs_context_free(struct fs_context * fc)2282  static void bch2_fs_context_free(struct fs_context *fc)
2283  {
2284  	struct bch2_opts_parse *opts = fc->fs_private;
2285  
2286  	if (opts) {
2287  		printbuf_exit(&opts->parse_later);
2288  		kfree(opts);
2289  	}
2290  }
2291  
bch2_fs_parse_param(struct fs_context * fc,struct fs_parameter * param)2292  static int bch2_fs_parse_param(struct fs_context *fc,
2293  			       struct fs_parameter *param)
2294  {
2295  	/*
2296  	 * the "source" param, i.e., the name of the device(s) to mount,
2297  	 * is handled by the VFS layer.
2298  	 */
2299  	if (!strcmp(param->key, "source"))
2300  		return -ENOPARAM;
2301  
2302  	struct bch2_opts_parse *opts = fc->fs_private;
2303  	struct bch_fs *c = NULL;
2304  
2305  	/* for reconfigure, we already have a struct bch_fs */
2306  	if (fc->root)
2307  		c = fc->root->d_sb->s_fs_info;
2308  
2309  	int ret = bch2_parse_one_mount_opt(c, &opts->opts,
2310  					   &opts->parse_later, param->key,
2311  					   param->string);
2312  
2313  	return bch2_err_class(ret);
2314  }
2315  
bch2_fs_reconfigure(struct fs_context * fc)2316  static int bch2_fs_reconfigure(struct fs_context *fc)
2317  {
2318  	struct super_block *sb = fc->root->d_sb;
2319  	struct bch2_opts_parse *opts = fc->fs_private;
2320  
2321  	return bch2_remount(sb, &fc->sb_flags, opts->opts);
2322  }
2323  
2324  static const struct fs_context_operations bch2_context_ops = {
2325  	.free        = bch2_fs_context_free,
2326  	.parse_param = bch2_fs_parse_param,
2327  	.get_tree    = bch2_fs_get_tree,
2328  	.reconfigure = bch2_fs_reconfigure,
2329  };
2330  
bch2_init_fs_context(struct fs_context * fc)2331  static int bch2_init_fs_context(struct fs_context *fc)
2332  {
2333  	struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL);
2334  
2335  	if (!opts)
2336  		return -ENOMEM;
2337  
2338  	opts->parse_later = PRINTBUF;
2339  
2340  	fc->ops = &bch2_context_ops;
2341  	fc->fs_private = opts;
2342  
2343  	return 0;
2344  }
2345  
bch2_fs_vfs_exit(struct bch_fs * c)2346  void bch2_fs_vfs_exit(struct bch_fs *c)
2347  {
2348  	if (c->vfs_inodes_table.tbl)
2349  		rhashtable_destroy(&c->vfs_inodes_table);
2350  }
2351  
bch2_fs_vfs_init(struct bch_fs * c)2352  int bch2_fs_vfs_init(struct bch_fs *c)
2353  {
2354  	return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params);
2355  }
2356  
2357  static struct file_system_type bcache_fs_type = {
2358  	.owner			= THIS_MODULE,
2359  	.name			= "bcachefs",
2360  	.init_fs_context	= bch2_init_fs_context,
2361  	.kill_sb		= bch2_kill_sb,
2362  	.fs_flags		= FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
2363  };
2364  
2365  MODULE_ALIAS_FS("bcachefs");
2366  
bch2_vfs_exit(void)2367  void bch2_vfs_exit(void)
2368  {
2369  	unregister_filesystem(&bcache_fs_type);
2370  	kmem_cache_destroy(bch2_inode_cache);
2371  }
2372  
bch2_vfs_init(void)2373  int __init bch2_vfs_init(void)
2374  {
2375  	int ret = -ENOMEM;
2376  
2377  	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
2378  				      SLAB_ACCOUNT);
2379  	if (!bch2_inode_cache)
2380  		goto err;
2381  
2382  	ret = register_filesystem(&bcache_fs_type);
2383  	if (ret)
2384  		goto err;
2385  
2386  	return 0;
2387  err:
2388  	bch2_vfs_exit();
2389  	return ret;
2390  }
2391  
2392  #endif /* NO_BCACHEFS_FS */
2393