1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4   * All Rights Reserved.
5   */
6  #include "xfs.h"
7  #include "xfs_fs.h"
8  #include "xfs_shared.h"
9  #include "xfs_format.h"
10  #include "xfs_log_format.h"
11  #include "xfs_trans_resv.h"
12  #include "xfs_mount.h"
13  #include "xfs_inode.h"
14  #include "xfs_acl.h"
15  #include "xfs_quota.h"
16  #include "xfs_da_format.h"
17  #include "xfs_da_btree.h"
18  #include "xfs_attr.h"
19  #include "xfs_trans.h"
20  #include "xfs_trans_space.h"
21  #include "xfs_bmap_btree.h"
22  #include "xfs_trace.h"
23  #include "xfs_icache.h"
24  #include "xfs_symlink.h"
25  #include "xfs_dir2.h"
26  #include "xfs_iomap.h"
27  #include "xfs_error.h"
28  #include "xfs_ioctl.h"
29  #include "xfs_xattr.h"
30  #include "xfs_file.h"
31  #include "xfs_bmap.h"
32  
33  #include <linux/posix_acl.h>
34  #include <linux/security.h>
35  #include <linux/iversion.h>
36  #include <linux/fiemap.h>
37  
38  /*
39   * Directories have different lock order w.r.t. mmap_lock compared to regular
40   * files. This is due to readdir potentially triggering page faults on a user
41   * buffer inside filldir(), and this happens with the ilock on the directory
42   * held. For regular files, the lock order is the other way around - the
43   * mmap_lock is taken during the page fault, and then we lock the ilock to do
44   * block mapping. Hence we need a different class for the directory ilock so
45   * that lockdep can tell them apart.
46   */
47  static struct lock_class_key xfs_nondir_ilock_class;
48  static struct lock_class_key xfs_dir_ilock_class;
49  
50  static int
xfs_initxattrs(struct inode * inode,const struct xattr * xattr_array,void * fs_info)51  xfs_initxattrs(
52  	struct inode		*inode,
53  	const struct xattr	*xattr_array,
54  	void			*fs_info)
55  {
56  	const struct xattr	*xattr;
57  	struct xfs_inode	*ip = XFS_I(inode);
58  	int			error = 0;
59  
60  	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
61  		struct xfs_da_args	args = {
62  			.dp		= ip,
63  			.attr_filter	= XFS_ATTR_SECURE,
64  			.name		= xattr->name,
65  			.namelen	= strlen(xattr->name),
66  			.value		= xattr->value,
67  			.valuelen	= xattr->value_len,
68  		};
69  		error = xfs_attr_change(&args, XFS_ATTRUPDATE_UPSERT);
70  		if (error < 0)
71  			break;
72  	}
73  	return error;
74  }
75  
76  /*
77   * Hook in SELinux.  This is not quite correct yet, what we really need
78   * here (as we do for default ACLs) is a mechanism by which creation of
79   * these attrs can be journalled at inode creation time (along with the
80   * inode, of course, such that log replay can't cause these to be lost).
81   */
82  int
xfs_inode_init_security(struct inode * inode,struct inode * dir,const struct qstr * qstr)83  xfs_inode_init_security(
84  	struct inode	*inode,
85  	struct inode	*dir,
86  	const struct qstr *qstr)
87  {
88  	return security_inode_init_security(inode, dir, qstr,
89  					     &xfs_initxattrs, NULL);
90  }
91  
92  static void
xfs_dentry_to_name(struct xfs_name * namep,struct dentry * dentry)93  xfs_dentry_to_name(
94  	struct xfs_name	*namep,
95  	struct dentry	*dentry)
96  {
97  	namep->name = dentry->d_name.name;
98  	namep->len = dentry->d_name.len;
99  	namep->type = XFS_DIR3_FT_UNKNOWN;
100  }
101  
102  static int
xfs_dentry_mode_to_name(struct xfs_name * namep,struct dentry * dentry,int mode)103  xfs_dentry_mode_to_name(
104  	struct xfs_name	*namep,
105  	struct dentry	*dentry,
106  	int		mode)
107  {
108  	namep->name = dentry->d_name.name;
109  	namep->len = dentry->d_name.len;
110  	namep->type = xfs_mode_to_ftype(mode);
111  
112  	if (unlikely(namep->type == XFS_DIR3_FT_UNKNOWN))
113  		return -EFSCORRUPTED;
114  
115  	return 0;
116  }
117  
118  STATIC void
xfs_cleanup_inode(struct inode * dir,struct inode * inode,struct dentry * dentry)119  xfs_cleanup_inode(
120  	struct inode	*dir,
121  	struct inode	*inode,
122  	struct dentry	*dentry)
123  {
124  	struct xfs_name	teardown;
125  
126  	/* Oh, the horror.
127  	 * If we can't add the ACL or we fail in
128  	 * xfs_inode_init_security we must back out.
129  	 * ENOSPC can hit here, among other things.
130  	 */
131  	xfs_dentry_to_name(&teardown, dentry);
132  
133  	xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
134  }
135  
136  /*
137   * Check to see if we are likely to need an extended attribute to be added to
138   * the inode we are about to allocate. This allows the attribute fork to be
139   * created during the inode allocation, reducing the number of transactions we
140   * need to do in this fast path.
141   *
142   * The security checks are optimistic, but not guaranteed. The two LSMs that
143   * require xattrs to be added here (selinux and smack) are also the only two
144   * LSMs that add a sb->s_security structure to the superblock. Hence if security
145   * is enabled and sb->s_security is set, we have a pretty good idea that we are
146   * going to be asked to add a security xattr immediately after allocating the
147   * xfs inode and instantiating the VFS inode.
148   */
149  static inline bool
xfs_create_need_xattr(struct inode * dir,struct posix_acl * default_acl,struct posix_acl * acl)150  xfs_create_need_xattr(
151  	struct inode	*dir,
152  	struct posix_acl *default_acl,
153  	struct posix_acl *acl)
154  {
155  	if (acl)
156  		return true;
157  	if (default_acl)
158  		return true;
159  #if IS_ENABLED(CONFIG_SECURITY)
160  	if (dir->i_sb->s_security)
161  		return true;
162  #endif
163  	return false;
164  }
165  
166  
167  STATIC int
xfs_generic_create(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,dev_t rdev,struct file * tmpfile)168  xfs_generic_create(
169  	struct mnt_idmap	*idmap,
170  	struct inode		*dir,
171  	struct dentry		*dentry,
172  	umode_t			mode,
173  	dev_t			rdev,
174  	struct file		*tmpfile)	/* unnamed file */
175  {
176  	struct xfs_icreate_args	args = {
177  		.idmap		= idmap,
178  		.pip		= XFS_I(dir),
179  		.rdev		= rdev,
180  		.mode		= mode,
181  	};
182  	struct inode		*inode;
183  	struct xfs_inode	*ip = NULL;
184  	struct posix_acl	*default_acl, *acl;
185  	struct xfs_name		name;
186  	int			error;
187  
188  	/*
189  	 * Irix uses Missed'em'V split, but doesn't want to see
190  	 * the upper 5 bits of (14bit) major.
191  	 */
192  	if (S_ISCHR(args.mode) || S_ISBLK(args.mode)) {
193  		if (unlikely(!sysv_valid_dev(args.rdev) ||
194  			     MAJOR(args.rdev) & ~0x1ff))
195  			return -EINVAL;
196  	} else {
197  		args.rdev = 0;
198  	}
199  
200  	error = posix_acl_create(dir, &args.mode, &default_acl, &acl);
201  	if (error)
202  		return error;
203  
204  	/* Verify mode is valid also for tmpfile case */
205  	error = xfs_dentry_mode_to_name(&name, dentry, args.mode);
206  	if (unlikely(error))
207  		goto out_free_acl;
208  
209  	if (!tmpfile) {
210  		if (xfs_create_need_xattr(dir, default_acl, acl))
211  			args.flags |= XFS_ICREATE_INIT_XATTRS;
212  
213  		error = xfs_create(&args, &name, &ip);
214  	} else {
215  		args.flags |= XFS_ICREATE_TMPFILE;
216  
217  		/*
218  		 * If this temporary file will not be linkable, don't bother
219  		 * creating an attr fork to receive a parent pointer.
220  		 */
221  		if (tmpfile->f_flags & O_EXCL)
222  			args.flags |= XFS_ICREATE_UNLINKABLE;
223  
224  		error = xfs_create_tmpfile(&args, &ip);
225  	}
226  	if (unlikely(error))
227  		goto out_free_acl;
228  
229  	inode = VFS_I(ip);
230  
231  	error = xfs_inode_init_security(inode, dir, &dentry->d_name);
232  	if (unlikely(error))
233  		goto out_cleanup_inode;
234  
235  	if (default_acl) {
236  		error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
237  		if (error)
238  			goto out_cleanup_inode;
239  	}
240  	if (acl) {
241  		error = __xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
242  		if (error)
243  			goto out_cleanup_inode;
244  	}
245  
246  	xfs_setup_iops(ip);
247  
248  	if (tmpfile) {
249  		/*
250  		 * The VFS requires that any inode fed to d_tmpfile must have
251  		 * nlink == 1 so that it can decrement the nlink in d_tmpfile.
252  		 * However, we created the temp file with nlink == 0 because
253  		 * we're not allowed to put an inode with nlink > 0 on the
254  		 * unlinked list.  Therefore we have to set nlink to 1 so that
255  		 * d_tmpfile can immediately set it back to zero.
256  		 */
257  		set_nlink(inode, 1);
258  		d_tmpfile(tmpfile, inode);
259  	} else
260  		d_instantiate(dentry, inode);
261  
262  	xfs_finish_inode_setup(ip);
263  
264   out_free_acl:
265  	posix_acl_release(default_acl);
266  	posix_acl_release(acl);
267  	return error;
268  
269   out_cleanup_inode:
270  	xfs_finish_inode_setup(ip);
271  	if (!tmpfile)
272  		xfs_cleanup_inode(dir, inode, dentry);
273  	xfs_irele(ip);
274  	goto out_free_acl;
275  }
276  
277  STATIC int
xfs_vn_mknod(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,dev_t rdev)278  xfs_vn_mknod(
279  	struct mnt_idmap	*idmap,
280  	struct inode		*dir,
281  	struct dentry		*dentry,
282  	umode_t			mode,
283  	dev_t			rdev)
284  {
285  	return xfs_generic_create(idmap, dir, dentry, mode, rdev, NULL);
286  }
287  
288  STATIC int
xfs_vn_create(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,bool flags)289  xfs_vn_create(
290  	struct mnt_idmap	*idmap,
291  	struct inode		*dir,
292  	struct dentry		*dentry,
293  	umode_t			mode,
294  	bool			flags)
295  {
296  	return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL);
297  }
298  
299  STATIC int
xfs_vn_mkdir(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode)300  xfs_vn_mkdir(
301  	struct mnt_idmap	*idmap,
302  	struct inode		*dir,
303  	struct dentry		*dentry,
304  	umode_t			mode)
305  {
306  	return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL);
307  }
308  
309  STATIC struct dentry *
xfs_vn_lookup(struct inode * dir,struct dentry * dentry,unsigned int flags)310  xfs_vn_lookup(
311  	struct inode	*dir,
312  	struct dentry	*dentry,
313  	unsigned int flags)
314  {
315  	struct inode *inode;
316  	struct xfs_inode *cip;
317  	struct xfs_name	name;
318  	int		error;
319  
320  	if (dentry->d_name.len >= MAXNAMELEN)
321  		return ERR_PTR(-ENAMETOOLONG);
322  
323  	xfs_dentry_to_name(&name, dentry);
324  	error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
325  	if (likely(!error))
326  		inode = VFS_I(cip);
327  	else if (likely(error == -ENOENT))
328  		inode = NULL;
329  	else
330  		inode = ERR_PTR(error);
331  	return d_splice_alias(inode, dentry);
332  }
333  
334  STATIC struct dentry *
xfs_vn_ci_lookup(struct inode * dir,struct dentry * dentry,unsigned int flags)335  xfs_vn_ci_lookup(
336  	struct inode	*dir,
337  	struct dentry	*dentry,
338  	unsigned int flags)
339  {
340  	struct xfs_inode *ip;
341  	struct xfs_name	xname;
342  	struct xfs_name ci_name;
343  	struct qstr	dname;
344  	int		error;
345  
346  	if (dentry->d_name.len >= MAXNAMELEN)
347  		return ERR_PTR(-ENAMETOOLONG);
348  
349  	xfs_dentry_to_name(&xname, dentry);
350  	error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
351  	if (unlikely(error)) {
352  		if (unlikely(error != -ENOENT))
353  			return ERR_PTR(error);
354  		/*
355  		 * call d_add(dentry, NULL) here when d_drop_negative_children
356  		 * is called in xfs_vn_mknod (ie. allow negative dentries
357  		 * with CI filesystems).
358  		 */
359  		return NULL;
360  	}
361  
362  	/* if exact match, just splice and exit */
363  	if (!ci_name.name)
364  		return d_splice_alias(VFS_I(ip), dentry);
365  
366  	/* else case-insensitive match... */
367  	dname.name = ci_name.name;
368  	dname.len = ci_name.len;
369  	dentry = d_add_ci(dentry, VFS_I(ip), &dname);
370  	kfree(ci_name.name);
371  	return dentry;
372  }
373  
374  STATIC int
xfs_vn_link(struct dentry * old_dentry,struct inode * dir,struct dentry * dentry)375  xfs_vn_link(
376  	struct dentry	*old_dentry,
377  	struct inode	*dir,
378  	struct dentry	*dentry)
379  {
380  	struct inode	*inode = d_inode(old_dentry);
381  	struct xfs_name	name;
382  	int		error;
383  
384  	error = xfs_dentry_mode_to_name(&name, dentry, inode->i_mode);
385  	if (unlikely(error))
386  		return error;
387  
388  	if (IS_PRIVATE(inode))
389  		return -EPERM;
390  
391  	error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
392  	if (unlikely(error))
393  		return error;
394  
395  	ihold(inode);
396  	d_instantiate(dentry, inode);
397  	return 0;
398  }
399  
400  STATIC int
xfs_vn_unlink(struct inode * dir,struct dentry * dentry)401  xfs_vn_unlink(
402  	struct inode	*dir,
403  	struct dentry	*dentry)
404  {
405  	struct xfs_name	name;
406  	int		error;
407  
408  	xfs_dentry_to_name(&name, dentry);
409  
410  	error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry)));
411  	if (error)
412  		return error;
413  
414  	/*
415  	 * With unlink, the VFS makes the dentry "negative": no inode,
416  	 * but still hashed. This is incompatible with case-insensitive
417  	 * mode, so invalidate (unhash) the dentry in CI-mode.
418  	 */
419  	if (xfs_has_asciici(XFS_M(dir->i_sb)))
420  		d_invalidate(dentry);
421  	return 0;
422  }
423  
424  STATIC int
xfs_vn_symlink(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,const char * symname)425  xfs_vn_symlink(
426  	struct mnt_idmap	*idmap,
427  	struct inode		*dir,
428  	struct dentry		*dentry,
429  	const char		*symname)
430  {
431  	struct inode	*inode;
432  	struct xfs_inode *cip = NULL;
433  	struct xfs_name	name;
434  	int		error;
435  	umode_t		mode;
436  
437  	mode = S_IFLNK |
438  		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
439  	error = xfs_dentry_mode_to_name(&name, dentry, mode);
440  	if (unlikely(error))
441  		goto out;
442  
443  	error = xfs_symlink(idmap, XFS_I(dir), &name, symname, mode, &cip);
444  	if (unlikely(error))
445  		goto out;
446  
447  	inode = VFS_I(cip);
448  
449  	error = xfs_inode_init_security(inode, dir, &dentry->d_name);
450  	if (unlikely(error))
451  		goto out_cleanup_inode;
452  
453  	xfs_setup_iops(cip);
454  
455  	d_instantiate(dentry, inode);
456  	xfs_finish_inode_setup(cip);
457  	return 0;
458  
459   out_cleanup_inode:
460  	xfs_finish_inode_setup(cip);
461  	xfs_cleanup_inode(dir, inode, dentry);
462  	xfs_irele(cip);
463   out:
464  	return error;
465  }
466  
467  STATIC int
xfs_vn_rename(struct mnt_idmap * idmap,struct inode * odir,struct dentry * odentry,struct inode * ndir,struct dentry * ndentry,unsigned int flags)468  xfs_vn_rename(
469  	struct mnt_idmap	*idmap,
470  	struct inode		*odir,
471  	struct dentry		*odentry,
472  	struct inode		*ndir,
473  	struct dentry		*ndentry,
474  	unsigned int		flags)
475  {
476  	struct inode	*new_inode = d_inode(ndentry);
477  	int		omode = 0;
478  	int		error;
479  	struct xfs_name	oname;
480  	struct xfs_name	nname;
481  
482  	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
483  		return -EINVAL;
484  
485  	/* if we are exchanging files, we need to set i_mode of both files */
486  	if (flags & RENAME_EXCHANGE)
487  		omode = d_inode(ndentry)->i_mode;
488  
489  	error = xfs_dentry_mode_to_name(&oname, odentry, omode);
490  	if (omode && unlikely(error))
491  		return error;
492  
493  	error = xfs_dentry_mode_to_name(&nname, ndentry,
494  					d_inode(odentry)->i_mode);
495  	if (unlikely(error))
496  		return error;
497  
498  	return xfs_rename(idmap, XFS_I(odir), &oname,
499  			  XFS_I(d_inode(odentry)), XFS_I(ndir), &nname,
500  			  new_inode ? XFS_I(new_inode) : NULL, flags);
501  }
502  
503  /*
504   * careful here - this function can get called recursively, so
505   * we need to be very careful about how much stack we use.
506   * uio is kmalloced for this reason...
507   */
508  STATIC const char *
xfs_vn_get_link(struct dentry * dentry,struct inode * inode,struct delayed_call * done)509  xfs_vn_get_link(
510  	struct dentry		*dentry,
511  	struct inode		*inode,
512  	struct delayed_call	*done)
513  {
514  	char			*link;
515  	int			error = -ENOMEM;
516  
517  	if (!dentry)
518  		return ERR_PTR(-ECHILD);
519  
520  	link = kmalloc(XFS_SYMLINK_MAXLEN+1, GFP_KERNEL);
521  	if (!link)
522  		goto out_err;
523  
524  	error = xfs_readlink(XFS_I(d_inode(dentry)), link);
525  	if (unlikely(error))
526  		goto out_kfree;
527  
528  	set_delayed_call(done, kfree_link, link);
529  	return link;
530  
531   out_kfree:
532  	kfree(link);
533   out_err:
534  	return ERR_PTR(error);
535  }
536  
537  static uint32_t
xfs_stat_blksize(struct xfs_inode * ip)538  xfs_stat_blksize(
539  	struct xfs_inode	*ip)
540  {
541  	struct xfs_mount	*mp = ip->i_mount;
542  
543  	/*
544  	 * If the file blocks are being allocated from a realtime volume, then
545  	 * always return the realtime extent size.
546  	 */
547  	if (XFS_IS_REALTIME_INODE(ip))
548  		return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip) ? : 1);
549  
550  	/*
551  	 * Allow large block sizes to be reported to userspace programs if the
552  	 * "largeio" mount option is used.
553  	 *
554  	 * If compatibility mode is specified, simply return the basic unit of
555  	 * caching so that we don't get inefficient read/modify/write I/O from
556  	 * user apps. Otherwise....
557  	 *
558  	 * If the underlying volume is a stripe, then return the stripe width in
559  	 * bytes as the recommended I/O size. It is not a stripe and we've set a
560  	 * default buffered I/O size, return that, otherwise return the compat
561  	 * default.
562  	 */
563  	if (xfs_has_large_iosize(mp)) {
564  		if (mp->m_swidth)
565  			return XFS_FSB_TO_B(mp, mp->m_swidth);
566  		if (xfs_has_allocsize(mp))
567  			return 1U << mp->m_allocsize_log;
568  	}
569  
570  	return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize);
571  }
572  
573  STATIC int
xfs_vn_getattr(struct mnt_idmap * idmap,const struct path * path,struct kstat * stat,u32 request_mask,unsigned int query_flags)574  xfs_vn_getattr(
575  	struct mnt_idmap	*idmap,
576  	const struct path	*path,
577  	struct kstat		*stat,
578  	u32			request_mask,
579  	unsigned int		query_flags)
580  {
581  	struct inode		*inode = d_inode(path->dentry);
582  	struct xfs_inode	*ip = XFS_I(inode);
583  	struct xfs_mount	*mp = ip->i_mount;
584  	vfsuid_t		vfsuid = i_uid_into_vfsuid(idmap, inode);
585  	vfsgid_t		vfsgid = i_gid_into_vfsgid(idmap, inode);
586  
587  	trace_xfs_getattr(ip);
588  
589  	if (xfs_is_shutdown(mp))
590  		return -EIO;
591  
592  	stat->size = XFS_ISIZE(ip);
593  	stat->dev = inode->i_sb->s_dev;
594  	stat->mode = inode->i_mode;
595  	stat->nlink = inode->i_nlink;
596  	stat->uid = vfsuid_into_kuid(vfsuid);
597  	stat->gid = vfsgid_into_kgid(vfsgid);
598  	stat->ino = ip->i_ino;
599  	stat->atime = inode_get_atime(inode);
600  	stat->mtime = inode_get_mtime(inode);
601  	stat->ctime = inode_get_ctime(inode);
602  	stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
603  
604  	if (xfs_has_v3inodes(mp)) {
605  		if (request_mask & STATX_BTIME) {
606  			stat->result_mask |= STATX_BTIME;
607  			stat->btime = ip->i_crtime;
608  		}
609  	}
610  
611  	if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
612  		stat->change_cookie = inode_query_iversion(inode);
613  		stat->result_mask |= STATX_CHANGE_COOKIE;
614  	}
615  
616  	/*
617  	 * Note: If you add another clause to set an attribute flag, please
618  	 * update attributes_mask below.
619  	 */
620  	if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
621  		stat->attributes |= STATX_ATTR_IMMUTABLE;
622  	if (ip->i_diflags & XFS_DIFLAG_APPEND)
623  		stat->attributes |= STATX_ATTR_APPEND;
624  	if (ip->i_diflags & XFS_DIFLAG_NODUMP)
625  		stat->attributes |= STATX_ATTR_NODUMP;
626  
627  	stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
628  				  STATX_ATTR_APPEND |
629  				  STATX_ATTR_NODUMP);
630  
631  	switch (inode->i_mode & S_IFMT) {
632  	case S_IFBLK:
633  	case S_IFCHR:
634  		stat->blksize = BLKDEV_IOSIZE;
635  		stat->rdev = inode->i_rdev;
636  		break;
637  	case S_IFREG:
638  		if (request_mask & STATX_DIOALIGN) {
639  			struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
640  			struct block_device	*bdev = target->bt_bdev;
641  
642  			stat->result_mask |= STATX_DIOALIGN;
643  			stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
644  			stat->dio_offset_align = bdev_logical_block_size(bdev);
645  		}
646  		fallthrough;
647  	default:
648  		stat->blksize = xfs_stat_blksize(ip);
649  		stat->rdev = 0;
650  		break;
651  	}
652  
653  	return 0;
654  }
655  
656  static int
xfs_vn_change_ok(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * iattr)657  xfs_vn_change_ok(
658  	struct mnt_idmap	*idmap,
659  	struct dentry		*dentry,
660  	struct iattr		*iattr)
661  {
662  	struct xfs_mount	*mp = XFS_I(d_inode(dentry))->i_mount;
663  
664  	if (xfs_is_readonly(mp))
665  		return -EROFS;
666  
667  	if (xfs_is_shutdown(mp))
668  		return -EIO;
669  
670  	return setattr_prepare(idmap, dentry, iattr);
671  }
672  
673  /*
674   * Set non-size attributes of an inode.
675   *
676   * Caution: The caller of this function is responsible for calling
677   * setattr_prepare() or otherwise verifying the change is fine.
678   */
679  static int
xfs_setattr_nonsize(struct mnt_idmap * idmap,struct dentry * dentry,struct xfs_inode * ip,struct iattr * iattr)680  xfs_setattr_nonsize(
681  	struct mnt_idmap	*idmap,
682  	struct dentry		*dentry,
683  	struct xfs_inode	*ip,
684  	struct iattr		*iattr)
685  {
686  	xfs_mount_t		*mp = ip->i_mount;
687  	struct inode		*inode = VFS_I(ip);
688  	int			mask = iattr->ia_valid;
689  	xfs_trans_t		*tp;
690  	int			error;
691  	kuid_t			uid = GLOBAL_ROOT_UID;
692  	kgid_t			gid = GLOBAL_ROOT_GID;
693  	struct xfs_dquot	*udqp = NULL, *gdqp = NULL;
694  	struct xfs_dquot	*old_udqp = NULL, *old_gdqp = NULL;
695  
696  	ASSERT((mask & ATTR_SIZE) == 0);
697  
698  	/*
699  	 * If disk quotas is on, we make sure that the dquots do exist on disk,
700  	 * before we start any other transactions. Trying to do this later
701  	 * is messy. We don't care to take a readlock to look at the ids
702  	 * in inode here, because we can't hold it across the trans_reserve.
703  	 * If the IDs do change before we take the ilock, we're covered
704  	 * because the i_*dquot fields will get updated anyway.
705  	 */
706  	if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
707  		uint	qflags = 0;
708  
709  		if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
710  			uid = from_vfsuid(idmap, i_user_ns(inode),
711  					  iattr->ia_vfsuid);
712  			qflags |= XFS_QMOPT_UQUOTA;
713  		} else {
714  			uid = inode->i_uid;
715  		}
716  		if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
717  			gid = from_vfsgid(idmap, i_user_ns(inode),
718  					  iattr->ia_vfsgid);
719  			qflags |= XFS_QMOPT_GQUOTA;
720  		}  else {
721  			gid = inode->i_gid;
722  		}
723  
724  		/*
725  		 * We take a reference when we initialize udqp and gdqp,
726  		 * so it is important that we never blindly double trip on
727  		 * the same variable. See xfs_create() for an example.
728  		 */
729  		ASSERT(udqp == NULL);
730  		ASSERT(gdqp == NULL);
731  		error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_projid,
732  					   qflags, &udqp, &gdqp, NULL);
733  		if (error)
734  			return error;
735  	}
736  
737  	error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL,
738  			has_capability_noaudit(current, CAP_FOWNER), &tp);
739  	if (error)
740  		goto out_dqrele;
741  
742  	/*
743  	 * Register quota modifications in the transaction.  Must be the owner
744  	 * or privileged.  These IDs could have changed since we last looked at
745  	 * them.  But, we're assured that if the ownership did change while we
746  	 * didn't have the inode locked, inode's dquot(s) would have changed
747  	 * also.
748  	 */
749  	if (XFS_IS_UQUOTA_ON(mp) &&
750  	    i_uid_needs_update(idmap, iattr, inode)) {
751  		ASSERT(udqp);
752  		old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp);
753  	}
754  	if (XFS_IS_GQUOTA_ON(mp) &&
755  	    i_gid_needs_update(idmap, iattr, inode)) {
756  		ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp));
757  		ASSERT(gdqp);
758  		old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp);
759  	}
760  
761  	setattr_copy(idmap, inode, iattr);
762  	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
763  
764  	XFS_STATS_INC(mp, xs_ig_attrchg);
765  
766  	if (xfs_has_wsync(mp))
767  		xfs_trans_set_sync(tp);
768  	error = xfs_trans_commit(tp);
769  
770  	/*
771  	 * Release any dquot(s) the inode had kept before chown.
772  	 */
773  	xfs_qm_dqrele(old_udqp);
774  	xfs_qm_dqrele(old_gdqp);
775  	xfs_qm_dqrele(udqp);
776  	xfs_qm_dqrele(gdqp);
777  
778  	if (error)
779  		return error;
780  
781  	/*
782  	 * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
783  	 * 	     update.  We could avoid this with linked transactions
784  	 * 	     and passing down the transaction pointer all the way
785  	 *	     to attr_set.  No previous user of the generic
786  	 * 	     Posix ACL code seems to care about this issue either.
787  	 */
788  	if (mask & ATTR_MODE) {
789  		error = posix_acl_chmod(idmap, dentry, inode->i_mode);
790  		if (error)
791  			return error;
792  	}
793  
794  	return 0;
795  
796  out_dqrele:
797  	xfs_qm_dqrele(udqp);
798  	xfs_qm_dqrele(gdqp);
799  	return error;
800  }
801  
802  /*
803   * Truncate file.  Must have write permission and not be a directory.
804   *
805   * Caution: The caller of this function is responsible for calling
806   * setattr_prepare() or otherwise verifying the change is fine.
807   */
808  STATIC int
xfs_setattr_size(struct mnt_idmap * idmap,struct dentry * dentry,struct xfs_inode * ip,struct iattr * iattr)809  xfs_setattr_size(
810  	struct mnt_idmap	*idmap,
811  	struct dentry		*dentry,
812  	struct xfs_inode	*ip,
813  	struct iattr		*iattr)
814  {
815  	struct xfs_mount	*mp = ip->i_mount;
816  	struct inode		*inode = VFS_I(ip);
817  	xfs_off_t		oldsize, newsize;
818  	struct xfs_trans	*tp;
819  	int			error;
820  	uint			lock_flags = 0;
821  	uint			resblks = 0;
822  	bool			did_zeroing = false;
823  
824  	xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
825  	ASSERT(S_ISREG(inode->i_mode));
826  	ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
827  		ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
828  
829  	oldsize = inode->i_size;
830  	newsize = iattr->ia_size;
831  
832  	/*
833  	 * Short circuit the truncate case for zero length files.
834  	 */
835  	if (newsize == 0 && oldsize == 0 && ip->i_df.if_nextents == 0) {
836  		if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME)))
837  			return 0;
838  
839  		/*
840  		 * Use the regular setattr path to update the timestamps.
841  		 */
842  		iattr->ia_valid &= ~ATTR_SIZE;
843  		return xfs_setattr_nonsize(idmap, dentry, ip, iattr);
844  	}
845  
846  	/*
847  	 * Make sure that the dquots are attached to the inode.
848  	 */
849  	error = xfs_qm_dqattach(ip);
850  	if (error)
851  		return error;
852  
853  	/*
854  	 * Wait for all direct I/O to complete.
855  	 */
856  	inode_dio_wait(inode);
857  
858  	/*
859  	 * File data changes must be complete before we start the transaction to
860  	 * modify the inode.  This needs to be done before joining the inode to
861  	 * the transaction because the inode cannot be unlocked once it is a
862  	 * part of the transaction.
863  	 *
864  	 * Start with zeroing any data beyond EOF that we may expose on file
865  	 * extension, or zeroing out the rest of the block on a downward
866  	 * truncate.
867  	 */
868  	if (newsize > oldsize) {
869  		trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
870  		error = xfs_zero_range(ip, oldsize, newsize - oldsize,
871  				&did_zeroing);
872  	} else {
873  		error = xfs_truncate_page(ip, newsize, &did_zeroing);
874  	}
875  
876  	if (error)
877  		return error;
878  
879  	/*
880  	 * We've already locked out new page faults, so now we can safely remove
881  	 * pages from the page cache knowing they won't get refaulted until we
882  	 * drop the XFS_MMAP_EXCL lock after the extent manipulations are
883  	 * complete. The truncate_setsize() call also cleans partial EOF page
884  	 * PTEs on extending truncates and hence ensures sub-page block size
885  	 * filesystems are correctly handled, too.
886  	 *
887  	 * We have to do all the page cache truncate work outside the
888  	 * transaction context as the "lock" order is page lock->log space
889  	 * reservation as defined by extent allocation in the writeback path.
890  	 * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but
891  	 * having already truncated the in-memory version of the file (i.e. made
892  	 * user visible changes). There's not much we can do about this, except
893  	 * to hope that the caller sees ENOMEM and retries the truncate
894  	 * operation.
895  	 *
896  	 * And we update in-core i_size and truncate page cache beyond newsize
897  	 * before writeback the [i_disk_size, newsize] range, so we're
898  	 * guaranteed not to write stale data past the new EOF on truncate down.
899  	 */
900  	truncate_setsize(inode, newsize);
901  
902  	/*
903  	 * We are going to log the inode size change in this transaction so
904  	 * any previous writes that are beyond the on disk EOF and the new
905  	 * EOF that have not been written out need to be written here.  If we
906  	 * do not write the data out, we expose ourselves to the null files
907  	 * problem. Note that this includes any block zeroing we did above;
908  	 * otherwise those blocks may not be zeroed after a crash.
909  	 */
910  	if (did_zeroing ||
911  	    (newsize > ip->i_disk_size && oldsize != ip->i_disk_size)) {
912  		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
913  						ip->i_disk_size, newsize - 1);
914  		if (error)
915  			return error;
916  	}
917  
918  	/*
919  	 * For realtime inode with more than one block rtextsize, we need the
920  	 * block reservation for bmap btree block allocations/splits that can
921  	 * happen since it could split the tail written extent and convert the
922  	 * right beyond EOF one to unwritten.
923  	 */
924  	if (xfs_inode_has_bigrtalloc(ip))
925  		resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
926  
927  	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
928  				0, 0, &tp);
929  	if (error)
930  		return error;
931  
932  	lock_flags |= XFS_ILOCK_EXCL;
933  	xfs_ilock(ip, XFS_ILOCK_EXCL);
934  	xfs_trans_ijoin(tp, ip, 0);
935  
936  	/*
937  	 * Only change the c/mtime if we are changing the size or we are
938  	 * explicitly asked to change it.  This handles the semantic difference
939  	 * between truncate() and ftruncate() as implemented in the VFS.
940  	 *
941  	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
942  	 * special case where we need to update the times despite not having
943  	 * these flags set.  For all other operations the VFS set these flags
944  	 * explicitly if it wants a timestamp update.
945  	 */
946  	if (newsize != oldsize &&
947  	    !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
948  		iattr->ia_ctime = iattr->ia_mtime =
949  			current_time(inode);
950  		iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
951  	}
952  
953  	/*
954  	 * The first thing we do is set the size to new_size permanently on
955  	 * disk.  This way we don't have to worry about anyone ever being able
956  	 * to look at the data being freed even in the face of a crash.
957  	 * What we're getting around here is the case where we free a block, it
958  	 * is allocated to another file, it is written to, and then we crash.
959  	 * If the new data gets written to the file but the log buffers
960  	 * containing the free and reallocation don't, then we'd end up with
961  	 * garbage in the blocks being freed.  As long as we make the new size
962  	 * permanent before actually freeing any blocks it doesn't matter if
963  	 * they get written to.
964  	 */
965  	ip->i_disk_size = newsize;
966  	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
967  
968  	if (newsize <= oldsize) {
969  		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
970  		if (error)
971  			goto out_trans_cancel;
972  
973  		/*
974  		 * Truncated "down", so we're removing references to old data
975  		 * here - if we delay flushing for a long time, we expose
976  		 * ourselves unduly to the notorious NULL files problem.  So,
977  		 * we mark this inode and flush it when the file is closed,
978  		 * and do not wait the usual (long) time for writeout.
979  		 */
980  		xfs_iflags_set(ip, XFS_ITRUNCATED);
981  
982  		/* A truncate down always removes post-EOF blocks. */
983  		xfs_inode_clear_eofblocks_tag(ip);
984  	}
985  
986  	ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
987  	setattr_copy(idmap, inode, iattr);
988  	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
989  
990  	XFS_STATS_INC(mp, xs_ig_attrchg);
991  
992  	if (xfs_has_wsync(mp))
993  		xfs_trans_set_sync(tp);
994  
995  	error = xfs_trans_commit(tp);
996  out_unlock:
997  	if (lock_flags)
998  		xfs_iunlock(ip, lock_flags);
999  	return error;
1000  
1001  out_trans_cancel:
1002  	xfs_trans_cancel(tp);
1003  	goto out_unlock;
1004  }
1005  
1006  int
xfs_vn_setattr_size(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * iattr)1007  xfs_vn_setattr_size(
1008  	struct mnt_idmap	*idmap,
1009  	struct dentry		*dentry,
1010  	struct iattr		*iattr)
1011  {
1012  	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
1013  	int error;
1014  
1015  	trace_xfs_setattr(ip);
1016  
1017  	error = xfs_vn_change_ok(idmap, dentry, iattr);
1018  	if (error)
1019  		return error;
1020  	return xfs_setattr_size(idmap, dentry, ip, iattr);
1021  }
1022  
1023  STATIC int
xfs_vn_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * iattr)1024  xfs_vn_setattr(
1025  	struct mnt_idmap	*idmap,
1026  	struct dentry		*dentry,
1027  	struct iattr		*iattr)
1028  {
1029  	struct inode		*inode = d_inode(dentry);
1030  	struct xfs_inode	*ip = XFS_I(inode);
1031  	int			error;
1032  
1033  	if (iattr->ia_valid & ATTR_SIZE) {
1034  		uint			iolock;
1035  
1036  		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1037  		iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1038  
1039  		error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1040  		if (error) {
1041  			xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
1042  			return error;
1043  		}
1044  
1045  		error = xfs_vn_setattr_size(idmap, dentry, iattr);
1046  		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
1047  	} else {
1048  		trace_xfs_setattr(ip);
1049  
1050  		error = xfs_vn_change_ok(idmap, dentry, iattr);
1051  		if (!error)
1052  			error = xfs_setattr_nonsize(idmap, dentry, ip, iattr);
1053  	}
1054  
1055  	return error;
1056  }
1057  
1058  STATIC int
xfs_vn_update_time(struct inode * inode,int flags)1059  xfs_vn_update_time(
1060  	struct inode		*inode,
1061  	int			flags)
1062  {
1063  	struct xfs_inode	*ip = XFS_I(inode);
1064  	struct xfs_mount	*mp = ip->i_mount;
1065  	int			log_flags = XFS_ILOG_TIMESTAMP;
1066  	struct xfs_trans	*tp;
1067  	int			error;
1068  	struct timespec64	now;
1069  
1070  	trace_xfs_update_time(ip);
1071  
1072  	if (inode->i_sb->s_flags & SB_LAZYTIME) {
1073  		if (!((flags & S_VERSION) &&
1074  		      inode_maybe_inc_iversion(inode, false))) {
1075  			generic_update_time(inode, flags);
1076  			return 0;
1077  		}
1078  
1079  		/* Capture the iversion update that just occurred */
1080  		log_flags |= XFS_ILOG_CORE;
1081  	}
1082  
1083  	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
1084  	if (error)
1085  		return error;
1086  
1087  	xfs_ilock(ip, XFS_ILOCK_EXCL);
1088  	if (flags & (S_CTIME|S_MTIME))
1089  		now = inode_set_ctime_current(inode);
1090  	else
1091  		now = current_time(inode);
1092  
1093  	if (flags & S_MTIME)
1094  		inode_set_mtime_to_ts(inode, now);
1095  	if (flags & S_ATIME)
1096  		inode_set_atime_to_ts(inode, now);
1097  
1098  	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1099  	xfs_trans_log_inode(tp, ip, log_flags);
1100  	return xfs_trans_commit(tp);
1101  }
1102  
1103  STATIC int
xfs_vn_fiemap(struct inode * inode,struct fiemap_extent_info * fieinfo,u64 start,u64 length)1104  xfs_vn_fiemap(
1105  	struct inode		*inode,
1106  	struct fiemap_extent_info *fieinfo,
1107  	u64			start,
1108  	u64			length)
1109  {
1110  	int			error;
1111  
1112  	xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
1113  	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
1114  		fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
1115  		error = iomap_fiemap(inode, fieinfo, start, length,
1116  				&xfs_xattr_iomap_ops);
1117  	} else {
1118  		error = iomap_fiemap(inode, fieinfo, start, length,
1119  				&xfs_read_iomap_ops);
1120  	}
1121  	xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
1122  
1123  	return error;
1124  }
1125  
1126  STATIC int
xfs_vn_tmpfile(struct mnt_idmap * idmap,struct inode * dir,struct file * file,umode_t mode)1127  xfs_vn_tmpfile(
1128  	struct mnt_idmap	*idmap,
1129  	struct inode		*dir,
1130  	struct file		*file,
1131  	umode_t			mode)
1132  {
1133  	int err = xfs_generic_create(idmap, dir, file->f_path.dentry, mode, 0, file);
1134  
1135  	return finish_open_simple(file, err);
1136  }
1137  
1138  static const struct inode_operations xfs_inode_operations = {
1139  	.get_inode_acl		= xfs_get_acl,
1140  	.set_acl		= xfs_set_acl,
1141  	.getattr		= xfs_vn_getattr,
1142  	.setattr		= xfs_vn_setattr,
1143  	.listxattr		= xfs_vn_listxattr,
1144  	.fiemap			= xfs_vn_fiemap,
1145  	.update_time		= xfs_vn_update_time,
1146  	.fileattr_get		= xfs_fileattr_get,
1147  	.fileattr_set		= xfs_fileattr_set,
1148  };
1149  
1150  static const struct inode_operations xfs_dir_inode_operations = {
1151  	.create			= xfs_vn_create,
1152  	.lookup			= xfs_vn_lookup,
1153  	.link			= xfs_vn_link,
1154  	.unlink			= xfs_vn_unlink,
1155  	.symlink		= xfs_vn_symlink,
1156  	.mkdir			= xfs_vn_mkdir,
1157  	/*
1158  	 * Yes, XFS uses the same method for rmdir and unlink.
1159  	 *
1160  	 * There are some subtile differences deeper in the code,
1161  	 * but we use S_ISDIR to check for those.
1162  	 */
1163  	.rmdir			= xfs_vn_unlink,
1164  	.mknod			= xfs_vn_mknod,
1165  	.rename			= xfs_vn_rename,
1166  	.get_inode_acl		= xfs_get_acl,
1167  	.set_acl		= xfs_set_acl,
1168  	.getattr		= xfs_vn_getattr,
1169  	.setattr		= xfs_vn_setattr,
1170  	.listxattr		= xfs_vn_listxattr,
1171  	.update_time		= xfs_vn_update_time,
1172  	.tmpfile		= xfs_vn_tmpfile,
1173  	.fileattr_get		= xfs_fileattr_get,
1174  	.fileattr_set		= xfs_fileattr_set,
1175  };
1176  
1177  static const struct inode_operations xfs_dir_ci_inode_operations = {
1178  	.create			= xfs_vn_create,
1179  	.lookup			= xfs_vn_ci_lookup,
1180  	.link			= xfs_vn_link,
1181  	.unlink			= xfs_vn_unlink,
1182  	.symlink		= xfs_vn_symlink,
1183  	.mkdir			= xfs_vn_mkdir,
1184  	/*
1185  	 * Yes, XFS uses the same method for rmdir and unlink.
1186  	 *
1187  	 * There are some subtile differences deeper in the code,
1188  	 * but we use S_ISDIR to check for those.
1189  	 */
1190  	.rmdir			= xfs_vn_unlink,
1191  	.mknod			= xfs_vn_mknod,
1192  	.rename			= xfs_vn_rename,
1193  	.get_inode_acl		= xfs_get_acl,
1194  	.set_acl		= xfs_set_acl,
1195  	.getattr		= xfs_vn_getattr,
1196  	.setattr		= xfs_vn_setattr,
1197  	.listxattr		= xfs_vn_listxattr,
1198  	.update_time		= xfs_vn_update_time,
1199  	.tmpfile		= xfs_vn_tmpfile,
1200  	.fileattr_get		= xfs_fileattr_get,
1201  	.fileattr_set		= xfs_fileattr_set,
1202  };
1203  
1204  static const struct inode_operations xfs_symlink_inode_operations = {
1205  	.get_link		= xfs_vn_get_link,
1206  	.getattr		= xfs_vn_getattr,
1207  	.setattr		= xfs_vn_setattr,
1208  	.listxattr		= xfs_vn_listxattr,
1209  	.update_time		= xfs_vn_update_time,
1210  };
1211  
1212  /* Figure out if this file actually supports DAX. */
1213  static bool
xfs_inode_supports_dax(struct xfs_inode * ip)1214  xfs_inode_supports_dax(
1215  	struct xfs_inode	*ip)
1216  {
1217  	struct xfs_mount	*mp = ip->i_mount;
1218  
1219  	/* Only supported on regular files. */
1220  	if (!S_ISREG(VFS_I(ip)->i_mode))
1221  		return false;
1222  
1223  	/* Block size must match page size */
1224  	if (mp->m_sb.sb_blocksize != PAGE_SIZE)
1225  		return false;
1226  
1227  	/* Device has to support DAX too. */
1228  	return xfs_inode_buftarg(ip)->bt_daxdev != NULL;
1229  }
1230  
1231  static bool
xfs_inode_should_enable_dax(struct xfs_inode * ip)1232  xfs_inode_should_enable_dax(
1233  	struct xfs_inode *ip)
1234  {
1235  	if (!IS_ENABLED(CONFIG_FS_DAX))
1236  		return false;
1237  	if (xfs_has_dax_never(ip->i_mount))
1238  		return false;
1239  	if (!xfs_inode_supports_dax(ip))
1240  		return false;
1241  	if (xfs_has_dax_always(ip->i_mount))
1242  		return true;
1243  	if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
1244  		return true;
1245  	return false;
1246  }
1247  
1248  void
xfs_diflags_to_iflags(struct xfs_inode * ip,bool init)1249  xfs_diflags_to_iflags(
1250  	struct xfs_inode	*ip,
1251  	bool init)
1252  {
1253  	struct inode            *inode = VFS_I(ip);
1254  	unsigned int            xflags = xfs_ip2xflags(ip);
1255  	unsigned int            flags = 0;
1256  
1257  	ASSERT(!(IS_DAX(inode) && init));
1258  
1259  	if (xflags & FS_XFLAG_IMMUTABLE)
1260  		flags |= S_IMMUTABLE;
1261  	if (xflags & FS_XFLAG_APPEND)
1262  		flags |= S_APPEND;
1263  	if (xflags & FS_XFLAG_SYNC)
1264  		flags |= S_SYNC;
1265  	if (xflags & FS_XFLAG_NOATIME)
1266  		flags |= S_NOATIME;
1267  	if (init && xfs_inode_should_enable_dax(ip))
1268  		flags |= S_DAX;
1269  
1270  	/*
1271  	 * S_DAX can only be set during inode initialization and is never set by
1272  	 * the VFS, so we cannot mask off S_DAX in i_flags.
1273  	 */
1274  	inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME);
1275  	inode->i_flags |= flags;
1276  }
1277  
1278  /*
1279   * Initialize the Linux inode.
1280   *
1281   * When reading existing inodes from disk this is called directly from xfs_iget,
1282   * when creating a new inode it is called from xfs_init_new_inode after setting
1283   * up the inode. These callers have different criteria for clearing XFS_INEW, so
1284   * leave it up to the caller to deal with unlocking the inode appropriately.
1285   */
1286  void
xfs_setup_inode(struct xfs_inode * ip)1287  xfs_setup_inode(
1288  	struct xfs_inode	*ip)
1289  {
1290  	struct inode		*inode = &ip->i_vnode;
1291  	gfp_t			gfp_mask;
1292  
1293  	inode->i_ino = ip->i_ino;
1294  	inode->i_state |= I_NEW;
1295  
1296  	inode_sb_list_add(inode);
1297  	/* make the inode look hashed for the writeback code */
1298  	inode_fake_hash(inode);
1299  
1300  	i_size_write(inode, ip->i_disk_size);
1301  	xfs_diflags_to_iflags(ip, true);
1302  
1303  	if (S_ISDIR(inode->i_mode)) {
1304  		/*
1305  		 * We set the i_rwsem class here to avoid potential races with
1306  		 * lockdep_annotate_inode_mutex_key() reinitialising the lock
1307  		 * after a filehandle lookup has already found the inode in
1308  		 * cache before it has been unlocked via unlock_new_inode().
1309  		 */
1310  		lockdep_set_class(&inode->i_rwsem,
1311  				  &inode->i_sb->s_type->i_mutex_dir_key);
1312  		lockdep_set_class(&ip->i_lock, &xfs_dir_ilock_class);
1313  	} else {
1314  		lockdep_set_class(&ip->i_lock, &xfs_nondir_ilock_class);
1315  	}
1316  
1317  	/*
1318  	 * Ensure all page cache allocations are done from GFP_NOFS context to
1319  	 * prevent direct reclaim recursion back into the filesystem and blowing
1320  	 * stacks or deadlocking.
1321  	 */
1322  	gfp_mask = mapping_gfp_mask(inode->i_mapping);
1323  	mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
1324  
1325  	/*
1326  	 * For real-time inodes update the stable write flags to that of the RT
1327  	 * device instead of the data device.
1328  	 */
1329  	if (S_ISREG(inode->i_mode) && XFS_IS_REALTIME_INODE(ip))
1330  		xfs_update_stable_writes(ip);
1331  
1332  	/*
1333  	 * If there is no attribute fork no ACL can exist on this inode,
1334  	 * and it can't have any file capabilities attached to it either.
1335  	 */
1336  	if (!xfs_inode_has_attr_fork(ip)) {
1337  		inode_has_no_xattr(inode);
1338  		cache_no_acl(inode);
1339  	}
1340  }
1341  
1342  void
xfs_setup_iops(struct xfs_inode * ip)1343  xfs_setup_iops(
1344  	struct xfs_inode	*ip)
1345  {
1346  	struct inode		*inode = &ip->i_vnode;
1347  
1348  	switch (inode->i_mode & S_IFMT) {
1349  	case S_IFREG:
1350  		inode->i_op = &xfs_inode_operations;
1351  		inode->i_fop = &xfs_file_operations;
1352  		if (IS_DAX(inode))
1353  			inode->i_mapping->a_ops = &xfs_dax_aops;
1354  		else
1355  			inode->i_mapping->a_ops = &xfs_address_space_operations;
1356  		break;
1357  	case S_IFDIR:
1358  		if (xfs_has_asciici(XFS_M(inode->i_sb)))
1359  			inode->i_op = &xfs_dir_ci_inode_operations;
1360  		else
1361  			inode->i_op = &xfs_dir_inode_operations;
1362  		inode->i_fop = &xfs_dir_file_operations;
1363  		break;
1364  	case S_IFLNK:
1365  		inode->i_op = &xfs_symlink_inode_operations;
1366  		break;
1367  	default:
1368  		inode->i_op = &xfs_inode_operations;
1369  		init_special_inode(inode, inode->i_mode, inode->i_rdev);
1370  		break;
1371  	}
1372  }
1373