1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * file.c
4   *
5   * File open, close, extend, truncate
6   *
7   * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
8   */
9  
10  #include <linux/capability.h>
11  #include <linux/fs.h>
12  #include <linux/types.h>
13  #include <linux/slab.h>
14  #include <linux/highmem.h>
15  #include <linux/pagemap.h>
16  #include <linux/uio.h>
17  #include <linux/sched.h>
18  #include <linux/splice.h>
19  #include <linux/mount.h>
20  #include <linux/writeback.h>
21  #include <linux/falloc.h>
22  #include <linux/quotaops.h>
23  #include <linux/blkdev.h>
24  #include <linux/backing-dev.h>
25  
26  #include <cluster/masklog.h>
27  
28  #include "ocfs2.h"
29  
30  #include "alloc.h"
31  #include "aops.h"
32  #include "dir.h"
33  #include "dlmglue.h"
34  #include "extent_map.h"
35  #include "file.h"
36  #include "sysfile.h"
37  #include "inode.h"
38  #include "ioctl.h"
39  #include "journal.h"
40  #include "locks.h"
41  #include "mmap.h"
42  #include "suballoc.h"
43  #include "super.h"
44  #include "xattr.h"
45  #include "acl.h"
46  #include "quota.h"
47  #include "refcounttree.h"
48  #include "ocfs2_trace.h"
49  
50  #include "buffer_head_io.h"
51  
ocfs2_init_file_private(struct inode * inode,struct file * file)52  static int ocfs2_init_file_private(struct inode *inode, struct file *file)
53  {
54  	struct ocfs2_file_private *fp;
55  
56  	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
57  	if (!fp)
58  		return -ENOMEM;
59  
60  	fp->fp_file = file;
61  	mutex_init(&fp->fp_mutex);
62  	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
63  	file->private_data = fp;
64  
65  	return 0;
66  }
67  
ocfs2_free_file_private(struct inode * inode,struct file * file)68  static void ocfs2_free_file_private(struct inode *inode, struct file *file)
69  {
70  	struct ocfs2_file_private *fp = file->private_data;
71  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
72  
73  	if (fp) {
74  		ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
75  		ocfs2_lock_res_free(&fp->fp_flock);
76  		kfree(fp);
77  		file->private_data = NULL;
78  	}
79  }
80  
ocfs2_file_open(struct inode * inode,struct file * file)81  static int ocfs2_file_open(struct inode *inode, struct file *file)
82  {
83  	int status;
84  	int mode = file->f_flags;
85  	struct ocfs2_inode_info *oi = OCFS2_I(inode);
86  
87  	trace_ocfs2_file_open(inode, file, file->f_path.dentry,
88  			      (unsigned long long)oi->ip_blkno,
89  			      file->f_path.dentry->d_name.len,
90  			      file->f_path.dentry->d_name.name, mode);
91  
92  	if (file->f_mode & FMODE_WRITE) {
93  		status = dquot_initialize(inode);
94  		if (status)
95  			goto leave;
96  	}
97  
98  	spin_lock(&oi->ip_lock);
99  
100  	/* Check that the inode hasn't been wiped from disk by another
101  	 * node. If it hasn't then we're safe as long as we hold the
102  	 * spin lock until our increment of open count. */
103  	if (oi->ip_flags & OCFS2_INODE_DELETED) {
104  		spin_unlock(&oi->ip_lock);
105  
106  		status = -ENOENT;
107  		goto leave;
108  	}
109  
110  	if (mode & O_DIRECT)
111  		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
112  
113  	oi->ip_open_count++;
114  	spin_unlock(&oi->ip_lock);
115  
116  	status = ocfs2_init_file_private(inode, file);
117  	if (status) {
118  		/*
119  		 * We want to set open count back if we're failing the
120  		 * open.
121  		 */
122  		spin_lock(&oi->ip_lock);
123  		oi->ip_open_count--;
124  		spin_unlock(&oi->ip_lock);
125  	}
126  
127  	file->f_mode |= FMODE_NOWAIT;
128  
129  leave:
130  	return status;
131  }
132  
ocfs2_file_release(struct inode * inode,struct file * file)133  static int ocfs2_file_release(struct inode *inode, struct file *file)
134  {
135  	struct ocfs2_inode_info *oi = OCFS2_I(inode);
136  
137  	spin_lock(&oi->ip_lock);
138  	if (!--oi->ip_open_count)
139  		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
140  
141  	trace_ocfs2_file_release(inode, file, file->f_path.dentry,
142  				 oi->ip_blkno,
143  				 file->f_path.dentry->d_name.len,
144  				 file->f_path.dentry->d_name.name,
145  				 oi->ip_open_count);
146  	spin_unlock(&oi->ip_lock);
147  
148  	ocfs2_free_file_private(inode, file);
149  
150  	return 0;
151  }
152  
ocfs2_dir_open(struct inode * inode,struct file * file)153  static int ocfs2_dir_open(struct inode *inode, struct file *file)
154  {
155  	return ocfs2_init_file_private(inode, file);
156  }
157  
ocfs2_dir_release(struct inode * inode,struct file * file)158  static int ocfs2_dir_release(struct inode *inode, struct file *file)
159  {
160  	ocfs2_free_file_private(inode, file);
161  	return 0;
162  }
163  
ocfs2_sync_file(struct file * file,loff_t start,loff_t end,int datasync)164  static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
165  			   int datasync)
166  {
167  	int err = 0;
168  	struct inode *inode = file->f_mapping->host;
169  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
170  	struct ocfs2_inode_info *oi = OCFS2_I(inode);
171  	journal_t *journal = osb->journal->j_journal;
172  	int ret;
173  	tid_t commit_tid;
174  	bool needs_barrier = false;
175  
176  	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
177  			      oi->ip_blkno,
178  			      file->f_path.dentry->d_name.len,
179  			      file->f_path.dentry->d_name.name,
180  			      (unsigned long long)datasync);
181  
182  	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
183  		return -EROFS;
184  
185  	err = file_write_and_wait_range(file, start, end);
186  	if (err)
187  		return err;
188  
189  	commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
190  	if (journal->j_flags & JBD2_BARRIER &&
191  	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
192  		needs_barrier = true;
193  	err = jbd2_complete_transaction(journal, commit_tid);
194  	if (needs_barrier) {
195  		ret = blkdev_issue_flush(inode->i_sb->s_bdev);
196  		if (!err)
197  			err = ret;
198  	}
199  
200  	if (err)
201  		mlog_errno(err);
202  
203  	return (err < 0) ? -EIO : 0;
204  }
205  
ocfs2_should_update_atime(struct inode * inode,struct vfsmount * vfsmnt)206  int ocfs2_should_update_atime(struct inode *inode,
207  			      struct vfsmount *vfsmnt)
208  {
209  	struct timespec64 now;
210  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
211  
212  	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
213  		return 0;
214  
215  	if ((inode->i_flags & S_NOATIME) ||
216  	    ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
217  		return 0;
218  
219  	/*
220  	 * We can be called with no vfsmnt structure - NFSD will
221  	 * sometimes do this.
222  	 *
223  	 * Note that our action here is different than touch_atime() -
224  	 * if we can't tell whether this is a noatime mount, then we
225  	 * don't know whether to trust the value of s_atime_quantum.
226  	 */
227  	if (vfsmnt == NULL)
228  		return 0;
229  
230  	if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
231  	    ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
232  		return 0;
233  
234  	if (vfsmnt->mnt_flags & MNT_RELATIME) {
235  		struct timespec64 ctime = inode_get_ctime(inode);
236  		struct timespec64 atime = inode_get_atime(inode);
237  		struct timespec64 mtime = inode_get_mtime(inode);
238  
239  		if ((timespec64_compare(&atime, &mtime) <= 0) ||
240  		    (timespec64_compare(&atime, &ctime) <= 0))
241  			return 1;
242  
243  		return 0;
244  	}
245  
246  	now = current_time(inode);
247  	if ((now.tv_sec - inode_get_atime_sec(inode) <= osb->s_atime_quantum))
248  		return 0;
249  	else
250  		return 1;
251  }
252  
ocfs2_update_inode_atime(struct inode * inode,struct buffer_head * bh)253  int ocfs2_update_inode_atime(struct inode *inode,
254  			     struct buffer_head *bh)
255  {
256  	int ret;
257  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
258  	handle_t *handle;
259  	struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
260  
261  	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
262  	if (IS_ERR(handle)) {
263  		ret = PTR_ERR(handle);
264  		mlog_errno(ret);
265  		goto out;
266  	}
267  
268  	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
269  				      OCFS2_JOURNAL_ACCESS_WRITE);
270  	if (ret) {
271  		mlog_errno(ret);
272  		goto out_commit;
273  	}
274  
275  	/*
276  	 * Don't use ocfs2_mark_inode_dirty() here as we don't always
277  	 * have i_rwsem to guard against concurrent changes to other
278  	 * inode fields.
279  	 */
280  	inode_set_atime_to_ts(inode, current_time(inode));
281  	di->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
282  	di->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
283  	ocfs2_update_inode_fsync_trans(handle, inode, 0);
284  	ocfs2_journal_dirty(handle, bh);
285  
286  out_commit:
287  	ocfs2_commit_trans(osb, handle);
288  out:
289  	return ret;
290  }
291  
ocfs2_set_inode_size(handle_t * handle,struct inode * inode,struct buffer_head * fe_bh,u64 new_i_size)292  int ocfs2_set_inode_size(handle_t *handle,
293  				struct inode *inode,
294  				struct buffer_head *fe_bh,
295  				u64 new_i_size)
296  {
297  	int status;
298  
299  	i_size_write(inode, new_i_size);
300  	inode->i_blocks = ocfs2_inode_sector_count(inode);
301  	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
302  
303  	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
304  	if (status < 0) {
305  		mlog_errno(status);
306  		goto bail;
307  	}
308  
309  bail:
310  	return status;
311  }
312  
ocfs2_simple_size_update(struct inode * inode,struct buffer_head * di_bh,u64 new_i_size)313  int ocfs2_simple_size_update(struct inode *inode,
314  			     struct buffer_head *di_bh,
315  			     u64 new_i_size)
316  {
317  	int ret;
318  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
319  	handle_t *handle = NULL;
320  
321  	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
322  	if (IS_ERR(handle)) {
323  		ret = PTR_ERR(handle);
324  		mlog_errno(ret);
325  		goto out;
326  	}
327  
328  	ret = ocfs2_set_inode_size(handle, inode, di_bh,
329  				   new_i_size);
330  	if (ret < 0)
331  		mlog_errno(ret);
332  
333  	ocfs2_update_inode_fsync_trans(handle, inode, 0);
334  	ocfs2_commit_trans(osb, handle);
335  out:
336  	return ret;
337  }
338  
ocfs2_cow_file_pos(struct inode * inode,struct buffer_head * fe_bh,u64 offset)339  static int ocfs2_cow_file_pos(struct inode *inode,
340  			      struct buffer_head *fe_bh,
341  			      u64 offset)
342  {
343  	int status;
344  	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
345  	unsigned int num_clusters = 0;
346  	unsigned int ext_flags = 0;
347  
348  	/*
349  	 * If the new offset is aligned to the range of the cluster, there is
350  	 * no space for ocfs2_zero_range_for_truncate to fill, so no need to
351  	 * CoW either.
352  	 */
353  	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
354  		return 0;
355  
356  	status = ocfs2_get_clusters(inode, cpos, &phys,
357  				    &num_clusters, &ext_flags);
358  	if (status) {
359  		mlog_errno(status);
360  		goto out;
361  	}
362  
363  	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
364  		goto out;
365  
366  	return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
367  
368  out:
369  	return status;
370  }
371  
ocfs2_orphan_for_truncate(struct ocfs2_super * osb,struct inode * inode,struct buffer_head * fe_bh,u64 new_i_size)372  static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
373  				     struct inode *inode,
374  				     struct buffer_head *fe_bh,
375  				     u64 new_i_size)
376  {
377  	int status;
378  	handle_t *handle;
379  	struct ocfs2_dinode *di;
380  	u64 cluster_bytes;
381  
382  	/*
383  	 * We need to CoW the cluster contains the offset if it is reflinked
384  	 * since we will call ocfs2_zero_range_for_truncate later which will
385  	 * write "0" from offset to the end of the cluster.
386  	 */
387  	status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
388  	if (status) {
389  		mlog_errno(status);
390  		return status;
391  	}
392  
393  	/* TODO: This needs to actually orphan the inode in this
394  	 * transaction. */
395  
396  	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
397  	if (IS_ERR(handle)) {
398  		status = PTR_ERR(handle);
399  		mlog_errno(status);
400  		goto out;
401  	}
402  
403  	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
404  					 OCFS2_JOURNAL_ACCESS_WRITE);
405  	if (status < 0) {
406  		mlog_errno(status);
407  		goto out_commit;
408  	}
409  
410  	/*
411  	 * Do this before setting i_size.
412  	 */
413  	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
414  	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
415  					       cluster_bytes);
416  	if (status) {
417  		mlog_errno(status);
418  		goto out_commit;
419  	}
420  
421  	i_size_write(inode, new_i_size);
422  	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
423  
424  	di = (struct ocfs2_dinode *) fe_bh->b_data;
425  	di->i_size = cpu_to_le64(new_i_size);
426  	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
427  	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
428  	ocfs2_update_inode_fsync_trans(handle, inode, 0);
429  
430  	ocfs2_journal_dirty(handle, fe_bh);
431  
432  out_commit:
433  	ocfs2_commit_trans(osb, handle);
434  out:
435  	return status;
436  }
437  
ocfs2_truncate_file(struct inode * inode,struct buffer_head * di_bh,u64 new_i_size)438  int ocfs2_truncate_file(struct inode *inode,
439  			       struct buffer_head *di_bh,
440  			       u64 new_i_size)
441  {
442  	int status = 0;
443  	struct ocfs2_dinode *fe = NULL;
444  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
445  
446  	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
447  	 * already validated it */
448  	fe = (struct ocfs2_dinode *) di_bh->b_data;
449  
450  	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
451  				  (unsigned long long)le64_to_cpu(fe->i_size),
452  				  (unsigned long long)new_i_size);
453  
454  	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
455  			"Inode %llu, inode i_size = %lld != di "
456  			"i_size = %llu, i_flags = 0x%x\n",
457  			(unsigned long long)OCFS2_I(inode)->ip_blkno,
458  			i_size_read(inode),
459  			(unsigned long long)le64_to_cpu(fe->i_size),
460  			le32_to_cpu(fe->i_flags));
461  
462  	if (new_i_size > le64_to_cpu(fe->i_size)) {
463  		trace_ocfs2_truncate_file_error(
464  			(unsigned long long)le64_to_cpu(fe->i_size),
465  			(unsigned long long)new_i_size);
466  		status = -EINVAL;
467  		mlog_errno(status);
468  		goto bail;
469  	}
470  
471  	down_write(&OCFS2_I(inode)->ip_alloc_sem);
472  
473  	ocfs2_resv_discard(&osb->osb_la_resmap,
474  			   &OCFS2_I(inode)->ip_la_data_resv);
475  
476  	/*
477  	 * The inode lock forced other nodes to sync and drop their
478  	 * pages, which (correctly) happens even if we have a truncate
479  	 * without allocation change - ocfs2 cluster sizes can be much
480  	 * greater than page size, so we have to truncate them
481  	 * anyway.
482  	 */
483  
484  	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
485  		unmap_mapping_range(inode->i_mapping,
486  				    new_i_size + PAGE_SIZE - 1, 0, 1);
487  		truncate_inode_pages(inode->i_mapping, new_i_size);
488  		status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
489  					       i_size_read(inode), 1);
490  		if (status)
491  			mlog_errno(status);
492  
493  		goto bail_unlock_sem;
494  	}
495  
496  	/* alright, we're going to need to do a full blown alloc size
497  	 * change. Orphan the inode so that recovery can complete the
498  	 * truncate if necessary. This does the task of marking
499  	 * i_size. */
500  	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
501  	if (status < 0) {
502  		mlog_errno(status);
503  		goto bail_unlock_sem;
504  	}
505  
506  	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
507  	truncate_inode_pages(inode->i_mapping, new_i_size);
508  
509  	status = ocfs2_commit_truncate(osb, inode, di_bh);
510  	if (status < 0) {
511  		mlog_errno(status);
512  		goto bail_unlock_sem;
513  	}
514  
515  	/* TODO: orphan dir cleanup here. */
516  bail_unlock_sem:
517  	up_write(&OCFS2_I(inode)->ip_alloc_sem);
518  
519  bail:
520  	if (!status && OCFS2_I(inode)->ip_clusters == 0)
521  		status = ocfs2_try_remove_refcount_tree(inode, di_bh);
522  
523  	return status;
524  }
525  
526  /*
527   * extend file allocation only here.
528   * we'll update all the disk stuff, and oip->alloc_size
529   *
530   * expect stuff to be locked, a transaction started and enough data /
531   * metadata reservations in the contexts.
532   *
533   * Will return -EAGAIN, and a reason if a restart is needed.
534   * If passed in, *reason will always be set, even in error.
535   */
ocfs2_add_inode_data(struct ocfs2_super * osb,struct inode * inode,u32 * logical_offset,u32 clusters_to_add,int mark_unwritten,struct buffer_head * fe_bh,handle_t * handle,struct ocfs2_alloc_context * data_ac,struct ocfs2_alloc_context * meta_ac,enum ocfs2_alloc_restarted * reason_ret)536  int ocfs2_add_inode_data(struct ocfs2_super *osb,
537  			 struct inode *inode,
538  			 u32 *logical_offset,
539  			 u32 clusters_to_add,
540  			 int mark_unwritten,
541  			 struct buffer_head *fe_bh,
542  			 handle_t *handle,
543  			 struct ocfs2_alloc_context *data_ac,
544  			 struct ocfs2_alloc_context *meta_ac,
545  			 enum ocfs2_alloc_restarted *reason_ret)
546  {
547  	struct ocfs2_extent_tree et;
548  
549  	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
550  	return ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
551  					   clusters_to_add, mark_unwritten,
552  					   data_ac, meta_ac, reason_ret);
553  }
554  
ocfs2_extend_allocation(struct inode * inode,u32 logical_start,u32 clusters_to_add,int mark_unwritten)555  static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
556  				   u32 clusters_to_add, int mark_unwritten)
557  {
558  	int status = 0;
559  	int restart_func = 0;
560  	int credits;
561  	u32 prev_clusters;
562  	struct buffer_head *bh = NULL;
563  	struct ocfs2_dinode *fe = NULL;
564  	handle_t *handle = NULL;
565  	struct ocfs2_alloc_context *data_ac = NULL;
566  	struct ocfs2_alloc_context *meta_ac = NULL;
567  	enum ocfs2_alloc_restarted why = RESTART_NONE;
568  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
569  	struct ocfs2_extent_tree et;
570  	int did_quota = 0;
571  
572  	/*
573  	 * Unwritten extent only exists for file systems which
574  	 * support holes.
575  	 */
576  	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
577  
578  	status = ocfs2_read_inode_block(inode, &bh);
579  	if (status < 0) {
580  		mlog_errno(status);
581  		goto leave;
582  	}
583  	fe = (struct ocfs2_dinode *) bh->b_data;
584  
585  restart_all:
586  	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
587  
588  	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
589  	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
590  				       &data_ac, &meta_ac);
591  	if (status) {
592  		mlog_errno(status);
593  		goto leave;
594  	}
595  
596  	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
597  	handle = ocfs2_start_trans(osb, credits);
598  	if (IS_ERR(handle)) {
599  		status = PTR_ERR(handle);
600  		handle = NULL;
601  		mlog_errno(status);
602  		goto leave;
603  	}
604  
605  restarted_transaction:
606  	trace_ocfs2_extend_allocation(
607  		(unsigned long long)OCFS2_I(inode)->ip_blkno,
608  		(unsigned long long)i_size_read(inode),
609  		le32_to_cpu(fe->i_clusters), clusters_to_add,
610  		why, restart_func);
611  
612  	status = dquot_alloc_space_nodirty(inode,
613  			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
614  	if (status)
615  		goto leave;
616  	did_quota = 1;
617  
618  	/* reserve a write to the file entry early on - that we if we
619  	 * run out of credits in the allocation path, we can still
620  	 * update i_size. */
621  	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
622  					 OCFS2_JOURNAL_ACCESS_WRITE);
623  	if (status < 0) {
624  		mlog_errno(status);
625  		goto leave;
626  	}
627  
628  	prev_clusters = OCFS2_I(inode)->ip_clusters;
629  
630  	status = ocfs2_add_inode_data(osb,
631  				      inode,
632  				      &logical_start,
633  				      clusters_to_add,
634  				      mark_unwritten,
635  				      bh,
636  				      handle,
637  				      data_ac,
638  				      meta_ac,
639  				      &why);
640  	if ((status < 0) && (status != -EAGAIN)) {
641  		if (status != -ENOSPC)
642  			mlog_errno(status);
643  		goto leave;
644  	}
645  	ocfs2_update_inode_fsync_trans(handle, inode, 1);
646  	ocfs2_journal_dirty(handle, bh);
647  
648  	spin_lock(&OCFS2_I(inode)->ip_lock);
649  	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
650  	spin_unlock(&OCFS2_I(inode)->ip_lock);
651  	/* Release unused quota reservation */
652  	dquot_free_space(inode,
653  			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
654  	did_quota = 0;
655  
656  	if (why != RESTART_NONE && clusters_to_add) {
657  		if (why == RESTART_META) {
658  			restart_func = 1;
659  			status = 0;
660  		} else {
661  			BUG_ON(why != RESTART_TRANS);
662  
663  			status = ocfs2_allocate_extend_trans(handle, 1);
664  			if (status < 0) {
665  				/* handle still has to be committed at
666  				 * this point. */
667  				status = -ENOMEM;
668  				mlog_errno(status);
669  				goto leave;
670  			}
671  			goto restarted_transaction;
672  		}
673  	}
674  
675  	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
676  	     le32_to_cpu(fe->i_clusters),
677  	     (unsigned long long)le64_to_cpu(fe->i_size),
678  	     OCFS2_I(inode)->ip_clusters,
679  	     (unsigned long long)i_size_read(inode));
680  
681  leave:
682  	if (status < 0 && did_quota)
683  		dquot_free_space(inode,
684  			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
685  	if (handle) {
686  		ocfs2_commit_trans(osb, handle);
687  		handle = NULL;
688  	}
689  	if (data_ac) {
690  		ocfs2_free_alloc_context(data_ac);
691  		data_ac = NULL;
692  	}
693  	if (meta_ac) {
694  		ocfs2_free_alloc_context(meta_ac);
695  		meta_ac = NULL;
696  	}
697  	if ((!status) && restart_func) {
698  		restart_func = 0;
699  		goto restart_all;
700  	}
701  	brelse(bh);
702  	bh = NULL;
703  
704  	return status;
705  }
706  
707  /*
708   * While a write will already be ordering the data, a truncate will not.
709   * Thus, we need to explicitly order the zeroed pages.
710   */
ocfs2_zero_start_ordered_transaction(struct inode * inode,struct buffer_head * di_bh,loff_t start_byte,loff_t length)711  static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
712  						      struct buffer_head *di_bh,
713  						      loff_t start_byte,
714  						      loff_t length)
715  {
716  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
717  	handle_t *handle = NULL;
718  	int ret = 0;
719  
720  	if (!ocfs2_should_order_data(inode))
721  		goto out;
722  
723  	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
724  	if (IS_ERR(handle)) {
725  		ret = -ENOMEM;
726  		mlog_errno(ret);
727  		goto out;
728  	}
729  
730  	ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
731  	if (ret < 0) {
732  		mlog_errno(ret);
733  		goto out;
734  	}
735  
736  	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
737  				      OCFS2_JOURNAL_ACCESS_WRITE);
738  	if (ret)
739  		mlog_errno(ret);
740  	ocfs2_update_inode_fsync_trans(handle, inode, 1);
741  
742  out:
743  	if (ret) {
744  		if (!IS_ERR(handle))
745  			ocfs2_commit_trans(osb, handle);
746  		handle = ERR_PTR(ret);
747  	}
748  	return handle;
749  }
750  
751  /* Some parts of this taken from generic_cont_expand, which turned out
752   * to be too fragile to do exactly what we need without us having to
753   * worry about recursive locking in ->write_begin() and ->write_end(). */
ocfs2_write_zero_page(struct inode * inode,u64 abs_from,u64 abs_to,struct buffer_head * di_bh)754  static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
755  				 u64 abs_to, struct buffer_head *di_bh)
756  {
757  	struct address_space *mapping = inode->i_mapping;
758  	struct folio *folio;
759  	unsigned long index = abs_from >> PAGE_SHIFT;
760  	handle_t *handle;
761  	int ret = 0;
762  	unsigned zero_from, zero_to, block_start, block_end;
763  	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
764  
765  	BUG_ON(abs_from >= abs_to);
766  	BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
767  	BUG_ON(abs_from & (inode->i_blkbits - 1));
768  
769  	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh,
770  						      abs_from,
771  						      abs_to - abs_from);
772  	if (IS_ERR(handle)) {
773  		ret = PTR_ERR(handle);
774  		goto out;
775  	}
776  
777  	folio = __filemap_get_folio(mapping, index,
778  			FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS);
779  	if (IS_ERR(folio)) {
780  		ret = PTR_ERR(folio);
781  		mlog_errno(ret);
782  		goto out_commit_trans;
783  	}
784  
785  	/* Get the offsets within the page that we want to zero */
786  	zero_from = abs_from & (PAGE_SIZE - 1);
787  	zero_to = abs_to & (PAGE_SIZE - 1);
788  	if (!zero_to)
789  		zero_to = PAGE_SIZE;
790  
791  	trace_ocfs2_write_zero_page(
792  			(unsigned long long)OCFS2_I(inode)->ip_blkno,
793  			(unsigned long long)abs_from,
794  			(unsigned long long)abs_to,
795  			index, zero_from, zero_to);
796  
797  	/* We know that zero_from is block aligned */
798  	for (block_start = zero_from; block_start < zero_to;
799  	     block_start = block_end) {
800  		block_end = block_start + i_blocksize(inode);
801  
802  		/*
803  		 * block_start is block-aligned.  Bump it by one to force
804  		 * __block_write_begin and block_commit_write to zero the
805  		 * whole block.
806  		 */
807  		ret = __block_write_begin(folio, block_start + 1, 0,
808  					  ocfs2_get_block);
809  		if (ret < 0) {
810  			mlog_errno(ret);
811  			goto out_unlock;
812  		}
813  
814  
815  		/* must not update i_size! */
816  		block_commit_write(&folio->page, block_start + 1, block_start + 1);
817  	}
818  
819  	/*
820  	 * fs-writeback will release the dirty pages without page lock
821  	 * whose offset are over inode size, the release happens at
822  	 * block_write_full_folio().
823  	 */
824  	i_size_write(inode, abs_to);
825  	inode->i_blocks = ocfs2_inode_sector_count(inode);
826  	di->i_size = cpu_to_le64((u64)i_size_read(inode));
827  	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
828  	di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode));
829  	di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
830  	di->i_mtime_nsec = di->i_ctime_nsec;
831  	if (handle) {
832  		ocfs2_journal_dirty(handle, di_bh);
833  		ocfs2_update_inode_fsync_trans(handle, inode, 1);
834  	}
835  
836  out_unlock:
837  	folio_unlock(folio);
838  	folio_put(folio);
839  out_commit_trans:
840  	if (handle)
841  		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
842  out:
843  	return ret;
844  }
845  
846  /*
847   * Find the next range to zero.  We do this in terms of bytes because
848   * that's what ocfs2_zero_extend() wants, and it is dealing with the
849   * pagecache.  We may return multiple extents.
850   *
851   * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
852   * needs to be zeroed.  range_start and range_end return the next zeroing
853   * range.  A subsequent call should pass the previous range_end as its
854   * zero_start.  If range_end is 0, there's nothing to do.
855   *
856   * Unwritten extents are skipped over.  Refcounted extents are CoWd.
857   */
ocfs2_zero_extend_get_range(struct inode * inode,struct buffer_head * di_bh,u64 zero_start,u64 zero_end,u64 * range_start,u64 * range_end)858  static int ocfs2_zero_extend_get_range(struct inode *inode,
859  				       struct buffer_head *di_bh,
860  				       u64 zero_start, u64 zero_end,
861  				       u64 *range_start, u64 *range_end)
862  {
863  	int rc = 0, needs_cow = 0;
864  	u32 p_cpos, zero_clusters = 0;
865  	u32 zero_cpos =
866  		zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
867  	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
868  	unsigned int num_clusters = 0;
869  	unsigned int ext_flags = 0;
870  
871  	while (zero_cpos < last_cpos) {
872  		rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
873  					&num_clusters, &ext_flags);
874  		if (rc) {
875  			mlog_errno(rc);
876  			goto out;
877  		}
878  
879  		if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
880  			zero_clusters = num_clusters;
881  			if (ext_flags & OCFS2_EXT_REFCOUNTED)
882  				needs_cow = 1;
883  			break;
884  		}
885  
886  		zero_cpos += num_clusters;
887  	}
888  	if (!zero_clusters) {
889  		*range_end = 0;
890  		goto out;
891  	}
892  
893  	while ((zero_cpos + zero_clusters) < last_cpos) {
894  		rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
895  					&p_cpos, &num_clusters,
896  					&ext_flags);
897  		if (rc) {
898  			mlog_errno(rc);
899  			goto out;
900  		}
901  
902  		if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
903  			break;
904  		if (ext_flags & OCFS2_EXT_REFCOUNTED)
905  			needs_cow = 1;
906  		zero_clusters += num_clusters;
907  	}
908  	if ((zero_cpos + zero_clusters) > last_cpos)
909  		zero_clusters = last_cpos - zero_cpos;
910  
911  	if (needs_cow) {
912  		rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
913  					zero_clusters, UINT_MAX);
914  		if (rc) {
915  			mlog_errno(rc);
916  			goto out;
917  		}
918  	}
919  
920  	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
921  	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
922  					     zero_cpos + zero_clusters);
923  
924  out:
925  	return rc;
926  }
927  
928  /*
929   * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
930   * has made sure that the entire range needs zeroing.
931   */
ocfs2_zero_extend_range(struct inode * inode,u64 range_start,u64 range_end,struct buffer_head * di_bh)932  static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
933  				   u64 range_end, struct buffer_head *di_bh)
934  {
935  	int rc = 0;
936  	u64 next_pos;
937  	u64 zero_pos = range_start;
938  
939  	trace_ocfs2_zero_extend_range(
940  			(unsigned long long)OCFS2_I(inode)->ip_blkno,
941  			(unsigned long long)range_start,
942  			(unsigned long long)range_end);
943  	BUG_ON(range_start >= range_end);
944  
945  	while (zero_pos < range_end) {
946  		next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE;
947  		if (next_pos > range_end)
948  			next_pos = range_end;
949  		rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
950  		if (rc < 0) {
951  			mlog_errno(rc);
952  			break;
953  		}
954  		zero_pos = next_pos;
955  
956  		/*
957  		 * Very large extends have the potential to lock up
958  		 * the cpu for extended periods of time.
959  		 */
960  		cond_resched();
961  	}
962  
963  	return rc;
964  }
965  
ocfs2_zero_extend(struct inode * inode,struct buffer_head * di_bh,loff_t zero_to_size)966  int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
967  		      loff_t zero_to_size)
968  {
969  	int ret = 0;
970  	u64 zero_start, range_start = 0, range_end = 0;
971  	struct super_block *sb = inode->i_sb;
972  
973  	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
974  	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
975  				(unsigned long long)zero_start,
976  				(unsigned long long)i_size_read(inode));
977  	while (zero_start < zero_to_size) {
978  		ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
979  						  zero_to_size,
980  						  &range_start,
981  						  &range_end);
982  		if (ret) {
983  			mlog_errno(ret);
984  			break;
985  		}
986  		if (!range_end)
987  			break;
988  		/* Trim the ends */
989  		if (range_start < zero_start)
990  			range_start = zero_start;
991  		if (range_end > zero_to_size)
992  			range_end = zero_to_size;
993  
994  		ret = ocfs2_zero_extend_range(inode, range_start,
995  					      range_end, di_bh);
996  		if (ret) {
997  			mlog_errno(ret);
998  			break;
999  		}
1000  		zero_start = range_end;
1001  	}
1002  
1003  	return ret;
1004  }
1005  
ocfs2_extend_no_holes(struct inode * inode,struct buffer_head * di_bh,u64 new_i_size,u64 zero_to)1006  int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
1007  			  u64 new_i_size, u64 zero_to)
1008  {
1009  	int ret;
1010  	u32 clusters_to_add;
1011  	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1012  
1013  	/*
1014  	 * Only quota files call this without a bh, and they can't be
1015  	 * refcounted.
1016  	 */
1017  	BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode));
1018  	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
1019  
1020  	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
1021  	if (clusters_to_add < oi->ip_clusters)
1022  		clusters_to_add = 0;
1023  	else
1024  		clusters_to_add -= oi->ip_clusters;
1025  
1026  	if (clusters_to_add) {
1027  		ret = ocfs2_extend_allocation(inode, oi->ip_clusters,
1028  					      clusters_to_add, 0);
1029  		if (ret) {
1030  			mlog_errno(ret);
1031  			goto out;
1032  		}
1033  	}
1034  
1035  	/*
1036  	 * Call this even if we don't add any clusters to the tree. We
1037  	 * still need to zero the area between the old i_size and the
1038  	 * new i_size.
1039  	 */
1040  	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
1041  	if (ret < 0)
1042  		mlog_errno(ret);
1043  
1044  out:
1045  	return ret;
1046  }
1047  
ocfs2_extend_file(struct inode * inode,struct buffer_head * di_bh,u64 new_i_size)1048  static int ocfs2_extend_file(struct inode *inode,
1049  			     struct buffer_head *di_bh,
1050  			     u64 new_i_size)
1051  {
1052  	int ret = 0;
1053  	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1054  
1055  	BUG_ON(!di_bh);
1056  
1057  	/* setattr sometimes calls us like this. */
1058  	if (new_i_size == 0)
1059  		goto out;
1060  
1061  	if (i_size_read(inode) == new_i_size)
1062  		goto out;
1063  	BUG_ON(new_i_size < i_size_read(inode));
1064  
1065  	/*
1066  	 * The alloc sem blocks people in read/write from reading our
1067  	 * allocation until we're done changing it. We depend on
1068  	 * i_rwsem to block other extend/truncate calls while we're
1069  	 * here.  We even have to hold it for sparse files because there
1070  	 * might be some tail zeroing.
1071  	 */
1072  	down_write(&oi->ip_alloc_sem);
1073  
1074  	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1075  		/*
1076  		 * We can optimize small extends by keeping the inodes
1077  		 * inline data.
1078  		 */
1079  		if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
1080  			up_write(&oi->ip_alloc_sem);
1081  			goto out_update_size;
1082  		}
1083  
1084  		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1085  		if (ret) {
1086  			up_write(&oi->ip_alloc_sem);
1087  			mlog_errno(ret);
1088  			goto out;
1089  		}
1090  	}
1091  
1092  	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
1093  		ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
1094  	else
1095  		ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1096  					    new_i_size);
1097  
1098  	up_write(&oi->ip_alloc_sem);
1099  
1100  	if (ret < 0) {
1101  		mlog_errno(ret);
1102  		goto out;
1103  	}
1104  
1105  out_update_size:
1106  	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
1107  	if (ret < 0)
1108  		mlog_errno(ret);
1109  
1110  out:
1111  	return ret;
1112  }
1113  
ocfs2_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)1114  int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
1115  		  struct iattr *attr)
1116  {
1117  	int status = 0, size_change;
1118  	int inode_locked = 0;
1119  	struct inode *inode = d_inode(dentry);
1120  	struct super_block *sb = inode->i_sb;
1121  	struct ocfs2_super *osb = OCFS2_SB(sb);
1122  	struct buffer_head *bh = NULL;
1123  	handle_t *handle = NULL;
1124  	struct dquot *transfer_to[MAXQUOTAS] = { };
1125  	int qtype;
1126  	int had_lock;
1127  	struct ocfs2_lock_holder oh;
1128  
1129  	trace_ocfs2_setattr(inode, dentry,
1130  			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
1131  			    dentry->d_name.len, dentry->d_name.name,
1132  			    attr->ia_valid,
1133  				attr->ia_valid & ATTR_MODE ? attr->ia_mode : 0,
1134  				attr->ia_valid & ATTR_UID ?
1135  					from_kuid(&init_user_ns, attr->ia_uid) : 0,
1136  				attr->ia_valid & ATTR_GID ?
1137  					from_kgid(&init_user_ns, attr->ia_gid) : 0);
1138  
1139  	/* ensuring we don't even attempt to truncate a symlink */
1140  	if (S_ISLNK(inode->i_mode))
1141  		attr->ia_valid &= ~ATTR_SIZE;
1142  
1143  #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
1144  			   | ATTR_GID | ATTR_UID | ATTR_MODE)
1145  	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
1146  		return 0;
1147  
1148  	status = setattr_prepare(&nop_mnt_idmap, dentry, attr);
1149  	if (status)
1150  		return status;
1151  
1152  	if (is_quota_modification(&nop_mnt_idmap, inode, attr)) {
1153  		status = dquot_initialize(inode);
1154  		if (status)
1155  			return status;
1156  	}
1157  	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
1158  	if (size_change) {
1159  		/*
1160  		 * Here we should wait dio to finish before inode lock
1161  		 * to avoid a deadlock between ocfs2_setattr() and
1162  		 * ocfs2_dio_end_io_write()
1163  		 */
1164  		inode_dio_wait(inode);
1165  
1166  		status = ocfs2_rw_lock(inode, 1);
1167  		if (status < 0) {
1168  			mlog_errno(status);
1169  			goto bail;
1170  		}
1171  	}
1172  
1173  	had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
1174  	if (had_lock < 0) {
1175  		status = had_lock;
1176  		goto bail_unlock_rw;
1177  	} else if (had_lock) {
1178  		/*
1179  		 * As far as we know, ocfs2_setattr() could only be the first
1180  		 * VFS entry point in the call chain of recursive cluster
1181  		 * locking issue.
1182  		 *
1183  		 * For instance:
1184  		 * chmod_common()
1185  		 *  notify_change()
1186  		 *   ocfs2_setattr()
1187  		 *    posix_acl_chmod()
1188  		 *     ocfs2_iop_get_acl()
1189  		 *
1190  		 * But, we're not 100% sure if it's always true, because the
1191  		 * ordering of the VFS entry points in the call chain is out
1192  		 * of our control. So, we'd better dump the stack here to
1193  		 * catch the other cases of recursive locking.
1194  		 */
1195  		mlog(ML_ERROR, "Another case of recursive locking:\n");
1196  		dump_stack();
1197  	}
1198  	inode_locked = 1;
1199  
1200  	if (size_change) {
1201  		status = inode_newsize_ok(inode, attr->ia_size);
1202  		if (status)
1203  			goto bail_unlock;
1204  
1205  		if (i_size_read(inode) >= attr->ia_size) {
1206  			if (ocfs2_should_order_data(inode)) {
1207  				status = ocfs2_begin_ordered_truncate(inode,
1208  								      attr->ia_size);
1209  				if (status)
1210  					goto bail_unlock;
1211  			}
1212  			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1213  		} else
1214  			status = ocfs2_extend_file(inode, bh, attr->ia_size);
1215  		if (status < 0) {
1216  			if (status != -ENOSPC)
1217  				mlog_errno(status);
1218  			status = -ENOSPC;
1219  			goto bail_unlock;
1220  		}
1221  	}
1222  
1223  	if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
1224  	    (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
1225  		/*
1226  		 * Gather pointers to quota structures so that allocation /
1227  		 * freeing of quota structures happens here and not inside
1228  		 * dquot_transfer() where we have problems with lock ordering
1229  		 */
1230  		if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
1231  		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1232  		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1233  			transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
1234  			if (IS_ERR(transfer_to[USRQUOTA])) {
1235  				status = PTR_ERR(transfer_to[USRQUOTA]);
1236  				transfer_to[USRQUOTA] = NULL;
1237  				goto bail_unlock;
1238  			}
1239  		}
1240  		if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
1241  		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1242  		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1243  			transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
1244  			if (IS_ERR(transfer_to[GRPQUOTA])) {
1245  				status = PTR_ERR(transfer_to[GRPQUOTA]);
1246  				transfer_to[GRPQUOTA] = NULL;
1247  				goto bail_unlock;
1248  			}
1249  		}
1250  		down_write(&OCFS2_I(inode)->ip_alloc_sem);
1251  		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
1252  					   2 * ocfs2_quota_trans_credits(sb));
1253  		if (IS_ERR(handle)) {
1254  			status = PTR_ERR(handle);
1255  			mlog_errno(status);
1256  			goto bail_unlock_alloc;
1257  		}
1258  		status = __dquot_transfer(inode, transfer_to);
1259  		if (status < 0)
1260  			goto bail_commit;
1261  	} else {
1262  		down_write(&OCFS2_I(inode)->ip_alloc_sem);
1263  		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1264  		if (IS_ERR(handle)) {
1265  			status = PTR_ERR(handle);
1266  			mlog_errno(status);
1267  			goto bail_unlock_alloc;
1268  		}
1269  	}
1270  
1271  	setattr_copy(&nop_mnt_idmap, inode, attr);
1272  	mark_inode_dirty(inode);
1273  
1274  	status = ocfs2_mark_inode_dirty(handle, inode, bh);
1275  	if (status < 0)
1276  		mlog_errno(status);
1277  
1278  bail_commit:
1279  	ocfs2_commit_trans(osb, handle);
1280  bail_unlock_alloc:
1281  	up_write(&OCFS2_I(inode)->ip_alloc_sem);
1282  bail_unlock:
1283  	if (status && inode_locked) {
1284  		ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
1285  		inode_locked = 0;
1286  	}
1287  bail_unlock_rw:
1288  	if (size_change)
1289  		ocfs2_rw_unlock(inode, 1);
1290  bail:
1291  
1292  	/* Release quota pointers in case we acquired them */
1293  	for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
1294  		dqput(transfer_to[qtype]);
1295  
1296  	if (!status && attr->ia_valid & ATTR_MODE) {
1297  		status = ocfs2_acl_chmod(inode, bh);
1298  		if (status < 0)
1299  			mlog_errno(status);
1300  	}
1301  	if (inode_locked)
1302  		ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
1303  
1304  	brelse(bh);
1305  	return status;
1306  }
1307  
ocfs2_getattr(struct mnt_idmap * idmap,const struct path * path,struct kstat * stat,u32 request_mask,unsigned int flags)1308  int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path,
1309  		  struct kstat *stat, u32 request_mask, unsigned int flags)
1310  {
1311  	struct inode *inode = d_inode(path->dentry);
1312  	struct super_block *sb = path->dentry->d_sb;
1313  	struct ocfs2_super *osb = sb->s_fs_info;
1314  	int err;
1315  
1316  	err = ocfs2_inode_revalidate(path->dentry);
1317  	if (err) {
1318  		if (err != -ENOENT)
1319  			mlog_errno(err);
1320  		goto bail;
1321  	}
1322  
1323  	generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
1324  	/*
1325  	 * If there is inline data in the inode, the inode will normally not
1326  	 * have data blocks allocated (it may have an external xattr block).
1327  	 * Report at least one sector for such files, so tools like tar, rsync,
1328  	 * others don't incorrectly think the file is completely sparse.
1329  	 */
1330  	if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
1331  		stat->blocks += (stat->size + 511)>>9;
1332  
1333  	/* We set the blksize from the cluster size for performance */
1334  	stat->blksize = osb->s_clustersize;
1335  
1336  bail:
1337  	return err;
1338  }
1339  
ocfs2_permission(struct mnt_idmap * idmap,struct inode * inode,int mask)1340  int ocfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
1341  		     int mask)
1342  {
1343  	int ret, had_lock;
1344  	struct ocfs2_lock_holder oh;
1345  
1346  	if (mask & MAY_NOT_BLOCK)
1347  		return -ECHILD;
1348  
1349  	had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh);
1350  	if (had_lock < 0) {
1351  		ret = had_lock;
1352  		goto out;
1353  	} else if (had_lock) {
1354  		/* See comments in ocfs2_setattr() for details.
1355  		 * The call chain of this case could be:
1356  		 * do_sys_open()
1357  		 *  may_open()
1358  		 *   inode_permission()
1359  		 *    ocfs2_permission()
1360  		 *     ocfs2_iop_get_acl()
1361  		 */
1362  		mlog(ML_ERROR, "Another case of recursive locking:\n");
1363  		dump_stack();
1364  	}
1365  
1366  	ret = generic_permission(&nop_mnt_idmap, inode, mask);
1367  
1368  	ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
1369  out:
1370  	return ret;
1371  }
1372  
__ocfs2_write_remove_suid(struct inode * inode,struct buffer_head * bh)1373  static int __ocfs2_write_remove_suid(struct inode *inode,
1374  				     struct buffer_head *bh)
1375  {
1376  	int ret;
1377  	handle_t *handle;
1378  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1379  	struct ocfs2_dinode *di;
1380  
1381  	trace_ocfs2_write_remove_suid(
1382  			(unsigned long long)OCFS2_I(inode)->ip_blkno,
1383  			inode->i_mode);
1384  
1385  	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1386  	if (IS_ERR(handle)) {
1387  		ret = PTR_ERR(handle);
1388  		mlog_errno(ret);
1389  		goto out;
1390  	}
1391  
1392  	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1393  				      OCFS2_JOURNAL_ACCESS_WRITE);
1394  	if (ret < 0) {
1395  		mlog_errno(ret);
1396  		goto out_trans;
1397  	}
1398  
1399  	inode->i_mode &= ~S_ISUID;
1400  	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1401  		inode->i_mode &= ~S_ISGID;
1402  
1403  	di = (struct ocfs2_dinode *) bh->b_data;
1404  	di->i_mode = cpu_to_le16(inode->i_mode);
1405  	ocfs2_update_inode_fsync_trans(handle, inode, 0);
1406  
1407  	ocfs2_journal_dirty(handle, bh);
1408  
1409  out_trans:
1410  	ocfs2_commit_trans(osb, handle);
1411  out:
1412  	return ret;
1413  }
1414  
ocfs2_write_remove_suid(struct inode * inode)1415  static int ocfs2_write_remove_suid(struct inode *inode)
1416  {
1417  	int ret;
1418  	struct buffer_head *bh = NULL;
1419  
1420  	ret = ocfs2_read_inode_block(inode, &bh);
1421  	if (ret < 0) {
1422  		mlog_errno(ret);
1423  		goto out;
1424  	}
1425  
1426  	ret =  __ocfs2_write_remove_suid(inode, bh);
1427  out:
1428  	brelse(bh);
1429  	return ret;
1430  }
1431  
1432  /*
1433   * Allocate enough extents to cover the region starting at byte offset
1434   * start for len bytes. Existing extents are skipped, any extents
1435   * added are marked as "unwritten".
1436   */
ocfs2_allocate_unwritten_extents(struct inode * inode,u64 start,u64 len)1437  static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1438  					    u64 start, u64 len)
1439  {
1440  	int ret;
1441  	u32 cpos, phys_cpos, clusters, alloc_size;
1442  	u64 end = start + len;
1443  	struct buffer_head *di_bh = NULL;
1444  
1445  	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1446  		ret = ocfs2_read_inode_block(inode, &di_bh);
1447  		if (ret) {
1448  			mlog_errno(ret);
1449  			goto out;
1450  		}
1451  
1452  		/*
1453  		 * Nothing to do if the requested reservation range
1454  		 * fits within the inode.
1455  		 */
1456  		if (ocfs2_size_fits_inline_data(di_bh, end))
1457  			goto out;
1458  
1459  		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1460  		if (ret) {
1461  			mlog_errno(ret);
1462  			goto out;
1463  		}
1464  	}
1465  
1466  	/*
1467  	 * We consider both start and len to be inclusive.
1468  	 */
1469  	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1470  	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1471  	clusters -= cpos;
1472  
1473  	while (clusters) {
1474  		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1475  					 &alloc_size, NULL);
1476  		if (ret) {
1477  			mlog_errno(ret);
1478  			goto out;
1479  		}
1480  
1481  		/*
1482  		 * Hole or existing extent len can be arbitrary, so
1483  		 * cap it to our own allocation request.
1484  		 */
1485  		if (alloc_size > clusters)
1486  			alloc_size = clusters;
1487  
1488  		if (phys_cpos) {
1489  			/*
1490  			 * We already have an allocation at this
1491  			 * region so we can safely skip it.
1492  			 */
1493  			goto next;
1494  		}
1495  
1496  		ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1497  		if (ret) {
1498  			if (ret != -ENOSPC)
1499  				mlog_errno(ret);
1500  			goto out;
1501  		}
1502  
1503  next:
1504  		cpos += alloc_size;
1505  		clusters -= alloc_size;
1506  	}
1507  
1508  	ret = 0;
1509  out:
1510  
1511  	brelse(di_bh);
1512  	return ret;
1513  }
1514  
1515  /*
1516   * Truncate a byte range, avoiding pages within partial clusters. This
1517   * preserves those pages for the zeroing code to write to.
1518   */
ocfs2_truncate_cluster_pages(struct inode * inode,u64 byte_start,u64 byte_len)1519  static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1520  					 u64 byte_len)
1521  {
1522  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1523  	loff_t start, end;
1524  	struct address_space *mapping = inode->i_mapping;
1525  
1526  	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1527  	end = byte_start + byte_len;
1528  	end = end & ~(osb->s_clustersize - 1);
1529  
1530  	if (start < end) {
1531  		unmap_mapping_range(mapping, start, end - start, 0);
1532  		truncate_inode_pages_range(mapping, start, end - 1);
1533  	}
1534  }
1535  
1536  /*
1537   * zero out partial blocks of one cluster.
1538   *
1539   * start: file offset where zero starts, will be made upper block aligned.
1540   * len: it will be trimmed to the end of current cluster if "start + len"
1541   *      is bigger than it.
1542   */
ocfs2_zeroout_partial_cluster(struct inode * inode,u64 start,u64 len)1543  static int ocfs2_zeroout_partial_cluster(struct inode *inode,
1544  					u64 start, u64 len)
1545  {
1546  	int ret;
1547  	u64 start_block, end_block, nr_blocks;
1548  	u64 p_block, offset;
1549  	u32 cluster, p_cluster, nr_clusters;
1550  	struct super_block *sb = inode->i_sb;
1551  	u64 end = ocfs2_align_bytes_to_clusters(sb, start);
1552  
1553  	if (start + len < end)
1554  		end = start + len;
1555  
1556  	start_block = ocfs2_blocks_for_bytes(sb, start);
1557  	end_block = ocfs2_blocks_for_bytes(sb, end);
1558  	nr_blocks = end_block - start_block;
1559  	if (!nr_blocks)
1560  		return 0;
1561  
1562  	cluster = ocfs2_bytes_to_clusters(sb, start);
1563  	ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
1564  				&nr_clusters, NULL);
1565  	if (ret)
1566  		return ret;
1567  	if (!p_cluster)
1568  		return 0;
1569  
1570  	offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
1571  	p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
1572  	return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
1573  }
1574  
ocfs2_zero_partial_clusters(struct inode * inode,u64 start,u64 len)1575  static int ocfs2_zero_partial_clusters(struct inode *inode,
1576  				       u64 start, u64 len)
1577  {
1578  	int ret = 0;
1579  	u64 tmpend = 0;
1580  	u64 end = start + len;
1581  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1582  	unsigned int csize = osb->s_clustersize;
1583  	handle_t *handle;
1584  	loff_t isize = i_size_read(inode);
1585  
1586  	/*
1587  	 * The "start" and "end" values are NOT necessarily part of
1588  	 * the range whose allocation is being deleted. Rather, this
1589  	 * is what the user passed in with the request. We must zero
1590  	 * partial clusters here. There's no need to worry about
1591  	 * physical allocation - the zeroing code knows to skip holes.
1592  	 */
1593  	trace_ocfs2_zero_partial_clusters(
1594  		(unsigned long long)OCFS2_I(inode)->ip_blkno,
1595  		(unsigned long long)start, (unsigned long long)end);
1596  
1597  	/*
1598  	 * If both edges are on a cluster boundary then there's no
1599  	 * zeroing required as the region is part of the allocation to
1600  	 * be truncated.
1601  	 */
1602  	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1603  		goto out;
1604  
1605  	/* No page cache for EOF blocks, issue zero out to disk. */
1606  	if (end > isize) {
1607  		/*
1608  		 * zeroout eof blocks in last cluster starting from
1609  		 * "isize" even "start" > "isize" because it is
1610  		 * complicated to zeroout just at "start" as "start"
1611  		 * may be not aligned with block size, buffer write
1612  		 * would be required to do that, but out of eof buffer
1613  		 * write is not supported.
1614  		 */
1615  		ret = ocfs2_zeroout_partial_cluster(inode, isize,
1616  					end - isize);
1617  		if (ret) {
1618  			mlog_errno(ret);
1619  			goto out;
1620  		}
1621  		if (start >= isize)
1622  			goto out;
1623  		end = isize;
1624  	}
1625  	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1626  	if (IS_ERR(handle)) {
1627  		ret = PTR_ERR(handle);
1628  		mlog_errno(ret);
1629  		goto out;
1630  	}
1631  
1632  	/*
1633  	 * If start is on a cluster boundary and end is somewhere in another
1634  	 * cluster, we have not COWed the cluster starting at start, unless
1635  	 * end is also within the same cluster. So, in this case, we skip this
1636  	 * first call to ocfs2_zero_range_for_truncate() truncate and move on
1637  	 * to the next one.
1638  	 */
1639  	if ((start & (csize - 1)) != 0) {
1640  		/*
1641  		 * We want to get the byte offset of the end of the 1st
1642  		 * cluster.
1643  		 */
1644  		tmpend = (u64)osb->s_clustersize +
1645  			(start & ~(osb->s_clustersize - 1));
1646  		if (tmpend > end)
1647  			tmpend = end;
1648  
1649  		trace_ocfs2_zero_partial_clusters_range1(
1650  			(unsigned long long)start,
1651  			(unsigned long long)tmpend);
1652  
1653  		ret = ocfs2_zero_range_for_truncate(inode, handle, start,
1654  						    tmpend);
1655  		if (ret)
1656  			mlog_errno(ret);
1657  	}
1658  
1659  	if (tmpend < end) {
1660  		/*
1661  		 * This may make start and end equal, but the zeroing
1662  		 * code will skip any work in that case so there's no
1663  		 * need to catch it up here.
1664  		 */
1665  		start = end & ~(osb->s_clustersize - 1);
1666  
1667  		trace_ocfs2_zero_partial_clusters_range2(
1668  			(unsigned long long)start, (unsigned long long)end);
1669  
1670  		ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1671  		if (ret)
1672  			mlog_errno(ret);
1673  	}
1674  	ocfs2_update_inode_fsync_trans(handle, inode, 1);
1675  
1676  	ocfs2_commit_trans(osb, handle);
1677  out:
1678  	return ret;
1679  }
1680  
ocfs2_find_rec(struct ocfs2_extent_list * el,u32 pos)1681  static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1682  {
1683  	int i;
1684  	struct ocfs2_extent_rec *rec = NULL;
1685  
1686  	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1687  
1688  		rec = &el->l_recs[i];
1689  
1690  		if (le32_to_cpu(rec->e_cpos) < pos)
1691  			break;
1692  	}
1693  
1694  	return i;
1695  }
1696  
1697  /*
1698   * Helper to calculate the punching pos and length in one run, we handle the
1699   * following three cases in order:
1700   *
1701   * - remove the entire record
1702   * - remove a partial record
1703   * - no record needs to be removed (hole-punching completed)
1704  */
ocfs2_calc_trunc_pos(struct inode * inode,struct ocfs2_extent_list * el,struct ocfs2_extent_rec * rec,u32 trunc_start,u32 * trunc_cpos,u32 * trunc_len,u32 * trunc_end,u64 * blkno,int * done)1705  static void ocfs2_calc_trunc_pos(struct inode *inode,
1706  				 struct ocfs2_extent_list *el,
1707  				 struct ocfs2_extent_rec *rec,
1708  				 u32 trunc_start, u32 *trunc_cpos,
1709  				 u32 *trunc_len, u32 *trunc_end,
1710  				 u64 *blkno, int *done)
1711  {
1712  	int ret = 0;
1713  	u32 coff, range;
1714  
1715  	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1716  
1717  	if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1718  		/*
1719  		 * remove an entire extent record.
1720  		 */
1721  		*trunc_cpos = le32_to_cpu(rec->e_cpos);
1722  		/*
1723  		 * Skip holes if any.
1724  		 */
1725  		if (range < *trunc_end)
1726  			*trunc_end = range;
1727  		*trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
1728  		*blkno = le64_to_cpu(rec->e_blkno);
1729  		*trunc_end = le32_to_cpu(rec->e_cpos);
1730  	} else if (range > trunc_start) {
1731  		/*
1732  		 * remove a partial extent record, which means we're
1733  		 * removing the last extent record.
1734  		 */
1735  		*trunc_cpos = trunc_start;
1736  		/*
1737  		 * skip hole if any.
1738  		 */
1739  		if (range < *trunc_end)
1740  			*trunc_end = range;
1741  		*trunc_len = *trunc_end - trunc_start;
1742  		coff = trunc_start - le32_to_cpu(rec->e_cpos);
1743  		*blkno = le64_to_cpu(rec->e_blkno) +
1744  				ocfs2_clusters_to_blocks(inode->i_sb, coff);
1745  		*trunc_end = trunc_start;
1746  	} else {
1747  		/*
1748  		 * It may have two following possibilities:
1749  		 *
1750  		 * - last record has been removed
1751  		 * - trunc_start was within a hole
1752  		 *
1753  		 * both two cases mean the completion of hole punching.
1754  		 */
1755  		ret = 1;
1756  	}
1757  
1758  	*done = ret;
1759  }
1760  
ocfs2_remove_inode_range(struct inode * inode,struct buffer_head * di_bh,u64 byte_start,u64 byte_len)1761  int ocfs2_remove_inode_range(struct inode *inode,
1762  			     struct buffer_head *di_bh, u64 byte_start,
1763  			     u64 byte_len)
1764  {
1765  	int ret = 0, flags = 0, done = 0, i;
1766  	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1767  	u32 cluster_in_el;
1768  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1769  	struct ocfs2_cached_dealloc_ctxt dealloc;
1770  	struct address_space *mapping = inode->i_mapping;
1771  	struct ocfs2_extent_tree et;
1772  	struct ocfs2_path *path = NULL;
1773  	struct ocfs2_extent_list *el = NULL;
1774  	struct ocfs2_extent_rec *rec = NULL;
1775  	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1776  	u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1777  
1778  	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1779  	ocfs2_init_dealloc_ctxt(&dealloc);
1780  
1781  	trace_ocfs2_remove_inode_range(
1782  			(unsigned long long)OCFS2_I(inode)->ip_blkno,
1783  			(unsigned long long)byte_start,
1784  			(unsigned long long)byte_len);
1785  
1786  	if (byte_len == 0)
1787  		return 0;
1788  
1789  	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1790  		int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di);
1791  
1792  		if (byte_start > id_count || byte_start + byte_len > id_count) {
1793  			ret = -EINVAL;
1794  			mlog_errno(ret);
1795  			goto out;
1796  		}
1797  
1798  		ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1799  					    byte_start + byte_len, 0);
1800  		if (ret) {
1801  			mlog_errno(ret);
1802  			goto out;
1803  		}
1804  		/*
1805  		 * There's no need to get fancy with the page cache
1806  		 * truncate of an inline-data inode. We're talking
1807  		 * about less than a page here, which will be cached
1808  		 * in the dinode buffer anyway.
1809  		 */
1810  		unmap_mapping_range(mapping, 0, 0, 0);
1811  		truncate_inode_pages(mapping, 0);
1812  		goto out;
1813  	}
1814  
1815  	/*
1816  	 * For reflinks, we may need to CoW 2 clusters which might be
1817  	 * partially zero'd later, if hole's start and end offset were
1818  	 * within one cluster(means is not exactly aligned to clustersize).
1819  	 */
1820  
1821  	if (ocfs2_is_refcount_inode(inode)) {
1822  		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
1823  		if (ret) {
1824  			mlog_errno(ret);
1825  			goto out;
1826  		}
1827  
1828  		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
1829  		if (ret) {
1830  			mlog_errno(ret);
1831  			goto out;
1832  		}
1833  	}
1834  
1835  	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1836  	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1837  	cluster_in_el = trunc_end;
1838  
1839  	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1840  	if (ret) {
1841  		mlog_errno(ret);
1842  		goto out;
1843  	}
1844  
1845  	path = ocfs2_new_path_from_et(&et);
1846  	if (!path) {
1847  		ret = -ENOMEM;
1848  		mlog_errno(ret);
1849  		goto out;
1850  	}
1851  
1852  	while (trunc_end > trunc_start) {
1853  
1854  		ret = ocfs2_find_path(INODE_CACHE(inode), path,
1855  				      cluster_in_el);
1856  		if (ret) {
1857  			mlog_errno(ret);
1858  			goto out;
1859  		}
1860  
1861  		el = path_leaf_el(path);
1862  
1863  		i = ocfs2_find_rec(el, trunc_end);
1864  		/*
1865  		 * Need to go to previous extent block.
1866  		 */
1867  		if (i < 0) {
1868  			if (path->p_tree_depth == 0)
1869  				break;
1870  
1871  			ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
1872  							    path,
1873  							    &cluster_in_el);
1874  			if (ret) {
1875  				mlog_errno(ret);
1876  				goto out;
1877  			}
1878  
1879  			/*
1880  			 * We've reached the leftmost extent block,
1881  			 * it's safe to leave.
1882  			 */
1883  			if (cluster_in_el == 0)
1884  				break;
1885  
1886  			/*
1887  			 * The 'pos' searched for previous extent block is
1888  			 * always one cluster less than actual trunc_end.
1889  			 */
1890  			trunc_end = cluster_in_el + 1;
1891  
1892  			ocfs2_reinit_path(path, 1);
1893  
1894  			continue;
1895  
1896  		} else
1897  			rec = &el->l_recs[i];
1898  
1899  		ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
1900  				     &trunc_len, &trunc_end, &blkno, &done);
1901  		if (done)
1902  			break;
1903  
1904  		flags = rec->e_flags;
1905  		phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
1906  
1907  		ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
1908  					       phys_cpos, trunc_len, flags,
1909  					       &dealloc, refcount_loc, false);
1910  		if (ret < 0) {
1911  			mlog_errno(ret);
1912  			goto out;
1913  		}
1914  
1915  		cluster_in_el = trunc_end;
1916  
1917  		ocfs2_reinit_path(path, 1);
1918  	}
1919  
1920  	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1921  
1922  out:
1923  	ocfs2_free_path(path);
1924  	ocfs2_schedule_truncate_log_flush(osb, 1);
1925  	ocfs2_run_deallocs(osb, &dealloc);
1926  
1927  	return ret;
1928  }
1929  
1930  /*
1931   * Parts of this function taken from xfs_change_file_space()
1932   */
__ocfs2_change_file_space(struct file * file,struct inode * inode,loff_t f_pos,unsigned int cmd,struct ocfs2_space_resv * sr,int change_size)1933  static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1934  				     loff_t f_pos, unsigned int cmd,
1935  				     struct ocfs2_space_resv *sr,
1936  				     int change_size)
1937  {
1938  	int ret;
1939  	s64 llen;
1940  	loff_t size, orig_isize;
1941  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1942  	struct buffer_head *di_bh = NULL;
1943  	handle_t *handle;
1944  	unsigned long long max_off = inode->i_sb->s_maxbytes;
1945  
1946  	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1947  		return -EROFS;
1948  
1949  	inode_lock(inode);
1950  
1951  	/* Wait all existing dio workers, newcomers will block on i_rwsem */
1952  	inode_dio_wait(inode);
1953  	/*
1954  	 * This prevents concurrent writes on other nodes
1955  	 */
1956  	ret = ocfs2_rw_lock(inode, 1);
1957  	if (ret) {
1958  		mlog_errno(ret);
1959  		goto out;
1960  	}
1961  
1962  	ret = ocfs2_inode_lock(inode, &di_bh, 1);
1963  	if (ret) {
1964  		mlog_errno(ret);
1965  		goto out_rw_unlock;
1966  	}
1967  
1968  	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1969  		ret = -EPERM;
1970  		goto out_inode_unlock;
1971  	}
1972  
1973  	switch (sr->l_whence) {
1974  	case 0: /*SEEK_SET*/
1975  		break;
1976  	case 1: /*SEEK_CUR*/
1977  		sr->l_start += f_pos;
1978  		break;
1979  	case 2: /*SEEK_END*/
1980  		sr->l_start += i_size_read(inode);
1981  		break;
1982  	default:
1983  		ret = -EINVAL;
1984  		goto out_inode_unlock;
1985  	}
1986  	sr->l_whence = 0;
1987  
1988  	llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1989  
1990  	if (sr->l_start < 0
1991  	    || sr->l_start > max_off
1992  	    || (sr->l_start + llen) < 0
1993  	    || (sr->l_start + llen) > max_off) {
1994  		ret = -EINVAL;
1995  		goto out_inode_unlock;
1996  	}
1997  	size = sr->l_start + sr->l_len;
1998  
1999  	if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
2000  	    cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
2001  		if (sr->l_len <= 0) {
2002  			ret = -EINVAL;
2003  			goto out_inode_unlock;
2004  		}
2005  	}
2006  
2007  	if (file && setattr_should_drop_suidgid(&nop_mnt_idmap, file_inode(file))) {
2008  		ret = __ocfs2_write_remove_suid(inode, di_bh);
2009  		if (ret) {
2010  			mlog_errno(ret);
2011  			goto out_inode_unlock;
2012  		}
2013  	}
2014  
2015  	down_write(&OCFS2_I(inode)->ip_alloc_sem);
2016  	switch (cmd) {
2017  	case OCFS2_IOC_RESVSP:
2018  	case OCFS2_IOC_RESVSP64:
2019  		/*
2020  		 * This takes unsigned offsets, but the signed ones we
2021  		 * pass have been checked against overflow above.
2022  		 */
2023  		ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
2024  						       sr->l_len);
2025  		break;
2026  	case OCFS2_IOC_UNRESVSP:
2027  	case OCFS2_IOC_UNRESVSP64:
2028  		ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
2029  					       sr->l_len);
2030  		break;
2031  	default:
2032  		ret = -EINVAL;
2033  	}
2034  
2035  	orig_isize = i_size_read(inode);
2036  	/* zeroout eof blocks in the cluster. */
2037  	if (!ret && change_size && orig_isize < size) {
2038  		ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
2039  					size - orig_isize);
2040  		if (!ret)
2041  			i_size_write(inode, size);
2042  	}
2043  	up_write(&OCFS2_I(inode)->ip_alloc_sem);
2044  	if (ret) {
2045  		mlog_errno(ret);
2046  		goto out_inode_unlock;
2047  	}
2048  
2049  	/*
2050  	 * We update c/mtime for these changes
2051  	 */
2052  	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
2053  	if (IS_ERR(handle)) {
2054  		ret = PTR_ERR(handle);
2055  		mlog_errno(ret);
2056  		goto out_inode_unlock;
2057  	}
2058  
2059  	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
2060  	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
2061  	if (ret < 0)
2062  		mlog_errno(ret);
2063  
2064  	if (file && (file->f_flags & O_SYNC))
2065  		handle->h_sync = 1;
2066  
2067  	ocfs2_commit_trans(osb, handle);
2068  
2069  out_inode_unlock:
2070  	brelse(di_bh);
2071  	ocfs2_inode_unlock(inode, 1);
2072  out_rw_unlock:
2073  	ocfs2_rw_unlock(inode, 1);
2074  
2075  out:
2076  	inode_unlock(inode);
2077  	return ret;
2078  }
2079  
ocfs2_change_file_space(struct file * file,unsigned int cmd,struct ocfs2_space_resv * sr)2080  int ocfs2_change_file_space(struct file *file, unsigned int cmd,
2081  			    struct ocfs2_space_resv *sr)
2082  {
2083  	struct inode *inode = file_inode(file);
2084  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2085  	int ret;
2086  
2087  	if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
2088  	    !ocfs2_writes_unwritten_extents(osb))
2089  		return -ENOTTY;
2090  	else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
2091  		 !ocfs2_sparse_alloc(osb))
2092  		return -ENOTTY;
2093  
2094  	if (!S_ISREG(inode->i_mode))
2095  		return -EINVAL;
2096  
2097  	if (!(file->f_mode & FMODE_WRITE))
2098  		return -EBADF;
2099  
2100  	ret = mnt_want_write_file(file);
2101  	if (ret)
2102  		return ret;
2103  	ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
2104  	mnt_drop_write_file(file);
2105  	return ret;
2106  }
2107  
ocfs2_fallocate(struct file * file,int mode,loff_t offset,loff_t len)2108  static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
2109  			    loff_t len)
2110  {
2111  	struct inode *inode = file_inode(file);
2112  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2113  	struct ocfs2_space_resv sr;
2114  	int change_size = 1;
2115  	int cmd = OCFS2_IOC_RESVSP64;
2116  	int ret = 0;
2117  
2118  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2119  		return -EOPNOTSUPP;
2120  	if (!ocfs2_writes_unwritten_extents(osb))
2121  		return -EOPNOTSUPP;
2122  
2123  	if (mode & FALLOC_FL_KEEP_SIZE) {
2124  		change_size = 0;
2125  	} else {
2126  		ret = inode_newsize_ok(inode, offset + len);
2127  		if (ret)
2128  			return ret;
2129  	}
2130  
2131  	if (mode & FALLOC_FL_PUNCH_HOLE)
2132  		cmd = OCFS2_IOC_UNRESVSP64;
2133  
2134  	sr.l_whence = 0;
2135  	sr.l_start = (s64)offset;
2136  	sr.l_len = (s64)len;
2137  
2138  	return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
2139  					 change_size);
2140  }
2141  
ocfs2_check_range_for_refcount(struct inode * inode,loff_t pos,size_t count)2142  int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
2143  				   size_t count)
2144  {
2145  	int ret = 0;
2146  	unsigned int extent_flags;
2147  	u32 cpos, clusters, extent_len, phys_cpos;
2148  	struct super_block *sb = inode->i_sb;
2149  
2150  	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
2151  	    !ocfs2_is_refcount_inode(inode) ||
2152  	    OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2153  		return 0;
2154  
2155  	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
2156  	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
2157  
2158  	while (clusters) {
2159  		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
2160  					 &extent_flags);
2161  		if (ret < 0) {
2162  			mlog_errno(ret);
2163  			goto out;
2164  		}
2165  
2166  		if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
2167  			ret = 1;
2168  			break;
2169  		}
2170  
2171  		if (extent_len > clusters)
2172  			extent_len = clusters;
2173  
2174  		clusters -= extent_len;
2175  		cpos += extent_len;
2176  	}
2177  out:
2178  	return ret;
2179  }
2180  
ocfs2_is_io_unaligned(struct inode * inode,size_t count,loff_t pos)2181  static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2182  {
2183  	int blockmask = inode->i_sb->s_blocksize - 1;
2184  	loff_t final_size = pos + count;
2185  
2186  	if ((pos & blockmask) || (final_size & blockmask))
2187  		return 1;
2188  	return 0;
2189  }
2190  
ocfs2_inode_lock_for_extent_tree(struct inode * inode,struct buffer_head ** di_bh,int meta_level,int write_sem,int wait)2191  static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
2192  					    struct buffer_head **di_bh,
2193  					    int meta_level,
2194  					    int write_sem,
2195  					    int wait)
2196  {
2197  	int ret = 0;
2198  
2199  	if (wait)
2200  		ret = ocfs2_inode_lock(inode, di_bh, meta_level);
2201  	else
2202  		ret = ocfs2_try_inode_lock(inode, di_bh, meta_level);
2203  	if (ret < 0)
2204  		goto out;
2205  
2206  	if (wait) {
2207  		if (write_sem)
2208  			down_write(&OCFS2_I(inode)->ip_alloc_sem);
2209  		else
2210  			down_read(&OCFS2_I(inode)->ip_alloc_sem);
2211  	} else {
2212  		if (write_sem)
2213  			ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
2214  		else
2215  			ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
2216  
2217  		if (!ret) {
2218  			ret = -EAGAIN;
2219  			goto out_unlock;
2220  		}
2221  	}
2222  
2223  	return ret;
2224  
2225  out_unlock:
2226  	brelse(*di_bh);
2227  	*di_bh = NULL;
2228  	ocfs2_inode_unlock(inode, meta_level);
2229  out:
2230  	return ret;
2231  }
2232  
ocfs2_inode_unlock_for_extent_tree(struct inode * inode,struct buffer_head ** di_bh,int meta_level,int write_sem)2233  static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
2234  					       struct buffer_head **di_bh,
2235  					       int meta_level,
2236  					       int write_sem)
2237  {
2238  	if (write_sem)
2239  		up_write(&OCFS2_I(inode)->ip_alloc_sem);
2240  	else
2241  		up_read(&OCFS2_I(inode)->ip_alloc_sem);
2242  
2243  	brelse(*di_bh);
2244  	*di_bh = NULL;
2245  
2246  	if (meta_level >= 0)
2247  		ocfs2_inode_unlock(inode, meta_level);
2248  }
2249  
ocfs2_prepare_inode_for_write(struct file * file,loff_t pos,size_t count,int wait)2250  static int ocfs2_prepare_inode_for_write(struct file *file,
2251  					 loff_t pos, size_t count, int wait)
2252  {
2253  	int ret = 0, meta_level = 0, overwrite_io = 0;
2254  	int write_sem = 0;
2255  	struct dentry *dentry = file->f_path.dentry;
2256  	struct inode *inode = d_inode(dentry);
2257  	struct buffer_head *di_bh = NULL;
2258  	u32 cpos;
2259  	u32 clusters;
2260  
2261  	/*
2262  	 * We start with a read level meta lock and only jump to an ex
2263  	 * if we need to make modifications here.
2264  	 */
2265  	for(;;) {
2266  		ret = ocfs2_inode_lock_for_extent_tree(inode,
2267  						       &di_bh,
2268  						       meta_level,
2269  						       write_sem,
2270  						       wait);
2271  		if (ret < 0) {
2272  			if (ret != -EAGAIN)
2273  				mlog_errno(ret);
2274  			goto out;
2275  		}
2276  
2277  		/*
2278  		 * Check if IO will overwrite allocated blocks in case
2279  		 * IOCB_NOWAIT flag is set.
2280  		 */
2281  		if (!wait && !overwrite_io) {
2282  			overwrite_io = 1;
2283  
2284  			ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
2285  			if (ret < 0) {
2286  				if (ret != -EAGAIN)
2287  					mlog_errno(ret);
2288  				goto out_unlock;
2289  			}
2290  		}
2291  
2292  		/* Clear suid / sgid if necessary. We do this here
2293  		 * instead of later in the write path because
2294  		 * remove_suid() calls ->setattr without any hint that
2295  		 * we may have already done our cluster locking. Since
2296  		 * ocfs2_setattr() *must* take cluster locks to
2297  		 * proceed, this will lead us to recursively lock the
2298  		 * inode. There's also the dinode i_size state which
2299  		 * can be lost via setattr during extending writes (we
2300  		 * set inode->i_size at the end of a write. */
2301  		if (setattr_should_drop_suidgid(&nop_mnt_idmap, inode)) {
2302  			if (meta_level == 0) {
2303  				ocfs2_inode_unlock_for_extent_tree(inode,
2304  								   &di_bh,
2305  								   meta_level,
2306  								   write_sem);
2307  				meta_level = 1;
2308  				continue;
2309  			}
2310  
2311  			ret = ocfs2_write_remove_suid(inode);
2312  			if (ret < 0) {
2313  				mlog_errno(ret);
2314  				goto out_unlock;
2315  			}
2316  		}
2317  
2318  		ret = ocfs2_check_range_for_refcount(inode, pos, count);
2319  		if (ret == 1) {
2320  			ocfs2_inode_unlock_for_extent_tree(inode,
2321  							   &di_bh,
2322  							   meta_level,
2323  							   write_sem);
2324  			meta_level = 1;
2325  			write_sem = 1;
2326  			ret = ocfs2_inode_lock_for_extent_tree(inode,
2327  							       &di_bh,
2328  							       meta_level,
2329  							       write_sem,
2330  							       wait);
2331  			if (ret < 0) {
2332  				if (ret != -EAGAIN)
2333  					mlog_errno(ret);
2334  				goto out;
2335  			}
2336  
2337  			cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2338  			clusters =
2339  				ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
2340  			ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
2341  		}
2342  
2343  		if (ret < 0) {
2344  			if (ret != -EAGAIN)
2345  				mlog_errno(ret);
2346  			goto out_unlock;
2347  		}
2348  
2349  		break;
2350  	}
2351  
2352  out_unlock:
2353  	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
2354  					    pos, count, wait);
2355  
2356  	ocfs2_inode_unlock_for_extent_tree(inode,
2357  					   &di_bh,
2358  					   meta_level,
2359  					   write_sem);
2360  
2361  out:
2362  	return ret;
2363  }
2364  
ocfs2_file_write_iter(struct kiocb * iocb,struct iov_iter * from)2365  static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2366  				    struct iov_iter *from)
2367  {
2368  	int rw_level;
2369  	ssize_t written = 0;
2370  	ssize_t ret;
2371  	size_t count = iov_iter_count(from);
2372  	struct file *file = iocb->ki_filp;
2373  	struct inode *inode = file_inode(file);
2374  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2375  	int full_coherency = !(osb->s_mount_opt &
2376  			       OCFS2_MOUNT_COHERENCY_BUFFERED);
2377  	void *saved_ki_complete = NULL;
2378  	int append_write = ((iocb->ki_pos + count) >=
2379  			i_size_read(inode) ? 1 : 0);
2380  	int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
2381  	int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
2382  
2383  	trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry,
2384  		(unsigned long long)OCFS2_I(inode)->ip_blkno,
2385  		file->f_path.dentry->d_name.len,
2386  		file->f_path.dentry->d_name.name,
2387  		(unsigned int)from->nr_segs);	/* GRRRRR */
2388  
2389  	if (!direct_io && nowait)
2390  		return -EOPNOTSUPP;
2391  
2392  	if (count == 0)
2393  		return 0;
2394  
2395  	if (nowait) {
2396  		if (!inode_trylock(inode))
2397  			return -EAGAIN;
2398  	} else
2399  		inode_lock(inode);
2400  
2401  	/*
2402  	 * Concurrent O_DIRECT writes are allowed with
2403  	 * mount_option "coherency=buffered".
2404  	 * For append write, we must take rw EX.
2405  	 */
2406  	rw_level = (!direct_io || full_coherency || append_write);
2407  
2408  	if (nowait)
2409  		ret = ocfs2_try_rw_lock(inode, rw_level);
2410  	else
2411  		ret = ocfs2_rw_lock(inode, rw_level);
2412  	if (ret < 0) {
2413  		if (ret != -EAGAIN)
2414  			mlog_errno(ret);
2415  		goto out_mutex;
2416  	}
2417  
2418  	/*
2419  	 * O_DIRECT writes with "coherency=full" need to take EX cluster
2420  	 * inode_lock to guarantee coherency.
2421  	 */
2422  	if (direct_io && full_coherency) {
2423  		/*
2424  		 * We need to take and drop the inode lock to force
2425  		 * other nodes to drop their caches.  Buffered I/O
2426  		 * already does this in write_begin().
2427  		 */
2428  		if (nowait)
2429  			ret = ocfs2_try_inode_lock(inode, NULL, 1);
2430  		else
2431  			ret = ocfs2_inode_lock(inode, NULL, 1);
2432  		if (ret < 0) {
2433  			if (ret != -EAGAIN)
2434  				mlog_errno(ret);
2435  			goto out;
2436  		}
2437  
2438  		ocfs2_inode_unlock(inode, 1);
2439  	}
2440  
2441  	ret = generic_write_checks(iocb, from);
2442  	if (ret <= 0) {
2443  		if (ret)
2444  			mlog_errno(ret);
2445  		goto out;
2446  	}
2447  	count = ret;
2448  
2449  	ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait);
2450  	if (ret < 0) {
2451  		if (ret != -EAGAIN)
2452  			mlog_errno(ret);
2453  		goto out;
2454  	}
2455  
2456  	if (direct_io && !is_sync_kiocb(iocb) &&
2457  	    ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
2458  		/*
2459  		 * Make it a sync io if it's an unaligned aio.
2460  		 */
2461  		saved_ki_complete = xchg(&iocb->ki_complete, NULL);
2462  	}
2463  
2464  	/* communicate with ocfs2_dio_end_io */
2465  	ocfs2_iocb_set_rw_locked(iocb, rw_level);
2466  
2467  	written = __generic_file_write_iter(iocb, from);
2468  	/* buffered aio wouldn't have proper lock coverage today */
2469  	BUG_ON(written == -EIOCBQUEUED && !direct_io);
2470  
2471  	/*
2472  	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2473  	 * function pointer which is called when o_direct io completes so that
2474  	 * it can unlock our rw lock.
2475  	 * Unfortunately there are error cases which call end_io and others
2476  	 * that don't.  so we don't have to unlock the rw_lock if either an
2477  	 * async dio is going to do it in the future or an end_io after an
2478  	 * error has already done it.
2479  	 */
2480  	if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2481  		rw_level = -1;
2482  	}
2483  
2484  	if (unlikely(written <= 0))
2485  		goto out;
2486  
2487  	if (((file->f_flags & O_DSYNC) && !direct_io) ||
2488  	    IS_SYNC(inode)) {
2489  		ret = filemap_fdatawrite_range(file->f_mapping,
2490  					       iocb->ki_pos - written,
2491  					       iocb->ki_pos - 1);
2492  		if (ret < 0)
2493  			written = ret;
2494  
2495  		if (!ret) {
2496  			ret = jbd2_journal_force_commit(osb->journal->j_journal);
2497  			if (ret < 0)
2498  				written = ret;
2499  		}
2500  
2501  		if (!ret)
2502  			ret = filemap_fdatawait_range(file->f_mapping,
2503  						      iocb->ki_pos - written,
2504  						      iocb->ki_pos - 1);
2505  	}
2506  
2507  out:
2508  	if (saved_ki_complete)
2509  		xchg(&iocb->ki_complete, saved_ki_complete);
2510  
2511  	if (rw_level != -1)
2512  		ocfs2_rw_unlock(inode, rw_level);
2513  
2514  out_mutex:
2515  	inode_unlock(inode);
2516  
2517  	if (written)
2518  		ret = written;
2519  	return ret;
2520  }
2521  
ocfs2_file_read_iter(struct kiocb * iocb,struct iov_iter * to)2522  static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
2523  				   struct iov_iter *to)
2524  {
2525  	int ret = 0, rw_level = -1, lock_level = 0;
2526  	struct file *filp = iocb->ki_filp;
2527  	struct inode *inode = file_inode(filp);
2528  	int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
2529  	int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
2530  
2531  	trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry,
2532  			(unsigned long long)OCFS2_I(inode)->ip_blkno,
2533  			filp->f_path.dentry->d_name.len,
2534  			filp->f_path.dentry->d_name.name,
2535  			to->nr_segs);	/* GRRRRR */
2536  
2537  
2538  	if (!inode) {
2539  		ret = -EINVAL;
2540  		mlog_errno(ret);
2541  		goto bail;
2542  	}
2543  
2544  	if (!direct_io && nowait)
2545  		return -EOPNOTSUPP;
2546  
2547  	/*
2548  	 * buffered reads protect themselves in ->read_folio().  O_DIRECT reads
2549  	 * need locks to protect pending reads from racing with truncate.
2550  	 */
2551  	if (direct_io) {
2552  		if (nowait)
2553  			ret = ocfs2_try_rw_lock(inode, 0);
2554  		else
2555  			ret = ocfs2_rw_lock(inode, 0);
2556  
2557  		if (ret < 0) {
2558  			if (ret != -EAGAIN)
2559  				mlog_errno(ret);
2560  			goto bail;
2561  		}
2562  		rw_level = 0;
2563  		/* communicate with ocfs2_dio_end_io */
2564  		ocfs2_iocb_set_rw_locked(iocb, rw_level);
2565  	}
2566  
2567  	/*
2568  	 * We're fine letting folks race truncates and extending
2569  	 * writes with read across the cluster, just like they can
2570  	 * locally. Hence no rw_lock during read.
2571  	 *
2572  	 * Take and drop the meta data lock to update inode fields
2573  	 * like i_size. This allows the checks down below
2574  	 * copy_splice_read() a chance of actually working.
2575  	 */
2576  	ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
2577  				     !nowait);
2578  	if (ret < 0) {
2579  		if (ret != -EAGAIN)
2580  			mlog_errno(ret);
2581  		goto bail;
2582  	}
2583  	ocfs2_inode_unlock(inode, lock_level);
2584  
2585  	ret = generic_file_read_iter(iocb, to);
2586  	trace_generic_file_read_iter_ret(ret);
2587  
2588  	/* buffered aio wouldn't have proper lock coverage today */
2589  	BUG_ON(ret == -EIOCBQUEUED && !direct_io);
2590  
2591  	/* see ocfs2_file_write_iter */
2592  	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2593  		rw_level = -1;
2594  	}
2595  
2596  bail:
2597  	if (rw_level != -1)
2598  		ocfs2_rw_unlock(inode, rw_level);
2599  
2600  	return ret;
2601  }
2602  
ocfs2_file_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)2603  static ssize_t ocfs2_file_splice_read(struct file *in, loff_t *ppos,
2604  				      struct pipe_inode_info *pipe,
2605  				      size_t len, unsigned int flags)
2606  {
2607  	struct inode *inode = file_inode(in);
2608  	ssize_t ret = 0;
2609  	int lock_level = 0;
2610  
2611  	trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
2612  				     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2613  				     in->f_path.dentry->d_name.len,
2614  				     in->f_path.dentry->d_name.name,
2615  				     flags);
2616  
2617  	/*
2618  	 * We're fine letting folks race truncates and extending writes with
2619  	 * read across the cluster, just like they can locally.  Hence no
2620  	 * rw_lock during read.
2621  	 *
2622  	 * Take and drop the meta data lock to update inode fields like i_size.
2623  	 * This allows the checks down below filemap_splice_read() a chance of
2624  	 * actually working.
2625  	 */
2626  	ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level, 1);
2627  	if (ret < 0) {
2628  		if (ret != -EAGAIN)
2629  			mlog_errno(ret);
2630  		goto bail;
2631  	}
2632  	ocfs2_inode_unlock(inode, lock_level);
2633  
2634  	ret = filemap_splice_read(in, ppos, pipe, len, flags);
2635  	trace_filemap_splice_read_ret(ret);
2636  bail:
2637  	return ret;
2638  }
2639  
2640  /* Refer generic_file_llseek_unlocked() */
ocfs2_file_llseek(struct file * file,loff_t offset,int whence)2641  static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2642  {
2643  	struct inode *inode = file->f_mapping->host;
2644  	int ret = 0;
2645  
2646  	inode_lock(inode);
2647  
2648  	switch (whence) {
2649  	case SEEK_SET:
2650  		break;
2651  	case SEEK_END:
2652  		/* SEEK_END requires the OCFS2 inode lock for the file
2653  		 * because it references the file's size.
2654  		 */
2655  		ret = ocfs2_inode_lock(inode, NULL, 0);
2656  		if (ret < 0) {
2657  			mlog_errno(ret);
2658  			goto out;
2659  		}
2660  		offset += i_size_read(inode);
2661  		ocfs2_inode_unlock(inode, 0);
2662  		break;
2663  	case SEEK_CUR:
2664  		if (offset == 0) {
2665  			offset = file->f_pos;
2666  			goto out;
2667  		}
2668  		offset += file->f_pos;
2669  		break;
2670  	case SEEK_DATA:
2671  	case SEEK_HOLE:
2672  		ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
2673  		if (ret)
2674  			goto out;
2675  		break;
2676  	default:
2677  		ret = -EINVAL;
2678  		goto out;
2679  	}
2680  
2681  	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2682  
2683  out:
2684  	inode_unlock(inode);
2685  	if (ret)
2686  		return ret;
2687  	return offset;
2688  }
2689  
ocfs2_remap_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t len,unsigned int remap_flags)2690  static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
2691  				     struct file *file_out, loff_t pos_out,
2692  				     loff_t len, unsigned int remap_flags)
2693  {
2694  	struct inode *inode_in = file_inode(file_in);
2695  	struct inode *inode_out = file_inode(file_out);
2696  	struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
2697  	struct buffer_head *in_bh = NULL, *out_bh = NULL;
2698  	bool same_inode = (inode_in == inode_out);
2699  	loff_t remapped = 0;
2700  	ssize_t ret;
2701  
2702  	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
2703  		return -EINVAL;
2704  	if (!ocfs2_refcount_tree(osb))
2705  		return -EOPNOTSUPP;
2706  	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
2707  		return -EROFS;
2708  
2709  	/* Lock both files against IO */
2710  	ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
2711  	if (ret)
2712  		return ret;
2713  
2714  	/* Check file eligibility and prepare for block sharing. */
2715  	ret = -EINVAL;
2716  	if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
2717  	    (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
2718  		goto out_unlock;
2719  
2720  	ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
2721  			&len, remap_flags);
2722  	if (ret < 0 || len == 0)
2723  		goto out_unlock;
2724  
2725  	/* Lock out changes to the allocation maps and remap. */
2726  	down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
2727  	if (!same_inode)
2728  		down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
2729  				  SINGLE_DEPTH_NESTING);
2730  
2731  	/* Zap any page cache for the destination file's range. */
2732  	truncate_inode_pages_range(&inode_out->i_data,
2733  				   round_down(pos_out, PAGE_SIZE),
2734  				   round_up(pos_out + len, PAGE_SIZE) - 1);
2735  
2736  	remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
2737  			inode_out, out_bh, pos_out, len);
2738  	up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
2739  	if (!same_inode)
2740  		up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
2741  	if (remapped < 0) {
2742  		ret = remapped;
2743  		mlog_errno(ret);
2744  		goto out_unlock;
2745  	}
2746  
2747  	/*
2748  	 * Empty the extent map so that we may get the right extent
2749  	 * record from the disk.
2750  	 */
2751  	ocfs2_extent_map_trunc(inode_in, 0);
2752  	ocfs2_extent_map_trunc(inode_out, 0);
2753  
2754  	ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
2755  	if (ret) {
2756  		mlog_errno(ret);
2757  		goto out_unlock;
2758  	}
2759  
2760  out_unlock:
2761  	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
2762  	return remapped > 0 ? remapped : ret;
2763  }
2764  
ocfs2_dir_llseek(struct file * file,loff_t offset,int whence)2765  static loff_t ocfs2_dir_llseek(struct file *file, loff_t offset, int whence)
2766  {
2767  	struct ocfs2_file_private *fp = file->private_data;
2768  
2769  	return generic_llseek_cookie(file, offset, whence, &fp->cookie);
2770  }
2771  
2772  const struct inode_operations ocfs2_file_iops = {
2773  	.setattr	= ocfs2_setattr,
2774  	.getattr	= ocfs2_getattr,
2775  	.permission	= ocfs2_permission,
2776  	.listxattr	= ocfs2_listxattr,
2777  	.fiemap		= ocfs2_fiemap,
2778  	.get_inode_acl	= ocfs2_iop_get_acl,
2779  	.set_acl	= ocfs2_iop_set_acl,
2780  	.fileattr_get	= ocfs2_fileattr_get,
2781  	.fileattr_set	= ocfs2_fileattr_set,
2782  };
2783  
2784  const struct inode_operations ocfs2_special_file_iops = {
2785  	.setattr	= ocfs2_setattr,
2786  	.getattr	= ocfs2_getattr,
2787  	.listxattr	= ocfs2_listxattr,
2788  	.permission	= ocfs2_permission,
2789  	.get_inode_acl	= ocfs2_iop_get_acl,
2790  	.set_acl	= ocfs2_iop_set_acl,
2791  };
2792  
2793  /*
2794   * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2795   * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2796   */
2797  const struct file_operations ocfs2_fops = {
2798  	.llseek		= ocfs2_file_llseek,
2799  	.mmap		= ocfs2_mmap,
2800  	.fsync		= ocfs2_sync_file,
2801  	.release	= ocfs2_file_release,
2802  	.open		= ocfs2_file_open,
2803  	.read_iter	= ocfs2_file_read_iter,
2804  	.write_iter	= ocfs2_file_write_iter,
2805  	.unlocked_ioctl	= ocfs2_ioctl,
2806  #ifdef CONFIG_COMPAT
2807  	.compat_ioctl   = ocfs2_compat_ioctl,
2808  #endif
2809  	.lock		= ocfs2_lock,
2810  	.flock		= ocfs2_flock,
2811  	.splice_read	= ocfs2_file_splice_read,
2812  	.splice_write	= iter_file_splice_write,
2813  	.fallocate	= ocfs2_fallocate,
2814  	.remap_file_range = ocfs2_remap_file_range,
2815  };
2816  
2817  WRAP_DIR_ITER(ocfs2_readdir) // FIXME!
2818  const struct file_operations ocfs2_dops = {
2819  	.llseek		= ocfs2_dir_llseek,
2820  	.read		= generic_read_dir,
2821  	.iterate_shared	= shared_ocfs2_readdir,
2822  	.fsync		= ocfs2_sync_file,
2823  	.release	= ocfs2_dir_release,
2824  	.open		= ocfs2_dir_open,
2825  	.unlocked_ioctl	= ocfs2_ioctl,
2826  #ifdef CONFIG_COMPAT
2827  	.compat_ioctl   = ocfs2_compat_ioctl,
2828  #endif
2829  	.lock		= ocfs2_lock,
2830  	.flock		= ocfs2_flock,
2831  };
2832  
2833  /*
2834   * POSIX-lockless variants of our file_operations.
2835   *
2836   * These will be used if the underlying cluster stack does not support
2837   * posix file locking, if the user passes the "localflocks" mount
2838   * option, or if we have a local-only fs.
2839   *
2840   * ocfs2_flock is in here because all stacks handle UNIX file locks,
2841   * so we still want it in the case of no stack support for
2842   * plocks. Internally, it will do the right thing when asked to ignore
2843   * the cluster.
2844   */
2845  const struct file_operations ocfs2_fops_no_plocks = {
2846  	.llseek		= ocfs2_file_llseek,
2847  	.mmap		= ocfs2_mmap,
2848  	.fsync		= ocfs2_sync_file,
2849  	.release	= ocfs2_file_release,
2850  	.open		= ocfs2_file_open,
2851  	.read_iter	= ocfs2_file_read_iter,
2852  	.write_iter	= ocfs2_file_write_iter,
2853  	.unlocked_ioctl	= ocfs2_ioctl,
2854  #ifdef CONFIG_COMPAT
2855  	.compat_ioctl   = ocfs2_compat_ioctl,
2856  #endif
2857  	.flock		= ocfs2_flock,
2858  	.splice_read	= filemap_splice_read,
2859  	.splice_write	= iter_file_splice_write,
2860  	.fallocate	= ocfs2_fallocate,
2861  	.remap_file_range = ocfs2_remap_file_range,
2862  };
2863  
2864  const struct file_operations ocfs2_dops_no_plocks = {
2865  	.llseek		= ocfs2_dir_llseek,
2866  	.read		= generic_read_dir,
2867  	.iterate_shared	= shared_ocfs2_readdir,
2868  	.fsync		= ocfs2_sync_file,
2869  	.release	= ocfs2_dir_release,
2870  	.open		= ocfs2_dir_open,
2871  	.unlocked_ioctl	= ocfs2_ioctl,
2872  #ifdef CONFIG_COMPAT
2873  	.compat_ioctl   = ocfs2_compat_ioctl,
2874  #endif
2875  	.flock		= ocfs2_flock,
2876  };
2877