1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * linux/fs/nfs/direct.c
4   *
5   * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
6   *
7   * High-performance uncached I/O for the Linux NFS client
8   *
9   * There are important applications whose performance or correctness
10   * depends on uncached access to file data.  Database clusters
11   * (multiple copies of the same instance running on separate hosts)
12   * implement their own cache coherency protocol that subsumes file
13   * system cache protocols.  Applications that process datasets
14   * considerably larger than the client's memory do not always benefit
15   * from a local cache.  A streaming video server, for instance, has no
16   * need to cache the contents of a file.
17   *
18   * When an application requests uncached I/O, all read and write requests
19   * are made directly to the server; data stored or fetched via these
20   * requests is not cached in the Linux page cache.  The client does not
21   * correct unaligned requests from applications.  All requested bytes are
22   * held on permanent storage before a direct write system call returns to
23   * an application.
24   *
25   * Solaris implements an uncached I/O facility called directio() that
26   * is used for backups and sequential I/O to very large files.  Solaris
27   * also supports uncaching whole NFS partitions with "-o forcedirectio,"
28   * an undocumented mount option.
29   *
30   * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
31   * help from Andrew Morton.
32   *
33   * 18 Dec 2001	Initial implementation for 2.4  --cel
34   * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
35   * 08 Jun 2003	Port to 2.5 APIs  --cel
36   * 31 Mar 2004	Handle direct I/O without VFS support  --cel
37   * 15 Sep 2004	Parallel async reads  --cel
38   * 04 May 2005	support O_DIRECT with aio  --cel
39   *
40   */
41  
42  #include <linux/errno.h>
43  #include <linux/sched.h>
44  #include <linux/kernel.h>
45  #include <linux/file.h>
46  #include <linux/pagemap.h>
47  #include <linux/kref.h>
48  #include <linux/slab.h>
49  #include <linux/task_io_accounting_ops.h>
50  #include <linux/module.h>
51  
52  #include <linux/nfs_fs.h>
53  #include <linux/nfs_page.h>
54  #include <linux/sunrpc/clnt.h>
55  
56  #include <linux/uaccess.h>
57  #include <linux/atomic.h>
58  
59  #include "internal.h"
60  #include "iostat.h"
61  #include "pnfs.h"
62  #include "fscache.h"
63  #include "nfstrace.h"
64  
65  #define NFSDBG_FACILITY		NFSDBG_VFS
66  
67  static struct kmem_cache *nfs_direct_cachep;
68  
69  static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
70  static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
71  static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
72  static void nfs_direct_write_schedule_work(struct work_struct *work);
73  
get_dreq(struct nfs_direct_req * dreq)74  static inline void get_dreq(struct nfs_direct_req *dreq)
75  {
76  	atomic_inc(&dreq->io_count);
77  }
78  
put_dreq(struct nfs_direct_req * dreq)79  static inline int put_dreq(struct nfs_direct_req *dreq)
80  {
81  	return atomic_dec_and_test(&dreq->io_count);
82  }
83  
84  static void
nfs_direct_handle_truncated(struct nfs_direct_req * dreq,const struct nfs_pgio_header * hdr,ssize_t dreq_len)85  nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
86  			    const struct nfs_pgio_header *hdr,
87  			    ssize_t dreq_len)
88  {
89  	if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
90  	      test_bit(NFS_IOHDR_EOF, &hdr->flags)))
91  		return;
92  	if (dreq->max_count >= dreq_len) {
93  		dreq->max_count = dreq_len;
94  		if (dreq->count > dreq_len)
95  			dreq->count = dreq_len;
96  	}
97  
98  	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error)
99  		dreq->error = hdr->error;
100  }
101  
102  static void
nfs_direct_count_bytes(struct nfs_direct_req * dreq,const struct nfs_pgio_header * hdr)103  nfs_direct_count_bytes(struct nfs_direct_req *dreq,
104  		       const struct nfs_pgio_header *hdr)
105  {
106  	loff_t hdr_end = hdr->io_start + hdr->good_bytes;
107  	ssize_t dreq_len = 0;
108  
109  	if (hdr_end > dreq->io_start)
110  		dreq_len = hdr_end - dreq->io_start;
111  
112  	nfs_direct_handle_truncated(dreq, hdr, dreq_len);
113  
114  	if (dreq_len > dreq->max_count)
115  		dreq_len = dreq->max_count;
116  
117  	if (dreq->count < dreq_len)
118  		dreq->count = dreq_len;
119  }
120  
nfs_direct_truncate_request(struct nfs_direct_req * dreq,struct nfs_page * req)121  static void nfs_direct_truncate_request(struct nfs_direct_req *dreq,
122  					struct nfs_page *req)
123  {
124  	loff_t offs = req_offset(req);
125  	size_t req_start = (size_t)(offs - dreq->io_start);
126  
127  	if (req_start < dreq->max_count)
128  		dreq->max_count = req_start;
129  	if (req_start < dreq->count)
130  		dreq->count = req_start;
131  }
132  
133  /**
134   * nfs_swap_rw - NFS address space operation for swap I/O
135   * @iocb: target I/O control block
136   * @iter: I/O buffer
137   *
138   * Perform IO to the swap-file.  This is much like direct IO.
139   */
nfs_swap_rw(struct kiocb * iocb,struct iov_iter * iter)140  int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
141  {
142  	ssize_t ret;
143  
144  	if (iov_iter_rw(iter) == READ)
145  		ret = nfs_file_direct_read(iocb, iter, true);
146  	else
147  		ret = nfs_file_direct_write(iocb, iter, true);
148  	if (ret < 0)
149  		return ret;
150  	return 0;
151  }
152  
nfs_direct_release_pages(struct page ** pages,unsigned int npages)153  static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
154  {
155  	unsigned int i;
156  	for (i = 0; i < npages; i++)
157  		put_page(pages[i]);
158  }
159  
nfs_init_cinfo_from_dreq(struct nfs_commit_info * cinfo,struct nfs_direct_req * dreq)160  void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
161  			      struct nfs_direct_req *dreq)
162  {
163  	cinfo->inode = dreq->inode;
164  	cinfo->mds = &dreq->mds_cinfo;
165  	cinfo->ds = &dreq->ds_cinfo;
166  	cinfo->dreq = dreq;
167  	cinfo->completion_ops = &nfs_direct_commit_completion_ops;
168  }
169  
nfs_direct_req_alloc(void)170  static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
171  {
172  	struct nfs_direct_req *dreq;
173  
174  	dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
175  	if (!dreq)
176  		return NULL;
177  
178  	kref_init(&dreq->kref);
179  	kref_get(&dreq->kref);
180  	init_completion(&dreq->completion);
181  	INIT_LIST_HEAD(&dreq->mds_cinfo.list);
182  	pnfs_init_ds_commit_info(&dreq->ds_cinfo);
183  	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
184  	spin_lock_init(&dreq->lock);
185  
186  	return dreq;
187  }
188  
nfs_direct_req_free(struct kref * kref)189  static void nfs_direct_req_free(struct kref *kref)
190  {
191  	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
192  
193  	pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode);
194  	if (dreq->l_ctx != NULL)
195  		nfs_put_lock_context(dreq->l_ctx);
196  	if (dreq->ctx != NULL)
197  		put_nfs_open_context(dreq->ctx);
198  	kmem_cache_free(nfs_direct_cachep, dreq);
199  }
200  
nfs_direct_req_release(struct nfs_direct_req * dreq)201  static void nfs_direct_req_release(struct nfs_direct_req *dreq)
202  {
203  	kref_put(&dreq->kref, nfs_direct_req_free);
204  }
205  
nfs_dreq_bytes_left(struct nfs_direct_req * dreq,loff_t offset)206  ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset)
207  {
208  	loff_t start = offset - dreq->io_start;
209  	return dreq->max_count - start;
210  }
211  EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
212  
213  /*
214   * Collects and returns the final error value/byte-count.
215   */
nfs_direct_wait(struct nfs_direct_req * dreq)216  static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
217  {
218  	ssize_t result = -EIOCBQUEUED;
219  
220  	/* Async requests don't wait here */
221  	if (dreq->iocb)
222  		goto out;
223  
224  	result = wait_for_completion_killable(&dreq->completion);
225  
226  	if (!result) {
227  		result = dreq->count;
228  		WARN_ON_ONCE(dreq->count < 0);
229  	}
230  	if (!result)
231  		result = dreq->error;
232  
233  out:
234  	return (ssize_t) result;
235  }
236  
237  /*
238   * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
239   * the iocb is still valid here if this is a synchronous request.
240   */
nfs_direct_complete(struct nfs_direct_req * dreq)241  static void nfs_direct_complete(struct nfs_direct_req *dreq)
242  {
243  	struct inode *inode = dreq->inode;
244  
245  	inode_dio_end(inode);
246  
247  	if (dreq->iocb) {
248  		long res = (long) dreq->error;
249  		if (dreq->count != 0) {
250  			res = (long) dreq->count;
251  			WARN_ON_ONCE(dreq->count < 0);
252  		}
253  		dreq->iocb->ki_complete(dreq->iocb, res);
254  	}
255  
256  	complete(&dreq->completion);
257  
258  	nfs_direct_req_release(dreq);
259  }
260  
nfs_direct_read_completion(struct nfs_pgio_header * hdr)261  static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
262  {
263  	unsigned long bytes = 0;
264  	struct nfs_direct_req *dreq = hdr->dreq;
265  
266  	spin_lock(&dreq->lock);
267  	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
268  		spin_unlock(&dreq->lock);
269  		goto out_put;
270  	}
271  
272  	nfs_direct_count_bytes(dreq, hdr);
273  	spin_unlock(&dreq->lock);
274  
275  	while (!list_empty(&hdr->pages)) {
276  		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
277  		struct page *page = req->wb_page;
278  
279  		if (!PageCompound(page) && bytes < hdr->good_bytes &&
280  		    (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY))
281  			set_page_dirty(page);
282  		bytes += req->wb_bytes;
283  		nfs_list_remove_request(req);
284  		nfs_release_request(req);
285  	}
286  out_put:
287  	if (put_dreq(dreq))
288  		nfs_direct_complete(dreq);
289  	hdr->release(hdr);
290  }
291  
nfs_read_sync_pgio_error(struct list_head * head,int error)292  static void nfs_read_sync_pgio_error(struct list_head *head, int error)
293  {
294  	struct nfs_page *req;
295  
296  	while (!list_empty(head)) {
297  		req = nfs_list_entry(head->next);
298  		nfs_list_remove_request(req);
299  		nfs_release_request(req);
300  	}
301  }
302  
nfs_direct_pgio_init(struct nfs_pgio_header * hdr)303  static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
304  {
305  	get_dreq(hdr->dreq);
306  }
307  
308  static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
309  	.error_cleanup = nfs_read_sync_pgio_error,
310  	.init_hdr = nfs_direct_pgio_init,
311  	.completion = nfs_direct_read_completion,
312  };
313  
314  /*
315   * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
316   * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
317   * bail and stop sending more reads.  Read length accounting is
318   * handled automatically by nfs_direct_read_result().  Otherwise, if
319   * no requests have been sent, just return an error.
320   */
321  
nfs_direct_read_schedule_iovec(struct nfs_direct_req * dreq,struct iov_iter * iter,loff_t pos)322  static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
323  					      struct iov_iter *iter,
324  					      loff_t pos)
325  {
326  	struct nfs_pageio_descriptor desc;
327  	struct inode *inode = dreq->inode;
328  	ssize_t result = -EINVAL;
329  	size_t requested_bytes = 0;
330  	size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
331  
332  	nfs_pageio_init_read(&desc, dreq->inode, false,
333  			     &nfs_direct_read_completion_ops);
334  	get_dreq(dreq);
335  	desc.pg_dreq = dreq;
336  	inode_dio_begin(inode);
337  
338  	while (iov_iter_count(iter)) {
339  		struct page **pagevec;
340  		size_t bytes;
341  		size_t pgbase;
342  		unsigned npages, i;
343  
344  		result = iov_iter_get_pages_alloc2(iter, &pagevec,
345  						  rsize, &pgbase);
346  		if (result < 0)
347  			break;
348  
349  		bytes = result;
350  		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
351  		for (i = 0; i < npages; i++) {
352  			struct nfs_page *req;
353  			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
354  			/* XXX do we need to do the eof zeroing found in async_filler? */
355  			req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
356  							pgbase, pos, req_len);
357  			if (IS_ERR(req)) {
358  				result = PTR_ERR(req);
359  				break;
360  			}
361  			if (!nfs_pageio_add_request(&desc, req)) {
362  				result = desc.pg_error;
363  				nfs_release_request(req);
364  				break;
365  			}
366  			pgbase = 0;
367  			bytes -= req_len;
368  			requested_bytes += req_len;
369  			pos += req_len;
370  		}
371  		nfs_direct_release_pages(pagevec, npages);
372  		kvfree(pagevec);
373  		if (result < 0)
374  			break;
375  	}
376  
377  	nfs_pageio_complete(&desc);
378  
379  	/*
380  	 * If no bytes were started, return the error, and let the
381  	 * generic layer handle the completion.
382  	 */
383  	if (requested_bytes == 0) {
384  		inode_dio_end(inode);
385  		nfs_direct_req_release(dreq);
386  		return result < 0 ? result : -EIO;
387  	}
388  
389  	if (put_dreq(dreq))
390  		nfs_direct_complete(dreq);
391  	return requested_bytes;
392  }
393  
394  /**
395   * nfs_file_direct_read - file direct read operation for NFS files
396   * @iocb: target I/O control block
397   * @iter: vector of user buffers into which to read data
398   * @swap: flag indicating this is swap IO, not O_DIRECT IO
399   *
400   * We use this function for direct reads instead of calling
401   * generic_file_aio_read() in order to avoid gfar's check to see if
402   * the request starts before the end of the file.  For that check
403   * to work, we must generate a GETATTR before each direct read, and
404   * even then there is a window between the GETATTR and the subsequent
405   * READ where the file size could change.  Our preference is simply
406   * to do all reads the application wants, and the server will take
407   * care of managing the end of file boundary.
408   *
409   * This function also eliminates unnecessarily updating the file's
410   * atime locally, as the NFS server sets the file's atime, and this
411   * client must read the updated atime from the server back into its
412   * cache.
413   */
nfs_file_direct_read(struct kiocb * iocb,struct iov_iter * iter,bool swap)414  ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
415  			     bool swap)
416  {
417  	struct file *file = iocb->ki_filp;
418  	struct address_space *mapping = file->f_mapping;
419  	struct inode *inode = mapping->host;
420  	struct nfs_direct_req *dreq;
421  	struct nfs_lock_context *l_ctx;
422  	ssize_t result, requested;
423  	size_t count = iov_iter_count(iter);
424  	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
425  
426  	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
427  		file, count, (long long) iocb->ki_pos);
428  
429  	result = 0;
430  	if (!count)
431  		goto out;
432  
433  	task_io_account_read(count);
434  
435  	result = -ENOMEM;
436  	dreq = nfs_direct_req_alloc();
437  	if (dreq == NULL)
438  		goto out;
439  
440  	dreq->inode = inode;
441  	dreq->max_count = count;
442  	dreq->io_start = iocb->ki_pos;
443  	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
444  	l_ctx = nfs_get_lock_context(dreq->ctx);
445  	if (IS_ERR(l_ctx)) {
446  		result = PTR_ERR(l_ctx);
447  		nfs_direct_req_release(dreq);
448  		goto out_release;
449  	}
450  	dreq->l_ctx = l_ctx;
451  	if (!is_sync_kiocb(iocb))
452  		dreq->iocb = iocb;
453  
454  	if (user_backed_iter(iter))
455  		dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
456  
457  	if (!swap)
458  		nfs_start_io_direct(inode);
459  
460  	NFS_I(inode)->read_io += count;
461  	requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
462  
463  	if (!swap)
464  		nfs_end_io_direct(inode);
465  
466  	if (requested > 0) {
467  		result = nfs_direct_wait(dreq);
468  		if (result > 0) {
469  			requested -= result;
470  			iocb->ki_pos += result;
471  		}
472  		iov_iter_revert(iter, requested);
473  	} else {
474  		result = requested;
475  	}
476  
477  out_release:
478  	nfs_direct_req_release(dreq);
479  out:
480  	return result;
481  }
482  
nfs_direct_add_page_head(struct list_head * list,struct nfs_page * req)483  static void nfs_direct_add_page_head(struct list_head *list,
484  				     struct nfs_page *req)
485  {
486  	struct nfs_page *head = req->wb_head;
487  
488  	if (!list_empty(&head->wb_list) || !nfs_lock_request(head))
489  		return;
490  	if (!list_empty(&head->wb_list)) {
491  		nfs_unlock_request(head);
492  		return;
493  	}
494  	list_add(&head->wb_list, list);
495  	kref_get(&head->wb_kref);
496  	kref_get(&head->wb_kref);
497  }
498  
nfs_direct_join_group(struct list_head * list,struct nfs_commit_info * cinfo,struct inode * inode)499  static void nfs_direct_join_group(struct list_head *list,
500  				  struct nfs_commit_info *cinfo,
501  				  struct inode *inode)
502  {
503  	struct nfs_page *req, *subreq;
504  
505  	list_for_each_entry(req, list, wb_list) {
506  		if (req->wb_head != req) {
507  			nfs_direct_add_page_head(&req->wb_list, req);
508  			continue;
509  		}
510  		subreq = req->wb_this_page;
511  		if (subreq == req)
512  			continue;
513  		do {
514  			/*
515  			 * Remove subrequests from this list before freeing
516  			 * them in the call to nfs_join_page_group().
517  			 */
518  			if (!list_empty(&subreq->wb_list)) {
519  				nfs_list_remove_request(subreq);
520  				nfs_release_request(subreq);
521  			}
522  		} while ((subreq = subreq->wb_this_page) != req);
523  		nfs_join_page_group(req, cinfo, inode);
524  	}
525  }
526  
527  static void
nfs_direct_write_scan_commit_list(struct inode * inode,struct list_head * list,struct nfs_commit_info * cinfo)528  nfs_direct_write_scan_commit_list(struct inode *inode,
529  				  struct list_head *list,
530  				  struct nfs_commit_info *cinfo)
531  {
532  	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
533  	pnfs_recover_commit_reqs(list, cinfo);
534  	nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
535  	mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
536  }
537  
nfs_direct_write_reschedule(struct nfs_direct_req * dreq)538  static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
539  {
540  	struct nfs_pageio_descriptor desc;
541  	struct nfs_page *req;
542  	LIST_HEAD(reqs);
543  	struct nfs_commit_info cinfo;
544  
545  	nfs_init_cinfo_from_dreq(&cinfo, dreq);
546  	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
547  
548  	nfs_direct_join_group(&reqs, &cinfo, dreq->inode);
549  
550  	nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
551  	get_dreq(dreq);
552  
553  	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
554  			      &nfs_direct_write_completion_ops);
555  	desc.pg_dreq = dreq;
556  
557  	while (!list_empty(&reqs)) {
558  		req = nfs_list_entry(reqs.next);
559  		/* Bump the transmission count */
560  		req->wb_nio++;
561  		if (!nfs_pageio_add_request(&desc, req)) {
562  			spin_lock(&dreq->lock);
563  			if (dreq->error < 0) {
564  				desc.pg_error = dreq->error;
565  			} else if (desc.pg_error != -EAGAIN) {
566  				dreq->flags = 0;
567  				if (!desc.pg_error)
568  					desc.pg_error = -EIO;
569  				dreq->error = desc.pg_error;
570  			} else
571  				dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
572  			spin_unlock(&dreq->lock);
573  			break;
574  		}
575  		nfs_release_request(req);
576  	}
577  	nfs_pageio_complete(&desc);
578  
579  	while (!list_empty(&reqs)) {
580  		req = nfs_list_entry(reqs.next);
581  		nfs_list_remove_request(req);
582  		nfs_unlock_and_release_request(req);
583  		if (desc.pg_error == -EAGAIN) {
584  			nfs_mark_request_commit(req, NULL, &cinfo, 0);
585  		} else {
586  			spin_lock(&dreq->lock);
587  			nfs_direct_truncate_request(dreq, req);
588  			spin_unlock(&dreq->lock);
589  			nfs_release_request(req);
590  		}
591  	}
592  
593  	if (put_dreq(dreq))
594  		nfs_direct_write_complete(dreq);
595  }
596  
nfs_direct_commit_complete(struct nfs_commit_data * data)597  static void nfs_direct_commit_complete(struct nfs_commit_data *data)
598  {
599  	const struct nfs_writeverf *verf = data->res.verf;
600  	struct nfs_direct_req *dreq = data->dreq;
601  	struct nfs_commit_info cinfo;
602  	struct nfs_page *req;
603  	int status = data->task.tk_status;
604  
605  	trace_nfs_direct_commit_complete(dreq);
606  
607  	spin_lock(&dreq->lock);
608  	if (status < 0) {
609  		/* Errors in commit are fatal */
610  		dreq->error = status;
611  		dreq->flags = NFS_ODIRECT_DONE;
612  	} else {
613  		status = dreq->error;
614  	}
615  	spin_unlock(&dreq->lock);
616  
617  	nfs_init_cinfo_from_dreq(&cinfo, dreq);
618  
619  	while (!list_empty(&data->pages)) {
620  		req = nfs_list_entry(data->pages.next);
621  		nfs_list_remove_request(req);
622  		if (status < 0) {
623  			spin_lock(&dreq->lock);
624  			nfs_direct_truncate_request(dreq, req);
625  			spin_unlock(&dreq->lock);
626  			nfs_release_request(req);
627  		} else if (!nfs_write_match_verf(verf, req)) {
628  			spin_lock(&dreq->lock);
629  			if (dreq->flags == 0)
630  				dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
631  			spin_unlock(&dreq->lock);
632  			/*
633  			 * Despite the reboot, the write was successful,
634  			 * so reset wb_nio.
635  			 */
636  			req->wb_nio = 0;
637  			nfs_mark_request_commit(req, NULL, &cinfo, 0);
638  		} else
639  			nfs_release_request(req);
640  		nfs_unlock_and_release_request(req);
641  	}
642  
643  	if (nfs_commit_end(cinfo.mds))
644  		nfs_direct_write_complete(dreq);
645  }
646  
nfs_direct_resched_write(struct nfs_commit_info * cinfo,struct nfs_page * req)647  static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
648  		struct nfs_page *req)
649  {
650  	struct nfs_direct_req *dreq = cinfo->dreq;
651  
652  	trace_nfs_direct_resched_write(dreq);
653  
654  	spin_lock(&dreq->lock);
655  	if (dreq->flags != NFS_ODIRECT_DONE)
656  		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
657  	spin_unlock(&dreq->lock);
658  	nfs_mark_request_commit(req, NULL, cinfo, 0);
659  }
660  
661  static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
662  	.completion = nfs_direct_commit_complete,
663  	.resched_write = nfs_direct_resched_write,
664  };
665  
nfs_direct_commit_schedule(struct nfs_direct_req * dreq)666  static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
667  {
668  	int res;
669  	struct nfs_commit_info cinfo;
670  	LIST_HEAD(mds_list);
671  
672  	nfs_init_cinfo_from_dreq(&cinfo, dreq);
673  	nfs_commit_begin(cinfo.mds);
674  	nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
675  	res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
676  	if (res < 0) { /* res == -ENOMEM */
677  		spin_lock(&dreq->lock);
678  		if (dreq->flags == 0)
679  			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
680  		spin_unlock(&dreq->lock);
681  	}
682  	if (nfs_commit_end(cinfo.mds))
683  		nfs_direct_write_complete(dreq);
684  }
685  
nfs_direct_write_clear_reqs(struct nfs_direct_req * dreq)686  static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
687  {
688  	struct nfs_commit_info cinfo;
689  	struct nfs_page *req;
690  	LIST_HEAD(reqs);
691  
692  	nfs_init_cinfo_from_dreq(&cinfo, dreq);
693  	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
694  
695  	while (!list_empty(&reqs)) {
696  		req = nfs_list_entry(reqs.next);
697  		nfs_list_remove_request(req);
698  		nfs_direct_truncate_request(dreq, req);
699  		nfs_release_request(req);
700  		nfs_unlock_and_release_request(req);
701  	}
702  }
703  
nfs_direct_write_schedule_work(struct work_struct * work)704  static void nfs_direct_write_schedule_work(struct work_struct *work)
705  {
706  	struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
707  	int flags = dreq->flags;
708  
709  	dreq->flags = 0;
710  	switch (flags) {
711  		case NFS_ODIRECT_DO_COMMIT:
712  			nfs_direct_commit_schedule(dreq);
713  			break;
714  		case NFS_ODIRECT_RESCHED_WRITES:
715  			nfs_direct_write_reschedule(dreq);
716  			break;
717  		default:
718  			nfs_direct_write_clear_reqs(dreq);
719  			nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
720  			nfs_direct_complete(dreq);
721  	}
722  }
723  
nfs_direct_write_complete(struct nfs_direct_req * dreq)724  static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
725  {
726  	trace_nfs_direct_write_complete(dreq);
727  	queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */
728  }
729  
nfs_direct_write_completion(struct nfs_pgio_header * hdr)730  static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
731  {
732  	struct nfs_direct_req *dreq = hdr->dreq;
733  	struct nfs_commit_info cinfo;
734  	struct nfs_page *req = nfs_list_entry(hdr->pages.next);
735  	int flags = NFS_ODIRECT_DONE;
736  
737  	trace_nfs_direct_write_completion(dreq);
738  
739  	nfs_init_cinfo_from_dreq(&cinfo, dreq);
740  
741  	spin_lock(&dreq->lock);
742  	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
743  		spin_unlock(&dreq->lock);
744  		goto out_put;
745  	}
746  
747  	nfs_direct_count_bytes(dreq, hdr);
748  	if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) &&
749  	    !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
750  		if (!dreq->flags)
751  			dreq->flags = NFS_ODIRECT_DO_COMMIT;
752  		flags = dreq->flags;
753  	}
754  	spin_unlock(&dreq->lock);
755  
756  	while (!list_empty(&hdr->pages)) {
757  
758  		req = nfs_list_entry(hdr->pages.next);
759  		nfs_list_remove_request(req);
760  		if (flags == NFS_ODIRECT_DO_COMMIT) {
761  			kref_get(&req->wb_kref);
762  			memcpy(&req->wb_verf, &hdr->verf.verifier,
763  			       sizeof(req->wb_verf));
764  			nfs_mark_request_commit(req, hdr->lseg, &cinfo,
765  				hdr->ds_commit_idx);
766  		} else if (flags == NFS_ODIRECT_RESCHED_WRITES) {
767  			kref_get(&req->wb_kref);
768  			nfs_mark_request_commit(req, NULL, &cinfo, 0);
769  		}
770  		nfs_unlock_and_release_request(req);
771  	}
772  
773  out_put:
774  	if (put_dreq(dreq))
775  		nfs_direct_write_complete(dreq);
776  	hdr->release(hdr);
777  }
778  
nfs_write_sync_pgio_error(struct list_head * head,int error)779  static void nfs_write_sync_pgio_error(struct list_head *head, int error)
780  {
781  	struct nfs_page *req;
782  
783  	while (!list_empty(head)) {
784  		req = nfs_list_entry(head->next);
785  		nfs_list_remove_request(req);
786  		nfs_unlock_and_release_request(req);
787  	}
788  }
789  
nfs_direct_write_reschedule_io(struct nfs_pgio_header * hdr)790  static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
791  {
792  	struct nfs_direct_req *dreq = hdr->dreq;
793  	struct nfs_page *req;
794  	struct nfs_commit_info cinfo;
795  
796  	trace_nfs_direct_write_reschedule_io(dreq);
797  
798  	nfs_init_cinfo_from_dreq(&cinfo, dreq);
799  	spin_lock(&dreq->lock);
800  	if (dreq->error == 0)
801  		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
802  	set_bit(NFS_IOHDR_REDO, &hdr->flags);
803  	spin_unlock(&dreq->lock);
804  	while (!list_empty(&hdr->pages)) {
805  		req = nfs_list_entry(hdr->pages.next);
806  		nfs_list_remove_request(req);
807  		nfs_unlock_request(req);
808  		nfs_mark_request_commit(req, NULL, &cinfo, 0);
809  	}
810  }
811  
812  static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
813  	.error_cleanup = nfs_write_sync_pgio_error,
814  	.init_hdr = nfs_direct_pgio_init,
815  	.completion = nfs_direct_write_completion,
816  	.reschedule_io = nfs_direct_write_reschedule_io,
817  };
818  
819  
820  /*
821   * NB: Return the value of the first error return code.  Subsequent
822   *     errors after the first one are ignored.
823   */
824  /*
825   * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
826   * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
827   * bail and stop sending more writes.  Write length accounting is
828   * handled automatically by nfs_direct_write_result().  Otherwise, if
829   * no requests have been sent, just return an error.
830   */
nfs_direct_write_schedule_iovec(struct nfs_direct_req * dreq,struct iov_iter * iter,loff_t pos,int ioflags)831  static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
832  					       struct iov_iter *iter,
833  					       loff_t pos, int ioflags)
834  {
835  	struct nfs_pageio_descriptor desc;
836  	struct inode *inode = dreq->inode;
837  	struct nfs_commit_info cinfo;
838  	ssize_t result = 0;
839  	size_t requested_bytes = 0;
840  	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
841  	bool defer = false;
842  
843  	trace_nfs_direct_write_schedule_iovec(dreq);
844  
845  	nfs_pageio_init_write(&desc, inode, ioflags, false,
846  			      &nfs_direct_write_completion_ops);
847  	desc.pg_dreq = dreq;
848  	get_dreq(dreq);
849  	inode_dio_begin(inode);
850  
851  	NFS_I(inode)->write_io += iov_iter_count(iter);
852  	while (iov_iter_count(iter)) {
853  		struct page **pagevec;
854  		size_t bytes;
855  		size_t pgbase;
856  		unsigned npages, i;
857  
858  		result = iov_iter_get_pages_alloc2(iter, &pagevec,
859  						  wsize, &pgbase);
860  		if (result < 0)
861  			break;
862  
863  		bytes = result;
864  		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
865  		for (i = 0; i < npages; i++) {
866  			struct nfs_page *req;
867  			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
868  
869  			req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
870  							pgbase, pos, req_len);
871  			if (IS_ERR(req)) {
872  				result = PTR_ERR(req);
873  				break;
874  			}
875  
876  			if (desc.pg_error < 0) {
877  				nfs_free_request(req);
878  				result = desc.pg_error;
879  				break;
880  			}
881  
882  			pgbase = 0;
883  			bytes -= req_len;
884  			requested_bytes += req_len;
885  			pos += req_len;
886  
887  			if (defer) {
888  				nfs_mark_request_commit(req, NULL, &cinfo, 0);
889  				continue;
890  			}
891  
892  			nfs_lock_request(req);
893  			if (nfs_pageio_add_request(&desc, req))
894  				continue;
895  
896  			/* Exit on hard errors */
897  			if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) {
898  				result = desc.pg_error;
899  				nfs_unlock_and_release_request(req);
900  				break;
901  			}
902  
903  			/* If the error is soft, defer remaining requests */
904  			nfs_init_cinfo_from_dreq(&cinfo, dreq);
905  			spin_lock(&dreq->lock);
906  			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
907  			spin_unlock(&dreq->lock);
908  			nfs_unlock_request(req);
909  			nfs_mark_request_commit(req, NULL, &cinfo, 0);
910  			desc.pg_error = 0;
911  			defer = true;
912  		}
913  		nfs_direct_release_pages(pagevec, npages);
914  		kvfree(pagevec);
915  		if (result < 0)
916  			break;
917  	}
918  	nfs_pageio_complete(&desc);
919  
920  	/*
921  	 * If no bytes were started, return the error, and let the
922  	 * generic layer handle the completion.
923  	 */
924  	if (requested_bytes == 0) {
925  		inode_dio_end(inode);
926  		nfs_direct_req_release(dreq);
927  		return result < 0 ? result : -EIO;
928  	}
929  
930  	if (put_dreq(dreq))
931  		nfs_direct_write_complete(dreq);
932  	return requested_bytes;
933  }
934  
935  /**
936   * nfs_file_direct_write - file direct write operation for NFS files
937   * @iocb: target I/O control block
938   * @iter: vector of user buffers from which to write data
939   * @swap: flag indicating this is swap IO, not O_DIRECT IO
940   *
941   * We use this function for direct writes instead of calling
942   * generic_file_aio_write() in order to avoid taking the inode
943   * semaphore and updating the i_size.  The NFS server will set
944   * the new i_size and this client must read the updated size
945   * back into its cache.  We let the server do generic write
946   * parameter checking and report problems.
947   *
948   * We eliminate local atime updates, see direct read above.
949   *
950   * We avoid unnecessary page cache invalidations for normal cached
951   * readers of this file.
952   *
953   * Note that O_APPEND is not supported for NFS direct writes, as there
954   * is no atomic O_APPEND write facility in the NFS protocol.
955   */
nfs_file_direct_write(struct kiocb * iocb,struct iov_iter * iter,bool swap)956  ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
957  			      bool swap)
958  {
959  	ssize_t result, requested;
960  	size_t count;
961  	struct file *file = iocb->ki_filp;
962  	struct address_space *mapping = file->f_mapping;
963  	struct inode *inode = mapping->host;
964  	struct nfs_direct_req *dreq;
965  	struct nfs_lock_context *l_ctx;
966  	loff_t pos, end;
967  
968  	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
969  		file, iov_iter_count(iter), (long long) iocb->ki_pos);
970  
971  	if (swap)
972  		/* bypass generic checks */
973  		result =  iov_iter_count(iter);
974  	else
975  		result = generic_write_checks(iocb, iter);
976  	if (result <= 0)
977  		return result;
978  	count = result;
979  	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
980  
981  	pos = iocb->ki_pos;
982  	end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
983  
984  	task_io_account_write(count);
985  
986  	result = -ENOMEM;
987  	dreq = nfs_direct_req_alloc();
988  	if (!dreq)
989  		goto out;
990  
991  	dreq->inode = inode;
992  	dreq->max_count = count;
993  	dreq->io_start = pos;
994  	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
995  	l_ctx = nfs_get_lock_context(dreq->ctx);
996  	if (IS_ERR(l_ctx)) {
997  		result = PTR_ERR(l_ctx);
998  		nfs_direct_req_release(dreq);
999  		goto out_release;
1000  	}
1001  	dreq->l_ctx = l_ctx;
1002  	if (!is_sync_kiocb(iocb))
1003  		dreq->iocb = iocb;
1004  	pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
1005  
1006  	if (swap) {
1007  		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
1008  							    FLUSH_STABLE);
1009  	} else {
1010  		nfs_start_io_direct(inode);
1011  
1012  		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
1013  							    FLUSH_COND_STABLE);
1014  
1015  		if (mapping->nrpages) {
1016  			invalidate_inode_pages2_range(mapping,
1017  						      pos >> PAGE_SHIFT, end);
1018  		}
1019  
1020  		nfs_end_io_direct(inode);
1021  	}
1022  
1023  	if (requested > 0) {
1024  		result = nfs_direct_wait(dreq);
1025  		if (result > 0) {
1026  			requested -= result;
1027  			iocb->ki_pos = pos + result;
1028  			/* XXX: should check the generic_write_sync retval */
1029  			generic_write_sync(iocb, result);
1030  		}
1031  		iov_iter_revert(iter, requested);
1032  	} else {
1033  		result = requested;
1034  	}
1035  	nfs_fscache_invalidate(inode, FSCACHE_INVAL_DIO_WRITE);
1036  out_release:
1037  	nfs_direct_req_release(dreq);
1038  out:
1039  	return result;
1040  }
1041  
1042  /**
1043   * nfs_init_directcache - create a slab cache for nfs_direct_req structures
1044   *
1045   */
nfs_init_directcache(void)1046  int __init nfs_init_directcache(void)
1047  {
1048  	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
1049  						sizeof(struct nfs_direct_req),
1050  						0, SLAB_RECLAIM_ACCOUNT,
1051  						NULL);
1052  	if (nfs_direct_cachep == NULL)
1053  		return -ENOMEM;
1054  
1055  	return 0;
1056  }
1057  
1058  /**
1059   * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
1060   *
1061   */
nfs_destroy_directcache(void)1062  void nfs_destroy_directcache(void)
1063  {
1064  	kmem_cache_destroy(nfs_direct_cachep);
1065  }
1066