1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * fs/direct-io.c
4   *
5   * Copyright (C) 2002, Linus Torvalds.
6   *
7   * O_DIRECT
8   *
9   * 04Jul2002	Andrew Morton
10   *		Initial version
11   * 11Sep2002	janetinc@us.ibm.com
12   * 		added readv/writev support.
13   * 29Oct2002	Andrew Morton
14   *		rewrote bio_add_page() support.
15   * 30Oct2002	pbadari@us.ibm.com
16   *		added support for non-aligned IO.
17   * 06Nov2002	pbadari@us.ibm.com
18   *		added asynchronous IO support.
19   * 21Jul2003	nathans@sgi.com
20   *		added IO completion notifier.
21   */
22  
23  #include <linux/kernel.h>
24  #include <linux/module.h>
25  #include <linux/types.h>
26  #include <linux/fs.h>
27  #include <linux/mm.h>
28  #include <linux/slab.h>
29  #include <linux/highmem.h>
30  #include <linux/pagemap.h>
31  #include <linux/task_io_accounting_ops.h>
32  #include <linux/bio.h>
33  #include <linux/wait.h>
34  #include <linux/err.h>
35  #include <linux/blkdev.h>
36  #include <linux/buffer_head.h>
37  #include <linux/rwsem.h>
38  #include <linux/uio.h>
39  #include <linux/atomic.h>
40  
41  #include "internal.h"
42  
43  /*
44   * How many user pages to map in one call to iov_iter_extract_pages().  This
45   * determines the size of a structure in the slab cache
46   */
47  #define DIO_PAGES	64
48  
49  /*
50   * Flags for dio_complete()
51   */
52  #define DIO_COMPLETE_ASYNC		0x01	/* This is async IO */
53  #define DIO_COMPLETE_INVALIDATE		0x02	/* Can invalidate pages */
54  
55  /*
56   * This code generally works in units of "dio_blocks".  A dio_block is
57   * somewhere between the hard sector size and the filesystem block size.  it
58   * is determined on a per-invocation basis.   When talking to the filesystem
59   * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity
60   * down by dio->blkfactor.  Similarly, fs-blocksize quantities are converted
61   * to bio_block quantities by shifting left by blkfactor.
62   *
63   * If blkfactor is zero then the user's request was aligned to the filesystem's
64   * blocksize.
65   */
66  
67  /* dio_state only used in the submission path */
68  
69  struct dio_submit {
70  	struct bio *bio;		/* bio under assembly */
71  	unsigned blkbits;		/* doesn't change */
72  	unsigned blkfactor;		/* When we're using an alignment which
73  					   is finer than the filesystem's soft
74  					   blocksize, this specifies how much
75  					   finer.  blkfactor=2 means 1/4-block
76  					   alignment.  Does not change */
77  	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
78  					   been performed at the start of a
79  					   write */
80  	int pages_in_io;		/* approximate total IO pages */
81  	sector_t block_in_file;		/* Current offset into the underlying
82  					   file in dio_block units. */
83  	unsigned blocks_available;	/* At block_in_file.  changes */
84  	int reap_counter;		/* rate limit reaping */
85  	sector_t final_block_in_request;/* doesn't change */
86  	int boundary;			/* prev block is at a boundary */
87  	get_block_t *get_block;		/* block mapping function */
88  
89  	loff_t logical_offset_in_bio;	/* current first logical block in bio */
90  	sector_t final_block_in_bio;	/* current final block in bio + 1 */
91  	sector_t next_block_for_io;	/* next block to be put under IO,
92  					   in dio_blocks units */
93  
94  	/*
95  	 * Deferred addition of a page to the dio.  These variables are
96  	 * private to dio_send_cur_page(), submit_page_section() and
97  	 * dio_bio_add_page().
98  	 */
99  	struct page *cur_page;		/* The page */
100  	unsigned cur_page_offset;	/* Offset into it, in bytes */
101  	unsigned cur_page_len;		/* Nr of bytes at cur_page_offset */
102  	sector_t cur_page_block;	/* Where it starts */
103  	loff_t cur_page_fs_offset;	/* Offset in file */
104  
105  	struct iov_iter *iter;
106  	/*
107  	 * Page queue.  These variables belong to dio_refill_pages() and
108  	 * dio_get_page().
109  	 */
110  	unsigned head;			/* next page to process */
111  	unsigned tail;			/* last valid page + 1 */
112  	size_t from, to;
113  };
114  
115  /* dio_state communicated between submission path and end_io */
116  struct dio {
117  	int flags;			/* doesn't change */
118  	blk_opf_t opf;			/* request operation type and flags */
119  	struct gendisk *bio_disk;
120  	struct inode *inode;
121  	loff_t i_size;			/* i_size when submitted */
122  	dio_iodone_t *end_io;		/* IO completion function */
123  	bool is_pinned;			/* T if we have pins on the pages */
124  
125  	void *private;			/* copy from map_bh.b_private */
126  
127  	/* BIO completion state */
128  	spinlock_t bio_lock;		/* protects BIO fields below */
129  	int page_errors;		/* err from iov_iter_extract_pages() */
130  	int is_async;			/* is IO async ? */
131  	bool defer_completion;		/* defer AIO completion to workqueue? */
132  	bool should_dirty;		/* if pages should be dirtied */
133  	int io_error;			/* IO error in completion path */
134  	unsigned long refcount;		/* direct_io_worker() and bios */
135  	struct bio *bio_list;		/* singly linked via bi_private */
136  	struct task_struct *waiter;	/* waiting task (NULL if none) */
137  
138  	/* AIO related stuff */
139  	struct kiocb *iocb;		/* kiocb */
140  	ssize_t result;                 /* IO result */
141  
142  	/*
143  	 * pages[] (and any fields placed after it) are not zeroed out at
144  	 * allocation time.  Don't add new fields after pages[] unless you
145  	 * wish that they not be zeroed.
146  	 */
147  	union {
148  		struct page *pages[DIO_PAGES];	/* page buffer */
149  		struct work_struct complete_work;/* deferred AIO completion */
150  	};
151  } ____cacheline_aligned_in_smp;
152  
153  static struct kmem_cache *dio_cache __ro_after_init;
154  
155  /*
156   * How many pages are in the queue?
157   */
dio_pages_present(struct dio_submit * sdio)158  static inline unsigned dio_pages_present(struct dio_submit *sdio)
159  {
160  	return sdio->tail - sdio->head;
161  }
162  
163  /*
164   * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
165   */
dio_refill_pages(struct dio * dio,struct dio_submit * sdio)166  static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
167  {
168  	struct page **pages = dio->pages;
169  	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
170  	ssize_t ret;
171  
172  	ret = iov_iter_extract_pages(sdio->iter, &pages, LONG_MAX,
173  				     DIO_PAGES, 0, &sdio->from);
174  
175  	if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) {
176  		/*
177  		 * A memory fault, but the filesystem has some outstanding
178  		 * mapped blocks.  We need to use those blocks up to avoid
179  		 * leaking stale data in the file.
180  		 */
181  		if (dio->page_errors == 0)
182  			dio->page_errors = ret;
183  		dio->pages[0] = ZERO_PAGE(0);
184  		sdio->head = 0;
185  		sdio->tail = 1;
186  		sdio->from = 0;
187  		sdio->to = PAGE_SIZE;
188  		return 0;
189  	}
190  
191  	if (ret >= 0) {
192  		ret += sdio->from;
193  		sdio->head = 0;
194  		sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
195  		sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1;
196  		return 0;
197  	}
198  	return ret;
199  }
200  
201  /*
202   * Get another userspace page.  Returns an ERR_PTR on error.  Pages are
203   * buffered inside the dio so that we can call iov_iter_extract_pages()
204   * against a decent number of pages, less frequently.  To provide nicer use of
205   * the L1 cache.
206   */
dio_get_page(struct dio * dio,struct dio_submit * sdio)207  static inline struct page *dio_get_page(struct dio *dio,
208  					struct dio_submit *sdio)
209  {
210  	if (dio_pages_present(sdio) == 0) {
211  		int ret;
212  
213  		ret = dio_refill_pages(dio, sdio);
214  		if (ret)
215  			return ERR_PTR(ret);
216  		BUG_ON(dio_pages_present(sdio) == 0);
217  	}
218  	return dio->pages[sdio->head];
219  }
220  
dio_pin_page(struct dio * dio,struct page * page)221  static void dio_pin_page(struct dio *dio, struct page *page)
222  {
223  	if (dio->is_pinned)
224  		folio_add_pin(page_folio(page));
225  }
226  
dio_unpin_page(struct dio * dio,struct page * page)227  static void dio_unpin_page(struct dio *dio, struct page *page)
228  {
229  	if (dio->is_pinned)
230  		unpin_user_page(page);
231  }
232  
233  /*
234   * dio_complete() - called when all DIO BIO I/O has been completed
235   *
236   * This drops i_dio_count, lets interested parties know that a DIO operation
237   * has completed, and calculates the resulting return code for the operation.
238   *
239   * It lets the filesystem know if it registered an interest earlier via
240   * get_block.  Pass the private field of the map buffer_head so that
241   * filesystems can use it to hold additional state between get_block calls and
242   * dio_complete.
243   */
dio_complete(struct dio * dio,ssize_t ret,unsigned int flags)244  static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
245  {
246  	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
247  	loff_t offset = dio->iocb->ki_pos;
248  	ssize_t transferred = 0;
249  	int err;
250  
251  	/*
252  	 * AIO submission can race with bio completion to get here while
253  	 * expecting to have the last io completed by bio completion.
254  	 * In that case -EIOCBQUEUED is in fact not an error we want
255  	 * to preserve through this call.
256  	 */
257  	if (ret == -EIOCBQUEUED)
258  		ret = 0;
259  
260  	if (dio->result) {
261  		transferred = dio->result;
262  
263  		/* Check for short read case */
264  		if (dio_op == REQ_OP_READ &&
265  		    ((offset + transferred) > dio->i_size))
266  			transferred = dio->i_size - offset;
267  		/* ignore EFAULT if some IO has been done */
268  		if (unlikely(ret == -EFAULT) && transferred)
269  			ret = 0;
270  	}
271  
272  	if (ret == 0)
273  		ret = dio->page_errors;
274  	if (ret == 0)
275  		ret = dio->io_error;
276  	if (ret == 0)
277  		ret = transferred;
278  
279  	if (dio->end_io) {
280  		// XXX: ki_pos??
281  		err = dio->end_io(dio->iocb, offset, ret, dio->private);
282  		if (err)
283  			ret = err;
284  	}
285  
286  	/*
287  	 * Try again to invalidate clean pages which might have been cached by
288  	 * non-direct readahead, or faulted in by get_user_pages() if the source
289  	 * of the write was an mmap'ed region of the file we're writing.  Either
290  	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
291  	 * this invalidation fails, tough, the write still worked...
292  	 *
293  	 * And this page cache invalidation has to be after dio->end_io(), as
294  	 * some filesystems convert unwritten extents to real allocations in
295  	 * end_io() when necessary, otherwise a racing buffer read would cache
296  	 * zeros from unwritten extents.
297  	 */
298  	if (flags & DIO_COMPLETE_INVALIDATE &&
299  	    ret > 0 && dio_op == REQ_OP_WRITE)
300  		kiocb_invalidate_post_direct_write(dio->iocb, ret);
301  
302  	inode_dio_end(dio->inode);
303  
304  	if (flags & DIO_COMPLETE_ASYNC) {
305  		/*
306  		 * generic_write_sync expects ki_pos to have been updated
307  		 * already, but the submission path only does this for
308  		 * synchronous I/O.
309  		 */
310  		dio->iocb->ki_pos += transferred;
311  
312  		if (ret > 0 && dio_op == REQ_OP_WRITE)
313  			ret = generic_write_sync(dio->iocb, ret);
314  		dio->iocb->ki_complete(dio->iocb, ret);
315  	}
316  
317  	kmem_cache_free(dio_cache, dio);
318  	return ret;
319  }
320  
dio_aio_complete_work(struct work_struct * work)321  static void dio_aio_complete_work(struct work_struct *work)
322  {
323  	struct dio *dio = container_of(work, struct dio, complete_work);
324  
325  	dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE);
326  }
327  
328  static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio);
329  
330  /*
331   * Asynchronous IO callback.
332   */
dio_bio_end_aio(struct bio * bio)333  static void dio_bio_end_aio(struct bio *bio)
334  {
335  	struct dio *dio = bio->bi_private;
336  	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
337  	unsigned long remaining;
338  	unsigned long flags;
339  	bool defer_completion = false;
340  
341  	/* cleanup the bio */
342  	dio_bio_complete(dio, bio);
343  
344  	spin_lock_irqsave(&dio->bio_lock, flags);
345  	remaining = --dio->refcount;
346  	if (remaining == 1 && dio->waiter)
347  		wake_up_process(dio->waiter);
348  	spin_unlock_irqrestore(&dio->bio_lock, flags);
349  
350  	if (remaining == 0) {
351  		/*
352  		 * Defer completion when defer_completion is set or
353  		 * when the inode has pages mapped and this is AIO write.
354  		 * We need to invalidate those pages because there is a
355  		 * chance they contain stale data in the case buffered IO
356  		 * went in between AIO submission and completion into the
357  		 * same region.
358  		 */
359  		if (dio->result)
360  			defer_completion = dio->defer_completion ||
361  					   (dio_op == REQ_OP_WRITE &&
362  					    dio->inode->i_mapping->nrpages);
363  		if (defer_completion) {
364  			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
365  			queue_work(dio->inode->i_sb->s_dio_done_wq,
366  				   &dio->complete_work);
367  		} else {
368  			dio_complete(dio, 0, DIO_COMPLETE_ASYNC);
369  		}
370  	}
371  }
372  
373  /*
374   * The BIO completion handler simply queues the BIO up for the process-context
375   * handler.
376   *
377   * During I/O bi_private points at the dio.  After I/O, bi_private is used to
378   * implement a singly-linked list of completed BIOs, at dio->bio_list.
379   */
dio_bio_end_io(struct bio * bio)380  static void dio_bio_end_io(struct bio *bio)
381  {
382  	struct dio *dio = bio->bi_private;
383  	unsigned long flags;
384  
385  	spin_lock_irqsave(&dio->bio_lock, flags);
386  	bio->bi_private = dio->bio_list;
387  	dio->bio_list = bio;
388  	if (--dio->refcount == 1 && dio->waiter)
389  		wake_up_process(dio->waiter);
390  	spin_unlock_irqrestore(&dio->bio_lock, flags);
391  }
392  
393  static inline void
dio_bio_alloc(struct dio * dio,struct dio_submit * sdio,struct block_device * bdev,sector_t first_sector,int nr_vecs)394  dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
395  	      struct block_device *bdev,
396  	      sector_t first_sector, int nr_vecs)
397  {
398  	struct bio *bio;
399  
400  	/*
401  	 * bio_alloc() is guaranteed to return a bio when allowed to sleep and
402  	 * we request a valid number of vectors.
403  	 */
404  	bio = bio_alloc(bdev, nr_vecs, dio->opf, GFP_KERNEL);
405  	bio->bi_iter.bi_sector = first_sector;
406  	if (dio->is_async)
407  		bio->bi_end_io = dio_bio_end_aio;
408  	else
409  		bio->bi_end_io = dio_bio_end_io;
410  	if (dio->is_pinned)
411  		bio_set_flag(bio, BIO_PAGE_PINNED);
412  	bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint;
413  
414  	sdio->bio = bio;
415  	sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
416  }
417  
418  /*
419   * In the AIO read case we speculatively dirty the pages before starting IO.
420   * During IO completion, any of these pages which happen to have been written
421   * back will be redirtied by bio_check_pages_dirty().
422   *
423   * bios hold a dio reference between submit_bio and ->end_io.
424   */
dio_bio_submit(struct dio * dio,struct dio_submit * sdio)425  static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
426  {
427  	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
428  	struct bio *bio = sdio->bio;
429  	unsigned long flags;
430  
431  	bio->bi_private = dio;
432  
433  	spin_lock_irqsave(&dio->bio_lock, flags);
434  	dio->refcount++;
435  	spin_unlock_irqrestore(&dio->bio_lock, flags);
436  
437  	if (dio->is_async && dio_op == REQ_OP_READ && dio->should_dirty)
438  		bio_set_pages_dirty(bio);
439  
440  	dio->bio_disk = bio->bi_bdev->bd_disk;
441  
442  	submit_bio(bio);
443  
444  	sdio->bio = NULL;
445  	sdio->boundary = 0;
446  	sdio->logical_offset_in_bio = 0;
447  }
448  
449  /*
450   * Release any resources in case of a failure
451   */
dio_cleanup(struct dio * dio,struct dio_submit * sdio)452  static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
453  {
454  	if (dio->is_pinned)
455  		unpin_user_pages(dio->pages + sdio->head,
456  				 sdio->tail - sdio->head);
457  	sdio->head = sdio->tail;
458  }
459  
460  /*
461   * Wait for the next BIO to complete.  Remove it and return it.  NULL is
462   * returned once all BIOs have been completed.  This must only be called once
463   * all bios have been issued so that dio->refcount can only decrease.  This
464   * requires that the caller hold a reference on the dio.
465   */
dio_await_one(struct dio * dio)466  static struct bio *dio_await_one(struct dio *dio)
467  {
468  	unsigned long flags;
469  	struct bio *bio = NULL;
470  
471  	spin_lock_irqsave(&dio->bio_lock, flags);
472  
473  	/*
474  	 * Wait as long as the list is empty and there are bios in flight.  bio
475  	 * completion drops the count, maybe adds to the list, and wakes while
476  	 * holding the bio_lock so we don't need set_current_state()'s barrier
477  	 * and can call it after testing our condition.
478  	 */
479  	while (dio->refcount > 1 && dio->bio_list == NULL) {
480  		__set_current_state(TASK_UNINTERRUPTIBLE);
481  		dio->waiter = current;
482  		spin_unlock_irqrestore(&dio->bio_lock, flags);
483  		blk_io_schedule();
484  		/* wake up sets us TASK_RUNNING */
485  		spin_lock_irqsave(&dio->bio_lock, flags);
486  		dio->waiter = NULL;
487  	}
488  	if (dio->bio_list) {
489  		bio = dio->bio_list;
490  		dio->bio_list = bio->bi_private;
491  	}
492  	spin_unlock_irqrestore(&dio->bio_lock, flags);
493  	return bio;
494  }
495  
496  /*
497   * Process one completed BIO.  No locks are held.
498   */
dio_bio_complete(struct dio * dio,struct bio * bio)499  static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
500  {
501  	blk_status_t err = bio->bi_status;
502  	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
503  	bool should_dirty = dio_op == REQ_OP_READ && dio->should_dirty;
504  
505  	if (err) {
506  		if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT))
507  			dio->io_error = -EAGAIN;
508  		else
509  			dio->io_error = -EIO;
510  	}
511  
512  	if (dio->is_async && should_dirty) {
513  		bio_check_pages_dirty(bio);	/* transfers ownership */
514  	} else {
515  		bio_release_pages(bio, should_dirty);
516  		bio_put(bio);
517  	}
518  	return err;
519  }
520  
521  /*
522   * Wait on and process all in-flight BIOs.  This must only be called once
523   * all bios have been issued so that the refcount can only decrease.
524   * This just waits for all bios to make it through dio_bio_complete.  IO
525   * errors are propagated through dio->io_error and should be propagated via
526   * dio_complete().
527   */
dio_await_completion(struct dio * dio)528  static void dio_await_completion(struct dio *dio)
529  {
530  	struct bio *bio;
531  	do {
532  		bio = dio_await_one(dio);
533  		if (bio)
534  			dio_bio_complete(dio, bio);
535  	} while (bio);
536  }
537  
538  /*
539   * A really large O_DIRECT read or write can generate a lot of BIOs.  So
540   * to keep the memory consumption sane we periodically reap any completed BIOs
541   * during the BIO generation phase.
542   *
543   * This also helps to limit the peak amount of pinned userspace memory.
544   */
dio_bio_reap(struct dio * dio,struct dio_submit * sdio)545  static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
546  {
547  	int ret = 0;
548  
549  	if (sdio->reap_counter++ >= 64) {
550  		while (dio->bio_list) {
551  			unsigned long flags;
552  			struct bio *bio;
553  			int ret2;
554  
555  			spin_lock_irqsave(&dio->bio_lock, flags);
556  			bio = dio->bio_list;
557  			dio->bio_list = bio->bi_private;
558  			spin_unlock_irqrestore(&dio->bio_lock, flags);
559  			ret2 = blk_status_to_errno(dio_bio_complete(dio, bio));
560  			if (ret == 0)
561  				ret = ret2;
562  		}
563  		sdio->reap_counter = 0;
564  	}
565  	return ret;
566  }
567  
dio_set_defer_completion(struct dio * dio)568  static int dio_set_defer_completion(struct dio *dio)
569  {
570  	struct super_block *sb = dio->inode->i_sb;
571  
572  	if (dio->defer_completion)
573  		return 0;
574  	dio->defer_completion = true;
575  	if (!sb->s_dio_done_wq)
576  		return sb_init_dio_done_wq(sb);
577  	return 0;
578  }
579  
580  /*
581   * Call into the fs to map some more disk blocks.  We record the current number
582   * of available blocks at sdio->blocks_available.  These are in units of the
583   * fs blocksize, i_blocksize(inode).
584   *
585   * The fs is allowed to map lots of blocks at once.  If it wants to do that,
586   * it uses the passed inode-relative block number as the file offset, as usual.
587   *
588   * get_block() is passed the number of i_blkbits-sized blocks which direct_io
589   * has remaining to do.  The fs should not map more than this number of blocks.
590   *
591   * If the fs has mapped a lot of blocks, it should populate bh->b_size to
592   * indicate how much contiguous disk space has been made available at
593   * bh->b_blocknr.
594   *
595   * If *any* of the mapped blocks are new, then the fs must set buffer_new().
596   * This isn't very efficient...
597   *
598   * In the case of filesystem holes: the fs may return an arbitrarily-large
599   * hole by returning an appropriate value in b_size and by clearing
600   * buffer_mapped().  However the direct-io code will only process holes one
601   * block at a time - it will repeatedly call get_block() as it walks the hole.
602   */
get_more_blocks(struct dio * dio,struct dio_submit * sdio,struct buffer_head * map_bh)603  static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
604  			   struct buffer_head *map_bh)
605  {
606  	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
607  	int ret;
608  	sector_t fs_startblk;	/* Into file, in filesystem-sized blocks */
609  	sector_t fs_endblk;	/* Into file, in filesystem-sized blocks */
610  	unsigned long fs_count;	/* Number of filesystem-sized blocks */
611  	int create;
612  	unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
613  	loff_t i_size;
614  
615  	/*
616  	 * If there was a memory error and we've overwritten all the
617  	 * mapped blocks then we can now return that memory error
618  	 */
619  	ret = dio->page_errors;
620  	if (ret == 0) {
621  		BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
622  		fs_startblk = sdio->block_in_file >> sdio->blkfactor;
623  		fs_endblk = (sdio->final_block_in_request - 1) >>
624  					sdio->blkfactor;
625  		fs_count = fs_endblk - fs_startblk + 1;
626  
627  		map_bh->b_state = 0;
628  		map_bh->b_size = fs_count << i_blkbits;
629  
630  		/*
631  		 * For writes that could fill holes inside i_size on a
632  		 * DIO_SKIP_HOLES filesystem we forbid block creations: only
633  		 * overwrites are permitted. We will return early to the caller
634  		 * once we see an unmapped buffer head returned, and the caller
635  		 * will fall back to buffered I/O.
636  		 *
637  		 * Otherwise the decision is left to the get_blocks method,
638  		 * which may decide to handle it or also return an unmapped
639  		 * buffer head.
640  		 */
641  		create = dio_op == REQ_OP_WRITE;
642  		if (dio->flags & DIO_SKIP_HOLES) {
643  			i_size = i_size_read(dio->inode);
644  			if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits)
645  				create = 0;
646  		}
647  
648  		ret = (*sdio->get_block)(dio->inode, fs_startblk,
649  						map_bh, create);
650  
651  		/* Store for completion */
652  		dio->private = map_bh->b_private;
653  
654  		if (ret == 0 && buffer_defer_completion(map_bh))
655  			ret = dio_set_defer_completion(dio);
656  	}
657  	return ret;
658  }
659  
660  /*
661   * There is no bio.  Make one now.
662   */
dio_new_bio(struct dio * dio,struct dio_submit * sdio,sector_t start_sector,struct buffer_head * map_bh)663  static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
664  		sector_t start_sector, struct buffer_head *map_bh)
665  {
666  	sector_t sector;
667  	int ret, nr_pages;
668  
669  	ret = dio_bio_reap(dio, sdio);
670  	if (ret)
671  		goto out;
672  	sector = start_sector << (sdio->blkbits - 9);
673  	nr_pages = bio_max_segs(sdio->pages_in_io);
674  	BUG_ON(nr_pages <= 0);
675  	dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
676  	sdio->boundary = 0;
677  out:
678  	return ret;
679  }
680  
681  /*
682   * Attempt to put the current chunk of 'cur_page' into the current BIO.  If
683   * that was successful then update final_block_in_bio and take a ref against
684   * the just-added page.
685   *
686   * Return zero on success.  Non-zero means the caller needs to start a new BIO.
687   */
dio_bio_add_page(struct dio * dio,struct dio_submit * sdio)688  static inline int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio)
689  {
690  	int ret;
691  
692  	ret = bio_add_page(sdio->bio, sdio->cur_page,
693  			sdio->cur_page_len, sdio->cur_page_offset);
694  	if (ret == sdio->cur_page_len) {
695  		/*
696  		 * Decrement count only, if we are done with this page
697  		 */
698  		if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
699  			sdio->pages_in_io--;
700  		dio_pin_page(dio, sdio->cur_page);
701  		sdio->final_block_in_bio = sdio->cur_page_block +
702  			(sdio->cur_page_len >> sdio->blkbits);
703  		ret = 0;
704  	} else {
705  		ret = 1;
706  	}
707  	return ret;
708  }
709  
710  /*
711   * Put cur_page under IO.  The section of cur_page which is described by
712   * cur_page_offset,cur_page_len is put into a BIO.  The section of cur_page
713   * starts on-disk at cur_page_block.
714   *
715   * We take a ref against the page here (on behalf of its presence in the bio).
716   *
717   * The caller of this function is responsible for removing cur_page from the
718   * dio, and for dropping the refcount which came from that presence.
719   */
dio_send_cur_page(struct dio * dio,struct dio_submit * sdio,struct buffer_head * map_bh)720  static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
721  		struct buffer_head *map_bh)
722  {
723  	int ret = 0;
724  
725  	if (sdio->bio) {
726  		loff_t cur_offset = sdio->cur_page_fs_offset;
727  		loff_t bio_next_offset = sdio->logical_offset_in_bio +
728  			sdio->bio->bi_iter.bi_size;
729  
730  		/*
731  		 * See whether this new request is contiguous with the old.
732  		 *
733  		 * Btrfs cannot handle having logically non-contiguous requests
734  		 * submitted.  For example if you have
735  		 *
736  		 * Logical:  [0-4095][HOLE][8192-12287]
737  		 * Physical: [0-4095]      [4096-8191]
738  		 *
739  		 * We cannot submit those pages together as one BIO.  So if our
740  		 * current logical offset in the file does not equal what would
741  		 * be the next logical offset in the bio, submit the bio we
742  		 * have.
743  		 */
744  		if (sdio->final_block_in_bio != sdio->cur_page_block ||
745  		    cur_offset != bio_next_offset)
746  			dio_bio_submit(dio, sdio);
747  	}
748  
749  	if (sdio->bio == NULL) {
750  		ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
751  		if (ret)
752  			goto out;
753  	}
754  
755  	if (dio_bio_add_page(dio, sdio) != 0) {
756  		dio_bio_submit(dio, sdio);
757  		ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
758  		if (ret == 0) {
759  			ret = dio_bio_add_page(dio, sdio);
760  			BUG_ON(ret != 0);
761  		}
762  	}
763  out:
764  	return ret;
765  }
766  
767  /*
768   * An autonomous function to put a chunk of a page under deferred IO.
769   *
770   * The caller doesn't actually know (or care) whether this piece of page is in
771   * a BIO, or is under IO or whatever.  We just take care of all possible
772   * situations here.  The separation between the logic of do_direct_IO() and
773   * that of submit_page_section() is important for clarity.  Please don't break.
774   *
775   * The chunk of page starts on-disk at blocknr.
776   *
777   * We perform deferred IO, by recording the last-submitted page inside our
778   * private part of the dio structure.  If possible, we just expand the IO
779   * across that page here.
780   *
781   * If that doesn't work out then we put the old page into the bio and add this
782   * page to the dio instead.
783   */
784  static inline int
submit_page_section(struct dio * dio,struct dio_submit * sdio,struct page * page,unsigned offset,unsigned len,sector_t blocknr,struct buffer_head * map_bh)785  submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
786  		    unsigned offset, unsigned len, sector_t blocknr,
787  		    struct buffer_head *map_bh)
788  {
789  	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
790  	int ret = 0;
791  	int boundary = sdio->boundary;	/* dio_send_cur_page may clear it */
792  
793  	if (dio_op == REQ_OP_WRITE) {
794  		/*
795  		 * Read accounting is performed in submit_bio()
796  		 */
797  		task_io_account_write(len);
798  	}
799  
800  	/*
801  	 * Can we just grow the current page's presence in the dio?
802  	 */
803  	if (sdio->cur_page == page &&
804  	    sdio->cur_page_offset + sdio->cur_page_len == offset &&
805  	    sdio->cur_page_block +
806  	    (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
807  		sdio->cur_page_len += len;
808  		goto out;
809  	}
810  
811  	/*
812  	 * If there's a deferred page already there then send it.
813  	 */
814  	if (sdio->cur_page) {
815  		ret = dio_send_cur_page(dio, sdio, map_bh);
816  		dio_unpin_page(dio, sdio->cur_page);
817  		sdio->cur_page = NULL;
818  		if (ret)
819  			return ret;
820  	}
821  
822  	dio_pin_page(dio, page);		/* It is in dio */
823  	sdio->cur_page = page;
824  	sdio->cur_page_offset = offset;
825  	sdio->cur_page_len = len;
826  	sdio->cur_page_block = blocknr;
827  	sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
828  out:
829  	/*
830  	 * If boundary then we want to schedule the IO now to
831  	 * avoid metadata seeks.
832  	 */
833  	if (boundary) {
834  		ret = dio_send_cur_page(dio, sdio, map_bh);
835  		if (sdio->bio)
836  			dio_bio_submit(dio, sdio);
837  		dio_unpin_page(dio, sdio->cur_page);
838  		sdio->cur_page = NULL;
839  	}
840  	return ret;
841  }
842  
843  /*
844   * If we are not writing the entire block and get_block() allocated
845   * the block for us, we need to fill-in the unused portion of the
846   * block with zeros. This happens only if user-buffer, fileoffset or
847   * io length is not filesystem block-size multiple.
848   *
849   * `end' is zero if we're doing the start of the IO, 1 at the end of the
850   * IO.
851   */
dio_zero_block(struct dio * dio,struct dio_submit * sdio,int end,struct buffer_head * map_bh)852  static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
853  		int end, struct buffer_head *map_bh)
854  {
855  	unsigned dio_blocks_per_fs_block;
856  	unsigned this_chunk_blocks;	/* In dio_blocks */
857  	unsigned this_chunk_bytes;
858  	struct page *page;
859  
860  	sdio->start_zero_done = 1;
861  	if (!sdio->blkfactor || !buffer_new(map_bh))
862  		return;
863  
864  	dio_blocks_per_fs_block = 1 << sdio->blkfactor;
865  	this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
866  
867  	if (!this_chunk_blocks)
868  		return;
869  
870  	/*
871  	 * We need to zero out part of an fs block.  It is either at the
872  	 * beginning or the end of the fs block.
873  	 */
874  	if (end)
875  		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
876  
877  	this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
878  
879  	page = ZERO_PAGE(0);
880  	if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
881  				sdio->next_block_for_io, map_bh))
882  		return;
883  
884  	sdio->next_block_for_io += this_chunk_blocks;
885  }
886  
887  /*
888   * Walk the user pages, and the file, mapping blocks to disk and generating
889   * a sequence of (page,offset,len,block) mappings.  These mappings are injected
890   * into submit_page_section(), which takes care of the next stage of submission
891   *
892   * Direct IO against a blockdev is different from a file.  Because we can
893   * happily perform page-sized but 512-byte aligned IOs.  It is important that
894   * blockdev IO be able to have fine alignment and large sizes.
895   *
896   * So what we do is to permit the ->get_block function to populate bh.b_size
897   * with the size of IO which is permitted at this offset and this i_blkbits.
898   *
899   * For best results, the blockdev should be set up with 512-byte i_blkbits and
900   * it should set b_size to PAGE_SIZE or more inside get_block().  This gives
901   * fine alignment but still allows this function to work in PAGE_SIZE units.
902   */
do_direct_IO(struct dio * dio,struct dio_submit * sdio,struct buffer_head * map_bh)903  static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
904  			struct buffer_head *map_bh)
905  {
906  	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
907  	const unsigned blkbits = sdio->blkbits;
908  	const unsigned i_blkbits = blkbits + sdio->blkfactor;
909  	int ret = 0;
910  
911  	while (sdio->block_in_file < sdio->final_block_in_request) {
912  		struct page *page;
913  		size_t from, to;
914  
915  		page = dio_get_page(dio, sdio);
916  		if (IS_ERR(page)) {
917  			ret = PTR_ERR(page);
918  			goto out;
919  		}
920  		from = sdio->head ? 0 : sdio->from;
921  		to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE;
922  		sdio->head++;
923  
924  		while (from < to) {
925  			unsigned this_chunk_bytes;	/* # of bytes mapped */
926  			unsigned this_chunk_blocks;	/* # of blocks */
927  			unsigned u;
928  
929  			if (sdio->blocks_available == 0) {
930  				/*
931  				 * Need to go and map some more disk
932  				 */
933  				unsigned long blkmask;
934  				unsigned long dio_remainder;
935  
936  				ret = get_more_blocks(dio, sdio, map_bh);
937  				if (ret) {
938  					dio_unpin_page(dio, page);
939  					goto out;
940  				}
941  				if (!buffer_mapped(map_bh))
942  					goto do_holes;
943  
944  				sdio->blocks_available =
945  						map_bh->b_size >> blkbits;
946  				sdio->next_block_for_io =
947  					map_bh->b_blocknr << sdio->blkfactor;
948  				if (buffer_new(map_bh)) {
949  					clean_bdev_aliases(
950  						map_bh->b_bdev,
951  						map_bh->b_blocknr,
952  						map_bh->b_size >> i_blkbits);
953  				}
954  
955  				if (!sdio->blkfactor)
956  					goto do_holes;
957  
958  				blkmask = (1 << sdio->blkfactor) - 1;
959  				dio_remainder = (sdio->block_in_file & blkmask);
960  
961  				/*
962  				 * If we are at the start of IO and that IO
963  				 * starts partway into a fs-block,
964  				 * dio_remainder will be non-zero.  If the IO
965  				 * is a read then we can simply advance the IO
966  				 * cursor to the first block which is to be
967  				 * read.  But if the IO is a write and the
968  				 * block was newly allocated we cannot do that;
969  				 * the start of the fs block must be zeroed out
970  				 * on-disk
971  				 */
972  				if (!buffer_new(map_bh))
973  					sdio->next_block_for_io += dio_remainder;
974  				sdio->blocks_available -= dio_remainder;
975  			}
976  do_holes:
977  			/* Handle holes */
978  			if (!buffer_mapped(map_bh)) {
979  				loff_t i_size_aligned;
980  
981  				/* AKPM: eargh, -ENOTBLK is a hack */
982  				if (dio_op == REQ_OP_WRITE) {
983  					dio_unpin_page(dio, page);
984  					return -ENOTBLK;
985  				}
986  
987  				/*
988  				 * Be sure to account for a partial block as the
989  				 * last block in the file
990  				 */
991  				i_size_aligned = ALIGN(i_size_read(dio->inode),
992  							1 << blkbits);
993  				if (sdio->block_in_file >=
994  						i_size_aligned >> blkbits) {
995  					/* We hit eof */
996  					dio_unpin_page(dio, page);
997  					goto out;
998  				}
999  				zero_user(page, from, 1 << blkbits);
1000  				sdio->block_in_file++;
1001  				from += 1 << blkbits;
1002  				dio->result += 1 << blkbits;
1003  				goto next_block;
1004  			}
1005  
1006  			/*
1007  			 * If we're performing IO which has an alignment which
1008  			 * is finer than the underlying fs, go check to see if
1009  			 * we must zero out the start of this block.
1010  			 */
1011  			if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
1012  				dio_zero_block(dio, sdio, 0, map_bh);
1013  
1014  			/*
1015  			 * Work out, in this_chunk_blocks, how much disk we
1016  			 * can add to this page
1017  			 */
1018  			this_chunk_blocks = sdio->blocks_available;
1019  			u = (to - from) >> blkbits;
1020  			if (this_chunk_blocks > u)
1021  				this_chunk_blocks = u;
1022  			u = sdio->final_block_in_request - sdio->block_in_file;
1023  			if (this_chunk_blocks > u)
1024  				this_chunk_blocks = u;
1025  			this_chunk_bytes = this_chunk_blocks << blkbits;
1026  			BUG_ON(this_chunk_bytes == 0);
1027  
1028  			if (this_chunk_blocks == sdio->blocks_available)
1029  				sdio->boundary = buffer_boundary(map_bh);
1030  			ret = submit_page_section(dio, sdio, page,
1031  						  from,
1032  						  this_chunk_bytes,
1033  						  sdio->next_block_for_io,
1034  						  map_bh);
1035  			if (ret) {
1036  				dio_unpin_page(dio, page);
1037  				goto out;
1038  			}
1039  			sdio->next_block_for_io += this_chunk_blocks;
1040  
1041  			sdio->block_in_file += this_chunk_blocks;
1042  			from += this_chunk_bytes;
1043  			dio->result += this_chunk_bytes;
1044  			sdio->blocks_available -= this_chunk_blocks;
1045  next_block:
1046  			BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
1047  			if (sdio->block_in_file == sdio->final_block_in_request)
1048  				break;
1049  		}
1050  
1051  		/* Drop the pin which was taken in get_user_pages() */
1052  		dio_unpin_page(dio, page);
1053  	}
1054  out:
1055  	return ret;
1056  }
1057  
drop_refcount(struct dio * dio)1058  static inline int drop_refcount(struct dio *dio)
1059  {
1060  	int ret2;
1061  	unsigned long flags;
1062  
1063  	/*
1064  	 * Sync will always be dropping the final ref and completing the
1065  	 * operation.  AIO can if it was a broken operation described above or
1066  	 * in fact if all the bios race to complete before we get here.  In
1067  	 * that case dio_complete() translates the EIOCBQUEUED into the proper
1068  	 * return code that the caller will hand to ->complete().
1069  	 *
1070  	 * This is managed by the bio_lock instead of being an atomic_t so that
1071  	 * completion paths can drop their ref and use the remaining count to
1072  	 * decide to wake the submission path atomically.
1073  	 */
1074  	spin_lock_irqsave(&dio->bio_lock, flags);
1075  	ret2 = --dio->refcount;
1076  	spin_unlock_irqrestore(&dio->bio_lock, flags);
1077  	return ret2;
1078  }
1079  
1080  /*
1081   * This is a library function for use by filesystem drivers.
1082   *
1083   * The locking rules are governed by the flags parameter:
1084   *  - if the flags value contains DIO_LOCKING we use a fancy locking
1085   *    scheme for dumb filesystems.
1086   *    For writes this function is called under i_mutex and returns with
1087   *    i_mutex held, for reads, i_mutex is not held on entry, but it is
1088   *    taken and dropped again before returning.
1089   *  - if the flags value does NOT contain DIO_LOCKING we don't use any
1090   *    internal locking but rather rely on the filesystem to synchronize
1091   *    direct I/O reads/writes versus each other and truncate.
1092   *
1093   * To help with locking against truncate we incremented the i_dio_count
1094   * counter before starting direct I/O, and decrement it once we are done.
1095   * Truncate can wait for it to reach zero to provide exclusion.  It is
1096   * expected that filesystem provide exclusion between new direct I/O
1097   * and truncates.  For DIO_LOCKING filesystems this is done by i_mutex,
1098   * but other filesystems need to take care of this on their own.
1099   *
1100   * NOTE: if you pass "sdio" to anything by pointer make sure that function
1101   * is always inlined. Otherwise gcc is unable to split the structure into
1102   * individual fields and will generate much worse code. This is important
1103   * for the whole file.
1104   */
__blockdev_direct_IO(struct kiocb * iocb,struct inode * inode,struct block_device * bdev,struct iov_iter * iter,get_block_t get_block,dio_iodone_t end_io,int flags)1105  ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
1106  		struct block_device *bdev, struct iov_iter *iter,
1107  		get_block_t get_block, dio_iodone_t end_io,
1108  		int flags)
1109  {
1110  	unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
1111  	unsigned blkbits = i_blkbits;
1112  	unsigned blocksize_mask = (1 << blkbits) - 1;
1113  	ssize_t retval = -EINVAL;
1114  	const size_t count = iov_iter_count(iter);
1115  	loff_t offset = iocb->ki_pos;
1116  	const loff_t end = offset + count;
1117  	struct dio *dio;
1118  	struct dio_submit sdio = { NULL, };
1119  	struct buffer_head map_bh = { 0, };
1120  	struct blk_plug plug;
1121  	unsigned long align = offset | iov_iter_alignment(iter);
1122  
1123  	/* watch out for a 0 len io from a tricksy fs */
1124  	if (iov_iter_rw(iter) == READ && !count)
1125  		return 0;
1126  
1127  	dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
1128  	if (!dio)
1129  		return -ENOMEM;
1130  	/*
1131  	 * Believe it or not, zeroing out the page array caused a .5%
1132  	 * performance regression in a database benchmark.  So, we take
1133  	 * care to only zero out what's needed.
1134  	 */
1135  	memset(dio, 0, offsetof(struct dio, pages));
1136  
1137  	dio->flags = flags;
1138  	if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) {
1139  		/* will be released by direct_io_worker */
1140  		inode_lock(inode);
1141  	}
1142  	dio->is_pinned = iov_iter_extract_will_pin(iter);
1143  
1144  	/* Once we sampled i_size check for reads beyond EOF */
1145  	dio->i_size = i_size_read(inode);
1146  	if (iov_iter_rw(iter) == READ && offset >= dio->i_size) {
1147  		retval = 0;
1148  		goto fail_dio;
1149  	}
1150  
1151  	if (align & blocksize_mask) {
1152  		if (bdev)
1153  			blkbits = blksize_bits(bdev_logical_block_size(bdev));
1154  		blocksize_mask = (1 << blkbits) - 1;
1155  		if (align & blocksize_mask)
1156  			goto fail_dio;
1157  	}
1158  
1159  	if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) {
1160  		struct address_space *mapping = iocb->ki_filp->f_mapping;
1161  
1162  		retval = filemap_write_and_wait_range(mapping, offset, end - 1);
1163  		if (retval)
1164  			goto fail_dio;
1165  	}
1166  
1167  	/*
1168  	 * For file extending writes updating i_size before data writeouts
1169  	 * complete can expose uninitialized blocks in dumb filesystems.
1170  	 * In that case we need to wait for I/O completion even if asked
1171  	 * for an asynchronous write.
1172  	 */
1173  	if (is_sync_kiocb(iocb))
1174  		dio->is_async = false;
1175  	else if (iov_iter_rw(iter) == WRITE && end > i_size_read(inode))
1176  		dio->is_async = false;
1177  	else
1178  		dio->is_async = true;
1179  
1180  	dio->inode = inode;
1181  	if (iov_iter_rw(iter) == WRITE) {
1182  		dio->opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
1183  		if (iocb->ki_flags & IOCB_NOWAIT)
1184  			dio->opf |= REQ_NOWAIT;
1185  	} else {
1186  		dio->opf = REQ_OP_READ;
1187  	}
1188  
1189  	/*
1190  	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
1191  	 * so that we can call ->fsync.
1192  	 */
1193  	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
1194  		retval = 0;
1195  		if (iocb_is_dsync(iocb))
1196  			retval = dio_set_defer_completion(dio);
1197  		else if (!dio->inode->i_sb->s_dio_done_wq) {
1198  			/*
1199  			 * In case of AIO write racing with buffered read we
1200  			 * need to defer completion. We can't decide this now,
1201  			 * however the workqueue needs to be initialized here.
1202  			 */
1203  			retval = sb_init_dio_done_wq(dio->inode->i_sb);
1204  		}
1205  		if (retval)
1206  			goto fail_dio;
1207  	}
1208  
1209  	/*
1210  	 * Will be decremented at I/O completion time.
1211  	 */
1212  	inode_dio_begin(inode);
1213  
1214  	sdio.blkbits = blkbits;
1215  	sdio.blkfactor = i_blkbits - blkbits;
1216  	sdio.block_in_file = offset >> blkbits;
1217  
1218  	sdio.get_block = get_block;
1219  	dio->end_io = end_io;
1220  	sdio.final_block_in_bio = -1;
1221  	sdio.next_block_for_io = -1;
1222  
1223  	dio->iocb = iocb;
1224  
1225  	spin_lock_init(&dio->bio_lock);
1226  	dio->refcount = 1;
1227  
1228  	dio->should_dirty = user_backed_iter(iter) && iov_iter_rw(iter) == READ;
1229  	sdio.iter = iter;
1230  	sdio.final_block_in_request = end >> blkbits;
1231  
1232  	/*
1233  	 * In case of non-aligned buffers, we may need 2 more
1234  	 * pages since we need to zero out first and last block.
1235  	 */
1236  	if (unlikely(sdio.blkfactor))
1237  		sdio.pages_in_io = 2;
1238  
1239  	sdio.pages_in_io += iov_iter_npages(iter, INT_MAX);
1240  
1241  	blk_start_plug(&plug);
1242  
1243  	retval = do_direct_IO(dio, &sdio, &map_bh);
1244  	if (retval)
1245  		dio_cleanup(dio, &sdio);
1246  
1247  	if (retval == -ENOTBLK) {
1248  		/*
1249  		 * The remaining part of the request will be
1250  		 * handled by buffered I/O when we return
1251  		 */
1252  		retval = 0;
1253  	}
1254  	/*
1255  	 * There may be some unwritten disk at the end of a part-written
1256  	 * fs-block-sized block.  Go zero that now.
1257  	 */
1258  	dio_zero_block(dio, &sdio, 1, &map_bh);
1259  
1260  	if (sdio.cur_page) {
1261  		ssize_t ret2;
1262  
1263  		ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
1264  		if (retval == 0)
1265  			retval = ret2;
1266  		dio_unpin_page(dio, sdio.cur_page);
1267  		sdio.cur_page = NULL;
1268  	}
1269  	if (sdio.bio)
1270  		dio_bio_submit(dio, &sdio);
1271  
1272  	blk_finish_plug(&plug);
1273  
1274  	/*
1275  	 * It is possible that, we return short IO due to end of file.
1276  	 * In that case, we need to release all the pages we got hold on.
1277  	 */
1278  	dio_cleanup(dio, &sdio);
1279  
1280  	/*
1281  	 * All block lookups have been performed. For READ requests
1282  	 * we can let i_mutex go now that its achieved its purpose
1283  	 * of protecting us from looking up uninitialized blocks.
1284  	 */
1285  	if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
1286  		inode_unlock(dio->inode);
1287  
1288  	/*
1289  	 * The only time we want to leave bios in flight is when a successful
1290  	 * partial aio read or full aio write have been setup.  In that case
1291  	 * bio completion will call aio_complete.  The only time it's safe to
1292  	 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
1293  	 * This had *better* be the only place that raises -EIOCBQUEUED.
1294  	 */
1295  	BUG_ON(retval == -EIOCBQUEUED);
1296  	if (dio->is_async && retval == 0 && dio->result &&
1297  	    (iov_iter_rw(iter) == READ || dio->result == count))
1298  		retval = -EIOCBQUEUED;
1299  	else
1300  		dio_await_completion(dio);
1301  
1302  	if (drop_refcount(dio) == 0) {
1303  		retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE);
1304  	} else
1305  		BUG_ON(retval != -EIOCBQUEUED);
1306  
1307  	return retval;
1308  
1309  fail_dio:
1310  	if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ)
1311  		inode_unlock(inode);
1312  
1313  	kmem_cache_free(dio_cache, dio);
1314  	return retval;
1315  }
1316  EXPORT_SYMBOL(__blockdev_direct_IO);
1317  
dio_init(void)1318  static __init int dio_init(void)
1319  {
1320  	dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
1321  	return 0;
1322  }
1323  module_init(dio_init)
1324