1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 1991, 1992 Linus Torvalds
4   * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
5   * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
6   * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
7   * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
8   *	-  July2000
9   * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
10   */
11  
12  /*
13   * This handles all read/write requests to block devices
14   */
15  #include <linux/kernel.h>
16  #include <linux/module.h>
17  #include <linux/bio.h>
18  #include <linux/blkdev.h>
19  #include <linux/blk-pm.h>
20  #include <linux/blk-integrity.h>
21  #include <linux/highmem.h>
22  #include <linux/mm.h>
23  #include <linux/pagemap.h>
24  #include <linux/kernel_stat.h>
25  #include <linux/string.h>
26  #include <linux/init.h>
27  #include <linux/completion.h>
28  #include <linux/slab.h>
29  #include <linux/swap.h>
30  #include <linux/writeback.h>
31  #include <linux/task_io_accounting_ops.h>
32  #include <linux/fault-inject.h>
33  #include <linux/list_sort.h>
34  #include <linux/delay.h>
35  #include <linux/ratelimit.h>
36  #include <linux/pm_runtime.h>
37  #include <linux/t10-pi.h>
38  #include <linux/debugfs.h>
39  #include <linux/bpf.h>
40  #include <linux/part_stat.h>
41  #include <linux/sched/sysctl.h>
42  #include <linux/blk-crypto.h>
43  
44  #define CREATE_TRACE_POINTS
45  #include <trace/events/block.h>
46  
47  #include "blk.h"
48  #include "blk-mq-sched.h"
49  #include "blk-pm.h"
50  #include "blk-cgroup.h"
51  #include "blk-throttle.h"
52  #include "blk-ioprio.h"
53  
54  struct dentry *blk_debugfs_root;
55  
56  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
57  EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
58  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
59  EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
60  EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
61  EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
62  
63  static DEFINE_IDA(blk_queue_ida);
64  
65  /*
66   * For queue allocation
67   */
68  static struct kmem_cache *blk_requestq_cachep;
69  
70  /*
71   * Controlling structure to kblockd
72   */
73  static struct workqueue_struct *kblockd_workqueue;
74  
75  /**
76   * blk_queue_flag_set - atomically set a queue flag
77   * @flag: flag to be set
78   * @q: request queue
79   */
blk_queue_flag_set(unsigned int flag,struct request_queue * q)80  void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
81  {
82  	set_bit(flag, &q->queue_flags);
83  }
84  EXPORT_SYMBOL(blk_queue_flag_set);
85  
86  /**
87   * blk_queue_flag_clear - atomically clear a queue flag
88   * @flag: flag to be cleared
89   * @q: request queue
90   */
blk_queue_flag_clear(unsigned int flag,struct request_queue * q)91  void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
92  {
93  	clear_bit(flag, &q->queue_flags);
94  }
95  EXPORT_SYMBOL(blk_queue_flag_clear);
96  
97  #define REQ_OP_NAME(name) [REQ_OP_##name] = #name
98  static const char *const blk_op_name[] = {
99  	REQ_OP_NAME(READ),
100  	REQ_OP_NAME(WRITE),
101  	REQ_OP_NAME(FLUSH),
102  	REQ_OP_NAME(DISCARD),
103  	REQ_OP_NAME(SECURE_ERASE),
104  	REQ_OP_NAME(ZONE_RESET),
105  	REQ_OP_NAME(ZONE_RESET_ALL),
106  	REQ_OP_NAME(ZONE_OPEN),
107  	REQ_OP_NAME(ZONE_CLOSE),
108  	REQ_OP_NAME(ZONE_FINISH),
109  	REQ_OP_NAME(ZONE_APPEND),
110  	REQ_OP_NAME(WRITE_ZEROES),
111  	REQ_OP_NAME(DRV_IN),
112  	REQ_OP_NAME(DRV_OUT),
113  };
114  #undef REQ_OP_NAME
115  
116  /**
117   * blk_op_str - Return string XXX in the REQ_OP_XXX.
118   * @op: REQ_OP_XXX.
119   *
120   * Description: Centralize block layer function to convert REQ_OP_XXX into
121   * string format. Useful in the debugging and tracing bio or request. For
122   * invalid REQ_OP_XXX it returns string "UNKNOWN".
123   */
blk_op_str(enum req_op op)124  inline const char *blk_op_str(enum req_op op)
125  {
126  	const char *op_str = "UNKNOWN";
127  
128  	if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
129  		op_str = blk_op_name[op];
130  
131  	return op_str;
132  }
133  EXPORT_SYMBOL_GPL(blk_op_str);
134  
135  static const struct {
136  	int		errno;
137  	const char	*name;
138  } blk_errors[] = {
139  	[BLK_STS_OK]		= { 0,		"" },
140  	[BLK_STS_NOTSUPP]	= { -EOPNOTSUPP, "operation not supported" },
141  	[BLK_STS_TIMEOUT]	= { -ETIMEDOUT,	"timeout" },
142  	[BLK_STS_NOSPC]		= { -ENOSPC,	"critical space allocation" },
143  	[BLK_STS_TRANSPORT]	= { -ENOLINK,	"recoverable transport" },
144  	[BLK_STS_TARGET]	= { -EREMOTEIO,	"critical target" },
145  	[BLK_STS_RESV_CONFLICT]	= { -EBADE,	"reservation conflict" },
146  	[BLK_STS_MEDIUM]	= { -ENODATA,	"critical medium" },
147  	[BLK_STS_PROTECTION]	= { -EILSEQ,	"protection" },
148  	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
149  	[BLK_STS_DEV_RESOURCE]	= { -EBUSY,	"device resource" },
150  	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
151  	[BLK_STS_OFFLINE]	= { -ENODEV,	"device offline" },
152  
153  	/* device mapper special case, should not leak out: */
154  	[BLK_STS_DM_REQUEUE]	= { -EREMCHG, "dm internal retry" },
155  
156  	/* zone device specific errors */
157  	[BLK_STS_ZONE_OPEN_RESOURCE]	= { -ETOOMANYREFS, "open zones exceeded" },
158  	[BLK_STS_ZONE_ACTIVE_RESOURCE]	= { -EOVERFLOW, "active zones exceeded" },
159  
160  	/* Command duration limit device-side timeout */
161  	[BLK_STS_DURATION_LIMIT]	= { -ETIME, "duration limit exceeded" },
162  
163  	[BLK_STS_INVAL]		= { -EINVAL,	"invalid" },
164  
165  	/* everything else not covered above: */
166  	[BLK_STS_IOERR]		= { -EIO,	"I/O" },
167  };
168  
errno_to_blk_status(int errno)169  blk_status_t errno_to_blk_status(int errno)
170  {
171  	int i;
172  
173  	for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
174  		if (blk_errors[i].errno == errno)
175  			return (__force blk_status_t)i;
176  	}
177  
178  	return BLK_STS_IOERR;
179  }
180  EXPORT_SYMBOL_GPL(errno_to_blk_status);
181  
blk_status_to_errno(blk_status_t status)182  int blk_status_to_errno(blk_status_t status)
183  {
184  	int idx = (__force int)status;
185  
186  	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
187  		return -EIO;
188  	return blk_errors[idx].errno;
189  }
190  EXPORT_SYMBOL_GPL(blk_status_to_errno);
191  
blk_status_to_str(blk_status_t status)192  const char *blk_status_to_str(blk_status_t status)
193  {
194  	int idx = (__force int)status;
195  
196  	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
197  		return "<null>";
198  	return blk_errors[idx].name;
199  }
200  EXPORT_SYMBOL_GPL(blk_status_to_str);
201  
202  /**
203   * blk_sync_queue - cancel any pending callbacks on a queue
204   * @q: the queue
205   *
206   * Description:
207   *     The block layer may perform asynchronous callback activity
208   *     on a queue, such as calling the unplug function after a timeout.
209   *     A block device may call blk_sync_queue to ensure that any
210   *     such activity is cancelled, thus allowing it to release resources
211   *     that the callbacks might use. The caller must already have made sure
212   *     that its ->submit_bio will not re-add plugging prior to calling
213   *     this function.
214   *
215   *     This function does not cancel any asynchronous activity arising
216   *     out of elevator or throttling code. That would require elevator_exit()
217   *     and blkcg_exit_queue() to be called with queue lock initialized.
218   *
219   */
blk_sync_queue(struct request_queue * q)220  void blk_sync_queue(struct request_queue *q)
221  {
222  	del_timer_sync(&q->timeout);
223  	cancel_work_sync(&q->timeout_work);
224  }
225  EXPORT_SYMBOL(blk_sync_queue);
226  
227  /**
228   * blk_set_pm_only - increment pm_only counter
229   * @q: request queue pointer
230   */
blk_set_pm_only(struct request_queue * q)231  void blk_set_pm_only(struct request_queue *q)
232  {
233  	atomic_inc(&q->pm_only);
234  }
235  EXPORT_SYMBOL_GPL(blk_set_pm_only);
236  
blk_clear_pm_only(struct request_queue * q)237  void blk_clear_pm_only(struct request_queue *q)
238  {
239  	int pm_only;
240  
241  	pm_only = atomic_dec_return(&q->pm_only);
242  	WARN_ON_ONCE(pm_only < 0);
243  	if (pm_only == 0)
244  		wake_up_all(&q->mq_freeze_wq);
245  }
246  EXPORT_SYMBOL_GPL(blk_clear_pm_only);
247  
blk_free_queue_rcu(struct rcu_head * rcu_head)248  static void blk_free_queue_rcu(struct rcu_head *rcu_head)
249  {
250  	struct request_queue *q = container_of(rcu_head,
251  			struct request_queue, rcu_head);
252  
253  	percpu_ref_exit(&q->q_usage_counter);
254  	kmem_cache_free(blk_requestq_cachep, q);
255  }
256  
blk_free_queue(struct request_queue * q)257  static void blk_free_queue(struct request_queue *q)
258  {
259  	blk_free_queue_stats(q->stats);
260  	if (queue_is_mq(q))
261  		blk_mq_release(q);
262  
263  	ida_free(&blk_queue_ida, q->id);
264  	call_rcu(&q->rcu_head, blk_free_queue_rcu);
265  }
266  
267  /**
268   * blk_put_queue - decrement the request_queue refcount
269   * @q: the request_queue structure to decrement the refcount for
270   *
271   * Decrements the refcount of the request_queue and free it when the refcount
272   * reaches 0.
273   */
blk_put_queue(struct request_queue * q)274  void blk_put_queue(struct request_queue *q)
275  {
276  	if (refcount_dec_and_test(&q->refs))
277  		blk_free_queue(q);
278  }
279  EXPORT_SYMBOL(blk_put_queue);
280  
blk_queue_start_drain(struct request_queue * q)281  void blk_queue_start_drain(struct request_queue *q)
282  {
283  	/*
284  	 * When queue DYING flag is set, we need to block new req
285  	 * entering queue, so we call blk_freeze_queue_start() to
286  	 * prevent I/O from crossing blk_queue_enter().
287  	 */
288  	blk_freeze_queue_start(q);
289  	if (queue_is_mq(q))
290  		blk_mq_wake_waiters(q);
291  	/* Make blk_queue_enter() reexamine the DYING flag. */
292  	wake_up_all(&q->mq_freeze_wq);
293  }
294  
295  /**
296   * blk_queue_enter() - try to increase q->q_usage_counter
297   * @q: request queue pointer
298   * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
299   */
blk_queue_enter(struct request_queue * q,blk_mq_req_flags_t flags)300  int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
301  {
302  	const bool pm = flags & BLK_MQ_REQ_PM;
303  
304  	while (!blk_try_enter_queue(q, pm)) {
305  		if (flags & BLK_MQ_REQ_NOWAIT)
306  			return -EAGAIN;
307  
308  		/*
309  		 * read pair of barrier in blk_freeze_queue_start(), we need to
310  		 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
311  		 * reading .mq_freeze_depth or queue dying flag, otherwise the
312  		 * following wait may never return if the two reads are
313  		 * reordered.
314  		 */
315  		smp_rmb();
316  		wait_event(q->mq_freeze_wq,
317  			   (!q->mq_freeze_depth &&
318  			    blk_pm_resume_queue(pm, q)) ||
319  			   blk_queue_dying(q));
320  		if (blk_queue_dying(q))
321  			return -ENODEV;
322  	}
323  
324  	return 0;
325  }
326  
__bio_queue_enter(struct request_queue * q,struct bio * bio)327  int __bio_queue_enter(struct request_queue *q, struct bio *bio)
328  {
329  	while (!blk_try_enter_queue(q, false)) {
330  		struct gendisk *disk = bio->bi_bdev->bd_disk;
331  
332  		if (bio->bi_opf & REQ_NOWAIT) {
333  			if (test_bit(GD_DEAD, &disk->state))
334  				goto dead;
335  			bio_wouldblock_error(bio);
336  			return -EAGAIN;
337  		}
338  
339  		/*
340  		 * read pair of barrier in blk_freeze_queue_start(), we need to
341  		 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
342  		 * reading .mq_freeze_depth or queue dying flag, otherwise the
343  		 * following wait may never return if the two reads are
344  		 * reordered.
345  		 */
346  		smp_rmb();
347  		wait_event(q->mq_freeze_wq,
348  			   (!q->mq_freeze_depth &&
349  			    blk_pm_resume_queue(false, q)) ||
350  			   test_bit(GD_DEAD, &disk->state));
351  		if (test_bit(GD_DEAD, &disk->state))
352  			goto dead;
353  	}
354  
355  	return 0;
356  dead:
357  	bio_io_error(bio);
358  	return -ENODEV;
359  }
360  
blk_queue_exit(struct request_queue * q)361  void blk_queue_exit(struct request_queue *q)
362  {
363  	percpu_ref_put(&q->q_usage_counter);
364  }
365  
blk_queue_usage_counter_release(struct percpu_ref * ref)366  static void blk_queue_usage_counter_release(struct percpu_ref *ref)
367  {
368  	struct request_queue *q =
369  		container_of(ref, struct request_queue, q_usage_counter);
370  
371  	wake_up_all(&q->mq_freeze_wq);
372  }
373  
blk_rq_timed_out_timer(struct timer_list * t)374  static void blk_rq_timed_out_timer(struct timer_list *t)
375  {
376  	struct request_queue *q = from_timer(q, t, timeout);
377  
378  	kblockd_schedule_work(&q->timeout_work);
379  }
380  
blk_timeout_work(struct work_struct * work)381  static void blk_timeout_work(struct work_struct *work)
382  {
383  }
384  
blk_alloc_queue(struct queue_limits * lim,int node_id)385  struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
386  {
387  	struct request_queue *q;
388  	int error;
389  
390  	q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
391  				  node_id);
392  	if (!q)
393  		return ERR_PTR(-ENOMEM);
394  
395  	q->last_merge = NULL;
396  
397  	q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
398  	if (q->id < 0) {
399  		error = q->id;
400  		goto fail_q;
401  	}
402  
403  	q->stats = blk_alloc_queue_stats();
404  	if (!q->stats) {
405  		error = -ENOMEM;
406  		goto fail_id;
407  	}
408  
409  	error = blk_set_default_limits(lim);
410  	if (error)
411  		goto fail_stats;
412  	q->limits = *lim;
413  
414  	q->node = node_id;
415  
416  	atomic_set(&q->nr_active_requests_shared_tags, 0);
417  
418  	timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
419  	INIT_WORK(&q->timeout_work, blk_timeout_work);
420  	INIT_LIST_HEAD(&q->icq_list);
421  
422  	refcount_set(&q->refs, 1);
423  	mutex_init(&q->debugfs_mutex);
424  	mutex_init(&q->sysfs_lock);
425  	mutex_init(&q->sysfs_dir_lock);
426  	mutex_init(&q->limits_lock);
427  	mutex_init(&q->rq_qos_mutex);
428  	spin_lock_init(&q->queue_lock);
429  
430  	init_waitqueue_head(&q->mq_freeze_wq);
431  	mutex_init(&q->mq_freeze_lock);
432  
433  	blkg_init_queue(q);
434  
435  	/*
436  	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
437  	 * See blk_register_queue() for details.
438  	 */
439  	error = percpu_ref_init(&q->q_usage_counter,
440  				blk_queue_usage_counter_release,
441  				PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
442  	if (error)
443  		goto fail_stats;
444  
445  	q->nr_requests = BLKDEV_DEFAULT_RQ;
446  
447  	return q;
448  
449  fail_stats:
450  	blk_free_queue_stats(q->stats);
451  fail_id:
452  	ida_free(&blk_queue_ida, q->id);
453  fail_q:
454  	kmem_cache_free(blk_requestq_cachep, q);
455  	return ERR_PTR(error);
456  }
457  
458  /**
459   * blk_get_queue - increment the request_queue refcount
460   * @q: the request_queue structure to increment the refcount for
461   *
462   * Increment the refcount of the request_queue kobject.
463   *
464   * Context: Any context.
465   */
blk_get_queue(struct request_queue * q)466  bool blk_get_queue(struct request_queue *q)
467  {
468  	if (unlikely(blk_queue_dying(q)))
469  		return false;
470  	refcount_inc(&q->refs);
471  	return true;
472  }
473  EXPORT_SYMBOL(blk_get_queue);
474  
475  #ifdef CONFIG_FAIL_MAKE_REQUEST
476  
477  static DECLARE_FAULT_ATTR(fail_make_request);
478  
setup_fail_make_request(char * str)479  static int __init setup_fail_make_request(char *str)
480  {
481  	return setup_fault_attr(&fail_make_request, str);
482  }
483  __setup("fail_make_request=", setup_fail_make_request);
484  
should_fail_request(struct block_device * part,unsigned int bytes)485  bool should_fail_request(struct block_device *part, unsigned int bytes)
486  {
487  	return bdev_test_flag(part, BD_MAKE_IT_FAIL) &&
488  	       should_fail(&fail_make_request, bytes);
489  }
490  
fail_make_request_debugfs(void)491  static int __init fail_make_request_debugfs(void)
492  {
493  	struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
494  						NULL, &fail_make_request);
495  
496  	return PTR_ERR_OR_ZERO(dir);
497  }
498  
499  late_initcall(fail_make_request_debugfs);
500  #endif /* CONFIG_FAIL_MAKE_REQUEST */
501  
bio_check_ro(struct bio * bio)502  static inline void bio_check_ro(struct bio *bio)
503  {
504  	if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
505  		if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
506  			return;
507  
508  		if (bdev_test_flag(bio->bi_bdev, BD_RO_WARNED))
509  			return;
510  
511  		bdev_set_flag(bio->bi_bdev, BD_RO_WARNED);
512  
513  		/*
514  		 * Use ioctl to set underlying disk of raid/dm to read-only
515  		 * will trigger this.
516  		 */
517  		pr_warn("Trying to write to read-only block-device %pg\n",
518  			bio->bi_bdev);
519  	}
520  }
521  
should_fail_bio(struct bio * bio)522  static noinline int should_fail_bio(struct bio *bio)
523  {
524  	if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
525  		return -EIO;
526  	return 0;
527  }
528  ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
529  
530  /*
531   * Check whether this bio extends beyond the end of the device or partition.
532   * This may well happen - the kernel calls bread() without checking the size of
533   * the device, e.g., when mounting a file system.
534   */
bio_check_eod(struct bio * bio)535  static inline int bio_check_eod(struct bio *bio)
536  {
537  	sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
538  	unsigned int nr_sectors = bio_sectors(bio);
539  
540  	if (nr_sectors &&
541  	    (nr_sectors > maxsector ||
542  	     bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
543  		pr_info_ratelimited("%s: attempt to access beyond end of device\n"
544  				    "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n",
545  				    current->comm, bio->bi_bdev, bio->bi_opf,
546  				    bio->bi_iter.bi_sector, nr_sectors, maxsector);
547  		return -EIO;
548  	}
549  	return 0;
550  }
551  
552  /*
553   * Remap block n of partition p to block n+start(p) of the disk.
554   */
blk_partition_remap(struct bio * bio)555  static int blk_partition_remap(struct bio *bio)
556  {
557  	struct block_device *p = bio->bi_bdev;
558  
559  	if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
560  		return -EIO;
561  	if (bio_sectors(bio)) {
562  		bio->bi_iter.bi_sector += p->bd_start_sect;
563  		trace_block_bio_remap(bio, p->bd_dev,
564  				      bio->bi_iter.bi_sector -
565  				      p->bd_start_sect);
566  	}
567  	bio_set_flag(bio, BIO_REMAPPED);
568  	return 0;
569  }
570  
571  /*
572   * Check write append to a zoned block device.
573   */
blk_check_zone_append(struct request_queue * q,struct bio * bio)574  static inline blk_status_t blk_check_zone_append(struct request_queue *q,
575  						 struct bio *bio)
576  {
577  	int nr_sectors = bio_sectors(bio);
578  
579  	/* Only applicable to zoned block devices */
580  	if (!bdev_is_zoned(bio->bi_bdev))
581  		return BLK_STS_NOTSUPP;
582  
583  	/* The bio sector must point to the start of a sequential zone */
584  	if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
585  		return BLK_STS_IOERR;
586  
587  	/*
588  	 * Not allowed to cross zone boundaries. Otherwise, the BIO will be
589  	 * split and could result in non-contiguous sectors being written in
590  	 * different zones.
591  	 */
592  	if (nr_sectors > q->limits.chunk_sectors)
593  		return BLK_STS_IOERR;
594  
595  	/* Make sure the BIO is small enough and will not get split */
596  	if (nr_sectors > queue_max_zone_append_sectors(q))
597  		return BLK_STS_IOERR;
598  
599  	bio->bi_opf |= REQ_NOMERGE;
600  
601  	return BLK_STS_OK;
602  }
603  
__submit_bio(struct bio * bio)604  static void __submit_bio(struct bio *bio)
605  {
606  	/* If plug is not used, add new plug here to cache nsecs time. */
607  	struct blk_plug plug;
608  
609  	if (unlikely(!blk_crypto_bio_prep(&bio)))
610  		return;
611  
612  	blk_start_plug(&plug);
613  
614  	if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) {
615  		blk_mq_submit_bio(bio);
616  	} else if (likely(bio_queue_enter(bio) == 0)) {
617  		struct gendisk *disk = bio->bi_bdev->bd_disk;
618  
619  		disk->fops->submit_bio(bio);
620  		blk_queue_exit(disk->queue);
621  	}
622  
623  	blk_finish_plug(&plug);
624  }
625  
626  /*
627   * The loop in this function may be a bit non-obvious, and so deserves some
628   * explanation:
629   *
630   *  - Before entering the loop, bio->bi_next is NULL (as all callers ensure
631   *    that), so we have a list with a single bio.
632   *  - We pretend that we have just taken it off a longer list, so we assign
633   *    bio_list to a pointer to the bio_list_on_stack, thus initialising the
634   *    bio_list of new bios to be added.  ->submit_bio() may indeed add some more
635   *    bios through a recursive call to submit_bio_noacct.  If it did, we find a
636   *    non-NULL value in bio_list and re-enter the loop from the top.
637   *  - In this case we really did just take the bio of the top of the list (no
638   *    pretending) and so remove it from bio_list, and call into ->submit_bio()
639   *    again.
640   *
641   * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
642   * bio_list_on_stack[1] contains bios that were submitted before the current
643   *	->submit_bio, but that haven't been processed yet.
644   */
__submit_bio_noacct(struct bio * bio)645  static void __submit_bio_noacct(struct bio *bio)
646  {
647  	struct bio_list bio_list_on_stack[2];
648  
649  	BUG_ON(bio->bi_next);
650  
651  	bio_list_init(&bio_list_on_stack[0]);
652  	current->bio_list = bio_list_on_stack;
653  
654  	do {
655  		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
656  		struct bio_list lower, same;
657  
658  		/*
659  		 * Create a fresh bio_list for all subordinate requests.
660  		 */
661  		bio_list_on_stack[1] = bio_list_on_stack[0];
662  		bio_list_init(&bio_list_on_stack[0]);
663  
664  		__submit_bio(bio);
665  
666  		/*
667  		 * Sort new bios into those for a lower level and those for the
668  		 * same level.
669  		 */
670  		bio_list_init(&lower);
671  		bio_list_init(&same);
672  		while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
673  			if (q == bdev_get_queue(bio->bi_bdev))
674  				bio_list_add(&same, bio);
675  			else
676  				bio_list_add(&lower, bio);
677  
678  		/*
679  		 * Now assemble so we handle the lowest level first.
680  		 */
681  		bio_list_merge(&bio_list_on_stack[0], &lower);
682  		bio_list_merge(&bio_list_on_stack[0], &same);
683  		bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
684  	} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
685  
686  	current->bio_list = NULL;
687  }
688  
__submit_bio_noacct_mq(struct bio * bio)689  static void __submit_bio_noacct_mq(struct bio *bio)
690  {
691  	struct bio_list bio_list[2] = { };
692  
693  	current->bio_list = bio_list;
694  
695  	do {
696  		__submit_bio(bio);
697  	} while ((bio = bio_list_pop(&bio_list[0])));
698  
699  	current->bio_list = NULL;
700  }
701  
submit_bio_noacct_nocheck(struct bio * bio)702  void submit_bio_noacct_nocheck(struct bio *bio)
703  {
704  	blk_cgroup_bio_start(bio);
705  	blkcg_bio_issue_init(bio);
706  
707  	if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
708  		trace_block_bio_queue(bio);
709  		/*
710  		 * Now that enqueuing has been traced, we need to trace
711  		 * completion as well.
712  		 */
713  		bio_set_flag(bio, BIO_TRACE_COMPLETION);
714  	}
715  
716  	/*
717  	 * We only want one ->submit_bio to be active at a time, else stack
718  	 * usage with stacked devices could be a problem.  Use current->bio_list
719  	 * to collect a list of requests submited by a ->submit_bio method while
720  	 * it is active, and then process them after it returned.
721  	 */
722  	if (current->bio_list)
723  		bio_list_add(&current->bio_list[0], bio);
724  	else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
725  		__submit_bio_noacct_mq(bio);
726  	else
727  		__submit_bio_noacct(bio);
728  }
729  
blk_validate_atomic_write_op_size(struct request_queue * q,struct bio * bio)730  static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q,
731  						 struct bio *bio)
732  {
733  	if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q))
734  		return BLK_STS_INVAL;
735  
736  	if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q))
737  		return BLK_STS_INVAL;
738  
739  	return BLK_STS_OK;
740  }
741  
742  /**
743   * submit_bio_noacct - re-submit a bio to the block device layer for I/O
744   * @bio:  The bio describing the location in memory and on the device.
745   *
746   * This is a version of submit_bio() that shall only be used for I/O that is
747   * resubmitted to lower level drivers by stacking block drivers.  All file
748   * systems and other upper level users of the block layer should use
749   * submit_bio() instead.
750   */
submit_bio_noacct(struct bio * bio)751  void submit_bio_noacct(struct bio *bio)
752  {
753  	struct block_device *bdev = bio->bi_bdev;
754  	struct request_queue *q = bdev_get_queue(bdev);
755  	blk_status_t status = BLK_STS_IOERR;
756  
757  	might_sleep();
758  
759  	/*
760  	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
761  	 * if queue does not support NOWAIT.
762  	 */
763  	if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev))
764  		goto not_supported;
765  
766  	if (should_fail_bio(bio))
767  		goto end_io;
768  	bio_check_ro(bio);
769  	if (!bio_flagged(bio, BIO_REMAPPED)) {
770  		if (unlikely(bio_check_eod(bio)))
771  			goto end_io;
772  		if (bdev_is_partition(bdev) &&
773  		    unlikely(blk_partition_remap(bio)))
774  			goto end_io;
775  	}
776  
777  	/*
778  	 * Filter flush bio's early so that bio based drivers without flush
779  	 * support don't have to worry about them.
780  	 */
781  	if (op_is_flush(bio->bi_opf)) {
782  		if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE &&
783  				 bio_op(bio) != REQ_OP_ZONE_APPEND))
784  			goto end_io;
785  		if (!bdev_write_cache(bdev)) {
786  			bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
787  			if (!bio_sectors(bio)) {
788  				status = BLK_STS_OK;
789  				goto end_io;
790  			}
791  		}
792  	}
793  
794  	if (!(q->limits.features & BLK_FEAT_POLL) &&
795  			(bio->bi_opf & REQ_POLLED)) {
796  		bio_clear_polled(bio);
797  		goto not_supported;
798  	}
799  
800  	switch (bio_op(bio)) {
801  	case REQ_OP_READ:
802  		break;
803  	case REQ_OP_WRITE:
804  		if (bio->bi_opf & REQ_ATOMIC) {
805  			status = blk_validate_atomic_write_op_size(q, bio);
806  			if (status != BLK_STS_OK)
807  				goto end_io;
808  		}
809  		break;
810  	case REQ_OP_FLUSH:
811  		/*
812  		 * REQ_OP_FLUSH can't be submitted through bios, it is only
813  		 * synthetized in struct request by the flush state machine.
814  		 */
815  		goto not_supported;
816  	case REQ_OP_DISCARD:
817  		if (!bdev_max_discard_sectors(bdev))
818  			goto not_supported;
819  		break;
820  	case REQ_OP_SECURE_ERASE:
821  		if (!bdev_max_secure_erase_sectors(bdev))
822  			goto not_supported;
823  		break;
824  	case REQ_OP_ZONE_APPEND:
825  		status = blk_check_zone_append(q, bio);
826  		if (status != BLK_STS_OK)
827  			goto end_io;
828  		break;
829  	case REQ_OP_WRITE_ZEROES:
830  		if (!q->limits.max_write_zeroes_sectors)
831  			goto not_supported;
832  		break;
833  	case REQ_OP_ZONE_RESET:
834  	case REQ_OP_ZONE_OPEN:
835  	case REQ_OP_ZONE_CLOSE:
836  	case REQ_OP_ZONE_FINISH:
837  	case REQ_OP_ZONE_RESET_ALL:
838  		if (!bdev_is_zoned(bio->bi_bdev))
839  			goto not_supported;
840  		break;
841  	case REQ_OP_DRV_IN:
842  	case REQ_OP_DRV_OUT:
843  		/*
844  		 * Driver private operations are only used with passthrough
845  		 * requests.
846  		 */
847  		fallthrough;
848  	default:
849  		goto not_supported;
850  	}
851  
852  	if (blk_throtl_bio(bio))
853  		return;
854  	submit_bio_noacct_nocheck(bio);
855  	return;
856  
857  not_supported:
858  	status = BLK_STS_NOTSUPP;
859  end_io:
860  	bio->bi_status = status;
861  	bio_endio(bio);
862  }
863  EXPORT_SYMBOL(submit_bio_noacct);
864  
bio_set_ioprio(struct bio * bio)865  static void bio_set_ioprio(struct bio *bio)
866  {
867  	/* Nobody set ioprio so far? Initialize it based on task's nice value */
868  	if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
869  		bio->bi_ioprio = get_current_ioprio();
870  	blkcg_set_ioprio(bio);
871  }
872  
873  /**
874   * submit_bio - submit a bio to the block device layer for I/O
875   * @bio: The &struct bio which describes the I/O
876   *
877   * submit_bio() is used to submit I/O requests to block devices.  It is passed a
878   * fully set up &struct bio that describes the I/O that needs to be done.  The
879   * bio will be send to the device described by the bi_bdev field.
880   *
881   * The success/failure status of the request, along with notification of
882   * completion, is delivered asynchronously through the ->bi_end_io() callback
883   * in @bio.  The bio must NOT be touched by the caller until ->bi_end_io() has
884   * been called.
885   */
submit_bio(struct bio * bio)886  void submit_bio(struct bio *bio)
887  {
888  	if (bio_op(bio) == REQ_OP_READ) {
889  		task_io_account_read(bio->bi_iter.bi_size);
890  		count_vm_events(PGPGIN, bio_sectors(bio));
891  	} else if (bio_op(bio) == REQ_OP_WRITE) {
892  		count_vm_events(PGPGOUT, bio_sectors(bio));
893  	}
894  
895  	bio_set_ioprio(bio);
896  	submit_bio_noacct(bio);
897  }
898  EXPORT_SYMBOL(submit_bio);
899  
900  /**
901   * bio_poll - poll for BIO completions
902   * @bio: bio to poll for
903   * @iob: batches of IO
904   * @flags: BLK_POLL_* flags that control the behavior
905   *
906   * Poll for completions on queue associated with the bio. Returns number of
907   * completed entries found.
908   *
909   * Note: the caller must either be the context that submitted @bio, or
910   * be in a RCU critical section to prevent freeing of @bio.
911   */
bio_poll(struct bio * bio,struct io_comp_batch * iob,unsigned int flags)912  int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
913  {
914  	blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
915  	struct block_device *bdev;
916  	struct request_queue *q;
917  	int ret = 0;
918  
919  	bdev = READ_ONCE(bio->bi_bdev);
920  	if (!bdev)
921  		return 0;
922  
923  	q = bdev_get_queue(bdev);
924  	if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL))
925  		return 0;
926  
927  	blk_flush_plug(current->plug, false);
928  
929  	/*
930  	 * We need to be able to enter a frozen queue, similar to how
931  	 * timeouts also need to do that. If that is blocked, then we can
932  	 * have pending IO when a queue freeze is started, and then the
933  	 * wait for the freeze to finish will wait for polled requests to
934  	 * timeout as the poller is preventer from entering the queue and
935  	 * completing them. As long as we prevent new IO from being queued,
936  	 * that should be all that matters.
937  	 */
938  	if (!percpu_ref_tryget(&q->q_usage_counter))
939  		return 0;
940  	if (queue_is_mq(q)) {
941  		ret = blk_mq_poll(q, cookie, iob, flags);
942  	} else {
943  		struct gendisk *disk = q->disk;
944  
945  		if (disk && disk->fops->poll_bio)
946  			ret = disk->fops->poll_bio(bio, iob, flags);
947  	}
948  	blk_queue_exit(q);
949  	return ret;
950  }
951  EXPORT_SYMBOL_GPL(bio_poll);
952  
953  /*
954   * Helper to implement file_operations.iopoll.  Requires the bio to be stored
955   * in iocb->private, and cleared before freeing the bio.
956   */
iocb_bio_iopoll(struct kiocb * kiocb,struct io_comp_batch * iob,unsigned int flags)957  int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
958  		    unsigned int flags)
959  {
960  	struct bio *bio;
961  	int ret = 0;
962  
963  	/*
964  	 * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
965  	 * point to a freshly allocated bio at this point.  If that happens
966  	 * we have a few cases to consider:
967  	 *
968  	 *  1) the bio is beeing initialized and bi_bdev is NULL.  We can just
969  	 *     simply nothing in this case
970  	 *  2) the bio points to a not poll enabled device.  bio_poll will catch
971  	 *     this and return 0
972  	 *  3) the bio points to a poll capable device, including but not
973  	 *     limited to the one that the original bio pointed to.  In this
974  	 *     case we will call into the actual poll method and poll for I/O,
975  	 *     even if we don't need to, but it won't cause harm either.
976  	 *
977  	 * For cases 2) and 3) above the RCU grace period ensures that bi_bdev
978  	 * is still allocated. Because partitions hold a reference to the whole
979  	 * device bdev and thus disk, the disk is also still valid.  Grabbing
980  	 * a reference to the queue in bio_poll() ensures the hctxs and requests
981  	 * are still valid as well.
982  	 */
983  	rcu_read_lock();
984  	bio = READ_ONCE(kiocb->private);
985  	if (bio)
986  		ret = bio_poll(bio, iob, flags);
987  	rcu_read_unlock();
988  
989  	return ret;
990  }
991  EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
992  
update_io_ticks(struct block_device * part,unsigned long now,bool end)993  void update_io_ticks(struct block_device *part, unsigned long now, bool end)
994  {
995  	unsigned long stamp;
996  again:
997  	stamp = READ_ONCE(part->bd_stamp);
998  	if (unlikely(time_after(now, stamp)) &&
999  	    likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
1000  	    (end || part_in_flight(part)))
1001  		__part_stat_add(part, io_ticks, now - stamp);
1002  
1003  	if (bdev_is_partition(part)) {
1004  		part = bdev_whole(part);
1005  		goto again;
1006  	}
1007  }
1008  
bdev_start_io_acct(struct block_device * bdev,enum req_op op,unsigned long start_time)1009  unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
1010  				 unsigned long start_time)
1011  {
1012  	part_stat_lock();
1013  	update_io_ticks(bdev, start_time, false);
1014  	part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
1015  	part_stat_unlock();
1016  
1017  	return start_time;
1018  }
1019  EXPORT_SYMBOL(bdev_start_io_acct);
1020  
1021  /**
1022   * bio_start_io_acct - start I/O accounting for bio based drivers
1023   * @bio:	bio to start account for
1024   *
1025   * Returns the start time that should be passed back to bio_end_io_acct().
1026   */
bio_start_io_acct(struct bio * bio)1027  unsigned long bio_start_io_acct(struct bio *bio)
1028  {
1029  	return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies);
1030  }
1031  EXPORT_SYMBOL_GPL(bio_start_io_acct);
1032  
bdev_end_io_acct(struct block_device * bdev,enum req_op op,unsigned int sectors,unsigned long start_time)1033  void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
1034  		      unsigned int sectors, unsigned long start_time)
1035  {
1036  	const int sgrp = op_stat_group(op);
1037  	unsigned long now = READ_ONCE(jiffies);
1038  	unsigned long duration = now - start_time;
1039  
1040  	part_stat_lock();
1041  	update_io_ticks(bdev, now, true);
1042  	part_stat_inc(bdev, ios[sgrp]);
1043  	part_stat_add(bdev, sectors[sgrp], sectors);
1044  	part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
1045  	part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
1046  	part_stat_unlock();
1047  }
1048  EXPORT_SYMBOL(bdev_end_io_acct);
1049  
bio_end_io_acct_remapped(struct bio * bio,unsigned long start_time,struct block_device * orig_bdev)1050  void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
1051  			      struct block_device *orig_bdev)
1052  {
1053  	bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time);
1054  }
1055  EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
1056  
1057  /**
1058   * blk_lld_busy - Check if underlying low-level drivers of a device are busy
1059   * @q : the queue of the device being checked
1060   *
1061   * Description:
1062   *    Check if underlying low-level drivers of a device are busy.
1063   *    If the drivers want to export their busy state, they must set own
1064   *    exporting function using blk_queue_lld_busy() first.
1065   *
1066   *    Basically, this function is used only by request stacking drivers
1067   *    to stop dispatching requests to underlying devices when underlying
1068   *    devices are busy.  This behavior helps more I/O merging on the queue
1069   *    of the request stacking driver and prevents I/O throughput regression
1070   *    on burst I/O load.
1071   *
1072   * Return:
1073   *    0 - Not busy (The request stacking driver should dispatch request)
1074   *    1 - Busy (The request stacking driver should stop dispatching request)
1075   */
blk_lld_busy(struct request_queue * q)1076  int blk_lld_busy(struct request_queue *q)
1077  {
1078  	if (queue_is_mq(q) && q->mq_ops->busy)
1079  		return q->mq_ops->busy(q);
1080  
1081  	return 0;
1082  }
1083  EXPORT_SYMBOL_GPL(blk_lld_busy);
1084  
kblockd_schedule_work(struct work_struct * work)1085  int kblockd_schedule_work(struct work_struct *work)
1086  {
1087  	return queue_work(kblockd_workqueue, work);
1088  }
1089  EXPORT_SYMBOL(kblockd_schedule_work);
1090  
kblockd_mod_delayed_work_on(int cpu,struct delayed_work * dwork,unsigned long delay)1091  int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
1092  				unsigned long delay)
1093  {
1094  	return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
1095  }
1096  EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
1097  
blk_start_plug_nr_ios(struct blk_plug * plug,unsigned short nr_ios)1098  void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
1099  {
1100  	struct task_struct *tsk = current;
1101  
1102  	/*
1103  	 * If this is a nested plug, don't actually assign it.
1104  	 */
1105  	if (tsk->plug)
1106  		return;
1107  
1108  	plug->cur_ktime = 0;
1109  	plug->mq_list = NULL;
1110  	plug->cached_rq = NULL;
1111  	plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
1112  	plug->rq_count = 0;
1113  	plug->multiple_queues = false;
1114  	plug->has_elevator = false;
1115  	INIT_LIST_HEAD(&plug->cb_list);
1116  
1117  	/*
1118  	 * Store ordering should not be needed here, since a potential
1119  	 * preempt will imply a full memory barrier
1120  	 */
1121  	tsk->plug = plug;
1122  }
1123  
1124  /**
1125   * blk_start_plug - initialize blk_plug and track it inside the task_struct
1126   * @plug:	The &struct blk_plug that needs to be initialized
1127   *
1128   * Description:
1129   *   blk_start_plug() indicates to the block layer an intent by the caller
1130   *   to submit multiple I/O requests in a batch.  The block layer may use
1131   *   this hint to defer submitting I/Os from the caller until blk_finish_plug()
1132   *   is called.  However, the block layer may choose to submit requests
1133   *   before a call to blk_finish_plug() if the number of queued I/Os
1134   *   exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
1135   *   %BLK_PLUG_FLUSH_SIZE.  The queued I/Os may also be submitted early if
1136   *   the task schedules (see below).
1137   *
1138   *   Tracking blk_plug inside the task_struct will help with auto-flushing the
1139   *   pending I/O should the task end up blocking between blk_start_plug() and
1140   *   blk_finish_plug(). This is important from a performance perspective, but
1141   *   also ensures that we don't deadlock. For instance, if the task is blocking
1142   *   for a memory allocation, memory reclaim could end up wanting to free a
1143   *   page belonging to that request that is currently residing in our private
1144   *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
1145   *   this kind of deadlock.
1146   */
blk_start_plug(struct blk_plug * plug)1147  void blk_start_plug(struct blk_plug *plug)
1148  {
1149  	blk_start_plug_nr_ios(plug, 1);
1150  }
1151  EXPORT_SYMBOL(blk_start_plug);
1152  
flush_plug_callbacks(struct blk_plug * plug,bool from_schedule)1153  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
1154  {
1155  	LIST_HEAD(callbacks);
1156  
1157  	while (!list_empty(&plug->cb_list)) {
1158  		list_splice_init(&plug->cb_list, &callbacks);
1159  
1160  		while (!list_empty(&callbacks)) {
1161  			struct blk_plug_cb *cb = list_first_entry(&callbacks,
1162  							  struct blk_plug_cb,
1163  							  list);
1164  			list_del(&cb->list);
1165  			cb->callback(cb, from_schedule);
1166  		}
1167  	}
1168  }
1169  
blk_check_plugged(blk_plug_cb_fn unplug,void * data,int size)1170  struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
1171  				      int size)
1172  {
1173  	struct blk_plug *plug = current->plug;
1174  	struct blk_plug_cb *cb;
1175  
1176  	if (!plug)
1177  		return NULL;
1178  
1179  	list_for_each_entry(cb, &plug->cb_list, list)
1180  		if (cb->callback == unplug && cb->data == data)
1181  			return cb;
1182  
1183  	/* Not currently on the callback list */
1184  	BUG_ON(size < sizeof(*cb));
1185  	cb = kzalloc(size, GFP_ATOMIC);
1186  	if (cb) {
1187  		cb->data = data;
1188  		cb->callback = unplug;
1189  		list_add(&cb->list, &plug->cb_list);
1190  	}
1191  	return cb;
1192  }
1193  EXPORT_SYMBOL(blk_check_plugged);
1194  
__blk_flush_plug(struct blk_plug * plug,bool from_schedule)1195  void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
1196  {
1197  	if (!list_empty(&plug->cb_list))
1198  		flush_plug_callbacks(plug, from_schedule);
1199  	blk_mq_flush_plug_list(plug, from_schedule);
1200  	/*
1201  	 * Unconditionally flush out cached requests, even if the unplug
1202  	 * event came from schedule. Since we know hold references to the
1203  	 * queue for cached requests, we don't want a blocked task holding
1204  	 * up a queue freeze/quiesce event.
1205  	 */
1206  	if (unlikely(!rq_list_empty(plug->cached_rq)))
1207  		blk_mq_free_plug_rqs(plug);
1208  
1209  	plug->cur_ktime = 0;
1210  	current->flags &= ~PF_BLOCK_TS;
1211  }
1212  
1213  /**
1214   * blk_finish_plug - mark the end of a batch of submitted I/O
1215   * @plug:	The &struct blk_plug passed to blk_start_plug()
1216   *
1217   * Description:
1218   * Indicate that a batch of I/O submissions is complete.  This function
1219   * must be paired with an initial call to blk_start_plug().  The intent
1220   * is to allow the block layer to optimize I/O submission.  See the
1221   * documentation for blk_start_plug() for more information.
1222   */
blk_finish_plug(struct blk_plug * plug)1223  void blk_finish_plug(struct blk_plug *plug)
1224  {
1225  	if (plug == current->plug) {
1226  		__blk_flush_plug(plug, false);
1227  		current->plug = NULL;
1228  	}
1229  }
1230  EXPORT_SYMBOL(blk_finish_plug);
1231  
blk_io_schedule(void)1232  void blk_io_schedule(void)
1233  {
1234  	/* Prevent hang_check timer from firing at us during very long I/O */
1235  	unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
1236  
1237  	if (timeout)
1238  		io_schedule_timeout(timeout);
1239  	else
1240  		io_schedule();
1241  }
1242  EXPORT_SYMBOL_GPL(blk_io_schedule);
1243  
blk_dev_init(void)1244  int __init blk_dev_init(void)
1245  {
1246  	BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS));
1247  	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
1248  			sizeof_field(struct request, cmd_flags));
1249  	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
1250  			sizeof_field(struct bio, bi_opf));
1251  
1252  	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
1253  	kblockd_workqueue = alloc_workqueue("kblockd",
1254  					    WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
1255  	if (!kblockd_workqueue)
1256  		panic("Failed to create kblockd\n");
1257  
1258  	blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC);
1259  
1260  	blk_debugfs_root = debugfs_create_dir("block", NULL);
1261  
1262  	return 0;
1263  }
1264