1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
4   * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
5   *
6   * This file is released under the GPL.
7   */
8  
9  #include "dm-core.h"
10  #include "dm-rq.h"
11  #include "dm-uevent.h"
12  #include "dm-ima.h"
13  
14  #include <linux/bio-integrity.h>
15  #include <linux/init.h>
16  #include <linux/module.h>
17  #include <linux/mutex.h>
18  #include <linux/sched/mm.h>
19  #include <linux/sched/signal.h>
20  #include <linux/blkpg.h>
21  #include <linux/bio.h>
22  #include <linux/mempool.h>
23  #include <linux/dax.h>
24  #include <linux/slab.h>
25  #include <linux/idr.h>
26  #include <linux/uio.h>
27  #include <linux/hdreg.h>
28  #include <linux/delay.h>
29  #include <linux/wait.h>
30  #include <linux/pr.h>
31  #include <linux/refcount.h>
32  #include <linux/part_stat.h>
33  #include <linux/blk-crypto.h>
34  #include <linux/blk-crypto-profile.h>
35  
36  #define DM_MSG_PREFIX "core"
37  
38  /*
39   * Cookies are numeric values sent with CHANGE and REMOVE
40   * uevents while resuming, removing or renaming the device.
41   */
42  #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
43  #define DM_COOKIE_LENGTH 24
44  
45  /*
46   * For REQ_POLLED fs bio, this flag is set if we link mapped underlying
47   * dm_io into one list, and reuse bio->bi_private as the list head. Before
48   * ending this fs bio, we will recover its ->bi_private.
49   */
50  #define REQ_DM_POLL_LIST	REQ_DRV
51  
52  static const char *_name = DM_NAME;
53  
54  static unsigned int major;
55  static unsigned int _major;
56  
57  static DEFINE_IDR(_minor_idr);
58  
59  static DEFINE_SPINLOCK(_minor_lock);
60  
61  static void do_deferred_remove(struct work_struct *w);
62  
63  static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
64  
65  static struct workqueue_struct *deferred_remove_workqueue;
66  
67  atomic_t dm_global_event_nr = ATOMIC_INIT(0);
68  DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
69  
dm_issue_global_event(void)70  void dm_issue_global_event(void)
71  {
72  	atomic_inc(&dm_global_event_nr);
73  	wake_up(&dm_global_eventq);
74  }
75  
76  DEFINE_STATIC_KEY_FALSE(stats_enabled);
77  DEFINE_STATIC_KEY_FALSE(swap_bios_enabled);
78  DEFINE_STATIC_KEY_FALSE(zoned_enabled);
79  
80  /*
81   * One of these is allocated (on-stack) per original bio.
82   */
83  struct clone_info {
84  	struct dm_table *map;
85  	struct bio *bio;
86  	struct dm_io *io;
87  	sector_t sector;
88  	unsigned int sector_count;
89  	bool is_abnormal_io:1;
90  	bool submit_as_polled:1;
91  };
92  
clone_to_tio(struct bio * clone)93  static inline struct dm_target_io *clone_to_tio(struct bio *clone)
94  {
95  	return container_of(clone, struct dm_target_io, clone);
96  }
97  
dm_per_bio_data(struct bio * bio,size_t data_size)98  void *dm_per_bio_data(struct bio *bio, size_t data_size)
99  {
100  	if (!dm_tio_flagged(clone_to_tio(bio), DM_TIO_INSIDE_DM_IO))
101  		return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;
102  	return (char *)bio - DM_IO_BIO_OFFSET - data_size;
103  }
104  EXPORT_SYMBOL_GPL(dm_per_bio_data);
105  
dm_bio_from_per_bio_data(void * data,size_t data_size)106  struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
107  {
108  	struct dm_io *io = (struct dm_io *)((char *)data + data_size);
109  
110  	if (io->magic == DM_IO_MAGIC)
111  		return (struct bio *)((char *)io + DM_IO_BIO_OFFSET);
112  	BUG_ON(io->magic != DM_TIO_MAGIC);
113  	return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET);
114  }
115  EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
116  
dm_bio_get_target_bio_nr(const struct bio * bio)117  unsigned int dm_bio_get_target_bio_nr(const struct bio *bio)
118  {
119  	return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
120  }
121  EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
122  
123  #define MINOR_ALLOCED ((void *)-1)
124  
125  #define DM_NUMA_NODE NUMA_NO_NODE
126  static int dm_numa_node = DM_NUMA_NODE;
127  
128  #define DEFAULT_SWAP_BIOS	(8 * 1048576 / PAGE_SIZE)
129  static int swap_bios = DEFAULT_SWAP_BIOS;
get_swap_bios(void)130  static int get_swap_bios(void)
131  {
132  	int latch = READ_ONCE(swap_bios);
133  
134  	if (unlikely(latch <= 0))
135  		latch = DEFAULT_SWAP_BIOS;
136  	return latch;
137  }
138  
139  struct table_device {
140  	struct list_head list;
141  	refcount_t count;
142  	struct dm_dev dm_dev;
143  };
144  
145  /*
146   * Bio-based DM's mempools' reserved IOs set by the user.
147   */
148  #define RESERVED_BIO_BASED_IOS		16
149  static unsigned int reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
150  
__dm_get_module_param_int(int * module_param,int min,int max)151  static int __dm_get_module_param_int(int *module_param, int min, int max)
152  {
153  	int param = READ_ONCE(*module_param);
154  	int modified_param = 0;
155  	bool modified = true;
156  
157  	if (param < min)
158  		modified_param = min;
159  	else if (param > max)
160  		modified_param = max;
161  	else
162  		modified = false;
163  
164  	if (modified) {
165  		(void)cmpxchg(module_param, param, modified_param);
166  		param = modified_param;
167  	}
168  
169  	return param;
170  }
171  
__dm_get_module_param(unsigned int * module_param,unsigned int def,unsigned int max)172  unsigned int __dm_get_module_param(unsigned int *module_param, unsigned int def, unsigned int max)
173  {
174  	unsigned int param = READ_ONCE(*module_param);
175  	unsigned int modified_param = 0;
176  
177  	if (!param)
178  		modified_param = def;
179  	else if (param > max)
180  		modified_param = max;
181  
182  	if (modified_param) {
183  		(void)cmpxchg(module_param, param, modified_param);
184  		param = modified_param;
185  	}
186  
187  	return param;
188  }
189  
dm_get_reserved_bio_based_ios(void)190  unsigned int dm_get_reserved_bio_based_ios(void)
191  {
192  	return __dm_get_module_param(&reserved_bio_based_ios,
193  				     RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
194  }
195  EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
196  
dm_get_numa_node(void)197  static unsigned int dm_get_numa_node(void)
198  {
199  	return __dm_get_module_param_int(&dm_numa_node,
200  					 DM_NUMA_NODE, num_online_nodes() - 1);
201  }
202  
local_init(void)203  static int __init local_init(void)
204  {
205  	int r;
206  
207  	r = dm_uevent_init();
208  	if (r)
209  		return r;
210  
211  	deferred_remove_workqueue = alloc_ordered_workqueue("kdmremove", 0);
212  	if (!deferred_remove_workqueue) {
213  		r = -ENOMEM;
214  		goto out_uevent_exit;
215  	}
216  
217  	_major = major;
218  	r = register_blkdev(_major, _name);
219  	if (r < 0)
220  		goto out_free_workqueue;
221  
222  	if (!_major)
223  		_major = r;
224  
225  	return 0;
226  
227  out_free_workqueue:
228  	destroy_workqueue(deferred_remove_workqueue);
229  out_uevent_exit:
230  	dm_uevent_exit();
231  
232  	return r;
233  }
234  
local_exit(void)235  static void local_exit(void)
236  {
237  	destroy_workqueue(deferred_remove_workqueue);
238  
239  	unregister_blkdev(_major, _name);
240  	dm_uevent_exit();
241  
242  	_major = 0;
243  
244  	DMINFO("cleaned up");
245  }
246  
247  static int (*_inits[])(void) __initdata = {
248  	local_init,
249  	dm_target_init,
250  	dm_linear_init,
251  	dm_stripe_init,
252  	dm_io_init,
253  	dm_kcopyd_init,
254  	dm_interface_init,
255  	dm_statistics_init,
256  };
257  
258  static void (*_exits[])(void) = {
259  	local_exit,
260  	dm_target_exit,
261  	dm_linear_exit,
262  	dm_stripe_exit,
263  	dm_io_exit,
264  	dm_kcopyd_exit,
265  	dm_interface_exit,
266  	dm_statistics_exit,
267  };
268  
dm_init(void)269  static int __init dm_init(void)
270  {
271  	const int count = ARRAY_SIZE(_inits);
272  	int r, i;
273  
274  #if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE))
275  	DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled."
276  	       " Duplicate IMA measurements will not be recorded in the IMA log.");
277  #endif
278  
279  	for (i = 0; i < count; i++) {
280  		r = _inits[i]();
281  		if (r)
282  			goto bad;
283  	}
284  
285  	return 0;
286  bad:
287  	while (i--)
288  		_exits[i]();
289  
290  	return r;
291  }
292  
dm_exit(void)293  static void __exit dm_exit(void)
294  {
295  	int i = ARRAY_SIZE(_exits);
296  
297  	while (i--)
298  		_exits[i]();
299  
300  	/*
301  	 * Should be empty by this point.
302  	 */
303  	idr_destroy(&_minor_idr);
304  }
305  
306  /*
307   * Block device functions
308   */
dm_deleting_md(struct mapped_device * md)309  int dm_deleting_md(struct mapped_device *md)
310  {
311  	return test_bit(DMF_DELETING, &md->flags);
312  }
313  
dm_blk_open(struct gendisk * disk,blk_mode_t mode)314  static int dm_blk_open(struct gendisk *disk, blk_mode_t mode)
315  {
316  	struct mapped_device *md;
317  
318  	spin_lock(&_minor_lock);
319  
320  	md = disk->private_data;
321  	if (!md)
322  		goto out;
323  
324  	if (test_bit(DMF_FREEING, &md->flags) ||
325  	    dm_deleting_md(md)) {
326  		md = NULL;
327  		goto out;
328  	}
329  
330  	dm_get(md);
331  	atomic_inc(&md->open_count);
332  out:
333  	spin_unlock(&_minor_lock);
334  
335  	return md ? 0 : -ENXIO;
336  }
337  
dm_blk_close(struct gendisk * disk)338  static void dm_blk_close(struct gendisk *disk)
339  {
340  	struct mapped_device *md;
341  
342  	spin_lock(&_minor_lock);
343  
344  	md = disk->private_data;
345  	if (WARN_ON(!md))
346  		goto out;
347  
348  	if (atomic_dec_and_test(&md->open_count) &&
349  	    (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
350  		queue_work(deferred_remove_workqueue, &deferred_remove_work);
351  
352  	dm_put(md);
353  out:
354  	spin_unlock(&_minor_lock);
355  }
356  
dm_open_count(struct mapped_device * md)357  int dm_open_count(struct mapped_device *md)
358  {
359  	return atomic_read(&md->open_count);
360  }
361  
362  /*
363   * Guarantees nothing is using the device before it's deleted.
364   */
dm_lock_for_deletion(struct mapped_device * md,bool mark_deferred,bool only_deferred)365  int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
366  {
367  	int r = 0;
368  
369  	spin_lock(&_minor_lock);
370  
371  	if (dm_open_count(md)) {
372  		r = -EBUSY;
373  		if (mark_deferred)
374  			set_bit(DMF_DEFERRED_REMOVE, &md->flags);
375  	} else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
376  		r = -EEXIST;
377  	else
378  		set_bit(DMF_DELETING, &md->flags);
379  
380  	spin_unlock(&_minor_lock);
381  
382  	return r;
383  }
384  
dm_cancel_deferred_remove(struct mapped_device * md)385  int dm_cancel_deferred_remove(struct mapped_device *md)
386  {
387  	int r = 0;
388  
389  	spin_lock(&_minor_lock);
390  
391  	if (test_bit(DMF_DELETING, &md->flags))
392  		r = -EBUSY;
393  	else
394  		clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
395  
396  	spin_unlock(&_minor_lock);
397  
398  	return r;
399  }
400  
do_deferred_remove(struct work_struct * w)401  static void do_deferred_remove(struct work_struct *w)
402  {
403  	dm_deferred_remove();
404  }
405  
dm_blk_getgeo(struct block_device * bdev,struct hd_geometry * geo)406  static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
407  {
408  	struct mapped_device *md = bdev->bd_disk->private_data;
409  
410  	return dm_get_geometry(md, geo);
411  }
412  
dm_prepare_ioctl(struct mapped_device * md,int * srcu_idx,struct block_device ** bdev)413  static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
414  			    struct block_device **bdev)
415  {
416  	struct dm_target *ti;
417  	struct dm_table *map;
418  	int r;
419  
420  retry:
421  	r = -ENOTTY;
422  	map = dm_get_live_table(md, srcu_idx);
423  	if (!map || !dm_table_get_size(map))
424  		return r;
425  
426  	/* We only support devices that have a single target */
427  	if (map->num_targets != 1)
428  		return r;
429  
430  	ti = dm_table_get_target(map, 0);
431  	if (!ti->type->prepare_ioctl)
432  		return r;
433  
434  	if (dm_suspended_md(md))
435  		return -EAGAIN;
436  
437  	r = ti->type->prepare_ioctl(ti, bdev);
438  	if (r == -ENOTCONN && !fatal_signal_pending(current)) {
439  		dm_put_live_table(md, *srcu_idx);
440  		fsleep(10000);
441  		goto retry;
442  	}
443  
444  	return r;
445  }
446  
dm_unprepare_ioctl(struct mapped_device * md,int srcu_idx)447  static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
448  {
449  	dm_put_live_table(md, srcu_idx);
450  }
451  
dm_blk_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)452  static int dm_blk_ioctl(struct block_device *bdev, blk_mode_t mode,
453  			unsigned int cmd, unsigned long arg)
454  {
455  	struct mapped_device *md = bdev->bd_disk->private_data;
456  	int r, srcu_idx;
457  
458  	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
459  	if (r < 0)
460  		goto out;
461  
462  	if (r > 0) {
463  		/*
464  		 * Target determined this ioctl is being issued against a
465  		 * subset of the parent bdev; require extra privileges.
466  		 */
467  		if (!capable(CAP_SYS_RAWIO)) {
468  			DMDEBUG_LIMIT(
469  	"%s: sending ioctl %x to DM device without required privilege.",
470  				current->comm, cmd);
471  			r = -ENOIOCTLCMD;
472  			goto out;
473  		}
474  	}
475  
476  	if (!bdev->bd_disk->fops->ioctl)
477  		r = -ENOTTY;
478  	else
479  		r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
480  out:
481  	dm_unprepare_ioctl(md, srcu_idx);
482  	return r;
483  }
484  
dm_start_time_ns_from_clone(struct bio * bio)485  u64 dm_start_time_ns_from_clone(struct bio *bio)
486  {
487  	return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
488  }
489  EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
490  
bio_is_flush_with_data(struct bio * bio)491  static inline bool bio_is_flush_with_data(struct bio *bio)
492  {
493  	return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
494  }
495  
dm_io_sectors(struct dm_io * io,struct bio * bio)496  static inline unsigned int dm_io_sectors(struct dm_io *io, struct bio *bio)
497  {
498  	/*
499  	 * If REQ_PREFLUSH set, don't account payload, it will be
500  	 * submitted (and accounted) after this flush completes.
501  	 */
502  	if (bio_is_flush_with_data(bio))
503  		return 0;
504  	if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT)))
505  		return io->sectors;
506  	return bio_sectors(bio);
507  }
508  
dm_io_acct(struct dm_io * io,bool end)509  static void dm_io_acct(struct dm_io *io, bool end)
510  {
511  	struct bio *bio = io->orig_bio;
512  
513  	if (dm_io_flagged(io, DM_IO_BLK_STAT)) {
514  		if (!end)
515  			bdev_start_io_acct(bio->bi_bdev, bio_op(bio),
516  					   io->start_time);
517  		else
518  			bdev_end_io_acct(bio->bi_bdev, bio_op(bio),
519  					 dm_io_sectors(io, bio),
520  					 io->start_time);
521  	}
522  
523  	if (static_branch_unlikely(&stats_enabled) &&
524  	    unlikely(dm_stats_used(&io->md->stats))) {
525  		sector_t sector;
526  
527  		if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT)))
528  			sector = bio_end_sector(bio) - io->sector_offset;
529  		else
530  			sector = bio->bi_iter.bi_sector;
531  
532  		dm_stats_account_io(&io->md->stats, bio_data_dir(bio),
533  				    sector, dm_io_sectors(io, bio),
534  				    end, io->start_time, &io->stats_aux);
535  	}
536  }
537  
__dm_start_io_acct(struct dm_io * io)538  static void __dm_start_io_acct(struct dm_io *io)
539  {
540  	dm_io_acct(io, false);
541  }
542  
dm_start_io_acct(struct dm_io * io,struct bio * clone)543  static void dm_start_io_acct(struct dm_io *io, struct bio *clone)
544  {
545  	/*
546  	 * Ensure IO accounting is only ever started once.
547  	 */
548  	if (dm_io_flagged(io, DM_IO_ACCOUNTED))
549  		return;
550  
551  	/* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */
552  	if (!clone || likely(dm_tio_is_normal(clone_to_tio(clone)))) {
553  		dm_io_set_flag(io, DM_IO_ACCOUNTED);
554  	} else {
555  		unsigned long flags;
556  		/* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */
557  		spin_lock_irqsave(&io->lock, flags);
558  		if (dm_io_flagged(io, DM_IO_ACCOUNTED)) {
559  			spin_unlock_irqrestore(&io->lock, flags);
560  			return;
561  		}
562  		dm_io_set_flag(io, DM_IO_ACCOUNTED);
563  		spin_unlock_irqrestore(&io->lock, flags);
564  	}
565  
566  	__dm_start_io_acct(io);
567  }
568  
dm_end_io_acct(struct dm_io * io)569  static void dm_end_io_acct(struct dm_io *io)
570  {
571  	dm_io_acct(io, true);
572  }
573  
alloc_io(struct mapped_device * md,struct bio * bio,gfp_t gfp_mask)574  static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio, gfp_t gfp_mask)
575  {
576  	struct dm_io *io;
577  	struct dm_target_io *tio;
578  	struct bio *clone;
579  
580  	clone = bio_alloc_clone(NULL, bio, gfp_mask, &md->mempools->io_bs);
581  	if (unlikely(!clone))
582  		return NULL;
583  	tio = clone_to_tio(clone);
584  	tio->flags = 0;
585  	dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO);
586  	tio->io = NULL;
587  
588  	io = container_of(tio, struct dm_io, tio);
589  	io->magic = DM_IO_MAGIC;
590  	io->status = BLK_STS_OK;
591  
592  	/* one ref is for submission, the other is for completion */
593  	atomic_set(&io->io_count, 2);
594  	this_cpu_inc(*md->pending_io);
595  	io->orig_bio = bio;
596  	io->md = md;
597  	spin_lock_init(&io->lock);
598  	io->start_time = jiffies;
599  	io->flags = 0;
600  	if (blk_queue_io_stat(md->queue))
601  		dm_io_set_flag(io, DM_IO_BLK_STAT);
602  
603  	if (static_branch_unlikely(&stats_enabled) &&
604  	    unlikely(dm_stats_used(&md->stats)))
605  		dm_stats_record_start(&md->stats, &io->stats_aux);
606  
607  	return io;
608  }
609  
free_io(struct dm_io * io)610  static void free_io(struct dm_io *io)
611  {
612  	bio_put(&io->tio.clone);
613  }
614  
alloc_tio(struct clone_info * ci,struct dm_target * ti,unsigned int target_bio_nr,unsigned int * len,gfp_t gfp_mask)615  static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
616  			     unsigned int target_bio_nr, unsigned int *len, gfp_t gfp_mask)
617  {
618  	struct mapped_device *md = ci->io->md;
619  	struct dm_target_io *tio;
620  	struct bio *clone;
621  
622  	if (!ci->io->tio.io) {
623  		/* the dm_target_io embedded in ci->io is available */
624  		tio = &ci->io->tio;
625  		/* alloc_io() already initialized embedded clone */
626  		clone = &tio->clone;
627  	} else {
628  		clone = bio_alloc_clone(NULL, ci->bio, gfp_mask,
629  					&md->mempools->bs);
630  		if (!clone)
631  			return NULL;
632  
633  		/* REQ_DM_POLL_LIST shouldn't be inherited */
634  		clone->bi_opf &= ~REQ_DM_POLL_LIST;
635  
636  		tio = clone_to_tio(clone);
637  		tio->flags = 0; /* also clears DM_TIO_INSIDE_DM_IO */
638  	}
639  
640  	tio->magic = DM_TIO_MAGIC;
641  	tio->io = ci->io;
642  	tio->ti = ti;
643  	tio->target_bio_nr = target_bio_nr;
644  	tio->len_ptr = len;
645  	tio->old_sector = 0;
646  
647  	/* Set default bdev, but target must bio_set_dev() before issuing IO */
648  	clone->bi_bdev = md->disk->part0;
649  	if (likely(ti != NULL) && unlikely(ti->needs_bio_set_dev))
650  		bio_set_dev(clone, md->disk->part0);
651  
652  	if (len) {
653  		clone->bi_iter.bi_size = to_bytes(*len);
654  		if (bio_integrity(clone))
655  			bio_integrity_trim(clone);
656  	}
657  
658  	return clone;
659  }
660  
free_tio(struct bio * clone)661  static void free_tio(struct bio *clone)
662  {
663  	if (dm_tio_flagged(clone_to_tio(clone), DM_TIO_INSIDE_DM_IO))
664  		return;
665  	bio_put(clone);
666  }
667  
668  /*
669   * Add the bio to the list of deferred io.
670   */
queue_io(struct mapped_device * md,struct bio * bio)671  static void queue_io(struct mapped_device *md, struct bio *bio)
672  {
673  	unsigned long flags;
674  
675  	spin_lock_irqsave(&md->deferred_lock, flags);
676  	bio_list_add(&md->deferred, bio);
677  	spin_unlock_irqrestore(&md->deferred_lock, flags);
678  	queue_work(md->wq, &md->work);
679  }
680  
681  /*
682   * Everyone (including functions in this file), should use this
683   * function to access the md->map field, and make sure they call
684   * dm_put_live_table() when finished.
685   */
dm_get_live_table(struct mapped_device * md,int * srcu_idx)686  struct dm_table *dm_get_live_table(struct mapped_device *md,
687  				   int *srcu_idx) __acquires(md->io_barrier)
688  {
689  	*srcu_idx = srcu_read_lock(&md->io_barrier);
690  
691  	return srcu_dereference(md->map, &md->io_barrier);
692  }
693  
dm_put_live_table(struct mapped_device * md,int srcu_idx)694  void dm_put_live_table(struct mapped_device *md,
695  		       int srcu_idx) __releases(md->io_barrier)
696  {
697  	srcu_read_unlock(&md->io_barrier, srcu_idx);
698  }
699  
dm_sync_table(struct mapped_device * md)700  void dm_sync_table(struct mapped_device *md)
701  {
702  	synchronize_srcu(&md->io_barrier);
703  	synchronize_rcu_expedited();
704  }
705  
706  /*
707   * A fast alternative to dm_get_live_table/dm_put_live_table.
708   * The caller must not block between these two functions.
709   */
dm_get_live_table_fast(struct mapped_device * md)710  static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
711  {
712  	rcu_read_lock();
713  	return rcu_dereference(md->map);
714  }
715  
dm_put_live_table_fast(struct mapped_device * md)716  static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
717  {
718  	rcu_read_unlock();
719  }
720  
721  static char *_dm_claim_ptr = "I belong to device-mapper";
722  
723  /*
724   * Open a table device so we can use it as a map destination.
725   */
open_table_device(struct mapped_device * md,dev_t dev,blk_mode_t mode)726  static struct table_device *open_table_device(struct mapped_device *md,
727  		dev_t dev, blk_mode_t mode)
728  {
729  	struct table_device *td;
730  	struct file *bdev_file;
731  	struct block_device *bdev;
732  	u64 part_off;
733  	int r;
734  
735  	td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
736  	if (!td)
737  		return ERR_PTR(-ENOMEM);
738  	refcount_set(&td->count, 1);
739  
740  	bdev_file = bdev_file_open_by_dev(dev, mode, _dm_claim_ptr, NULL);
741  	if (IS_ERR(bdev_file)) {
742  		r = PTR_ERR(bdev_file);
743  		goto out_free_td;
744  	}
745  
746  	bdev = file_bdev(bdev_file);
747  
748  	/*
749  	 * We can be called before the dm disk is added.  In that case we can't
750  	 * register the holder relation here.  It will be done once add_disk was
751  	 * called.
752  	 */
753  	if (md->disk->slave_dir) {
754  		r = bd_link_disk_holder(bdev, md->disk);
755  		if (r)
756  			goto out_blkdev_put;
757  	}
758  
759  	td->dm_dev.mode = mode;
760  	td->dm_dev.bdev = bdev;
761  	td->dm_dev.bdev_file = bdev_file;
762  	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off,
763  						NULL, NULL);
764  	format_dev_t(td->dm_dev.name, dev);
765  	list_add(&td->list, &md->table_devices);
766  	return td;
767  
768  out_blkdev_put:
769  	__fput_sync(bdev_file);
770  out_free_td:
771  	kfree(td);
772  	return ERR_PTR(r);
773  }
774  
775  /*
776   * Close a table device that we've been using.
777   */
close_table_device(struct table_device * td,struct mapped_device * md)778  static void close_table_device(struct table_device *td, struct mapped_device *md)
779  {
780  	if (md->disk->slave_dir)
781  		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
782  
783  	/* Leverage async fput() if DMF_DEFERRED_REMOVE set */
784  	if (unlikely(test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
785  		fput(td->dm_dev.bdev_file);
786  	else
787  		__fput_sync(td->dm_dev.bdev_file);
788  
789  	put_dax(td->dm_dev.dax_dev);
790  	list_del(&td->list);
791  	kfree(td);
792  }
793  
find_table_device(struct list_head * l,dev_t dev,blk_mode_t mode)794  static struct table_device *find_table_device(struct list_head *l, dev_t dev,
795  					      blk_mode_t mode)
796  {
797  	struct table_device *td;
798  
799  	list_for_each_entry(td, l, list)
800  		if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
801  			return td;
802  
803  	return NULL;
804  }
805  
dm_get_table_device(struct mapped_device * md,dev_t dev,blk_mode_t mode,struct dm_dev ** result)806  int dm_get_table_device(struct mapped_device *md, dev_t dev, blk_mode_t mode,
807  			struct dm_dev **result)
808  {
809  	struct table_device *td;
810  
811  	mutex_lock(&md->table_devices_lock);
812  	td = find_table_device(&md->table_devices, dev, mode);
813  	if (!td) {
814  		td = open_table_device(md, dev, mode);
815  		if (IS_ERR(td)) {
816  			mutex_unlock(&md->table_devices_lock);
817  			return PTR_ERR(td);
818  		}
819  	} else {
820  		refcount_inc(&td->count);
821  	}
822  	mutex_unlock(&md->table_devices_lock);
823  
824  	*result = &td->dm_dev;
825  	return 0;
826  }
827  
dm_put_table_device(struct mapped_device * md,struct dm_dev * d)828  void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
829  {
830  	struct table_device *td = container_of(d, struct table_device, dm_dev);
831  
832  	mutex_lock(&md->table_devices_lock);
833  	if (refcount_dec_and_test(&td->count))
834  		close_table_device(td, md);
835  	mutex_unlock(&md->table_devices_lock);
836  }
837  
838  /*
839   * Get the geometry associated with a dm device
840   */
dm_get_geometry(struct mapped_device * md,struct hd_geometry * geo)841  int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
842  {
843  	*geo = md->geometry;
844  
845  	return 0;
846  }
847  
848  /*
849   * Set the geometry of a device.
850   */
dm_set_geometry(struct mapped_device * md,struct hd_geometry * geo)851  int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
852  {
853  	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
854  
855  	if (geo->start > sz) {
856  		DMERR("Start sector is beyond the geometry limits.");
857  		return -EINVAL;
858  	}
859  
860  	md->geometry = *geo;
861  
862  	return 0;
863  }
864  
__noflush_suspending(struct mapped_device * md)865  static int __noflush_suspending(struct mapped_device *md)
866  {
867  	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
868  }
869  
dm_requeue_add_io(struct dm_io * io,bool first_stage)870  static void dm_requeue_add_io(struct dm_io *io, bool first_stage)
871  {
872  	struct mapped_device *md = io->md;
873  
874  	if (first_stage) {
875  		struct dm_io *next = md->requeue_list;
876  
877  		md->requeue_list = io;
878  		io->next = next;
879  	} else {
880  		bio_list_add_head(&md->deferred, io->orig_bio);
881  	}
882  }
883  
dm_kick_requeue(struct mapped_device * md,bool first_stage)884  static void dm_kick_requeue(struct mapped_device *md, bool first_stage)
885  {
886  	if (first_stage)
887  		queue_work(md->wq, &md->requeue_work);
888  	else
889  		queue_work(md->wq, &md->work);
890  }
891  
892  /*
893   * Return true if the dm_io's original bio is requeued.
894   * io->status is updated with error if requeue disallowed.
895   */
dm_handle_requeue(struct dm_io * io,bool first_stage)896  static bool dm_handle_requeue(struct dm_io *io, bool first_stage)
897  {
898  	struct bio *bio = io->orig_bio;
899  	bool handle_requeue = (io->status == BLK_STS_DM_REQUEUE);
900  	bool handle_polled_eagain = ((io->status == BLK_STS_AGAIN) &&
901  				     (bio->bi_opf & REQ_POLLED));
902  	struct mapped_device *md = io->md;
903  	bool requeued = false;
904  
905  	if (handle_requeue || handle_polled_eagain) {
906  		unsigned long flags;
907  
908  		if (bio->bi_opf & REQ_POLLED) {
909  			/*
910  			 * Upper layer won't help us poll split bio
911  			 * (io->orig_bio may only reflect a subset of the
912  			 * pre-split original) so clear REQ_POLLED.
913  			 */
914  			bio_clear_polled(bio);
915  		}
916  
917  		/*
918  		 * Target requested pushing back the I/O or
919  		 * polled IO hit BLK_STS_AGAIN.
920  		 */
921  		spin_lock_irqsave(&md->deferred_lock, flags);
922  		if ((__noflush_suspending(md) &&
923  		     !WARN_ON_ONCE(dm_is_zone_write(md, bio))) ||
924  		    handle_polled_eagain || first_stage) {
925  			dm_requeue_add_io(io, first_stage);
926  			requeued = true;
927  		} else {
928  			/*
929  			 * noflush suspend was interrupted or this is
930  			 * a write to a zoned target.
931  			 */
932  			io->status = BLK_STS_IOERR;
933  		}
934  		spin_unlock_irqrestore(&md->deferred_lock, flags);
935  	}
936  
937  	if (requeued)
938  		dm_kick_requeue(md, first_stage);
939  
940  	return requeued;
941  }
942  
__dm_io_complete(struct dm_io * io,bool first_stage)943  static void __dm_io_complete(struct dm_io *io, bool first_stage)
944  {
945  	struct bio *bio = io->orig_bio;
946  	struct mapped_device *md = io->md;
947  	blk_status_t io_error;
948  	bool requeued;
949  
950  	requeued = dm_handle_requeue(io, first_stage);
951  	if (requeued && first_stage)
952  		return;
953  
954  	io_error = io->status;
955  	if (dm_io_flagged(io, DM_IO_ACCOUNTED))
956  		dm_end_io_acct(io);
957  	else if (!io_error) {
958  		/*
959  		 * Must handle target that DM_MAPIO_SUBMITTED only to
960  		 * then bio_endio() rather than dm_submit_bio_remap()
961  		 */
962  		__dm_start_io_acct(io);
963  		dm_end_io_acct(io);
964  	}
965  	free_io(io);
966  	smp_wmb();
967  	this_cpu_dec(*md->pending_io);
968  
969  	/* nudge anyone waiting on suspend queue */
970  	if (unlikely(wq_has_sleeper(&md->wait)))
971  		wake_up(&md->wait);
972  
973  	/* Return early if the original bio was requeued */
974  	if (requeued)
975  		return;
976  
977  	if (bio_is_flush_with_data(bio)) {
978  		/*
979  		 * Preflush done for flush with data, reissue
980  		 * without REQ_PREFLUSH.
981  		 */
982  		bio->bi_opf &= ~REQ_PREFLUSH;
983  		queue_io(md, bio);
984  	} else {
985  		/* done with normal IO or empty flush */
986  		if (io_error)
987  			bio->bi_status = io_error;
988  		bio_endio(bio);
989  	}
990  }
991  
dm_wq_requeue_work(struct work_struct * work)992  static void dm_wq_requeue_work(struct work_struct *work)
993  {
994  	struct mapped_device *md = container_of(work, struct mapped_device,
995  						requeue_work);
996  	unsigned long flags;
997  	struct dm_io *io;
998  
999  	/* reuse deferred lock to simplify dm_handle_requeue */
1000  	spin_lock_irqsave(&md->deferred_lock, flags);
1001  	io = md->requeue_list;
1002  	md->requeue_list = NULL;
1003  	spin_unlock_irqrestore(&md->deferred_lock, flags);
1004  
1005  	while (io) {
1006  		struct dm_io *next = io->next;
1007  
1008  		dm_io_rewind(io, &md->disk->bio_split);
1009  
1010  		io->next = NULL;
1011  		__dm_io_complete(io, false);
1012  		io = next;
1013  		cond_resched();
1014  	}
1015  }
1016  
1017  /*
1018   * Two staged requeue:
1019   *
1020   * 1) io->orig_bio points to the real original bio, and the part mapped to
1021   *    this io must be requeued, instead of other parts of the original bio.
1022   *
1023   * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io.
1024   */
dm_io_complete(struct dm_io * io)1025  static void dm_io_complete(struct dm_io *io)
1026  {
1027  	bool first_requeue;
1028  
1029  	/*
1030  	 * Only dm_io that has been split needs two stage requeue, otherwise
1031  	 * we may run into long bio clone chain during suspend and OOM could
1032  	 * be triggered.
1033  	 *
1034  	 * Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they
1035  	 * also aren't handled via the first stage requeue.
1036  	 */
1037  	if (dm_io_flagged(io, DM_IO_WAS_SPLIT))
1038  		first_requeue = true;
1039  	else
1040  		first_requeue = false;
1041  
1042  	__dm_io_complete(io, first_requeue);
1043  }
1044  
1045  /*
1046   * Decrements the number of outstanding ios that a bio has been
1047   * cloned into, completing the original io if necc.
1048   */
__dm_io_dec_pending(struct dm_io * io)1049  static inline void __dm_io_dec_pending(struct dm_io *io)
1050  {
1051  	if (atomic_dec_and_test(&io->io_count))
1052  		dm_io_complete(io);
1053  }
1054  
dm_io_set_error(struct dm_io * io,blk_status_t error)1055  static void dm_io_set_error(struct dm_io *io, blk_status_t error)
1056  {
1057  	unsigned long flags;
1058  
1059  	/* Push-back supersedes any I/O errors */
1060  	spin_lock_irqsave(&io->lock, flags);
1061  	if (!(io->status == BLK_STS_DM_REQUEUE &&
1062  	      __noflush_suspending(io->md))) {
1063  		io->status = error;
1064  	}
1065  	spin_unlock_irqrestore(&io->lock, flags);
1066  }
1067  
dm_io_dec_pending(struct dm_io * io,blk_status_t error)1068  static void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
1069  {
1070  	if (unlikely(error))
1071  		dm_io_set_error(io, error);
1072  
1073  	__dm_io_dec_pending(io);
1074  }
1075  
1076  /*
1077   * The queue_limits are only valid as long as you have a reference
1078   * count on 'md'. But _not_ imposing verification to avoid atomic_read(),
1079   */
dm_get_queue_limits(struct mapped_device * md)1080  static inline struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
1081  {
1082  	return &md->queue->limits;
1083  }
1084  
disable_discard(struct mapped_device * md)1085  void disable_discard(struct mapped_device *md)
1086  {
1087  	struct queue_limits *limits = dm_get_queue_limits(md);
1088  
1089  	/* device doesn't really support DISCARD, disable it */
1090  	limits->max_hw_discard_sectors = 0;
1091  }
1092  
disable_write_zeroes(struct mapped_device * md)1093  void disable_write_zeroes(struct mapped_device *md)
1094  {
1095  	struct queue_limits *limits = dm_get_queue_limits(md);
1096  
1097  	/* device doesn't really support WRITE ZEROES, disable it */
1098  	limits->max_write_zeroes_sectors = 0;
1099  }
1100  
swap_bios_limit(struct dm_target * ti,struct bio * bio)1101  static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
1102  {
1103  	return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
1104  }
1105  
clone_endio(struct bio * bio)1106  static void clone_endio(struct bio *bio)
1107  {
1108  	blk_status_t error = bio->bi_status;
1109  	struct dm_target_io *tio = clone_to_tio(bio);
1110  	struct dm_target *ti = tio->ti;
1111  	dm_endio_fn endio = likely(ti != NULL) ? ti->type->end_io : NULL;
1112  	struct dm_io *io = tio->io;
1113  	struct mapped_device *md = io->md;
1114  
1115  	if (unlikely(error == BLK_STS_TARGET)) {
1116  		if (bio_op(bio) == REQ_OP_DISCARD &&
1117  		    !bdev_max_discard_sectors(bio->bi_bdev))
1118  			disable_discard(md);
1119  		else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
1120  			 !bdev_write_zeroes_sectors(bio->bi_bdev))
1121  			disable_write_zeroes(md);
1122  	}
1123  
1124  	if (static_branch_unlikely(&zoned_enabled) &&
1125  	    unlikely(bdev_is_zoned(bio->bi_bdev)))
1126  		dm_zone_endio(io, bio);
1127  
1128  	if (endio) {
1129  		int r = endio(ti, bio, &error);
1130  
1131  		switch (r) {
1132  		case DM_ENDIO_REQUEUE:
1133  			if (static_branch_unlikely(&zoned_enabled)) {
1134  				/*
1135  				 * Requeuing writes to a sequential zone of a zoned
1136  				 * target will break the sequential write pattern:
1137  				 * fail such IO.
1138  				 */
1139  				if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
1140  					error = BLK_STS_IOERR;
1141  				else
1142  					error = BLK_STS_DM_REQUEUE;
1143  			} else
1144  				error = BLK_STS_DM_REQUEUE;
1145  			fallthrough;
1146  		case DM_ENDIO_DONE:
1147  			break;
1148  		case DM_ENDIO_INCOMPLETE:
1149  			/* The target will handle the io */
1150  			return;
1151  		default:
1152  			DMCRIT("unimplemented target endio return value: %d", r);
1153  			BUG();
1154  		}
1155  	}
1156  
1157  	if (static_branch_unlikely(&swap_bios_enabled) &&
1158  	    likely(ti != NULL) && unlikely(swap_bios_limit(ti, bio)))
1159  		up(&md->swap_bios_semaphore);
1160  
1161  	free_tio(bio);
1162  	dm_io_dec_pending(io, error);
1163  }
1164  
1165  /*
1166   * Return maximum size of I/O possible at the supplied sector up to the current
1167   * target boundary.
1168   */
max_io_len_target_boundary(struct dm_target * ti,sector_t target_offset)1169  static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
1170  						  sector_t target_offset)
1171  {
1172  	return ti->len - target_offset;
1173  }
1174  
__max_io_len(struct dm_target * ti,sector_t sector,unsigned int max_granularity,unsigned int max_sectors)1175  static sector_t __max_io_len(struct dm_target *ti, sector_t sector,
1176  			     unsigned int max_granularity,
1177  			     unsigned int max_sectors)
1178  {
1179  	sector_t target_offset = dm_target_offset(ti, sector);
1180  	sector_t len = max_io_len_target_boundary(ti, target_offset);
1181  
1182  	/*
1183  	 * Does the target need to split IO even further?
1184  	 * - varied (per target) IO splitting is a tenet of DM; this
1185  	 *   explains why stacked chunk_sectors based splitting via
1186  	 *   bio_split_to_limits() isn't possible here.
1187  	 */
1188  	if (!max_granularity)
1189  		return len;
1190  	return min_t(sector_t, len,
1191  		min(max_sectors ? : queue_max_sectors(ti->table->md->queue),
1192  		    blk_boundary_sectors_left(target_offset, max_granularity)));
1193  }
1194  
max_io_len(struct dm_target * ti,sector_t sector)1195  static inline sector_t max_io_len(struct dm_target *ti, sector_t sector)
1196  {
1197  	return __max_io_len(ti, sector, ti->max_io_len, 0);
1198  }
1199  
dm_set_target_max_io_len(struct dm_target * ti,sector_t len)1200  int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1201  {
1202  	if (len > UINT_MAX) {
1203  		DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1204  		      (unsigned long long)len, UINT_MAX);
1205  		ti->error = "Maximum size of target IO is too large";
1206  		return -EINVAL;
1207  	}
1208  
1209  	ti->max_io_len = (uint32_t) len;
1210  
1211  	return 0;
1212  }
1213  EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1214  
dm_dax_get_live_target(struct mapped_device * md,sector_t sector,int * srcu_idx)1215  static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1216  						sector_t sector, int *srcu_idx)
1217  	__acquires(md->io_barrier)
1218  {
1219  	struct dm_table *map;
1220  	struct dm_target *ti;
1221  
1222  	map = dm_get_live_table(md, srcu_idx);
1223  	if (!map)
1224  		return NULL;
1225  
1226  	ti = dm_table_find_target(map, sector);
1227  	if (!ti)
1228  		return NULL;
1229  
1230  	return ti;
1231  }
1232  
dm_dax_direct_access(struct dax_device * dax_dev,pgoff_t pgoff,long nr_pages,enum dax_access_mode mode,void ** kaddr,pfn_t * pfn)1233  static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1234  		long nr_pages, enum dax_access_mode mode, void **kaddr,
1235  		pfn_t *pfn)
1236  {
1237  	struct mapped_device *md = dax_get_private(dax_dev);
1238  	sector_t sector = pgoff * PAGE_SECTORS;
1239  	struct dm_target *ti;
1240  	long len, ret = -EIO;
1241  	int srcu_idx;
1242  
1243  	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1244  
1245  	if (!ti)
1246  		goto out;
1247  	if (!ti->type->direct_access)
1248  		goto out;
1249  	len = max_io_len(ti, sector) / PAGE_SECTORS;
1250  	if (len < 1)
1251  		goto out;
1252  	nr_pages = min(len, nr_pages);
1253  	ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn);
1254  
1255   out:
1256  	dm_put_live_table(md, srcu_idx);
1257  
1258  	return ret;
1259  }
1260  
dm_dax_zero_page_range(struct dax_device * dax_dev,pgoff_t pgoff,size_t nr_pages)1261  static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1262  				  size_t nr_pages)
1263  {
1264  	struct mapped_device *md = dax_get_private(dax_dev);
1265  	sector_t sector = pgoff * PAGE_SECTORS;
1266  	struct dm_target *ti;
1267  	int ret = -EIO;
1268  	int srcu_idx;
1269  
1270  	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1271  
1272  	if (!ti)
1273  		goto out;
1274  	if (WARN_ON(!ti->type->dax_zero_page_range)) {
1275  		/*
1276  		 * ->zero_page_range() is mandatory dax operation. If we are
1277  		 *  here, something is wrong.
1278  		 */
1279  		goto out;
1280  	}
1281  	ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1282   out:
1283  	dm_put_live_table(md, srcu_idx);
1284  
1285  	return ret;
1286  }
1287  
dm_dax_recovery_write(struct dax_device * dax_dev,pgoff_t pgoff,void * addr,size_t bytes,struct iov_iter * i)1288  static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
1289  		void *addr, size_t bytes, struct iov_iter *i)
1290  {
1291  	struct mapped_device *md = dax_get_private(dax_dev);
1292  	sector_t sector = pgoff * PAGE_SECTORS;
1293  	struct dm_target *ti;
1294  	int srcu_idx;
1295  	long ret = 0;
1296  
1297  	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1298  	if (!ti || !ti->type->dax_recovery_write)
1299  		goto out;
1300  
1301  	ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i);
1302  out:
1303  	dm_put_live_table(md, srcu_idx);
1304  	return ret;
1305  }
1306  
1307  /*
1308   * A target may call dm_accept_partial_bio only from the map routine.  It is
1309   * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
1310   * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by
1311   * __send_duplicate_bios().
1312   *
1313   * dm_accept_partial_bio informs the dm that the target only wants to process
1314   * additional n_sectors sectors of the bio and the rest of the data should be
1315   * sent in a next bio.
1316   *
1317   * A diagram that explains the arithmetics:
1318   * +--------------------+---------------+-------+
1319   * |         1          |       2       |   3   |
1320   * +--------------------+---------------+-------+
1321   *
1322   * <-------------- *tio->len_ptr --------------->
1323   *                      <----- bio_sectors ----->
1324   *                      <-- n_sectors -->
1325   *
1326   * Region 1 was already iterated over with bio_advance or similar function.
1327   *	(it may be empty if the target doesn't use bio_advance)
1328   * Region 2 is the remaining bio size that the target wants to process.
1329   *	(it may be empty if region 1 is non-empty, although there is no reason
1330   *	 to make it empty)
1331   * The target requires that region 3 is to be sent in the next bio.
1332   *
1333   * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1334   * the partially processed part (the sum of regions 1+2) must be the same for all
1335   * copies of the bio.
1336   */
dm_accept_partial_bio(struct bio * bio,unsigned int n_sectors)1337  void dm_accept_partial_bio(struct bio *bio, unsigned int n_sectors)
1338  {
1339  	struct dm_target_io *tio = clone_to_tio(bio);
1340  	struct dm_io *io = tio->io;
1341  	unsigned int bio_sectors = bio_sectors(bio);
1342  
1343  	BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
1344  	BUG_ON(op_is_zone_mgmt(bio_op(bio)));
1345  	BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
1346  	BUG_ON(bio_sectors > *tio->len_ptr);
1347  	BUG_ON(n_sectors > bio_sectors);
1348  
1349  	*tio->len_ptr -= bio_sectors - n_sectors;
1350  	bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1351  
1352  	/*
1353  	 * __split_and_process_bio() may have already saved mapped part
1354  	 * for accounting but it is being reduced so update accordingly.
1355  	 */
1356  	dm_io_set_flag(io, DM_IO_WAS_SPLIT);
1357  	io->sectors = n_sectors;
1358  	io->sector_offset = bio_sectors(io->orig_bio);
1359  }
1360  EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1361  
1362  /*
1363   * @clone: clone bio that DM core passed to target's .map function
1364   * @tgt_clone: clone of @clone bio that target needs submitted
1365   *
1366   * Targets should use this interface to submit bios they take
1367   * ownership of when returning DM_MAPIO_SUBMITTED.
1368   *
1369   * Target should also enable ti->accounts_remapped_io
1370   */
dm_submit_bio_remap(struct bio * clone,struct bio * tgt_clone)1371  void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone)
1372  {
1373  	struct dm_target_io *tio = clone_to_tio(clone);
1374  	struct dm_io *io = tio->io;
1375  
1376  	/* establish bio that will get submitted */
1377  	if (!tgt_clone)
1378  		tgt_clone = clone;
1379  
1380  	/*
1381  	 * Account io->origin_bio to DM dev on behalf of target
1382  	 * that took ownership of IO with DM_MAPIO_SUBMITTED.
1383  	 */
1384  	dm_start_io_acct(io, clone);
1385  
1386  	trace_block_bio_remap(tgt_clone, disk_devt(io->md->disk),
1387  			      tio->old_sector);
1388  	submit_bio_noacct(tgt_clone);
1389  }
1390  EXPORT_SYMBOL_GPL(dm_submit_bio_remap);
1391  
__set_swap_bios_limit(struct mapped_device * md,int latch)1392  static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
1393  {
1394  	mutex_lock(&md->swap_bios_lock);
1395  	while (latch < md->swap_bios) {
1396  		cond_resched();
1397  		down(&md->swap_bios_semaphore);
1398  		md->swap_bios--;
1399  	}
1400  	while (latch > md->swap_bios) {
1401  		cond_resched();
1402  		up(&md->swap_bios_semaphore);
1403  		md->swap_bios++;
1404  	}
1405  	mutex_unlock(&md->swap_bios_lock);
1406  }
1407  
__map_bio(struct bio * clone)1408  static void __map_bio(struct bio *clone)
1409  {
1410  	struct dm_target_io *tio = clone_to_tio(clone);
1411  	struct dm_target *ti = tio->ti;
1412  	struct dm_io *io = tio->io;
1413  	struct mapped_device *md = io->md;
1414  	int r;
1415  
1416  	clone->bi_end_io = clone_endio;
1417  
1418  	/*
1419  	 * Map the clone.
1420  	 */
1421  	tio->old_sector = clone->bi_iter.bi_sector;
1422  
1423  	if (static_branch_unlikely(&swap_bios_enabled) &&
1424  	    unlikely(swap_bios_limit(ti, clone))) {
1425  		int latch = get_swap_bios();
1426  
1427  		if (unlikely(latch != md->swap_bios))
1428  			__set_swap_bios_limit(md, latch);
1429  		down(&md->swap_bios_semaphore);
1430  	}
1431  
1432  	if (likely(ti->type->map == linear_map))
1433  		r = linear_map(ti, clone);
1434  	else if (ti->type->map == stripe_map)
1435  		r = stripe_map(ti, clone);
1436  	else
1437  		r = ti->type->map(ti, clone);
1438  
1439  	switch (r) {
1440  	case DM_MAPIO_SUBMITTED:
1441  		/* target has assumed ownership of this io */
1442  		if (!ti->accounts_remapped_io)
1443  			dm_start_io_acct(io, clone);
1444  		break;
1445  	case DM_MAPIO_REMAPPED:
1446  		dm_submit_bio_remap(clone, NULL);
1447  		break;
1448  	case DM_MAPIO_KILL:
1449  	case DM_MAPIO_REQUEUE:
1450  		if (static_branch_unlikely(&swap_bios_enabled) &&
1451  		    unlikely(swap_bios_limit(ti, clone)))
1452  			up(&md->swap_bios_semaphore);
1453  		free_tio(clone);
1454  		if (r == DM_MAPIO_KILL)
1455  			dm_io_dec_pending(io, BLK_STS_IOERR);
1456  		else
1457  			dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
1458  		break;
1459  	default:
1460  		DMCRIT("unimplemented target map return value: %d", r);
1461  		BUG();
1462  	}
1463  }
1464  
setup_split_accounting(struct clone_info * ci,unsigned int len)1465  static void setup_split_accounting(struct clone_info *ci, unsigned int len)
1466  {
1467  	struct dm_io *io = ci->io;
1468  
1469  	if (ci->sector_count > len) {
1470  		/*
1471  		 * Split needed, save the mapped part for accounting.
1472  		 * NOTE: dm_accept_partial_bio() will update accordingly.
1473  		 */
1474  		dm_io_set_flag(io, DM_IO_WAS_SPLIT);
1475  		io->sectors = len;
1476  		io->sector_offset = bio_sectors(ci->bio);
1477  	}
1478  }
1479  
alloc_multiple_bios(struct bio_list * blist,struct clone_info * ci,struct dm_target * ti,unsigned int num_bios,unsigned * len,gfp_t gfp_flag)1480  static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1481  				struct dm_target *ti, unsigned int num_bios,
1482  				unsigned *len, gfp_t gfp_flag)
1483  {
1484  	struct bio *bio;
1485  	int try = (gfp_flag & GFP_NOWAIT) ? 0 : 1;
1486  
1487  	for (; try < 2; try++) {
1488  		int bio_nr;
1489  
1490  		if (try && num_bios > 1)
1491  			mutex_lock(&ci->io->md->table_devices_lock);
1492  		for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1493  			bio = alloc_tio(ci, ti, bio_nr, len,
1494  					try ? GFP_NOIO : GFP_NOWAIT);
1495  			if (!bio)
1496  				break;
1497  
1498  			bio_list_add(blist, bio);
1499  		}
1500  		if (try && num_bios > 1)
1501  			mutex_unlock(&ci->io->md->table_devices_lock);
1502  		if (bio_nr == num_bios)
1503  			return;
1504  
1505  		while ((bio = bio_list_pop(blist)))
1506  			free_tio(bio);
1507  	}
1508  }
1509  
__send_duplicate_bios(struct clone_info * ci,struct dm_target * ti,unsigned int num_bios,unsigned int * len,gfp_t gfp_flag)1510  static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1511  					  unsigned int num_bios, unsigned int *len,
1512  					  gfp_t gfp_flag)
1513  {
1514  	struct bio_list blist = BIO_EMPTY_LIST;
1515  	struct bio *clone;
1516  	unsigned int ret = 0;
1517  
1518  	if (WARN_ON_ONCE(num_bios == 0)) /* num_bios = 0 is a bug in caller */
1519  		return 0;
1520  
1521  	/* dm_accept_partial_bio() is not supported with shared tio->len_ptr */
1522  	if (len)
1523  		setup_split_accounting(ci, *len);
1524  
1525  	/*
1526  	 * Using alloc_multiple_bios(), even if num_bios is 1, to consistently
1527  	 * support allocating using GFP_NOWAIT with GFP_NOIO fallback.
1528  	 */
1529  	alloc_multiple_bios(&blist, ci, ti, num_bios, len, gfp_flag);
1530  	while ((clone = bio_list_pop(&blist))) {
1531  		if (num_bios > 1)
1532  			dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO);
1533  		__map_bio(clone);
1534  		ret += 1;
1535  	}
1536  
1537  	return ret;
1538  }
1539  
__send_empty_flush(struct clone_info * ci)1540  static void __send_empty_flush(struct clone_info *ci)
1541  {
1542  	struct dm_table *t = ci->map;
1543  	struct bio flush_bio;
1544  
1545  	/*
1546  	 * Use an on-stack bio for this, it's safe since we don't
1547  	 * need to reference it after submit. It's just used as
1548  	 * the basis for the clone(s).
1549  	 */
1550  	bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
1551  		 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
1552  
1553  	ci->bio = &flush_bio;
1554  	ci->sector_count = 0;
1555  	ci->io->tio.clone.bi_iter.bi_size = 0;
1556  
1557  	if (!t->flush_bypasses_map) {
1558  		for (unsigned int i = 0; i < t->num_targets; i++) {
1559  			unsigned int bios;
1560  			struct dm_target *ti = dm_table_get_target(t, i);
1561  
1562  			if (unlikely(ti->num_flush_bios == 0))
1563  				continue;
1564  
1565  			atomic_add(ti->num_flush_bios, &ci->io->io_count);
1566  			bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
1567  						     NULL, GFP_NOWAIT);
1568  			atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
1569  		}
1570  	} else {
1571  		/*
1572  		 * Note that there's no need to grab t->devices_lock here
1573  		 * because the targets that support flush optimization don't
1574  		 * modify the list of devices.
1575  		 */
1576  		struct list_head *devices = dm_table_get_devices(t);
1577  		unsigned int len = 0;
1578  		struct dm_dev_internal *dd;
1579  		list_for_each_entry(dd, devices, list) {
1580  			struct bio *clone;
1581  			/*
1582  			 * Note that the structure dm_target_io is not
1583  			 * associated with any target (because the device may be
1584  			 * used by multiple targets), so we set tio->ti = NULL.
1585  			 * We must check for NULL in the I/O processing path, to
1586  			 * avoid NULL pointer dereference.
1587  			 */
1588  			clone = alloc_tio(ci, NULL, 0, &len, GFP_NOIO);
1589  			atomic_add(1, &ci->io->io_count);
1590  			bio_set_dev(clone, dd->dm_dev->bdev);
1591  			clone->bi_end_io = clone_endio;
1592  			dm_submit_bio_remap(clone, NULL);
1593  		}
1594  	}
1595  
1596  	/*
1597  	 * alloc_io() takes one extra reference for submission, so the
1598  	 * reference won't reach 0 without the following subtraction
1599  	 */
1600  	atomic_sub(1, &ci->io->io_count);
1601  
1602  	bio_uninit(ci->bio);
1603  }
1604  
__send_abnormal_io(struct clone_info * ci,struct dm_target * ti,unsigned int num_bios,unsigned int max_granularity,unsigned int max_sectors)1605  static void __send_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1606  			       unsigned int num_bios, unsigned int max_granularity,
1607  			       unsigned int max_sectors)
1608  {
1609  	unsigned int len, bios;
1610  
1611  	len = min_t(sector_t, ci->sector_count,
1612  		    __max_io_len(ti, ci->sector, max_granularity, max_sectors));
1613  
1614  	atomic_add(num_bios, &ci->io->io_count);
1615  	bios = __send_duplicate_bios(ci, ti, num_bios, &len, GFP_NOIO);
1616  	/*
1617  	 * alloc_io() takes one extra reference for submission, so the
1618  	 * reference won't reach 0 without the following (+1) subtraction
1619  	 */
1620  	atomic_sub(num_bios - bios + 1, &ci->io->io_count);
1621  
1622  	ci->sector += len;
1623  	ci->sector_count -= len;
1624  }
1625  
is_abnormal_io(struct bio * bio)1626  static bool is_abnormal_io(struct bio *bio)
1627  {
1628  	switch (bio_op(bio)) {
1629  	case REQ_OP_READ:
1630  	case REQ_OP_WRITE:
1631  	case REQ_OP_FLUSH:
1632  		return false;
1633  	case REQ_OP_DISCARD:
1634  	case REQ_OP_SECURE_ERASE:
1635  	case REQ_OP_WRITE_ZEROES:
1636  	case REQ_OP_ZONE_RESET_ALL:
1637  		return true;
1638  	default:
1639  		return false;
1640  	}
1641  }
1642  
__process_abnormal_io(struct clone_info * ci,struct dm_target * ti)1643  static blk_status_t __process_abnormal_io(struct clone_info *ci,
1644  					  struct dm_target *ti)
1645  {
1646  	unsigned int num_bios = 0;
1647  	unsigned int max_granularity = 0;
1648  	unsigned int max_sectors = 0;
1649  	struct queue_limits *limits = dm_get_queue_limits(ti->table->md);
1650  
1651  	switch (bio_op(ci->bio)) {
1652  	case REQ_OP_DISCARD:
1653  		num_bios = ti->num_discard_bios;
1654  		max_sectors = limits->max_discard_sectors;
1655  		if (ti->max_discard_granularity)
1656  			max_granularity = max_sectors;
1657  		break;
1658  	case REQ_OP_SECURE_ERASE:
1659  		num_bios = ti->num_secure_erase_bios;
1660  		max_sectors = limits->max_secure_erase_sectors;
1661  		break;
1662  	case REQ_OP_WRITE_ZEROES:
1663  		num_bios = ti->num_write_zeroes_bios;
1664  		max_sectors = limits->max_write_zeroes_sectors;
1665  		break;
1666  	default:
1667  		break;
1668  	}
1669  
1670  	/*
1671  	 * Even though the device advertised support for this type of
1672  	 * request, that does not mean every target supports it, and
1673  	 * reconfiguration might also have changed that since the
1674  	 * check was performed.
1675  	 */
1676  	if (unlikely(!num_bios))
1677  		return BLK_STS_NOTSUPP;
1678  
1679  	__send_abnormal_io(ci, ti, num_bios, max_granularity, max_sectors);
1680  
1681  	return BLK_STS_OK;
1682  }
1683  
1684  /*
1685   * Reuse ->bi_private as dm_io list head for storing all dm_io instances
1686   * associated with this bio, and this bio's bi_private needs to be
1687   * stored in dm_io->data before the reuse.
1688   *
1689   * bio->bi_private is owned by fs or upper layer, so block layer won't
1690   * touch it after splitting. Meantime it won't be changed by anyone after
1691   * bio is submitted. So this reuse is safe.
1692   */
dm_poll_list_head(struct bio * bio)1693  static inline struct dm_io **dm_poll_list_head(struct bio *bio)
1694  {
1695  	return (struct dm_io **)&bio->bi_private;
1696  }
1697  
dm_queue_poll_io(struct bio * bio,struct dm_io * io)1698  static void dm_queue_poll_io(struct bio *bio, struct dm_io *io)
1699  {
1700  	struct dm_io **head = dm_poll_list_head(bio);
1701  
1702  	if (!(bio->bi_opf & REQ_DM_POLL_LIST)) {
1703  		bio->bi_opf |= REQ_DM_POLL_LIST;
1704  		/*
1705  		 * Save .bi_private into dm_io, so that we can reuse
1706  		 * .bi_private as dm_io list head for storing dm_io list
1707  		 */
1708  		io->data = bio->bi_private;
1709  
1710  		/* tell block layer to poll for completion */
1711  		bio->bi_cookie = ~BLK_QC_T_NONE;
1712  
1713  		io->next = NULL;
1714  	} else {
1715  		/*
1716  		 * bio recursed due to split, reuse original poll list,
1717  		 * and save bio->bi_private too.
1718  		 */
1719  		io->data = (*head)->data;
1720  		io->next = *head;
1721  	}
1722  
1723  	*head = io;
1724  }
1725  
1726  /*
1727   * Select the correct strategy for processing a non-flush bio.
1728   */
__split_and_process_bio(struct clone_info * ci)1729  static blk_status_t __split_and_process_bio(struct clone_info *ci)
1730  {
1731  	struct bio *clone;
1732  	struct dm_target *ti;
1733  	unsigned int len;
1734  
1735  	ti = dm_table_find_target(ci->map, ci->sector);
1736  	if (unlikely(!ti))
1737  		return BLK_STS_IOERR;
1738  
1739  	if (unlikely(ci->is_abnormal_io))
1740  		return __process_abnormal_io(ci, ti);
1741  
1742  	/*
1743  	 * Only support bio polling for normal IO, and the target io is
1744  	 * exactly inside the dm_io instance (verified in dm_poll_dm_io)
1745  	 */
1746  	ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);
1747  
1748  	len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
1749  	setup_split_accounting(ci, len);
1750  
1751  	if (unlikely(ci->bio->bi_opf & REQ_NOWAIT)) {
1752  		if (unlikely(!dm_target_supports_nowait(ti->type)))
1753  			return BLK_STS_NOTSUPP;
1754  
1755  		clone = alloc_tio(ci, ti, 0, &len, GFP_NOWAIT);
1756  		if (unlikely(!clone))
1757  			return BLK_STS_AGAIN;
1758  	} else {
1759  		clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
1760  	}
1761  	__map_bio(clone);
1762  
1763  	ci->sector += len;
1764  	ci->sector_count -= len;
1765  
1766  	return BLK_STS_OK;
1767  }
1768  
init_clone_info(struct clone_info * ci,struct dm_io * io,struct dm_table * map,struct bio * bio,bool is_abnormal)1769  static void init_clone_info(struct clone_info *ci, struct dm_io *io,
1770  			    struct dm_table *map, struct bio *bio, bool is_abnormal)
1771  {
1772  	ci->map = map;
1773  	ci->io = io;
1774  	ci->bio = bio;
1775  	ci->is_abnormal_io = is_abnormal;
1776  	ci->submit_as_polled = false;
1777  	ci->sector = bio->bi_iter.bi_sector;
1778  	ci->sector_count = bio_sectors(bio);
1779  
1780  	/* Shouldn't happen but sector_count was being set to 0 so... */
1781  	if (static_branch_unlikely(&zoned_enabled) &&
1782  	    WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count))
1783  		ci->sector_count = 0;
1784  }
1785  
1786  #ifdef CONFIG_BLK_DEV_ZONED
dm_zone_bio_needs_split(struct mapped_device * md,struct bio * bio)1787  static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
1788  					   struct bio *bio)
1789  {
1790  	/*
1791  	 * For mapped device that need zone append emulation, we must
1792  	 * split any large BIO that straddles zone boundaries.
1793  	 */
1794  	return dm_emulate_zone_append(md) && bio_straddles_zones(bio) &&
1795  		!bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
1796  }
dm_zone_plug_bio(struct mapped_device * md,struct bio * bio)1797  static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
1798  {
1799  	return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0);
1800  }
1801  
__send_zone_reset_all_emulated(struct clone_info * ci,struct dm_target * ti)1802  static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci,
1803  						   struct dm_target *ti)
1804  {
1805  	struct bio_list blist = BIO_EMPTY_LIST;
1806  	struct mapped_device *md = ci->io->md;
1807  	unsigned int zone_sectors = md->disk->queue->limits.chunk_sectors;
1808  	unsigned long *need_reset;
1809  	unsigned int i, nr_zones, nr_reset;
1810  	unsigned int num_bios = 0;
1811  	blk_status_t sts = BLK_STS_OK;
1812  	sector_t sector = ti->begin;
1813  	struct bio *clone;
1814  	int ret;
1815  
1816  	nr_zones = ti->len >> ilog2(zone_sectors);
1817  	need_reset = bitmap_zalloc(nr_zones, GFP_NOIO);
1818  	if (!need_reset)
1819  		return BLK_STS_RESOURCE;
1820  
1821  	ret = dm_zone_get_reset_bitmap(md, ci->map, ti->begin,
1822  				       nr_zones, need_reset);
1823  	if (ret) {
1824  		sts = BLK_STS_IOERR;
1825  		goto free_bitmap;
1826  	}
1827  
1828  	/* If we have no zone to reset, we are done. */
1829  	nr_reset = bitmap_weight(need_reset, nr_zones);
1830  	if (!nr_reset)
1831  		goto free_bitmap;
1832  
1833  	atomic_add(nr_zones, &ci->io->io_count);
1834  
1835  	for (i = 0; i < nr_zones; i++) {
1836  
1837  		if (!test_bit(i, need_reset)) {
1838  			sector += zone_sectors;
1839  			continue;
1840  		}
1841  
1842  		if (bio_list_empty(&blist)) {
1843  			/* This may take a while, so be nice to others */
1844  			if (num_bios)
1845  				cond_resched();
1846  
1847  			/*
1848  			 * We may need to reset thousands of zones, so let's
1849  			 * not go crazy with the clone allocation.
1850  			 */
1851  			alloc_multiple_bios(&blist, ci, ti, min(nr_reset, 32),
1852  					    NULL, GFP_NOIO);
1853  		}
1854  
1855  		/* Get a clone and change it to a regular reset operation. */
1856  		clone = bio_list_pop(&blist);
1857  		clone->bi_opf &= ~REQ_OP_MASK;
1858  		clone->bi_opf |= REQ_OP_ZONE_RESET | REQ_SYNC;
1859  		clone->bi_iter.bi_sector = sector;
1860  		clone->bi_iter.bi_size = 0;
1861  		__map_bio(clone);
1862  
1863  		sector += zone_sectors;
1864  		num_bios++;
1865  		nr_reset--;
1866  	}
1867  
1868  	WARN_ON_ONCE(!bio_list_empty(&blist));
1869  	atomic_sub(nr_zones - num_bios, &ci->io->io_count);
1870  	ci->sector_count = 0;
1871  
1872  free_bitmap:
1873  	bitmap_free(need_reset);
1874  
1875  	return sts;
1876  }
1877  
__send_zone_reset_all_native(struct clone_info * ci,struct dm_target * ti)1878  static void __send_zone_reset_all_native(struct clone_info *ci,
1879  					 struct dm_target *ti)
1880  {
1881  	unsigned int bios;
1882  
1883  	atomic_add(1, &ci->io->io_count);
1884  	bios = __send_duplicate_bios(ci, ti, 1, NULL, GFP_NOIO);
1885  	atomic_sub(1 - bios, &ci->io->io_count);
1886  
1887  	ci->sector_count = 0;
1888  }
1889  
__send_zone_reset_all(struct clone_info * ci)1890  static blk_status_t __send_zone_reset_all(struct clone_info *ci)
1891  {
1892  	struct dm_table *t = ci->map;
1893  	blk_status_t sts = BLK_STS_OK;
1894  
1895  	for (unsigned int i = 0; i < t->num_targets; i++) {
1896  		struct dm_target *ti = dm_table_get_target(t, i);
1897  
1898  		if (ti->zone_reset_all_supported) {
1899  			__send_zone_reset_all_native(ci, ti);
1900  			continue;
1901  		}
1902  
1903  		sts = __send_zone_reset_all_emulated(ci, ti);
1904  		if (sts != BLK_STS_OK)
1905  			break;
1906  	}
1907  
1908  	/* Release the reference that alloc_io() took for submission. */
1909  	atomic_sub(1, &ci->io->io_count);
1910  
1911  	return sts;
1912  }
1913  
1914  #else
dm_zone_bio_needs_split(struct mapped_device * md,struct bio * bio)1915  static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
1916  					   struct bio *bio)
1917  {
1918  	return false;
1919  }
dm_zone_plug_bio(struct mapped_device * md,struct bio * bio)1920  static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
1921  {
1922  	return false;
1923  }
__send_zone_reset_all(struct clone_info * ci)1924  static blk_status_t __send_zone_reset_all(struct clone_info *ci)
1925  {
1926  	return BLK_STS_NOTSUPP;
1927  }
1928  #endif
1929  
1930  /*
1931   * Entry point to split a bio into clones and submit them to the targets.
1932   */
dm_split_and_process_bio(struct mapped_device * md,struct dm_table * map,struct bio * bio)1933  static void dm_split_and_process_bio(struct mapped_device *md,
1934  				     struct dm_table *map, struct bio *bio)
1935  {
1936  	struct clone_info ci;
1937  	struct dm_io *io;
1938  	blk_status_t error = BLK_STS_OK;
1939  	bool is_abnormal, need_split;
1940  
1941  	is_abnormal = is_abnormal_io(bio);
1942  	if (static_branch_unlikely(&zoned_enabled)) {
1943  		/* Special case REQ_OP_ZONE_RESET_ALL as it cannot be split. */
1944  		need_split = (bio_op(bio) != REQ_OP_ZONE_RESET_ALL) &&
1945  			(is_abnormal || dm_zone_bio_needs_split(md, bio));
1946  	} else {
1947  		need_split = is_abnormal;
1948  	}
1949  
1950  	if (unlikely(need_split)) {
1951  		/*
1952  		 * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
1953  		 * otherwise associated queue_limits won't be imposed.
1954  		 * Also split the BIO for mapped devices needing zone append
1955  		 * emulation to ensure that the BIO does not cross zone
1956  		 * boundaries.
1957  		 */
1958  		bio = bio_split_to_limits(bio);
1959  		if (!bio)
1960  			return;
1961  	}
1962  
1963  	/*
1964  	 * Use the block layer zone write plugging for mapped devices that
1965  	 * need zone append emulation (e.g. dm-crypt).
1966  	 */
1967  	if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio))
1968  		return;
1969  
1970  	/* Only support nowait for normal IO */
1971  	if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) {
1972  		io = alloc_io(md, bio, GFP_NOWAIT);
1973  		if (unlikely(!io)) {
1974  			/* Unable to do anything without dm_io. */
1975  			bio_wouldblock_error(bio);
1976  			return;
1977  		}
1978  	} else {
1979  		io = alloc_io(md, bio, GFP_NOIO);
1980  	}
1981  	init_clone_info(&ci, io, map, bio, is_abnormal);
1982  
1983  	if (bio->bi_opf & REQ_PREFLUSH) {
1984  		__send_empty_flush(&ci);
1985  		/* dm_io_complete submits any data associated with flush */
1986  		goto out;
1987  	}
1988  
1989  	if (static_branch_unlikely(&zoned_enabled) &&
1990  	    (bio_op(bio) == REQ_OP_ZONE_RESET_ALL)) {
1991  		error = __send_zone_reset_all(&ci);
1992  		goto out;
1993  	}
1994  
1995  	error = __split_and_process_bio(&ci);
1996  	if (error || !ci.sector_count)
1997  		goto out;
1998  	/*
1999  	 * Remainder must be passed to submit_bio_noacct() so it gets handled
2000  	 * *after* bios already submitted have been completely processed.
2001  	 */
2002  	bio_trim(bio, io->sectors, ci.sector_count);
2003  	trace_block_split(bio, bio->bi_iter.bi_sector);
2004  	bio_inc_remaining(bio);
2005  	submit_bio_noacct(bio);
2006  out:
2007  	/*
2008  	 * Drop the extra reference count for non-POLLED bio, and hold one
2009  	 * reference for POLLED bio, which will be released in dm_poll_bio
2010  	 *
2011  	 * Add every dm_io instance into the dm_io list head which is stored
2012  	 * in bio->bi_private, so that dm_poll_bio can poll them all.
2013  	 */
2014  	if (error || !ci.submit_as_polled) {
2015  		/*
2016  		 * In case of submission failure, the extra reference for
2017  		 * submitting io isn't consumed yet
2018  		 */
2019  		if (error)
2020  			atomic_dec(&io->io_count);
2021  		dm_io_dec_pending(io, error);
2022  	} else
2023  		dm_queue_poll_io(bio, io);
2024  }
2025  
dm_submit_bio(struct bio * bio)2026  static void dm_submit_bio(struct bio *bio)
2027  {
2028  	struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
2029  	int srcu_idx;
2030  	struct dm_table *map;
2031  
2032  	map = dm_get_live_table(md, &srcu_idx);
2033  	if (unlikely(!map)) {
2034  		DMERR_LIMIT("%s: mapping table unavailable, erroring io",
2035  			    dm_device_name(md));
2036  		bio_io_error(bio);
2037  		goto out;
2038  	}
2039  
2040  	/* If suspended, queue this IO for later */
2041  	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
2042  		if (bio->bi_opf & REQ_NOWAIT)
2043  			bio_wouldblock_error(bio);
2044  		else if (bio->bi_opf & REQ_RAHEAD)
2045  			bio_io_error(bio);
2046  		else
2047  			queue_io(md, bio);
2048  		goto out;
2049  	}
2050  
2051  	dm_split_and_process_bio(md, map, bio);
2052  out:
2053  	dm_put_live_table(md, srcu_idx);
2054  }
2055  
dm_poll_dm_io(struct dm_io * io,struct io_comp_batch * iob,unsigned int flags)2056  static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob,
2057  			  unsigned int flags)
2058  {
2059  	WARN_ON_ONCE(!dm_tio_is_normal(&io->tio));
2060  
2061  	/* don't poll if the mapped io is done */
2062  	if (atomic_read(&io->io_count) > 1)
2063  		bio_poll(&io->tio.clone, iob, flags);
2064  
2065  	/* bio_poll holds the last reference */
2066  	return atomic_read(&io->io_count) == 1;
2067  }
2068  
dm_poll_bio(struct bio * bio,struct io_comp_batch * iob,unsigned int flags)2069  static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob,
2070  		       unsigned int flags)
2071  {
2072  	struct dm_io **head = dm_poll_list_head(bio);
2073  	struct dm_io *list = *head;
2074  	struct dm_io *tmp = NULL;
2075  	struct dm_io *curr, *next;
2076  
2077  	/* Only poll normal bio which was marked as REQ_DM_POLL_LIST */
2078  	if (!(bio->bi_opf & REQ_DM_POLL_LIST))
2079  		return 0;
2080  
2081  	WARN_ON_ONCE(!list);
2082  
2083  	/*
2084  	 * Restore .bi_private before possibly completing dm_io.
2085  	 *
2086  	 * bio_poll() is only possible once @bio has been completely
2087  	 * submitted via submit_bio_noacct()'s depth-first submission.
2088  	 * So there is no dm_queue_poll_io() race associated with
2089  	 * clearing REQ_DM_POLL_LIST here.
2090  	 */
2091  	bio->bi_opf &= ~REQ_DM_POLL_LIST;
2092  	bio->bi_private = list->data;
2093  
2094  	for (curr = list, next = curr->next; curr; curr = next, next =
2095  			curr ? curr->next : NULL) {
2096  		if (dm_poll_dm_io(curr, iob, flags)) {
2097  			/*
2098  			 * clone_endio() has already occurred, so no
2099  			 * error handling is needed here.
2100  			 */
2101  			__dm_io_dec_pending(curr);
2102  		} else {
2103  			curr->next = tmp;
2104  			tmp = curr;
2105  		}
2106  	}
2107  
2108  	/* Not done? */
2109  	if (tmp) {
2110  		bio->bi_opf |= REQ_DM_POLL_LIST;
2111  		/* Reset bio->bi_private to dm_io list head */
2112  		*head = tmp;
2113  		return 0;
2114  	}
2115  	return 1;
2116  }
2117  
2118  /*
2119   *---------------------------------------------------------------
2120   * An IDR is used to keep track of allocated minor numbers.
2121   *---------------------------------------------------------------
2122   */
free_minor(int minor)2123  static void free_minor(int minor)
2124  {
2125  	spin_lock(&_minor_lock);
2126  	idr_remove(&_minor_idr, minor);
2127  	spin_unlock(&_minor_lock);
2128  }
2129  
2130  /*
2131   * See if the device with a specific minor # is free.
2132   */
specific_minor(int minor)2133  static int specific_minor(int minor)
2134  {
2135  	int r;
2136  
2137  	if (minor >= (1 << MINORBITS))
2138  		return -EINVAL;
2139  
2140  	idr_preload(GFP_KERNEL);
2141  	spin_lock(&_minor_lock);
2142  
2143  	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
2144  
2145  	spin_unlock(&_minor_lock);
2146  	idr_preload_end();
2147  	if (r < 0)
2148  		return r == -ENOSPC ? -EBUSY : r;
2149  	return 0;
2150  }
2151  
next_free_minor(int * minor)2152  static int next_free_minor(int *minor)
2153  {
2154  	int r;
2155  
2156  	idr_preload(GFP_KERNEL);
2157  	spin_lock(&_minor_lock);
2158  
2159  	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
2160  
2161  	spin_unlock(&_minor_lock);
2162  	idr_preload_end();
2163  	if (r < 0)
2164  		return r;
2165  	*minor = r;
2166  	return 0;
2167  }
2168  
2169  static const struct block_device_operations dm_blk_dops;
2170  static const struct block_device_operations dm_rq_blk_dops;
2171  static const struct dax_operations dm_dax_ops;
2172  
2173  static void dm_wq_work(struct work_struct *work);
2174  
2175  #ifdef CONFIG_BLK_INLINE_ENCRYPTION
dm_queue_destroy_crypto_profile(struct request_queue * q)2176  static void dm_queue_destroy_crypto_profile(struct request_queue *q)
2177  {
2178  	dm_destroy_crypto_profile(q->crypto_profile);
2179  }
2180  
2181  #else /* CONFIG_BLK_INLINE_ENCRYPTION */
2182  
dm_queue_destroy_crypto_profile(struct request_queue * q)2183  static inline void dm_queue_destroy_crypto_profile(struct request_queue *q)
2184  {
2185  }
2186  #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
2187  
cleanup_mapped_device(struct mapped_device * md)2188  static void cleanup_mapped_device(struct mapped_device *md)
2189  {
2190  	if (md->wq)
2191  		destroy_workqueue(md->wq);
2192  	dm_free_md_mempools(md->mempools);
2193  
2194  	if (md->dax_dev) {
2195  		dax_remove_host(md->disk);
2196  		kill_dax(md->dax_dev);
2197  		put_dax(md->dax_dev);
2198  		md->dax_dev = NULL;
2199  	}
2200  
2201  	if (md->disk) {
2202  		spin_lock(&_minor_lock);
2203  		md->disk->private_data = NULL;
2204  		spin_unlock(&_minor_lock);
2205  		if (dm_get_md_type(md) != DM_TYPE_NONE) {
2206  			struct table_device *td;
2207  
2208  			dm_sysfs_exit(md);
2209  			list_for_each_entry(td, &md->table_devices, list) {
2210  				bd_unlink_disk_holder(td->dm_dev.bdev,
2211  						      md->disk);
2212  			}
2213  
2214  			/*
2215  			 * Hold lock to make sure del_gendisk() won't concurrent
2216  			 * with open/close_table_device().
2217  			 */
2218  			mutex_lock(&md->table_devices_lock);
2219  			del_gendisk(md->disk);
2220  			mutex_unlock(&md->table_devices_lock);
2221  		}
2222  		dm_queue_destroy_crypto_profile(md->queue);
2223  		put_disk(md->disk);
2224  	}
2225  
2226  	if (md->pending_io) {
2227  		free_percpu(md->pending_io);
2228  		md->pending_io = NULL;
2229  	}
2230  
2231  	cleanup_srcu_struct(&md->io_barrier);
2232  
2233  	mutex_destroy(&md->suspend_lock);
2234  	mutex_destroy(&md->type_lock);
2235  	mutex_destroy(&md->table_devices_lock);
2236  	mutex_destroy(&md->swap_bios_lock);
2237  
2238  	dm_mq_cleanup_mapped_device(md);
2239  }
2240  
2241  /*
2242   * Allocate and initialise a blank device with a given minor.
2243   */
alloc_dev(int minor)2244  static struct mapped_device *alloc_dev(int minor)
2245  {
2246  	int r, numa_node_id = dm_get_numa_node();
2247  	struct dax_device *dax_dev;
2248  	struct mapped_device *md;
2249  	void *old_md;
2250  
2251  	md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
2252  	if (!md) {
2253  		DMERR("unable to allocate device, out of memory.");
2254  		return NULL;
2255  	}
2256  
2257  	if (!try_module_get(THIS_MODULE))
2258  		goto bad_module_get;
2259  
2260  	/* get a minor number for the dev */
2261  	if (minor == DM_ANY_MINOR)
2262  		r = next_free_minor(&minor);
2263  	else
2264  		r = specific_minor(minor);
2265  	if (r < 0)
2266  		goto bad_minor;
2267  
2268  	r = init_srcu_struct(&md->io_barrier);
2269  	if (r < 0)
2270  		goto bad_io_barrier;
2271  
2272  	md->numa_node_id = numa_node_id;
2273  	md->init_tio_pdu = false;
2274  	md->type = DM_TYPE_NONE;
2275  	mutex_init(&md->suspend_lock);
2276  	mutex_init(&md->type_lock);
2277  	mutex_init(&md->table_devices_lock);
2278  	spin_lock_init(&md->deferred_lock);
2279  	atomic_set(&md->holders, 1);
2280  	atomic_set(&md->open_count, 0);
2281  	atomic_set(&md->event_nr, 0);
2282  	atomic_set(&md->uevent_seq, 0);
2283  	INIT_LIST_HEAD(&md->uevent_list);
2284  	INIT_LIST_HEAD(&md->table_devices);
2285  	spin_lock_init(&md->uevent_lock);
2286  
2287  	/*
2288  	 * default to bio-based until DM table is loaded and md->type
2289  	 * established. If request-based table is loaded: blk-mq will
2290  	 * override accordingly.
2291  	 */
2292  	md->disk = blk_alloc_disk(NULL, md->numa_node_id);
2293  	if (IS_ERR(md->disk)) {
2294  		md->disk = NULL;
2295  		goto bad;
2296  	}
2297  	md->queue = md->disk->queue;
2298  
2299  	init_waitqueue_head(&md->wait);
2300  	INIT_WORK(&md->work, dm_wq_work);
2301  	INIT_WORK(&md->requeue_work, dm_wq_requeue_work);
2302  	init_waitqueue_head(&md->eventq);
2303  	init_completion(&md->kobj_holder.completion);
2304  
2305  	md->requeue_list = NULL;
2306  	md->swap_bios = get_swap_bios();
2307  	sema_init(&md->swap_bios_semaphore, md->swap_bios);
2308  	mutex_init(&md->swap_bios_lock);
2309  
2310  	md->disk->major = _major;
2311  	md->disk->first_minor = minor;
2312  	md->disk->minors = 1;
2313  	md->disk->flags |= GENHD_FL_NO_PART;
2314  	md->disk->fops = &dm_blk_dops;
2315  	md->disk->private_data = md;
2316  	sprintf(md->disk->disk_name, "dm-%d", minor);
2317  
2318  	dax_dev = alloc_dax(md, &dm_dax_ops);
2319  	if (IS_ERR(dax_dev)) {
2320  		if (PTR_ERR(dax_dev) != -EOPNOTSUPP)
2321  			goto bad;
2322  	} else {
2323  		set_dax_nocache(dax_dev);
2324  		set_dax_nomc(dax_dev);
2325  		md->dax_dev = dax_dev;
2326  		if (dax_add_host(dax_dev, md->disk))
2327  			goto bad;
2328  	}
2329  
2330  	format_dev_t(md->name, MKDEV(_major, minor));
2331  
2332  	md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name);
2333  	if (!md->wq)
2334  		goto bad;
2335  
2336  	md->pending_io = alloc_percpu(unsigned long);
2337  	if (!md->pending_io)
2338  		goto bad;
2339  
2340  	r = dm_stats_init(&md->stats);
2341  	if (r < 0)
2342  		goto bad;
2343  
2344  	/* Populate the mapping, nobody knows we exist yet */
2345  	spin_lock(&_minor_lock);
2346  	old_md = idr_replace(&_minor_idr, md, minor);
2347  	spin_unlock(&_minor_lock);
2348  
2349  	BUG_ON(old_md != MINOR_ALLOCED);
2350  
2351  	return md;
2352  
2353  bad:
2354  	cleanup_mapped_device(md);
2355  bad_io_barrier:
2356  	free_minor(minor);
2357  bad_minor:
2358  	module_put(THIS_MODULE);
2359  bad_module_get:
2360  	kvfree(md);
2361  	return NULL;
2362  }
2363  
2364  static void unlock_fs(struct mapped_device *md);
2365  
free_dev(struct mapped_device * md)2366  static void free_dev(struct mapped_device *md)
2367  {
2368  	int minor = MINOR(disk_devt(md->disk));
2369  
2370  	unlock_fs(md);
2371  
2372  	cleanup_mapped_device(md);
2373  
2374  	WARN_ON_ONCE(!list_empty(&md->table_devices));
2375  	dm_stats_cleanup(&md->stats);
2376  	free_minor(minor);
2377  
2378  	module_put(THIS_MODULE);
2379  	kvfree(md);
2380  }
2381  
2382  /*
2383   * Bind a table to the device.
2384   */
event_callback(void * context)2385  static void event_callback(void *context)
2386  {
2387  	unsigned long flags;
2388  	LIST_HEAD(uevents);
2389  	struct mapped_device *md = context;
2390  
2391  	spin_lock_irqsave(&md->uevent_lock, flags);
2392  	list_splice_init(&md->uevent_list, &uevents);
2393  	spin_unlock_irqrestore(&md->uevent_lock, flags);
2394  
2395  	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2396  
2397  	atomic_inc(&md->event_nr);
2398  	wake_up(&md->eventq);
2399  	dm_issue_global_event();
2400  }
2401  
2402  /*
2403   * Returns old map, which caller must destroy.
2404   */
__bind(struct mapped_device * md,struct dm_table * t,struct queue_limits * limits)2405  static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2406  			       struct queue_limits *limits)
2407  {
2408  	struct dm_table *old_map;
2409  	sector_t size;
2410  	int ret;
2411  
2412  	lockdep_assert_held(&md->suspend_lock);
2413  
2414  	size = dm_table_get_size(t);
2415  
2416  	/*
2417  	 * Wipe any geometry if the size of the table changed.
2418  	 */
2419  	if (size != dm_get_size(md))
2420  		memset(&md->geometry, 0, sizeof(md->geometry));
2421  
2422  	set_capacity(md->disk, size);
2423  
2424  	dm_table_event_callback(t, event_callback, md);
2425  
2426  	if (dm_table_request_based(t)) {
2427  		/*
2428  		 * Leverage the fact that request-based DM targets are
2429  		 * immutable singletons - used to optimize dm_mq_queue_rq.
2430  		 */
2431  		md->immutable_target = dm_table_get_immutable_target(t);
2432  
2433  		/*
2434  		 * There is no need to reload with request-based dm because the
2435  		 * size of front_pad doesn't change.
2436  		 *
2437  		 * Note for future: If you are to reload bioset, prep-ed
2438  		 * requests in the queue may refer to bio from the old bioset,
2439  		 * so you must walk through the queue to unprep.
2440  		 */
2441  		if (!md->mempools) {
2442  			md->mempools = t->mempools;
2443  			t->mempools = NULL;
2444  		}
2445  	} else {
2446  		/*
2447  		 * The md may already have mempools that need changing.
2448  		 * If so, reload bioset because front_pad may have changed
2449  		 * because a different table was loaded.
2450  		 */
2451  		dm_free_md_mempools(md->mempools);
2452  		md->mempools = t->mempools;
2453  		t->mempools = NULL;
2454  	}
2455  
2456  	ret = dm_table_set_restrictions(t, md->queue, limits);
2457  	if (ret) {
2458  		old_map = ERR_PTR(ret);
2459  		goto out;
2460  	}
2461  
2462  	old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2463  	rcu_assign_pointer(md->map, (void *)t);
2464  	md->immutable_target_type = dm_table_get_immutable_target_type(t);
2465  
2466  	if (old_map)
2467  		dm_sync_table(md);
2468  out:
2469  	return old_map;
2470  }
2471  
2472  /*
2473   * Returns unbound table for the caller to free.
2474   */
__unbind(struct mapped_device * md)2475  static struct dm_table *__unbind(struct mapped_device *md)
2476  {
2477  	struct dm_table *map = rcu_dereference_protected(md->map, 1);
2478  
2479  	if (!map)
2480  		return NULL;
2481  
2482  	dm_table_event_callback(map, NULL, NULL);
2483  	RCU_INIT_POINTER(md->map, NULL);
2484  	dm_sync_table(md);
2485  
2486  	return map;
2487  }
2488  
2489  /*
2490   * Constructor for a new device.
2491   */
dm_create(int minor,struct mapped_device ** result)2492  int dm_create(int minor, struct mapped_device **result)
2493  {
2494  	struct mapped_device *md;
2495  
2496  	md = alloc_dev(minor);
2497  	if (!md)
2498  		return -ENXIO;
2499  
2500  	dm_ima_reset_data(md);
2501  
2502  	*result = md;
2503  	return 0;
2504  }
2505  
2506  /*
2507   * Functions to manage md->type.
2508   * All are required to hold md->type_lock.
2509   */
dm_lock_md_type(struct mapped_device * md)2510  void dm_lock_md_type(struct mapped_device *md)
2511  {
2512  	mutex_lock(&md->type_lock);
2513  }
2514  
dm_unlock_md_type(struct mapped_device * md)2515  void dm_unlock_md_type(struct mapped_device *md)
2516  {
2517  	mutex_unlock(&md->type_lock);
2518  }
2519  
dm_set_md_type(struct mapped_device * md,enum dm_queue_mode type)2520  void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2521  {
2522  	BUG_ON(!mutex_is_locked(&md->type_lock));
2523  	md->type = type;
2524  }
2525  
dm_get_md_type(struct mapped_device * md)2526  enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2527  {
2528  	return md->type;
2529  }
2530  
dm_get_immutable_target_type(struct mapped_device * md)2531  struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2532  {
2533  	return md->immutable_target_type;
2534  }
2535  
2536  /*
2537   * Setup the DM device's queue based on md's type
2538   */
dm_setup_md_queue(struct mapped_device * md,struct dm_table * t)2539  int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2540  {
2541  	enum dm_queue_mode type = dm_table_get_type(t);
2542  	struct queue_limits limits;
2543  	struct table_device *td;
2544  	int r;
2545  
2546  	WARN_ON_ONCE(type == DM_TYPE_NONE);
2547  
2548  	if (type == DM_TYPE_REQUEST_BASED) {
2549  		md->disk->fops = &dm_rq_blk_dops;
2550  		r = dm_mq_init_request_queue(md, t);
2551  		if (r) {
2552  			DMERR("Cannot initialize queue for request-based dm mapped device");
2553  			return r;
2554  		}
2555  	}
2556  
2557  	r = dm_calculate_queue_limits(t, &limits);
2558  	if (r) {
2559  		DMERR("Cannot calculate initial queue limits");
2560  		return r;
2561  	}
2562  	r = dm_table_set_restrictions(t, md->queue, &limits);
2563  	if (r)
2564  		return r;
2565  
2566  	/*
2567  	 * Hold lock to make sure add_disk() and del_gendisk() won't concurrent
2568  	 * with open_table_device() and close_table_device().
2569  	 */
2570  	mutex_lock(&md->table_devices_lock);
2571  	r = add_disk(md->disk);
2572  	mutex_unlock(&md->table_devices_lock);
2573  	if (r)
2574  		return r;
2575  
2576  	/*
2577  	 * Register the holder relationship for devices added before the disk
2578  	 * was live.
2579  	 */
2580  	list_for_each_entry(td, &md->table_devices, list) {
2581  		r = bd_link_disk_holder(td->dm_dev.bdev, md->disk);
2582  		if (r)
2583  			goto out_undo_holders;
2584  	}
2585  
2586  	r = dm_sysfs_init(md);
2587  	if (r)
2588  		goto out_undo_holders;
2589  
2590  	md->type = type;
2591  	return 0;
2592  
2593  out_undo_holders:
2594  	list_for_each_entry_continue_reverse(td, &md->table_devices, list)
2595  		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
2596  	mutex_lock(&md->table_devices_lock);
2597  	del_gendisk(md->disk);
2598  	mutex_unlock(&md->table_devices_lock);
2599  	return r;
2600  }
2601  
dm_get_md(dev_t dev)2602  struct mapped_device *dm_get_md(dev_t dev)
2603  {
2604  	struct mapped_device *md;
2605  	unsigned int minor = MINOR(dev);
2606  
2607  	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2608  		return NULL;
2609  
2610  	spin_lock(&_minor_lock);
2611  
2612  	md = idr_find(&_minor_idr, minor);
2613  	if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2614  	    test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2615  		md = NULL;
2616  		goto out;
2617  	}
2618  	dm_get(md);
2619  out:
2620  	spin_unlock(&_minor_lock);
2621  
2622  	return md;
2623  }
2624  EXPORT_SYMBOL_GPL(dm_get_md);
2625  
dm_get_mdptr(struct mapped_device * md)2626  void *dm_get_mdptr(struct mapped_device *md)
2627  {
2628  	return md->interface_ptr;
2629  }
2630  
dm_set_mdptr(struct mapped_device * md,void * ptr)2631  void dm_set_mdptr(struct mapped_device *md, void *ptr)
2632  {
2633  	md->interface_ptr = ptr;
2634  }
2635  
dm_get(struct mapped_device * md)2636  void dm_get(struct mapped_device *md)
2637  {
2638  	atomic_inc(&md->holders);
2639  	BUG_ON(test_bit(DMF_FREEING, &md->flags));
2640  }
2641  
dm_hold(struct mapped_device * md)2642  int dm_hold(struct mapped_device *md)
2643  {
2644  	spin_lock(&_minor_lock);
2645  	if (test_bit(DMF_FREEING, &md->flags)) {
2646  		spin_unlock(&_minor_lock);
2647  		return -EBUSY;
2648  	}
2649  	dm_get(md);
2650  	spin_unlock(&_minor_lock);
2651  	return 0;
2652  }
2653  EXPORT_SYMBOL_GPL(dm_hold);
2654  
dm_device_name(struct mapped_device * md)2655  const char *dm_device_name(struct mapped_device *md)
2656  {
2657  	return md->name;
2658  }
2659  EXPORT_SYMBOL_GPL(dm_device_name);
2660  
__dm_destroy(struct mapped_device * md,bool wait)2661  static void __dm_destroy(struct mapped_device *md, bool wait)
2662  {
2663  	struct dm_table *map;
2664  	int srcu_idx;
2665  
2666  	might_sleep();
2667  
2668  	spin_lock(&_minor_lock);
2669  	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2670  	set_bit(DMF_FREEING, &md->flags);
2671  	spin_unlock(&_minor_lock);
2672  
2673  	blk_mark_disk_dead(md->disk);
2674  
2675  	/*
2676  	 * Take suspend_lock so that presuspend and postsuspend methods
2677  	 * do not race with internal suspend.
2678  	 */
2679  	mutex_lock(&md->suspend_lock);
2680  	map = dm_get_live_table(md, &srcu_idx);
2681  	if (!dm_suspended_md(md)) {
2682  		dm_table_presuspend_targets(map);
2683  		set_bit(DMF_SUSPENDED, &md->flags);
2684  		set_bit(DMF_POST_SUSPENDING, &md->flags);
2685  		dm_table_postsuspend_targets(map);
2686  	}
2687  	/* dm_put_live_table must be before fsleep, otherwise deadlock is possible */
2688  	dm_put_live_table(md, srcu_idx);
2689  	mutex_unlock(&md->suspend_lock);
2690  
2691  	/*
2692  	 * Rare, but there may be I/O requests still going to complete,
2693  	 * for example.  Wait for all references to disappear.
2694  	 * No one should increment the reference count of the mapped_device,
2695  	 * after the mapped_device state becomes DMF_FREEING.
2696  	 */
2697  	if (wait)
2698  		while (atomic_read(&md->holders))
2699  			fsleep(1000);
2700  	else if (atomic_read(&md->holders))
2701  		DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2702  		       dm_device_name(md), atomic_read(&md->holders));
2703  
2704  	dm_table_destroy(__unbind(md));
2705  	free_dev(md);
2706  }
2707  
dm_destroy(struct mapped_device * md)2708  void dm_destroy(struct mapped_device *md)
2709  {
2710  	__dm_destroy(md, true);
2711  }
2712  
dm_destroy_immediate(struct mapped_device * md)2713  void dm_destroy_immediate(struct mapped_device *md)
2714  {
2715  	__dm_destroy(md, false);
2716  }
2717  
dm_put(struct mapped_device * md)2718  void dm_put(struct mapped_device *md)
2719  {
2720  	atomic_dec(&md->holders);
2721  }
2722  EXPORT_SYMBOL_GPL(dm_put);
2723  
dm_in_flight_bios(struct mapped_device * md)2724  static bool dm_in_flight_bios(struct mapped_device *md)
2725  {
2726  	int cpu;
2727  	unsigned long sum = 0;
2728  
2729  	for_each_possible_cpu(cpu)
2730  		sum += *per_cpu_ptr(md->pending_io, cpu);
2731  
2732  	return sum != 0;
2733  }
2734  
dm_wait_for_bios_completion(struct mapped_device * md,unsigned int task_state)2735  static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state)
2736  {
2737  	int r = 0;
2738  	DEFINE_WAIT(wait);
2739  
2740  	while (true) {
2741  		prepare_to_wait(&md->wait, &wait, task_state);
2742  
2743  		if (!dm_in_flight_bios(md))
2744  			break;
2745  
2746  		if (signal_pending_state(task_state, current)) {
2747  			r = -ERESTARTSYS;
2748  			break;
2749  		}
2750  
2751  		io_schedule();
2752  	}
2753  	finish_wait(&md->wait, &wait);
2754  
2755  	smp_rmb();
2756  
2757  	return r;
2758  }
2759  
dm_wait_for_completion(struct mapped_device * md,unsigned int task_state)2760  static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state)
2761  {
2762  	int r = 0;
2763  
2764  	if (!queue_is_mq(md->queue))
2765  		return dm_wait_for_bios_completion(md, task_state);
2766  
2767  	while (true) {
2768  		if (!blk_mq_queue_inflight(md->queue))
2769  			break;
2770  
2771  		if (signal_pending_state(task_state, current)) {
2772  			r = -ERESTARTSYS;
2773  			break;
2774  		}
2775  
2776  		fsleep(5000);
2777  	}
2778  
2779  	return r;
2780  }
2781  
2782  /*
2783   * Process the deferred bios
2784   */
dm_wq_work(struct work_struct * work)2785  static void dm_wq_work(struct work_struct *work)
2786  {
2787  	struct mapped_device *md = container_of(work, struct mapped_device, work);
2788  	struct bio *bio;
2789  
2790  	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2791  		spin_lock_irq(&md->deferred_lock);
2792  		bio = bio_list_pop(&md->deferred);
2793  		spin_unlock_irq(&md->deferred_lock);
2794  
2795  		if (!bio)
2796  			break;
2797  
2798  		submit_bio_noacct(bio);
2799  		cond_resched();
2800  	}
2801  }
2802  
dm_queue_flush(struct mapped_device * md)2803  static void dm_queue_flush(struct mapped_device *md)
2804  {
2805  	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2806  	smp_mb__after_atomic();
2807  	queue_work(md->wq, &md->work);
2808  }
2809  
2810  /*
2811   * Swap in a new table, returning the old one for the caller to destroy.
2812   */
dm_swap_table(struct mapped_device * md,struct dm_table * table)2813  struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2814  {
2815  	struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2816  	struct queue_limits limits;
2817  	int r;
2818  
2819  	mutex_lock(&md->suspend_lock);
2820  
2821  	/* device must be suspended */
2822  	if (!dm_suspended_md(md))
2823  		goto out;
2824  
2825  	/*
2826  	 * If the new table has no data devices, retain the existing limits.
2827  	 * This helps multipath with queue_if_no_path if all paths disappear,
2828  	 * then new I/O is queued based on these limits, and then some paths
2829  	 * reappear.
2830  	 */
2831  	if (dm_table_has_no_data_devices(table)) {
2832  		live_map = dm_get_live_table_fast(md);
2833  		if (live_map)
2834  			limits = md->queue->limits;
2835  		dm_put_live_table_fast(md);
2836  	}
2837  
2838  	if (!live_map) {
2839  		r = dm_calculate_queue_limits(table, &limits);
2840  		if (r) {
2841  			map = ERR_PTR(r);
2842  			goto out;
2843  		}
2844  	}
2845  
2846  	map = __bind(md, table, &limits);
2847  	dm_issue_global_event();
2848  
2849  out:
2850  	mutex_unlock(&md->suspend_lock);
2851  	return map;
2852  }
2853  
2854  /*
2855   * Functions to lock and unlock any filesystem running on the
2856   * device.
2857   */
lock_fs(struct mapped_device * md)2858  static int lock_fs(struct mapped_device *md)
2859  {
2860  	int r;
2861  
2862  	WARN_ON(test_bit(DMF_FROZEN, &md->flags));
2863  
2864  	r = bdev_freeze(md->disk->part0);
2865  	if (!r)
2866  		set_bit(DMF_FROZEN, &md->flags);
2867  	return r;
2868  }
2869  
unlock_fs(struct mapped_device * md)2870  static void unlock_fs(struct mapped_device *md)
2871  {
2872  	if (!test_bit(DMF_FROZEN, &md->flags))
2873  		return;
2874  	bdev_thaw(md->disk->part0);
2875  	clear_bit(DMF_FROZEN, &md->flags);
2876  }
2877  
2878  /*
2879   * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2880   * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2881   * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2882   *
2883   * If __dm_suspend returns 0, the device is completely quiescent
2884   * now. There is no request-processing activity. All new requests
2885   * are being added to md->deferred list.
2886   */
__dm_suspend(struct mapped_device * md,struct dm_table * map,unsigned int suspend_flags,unsigned int task_state,int dmf_suspended_flag)2887  static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2888  			unsigned int suspend_flags, unsigned int task_state,
2889  			int dmf_suspended_flag)
2890  {
2891  	bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2892  	bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2893  	int r;
2894  
2895  	lockdep_assert_held(&md->suspend_lock);
2896  
2897  	/*
2898  	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2899  	 * This flag is cleared before dm_suspend returns.
2900  	 */
2901  	if (noflush)
2902  		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2903  	else
2904  		DMDEBUG("%s: suspending with flush", dm_device_name(md));
2905  
2906  	/*
2907  	 * This gets reverted if there's an error later and the targets
2908  	 * provide the .presuspend_undo hook.
2909  	 */
2910  	dm_table_presuspend_targets(map);
2911  
2912  	/*
2913  	 * Flush I/O to the device.
2914  	 * Any I/O submitted after lock_fs() may not be flushed.
2915  	 * noflush takes precedence over do_lockfs.
2916  	 * (lock_fs() flushes I/Os and waits for them to complete.)
2917  	 */
2918  	if (!noflush && do_lockfs) {
2919  		r = lock_fs(md);
2920  		if (r) {
2921  			dm_table_presuspend_undo_targets(map);
2922  			return r;
2923  		}
2924  	}
2925  
2926  	/*
2927  	 * Here we must make sure that no processes are submitting requests
2928  	 * to target drivers i.e. no one may be executing
2929  	 * dm_split_and_process_bio from dm_submit_bio.
2930  	 *
2931  	 * To get all processes out of dm_split_and_process_bio in dm_submit_bio,
2932  	 * we take the write lock. To prevent any process from reentering
2933  	 * dm_split_and_process_bio from dm_submit_bio and quiesce the thread
2934  	 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call
2935  	 * flush_workqueue(md->wq).
2936  	 */
2937  	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2938  	if (map)
2939  		synchronize_srcu(&md->io_barrier);
2940  
2941  	/*
2942  	 * Stop md->queue before flushing md->wq in case request-based
2943  	 * dm defers requests to md->wq from md->queue.
2944  	 */
2945  	if (dm_request_based(md))
2946  		dm_stop_queue(md->queue);
2947  
2948  	flush_workqueue(md->wq);
2949  
2950  	/*
2951  	 * At this point no more requests are entering target request routines.
2952  	 * We call dm_wait_for_completion to wait for all existing requests
2953  	 * to finish.
2954  	 */
2955  	r = dm_wait_for_completion(md, task_state);
2956  	if (!r)
2957  		set_bit(dmf_suspended_flag, &md->flags);
2958  
2959  	if (noflush)
2960  		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2961  	if (map)
2962  		synchronize_srcu(&md->io_barrier);
2963  
2964  	/* were we interrupted ? */
2965  	if (r < 0) {
2966  		dm_queue_flush(md);
2967  
2968  		if (dm_request_based(md))
2969  			dm_start_queue(md->queue);
2970  
2971  		unlock_fs(md);
2972  		dm_table_presuspend_undo_targets(map);
2973  		/* pushback list is already flushed, so skip flush */
2974  	}
2975  
2976  	return r;
2977  }
2978  
2979  /*
2980   * We need to be able to change a mapping table under a mounted
2981   * filesystem.  For example we might want to move some data in
2982   * the background.  Before the table can be swapped with
2983   * dm_bind_table, dm_suspend must be called to flush any in
2984   * flight bios and ensure that any further io gets deferred.
2985   */
2986  /*
2987   * Suspend mechanism in request-based dm.
2988   *
2989   * 1. Flush all I/Os by lock_fs() if needed.
2990   * 2. Stop dispatching any I/O by stopping the request_queue.
2991   * 3. Wait for all in-flight I/Os to be completed or requeued.
2992   *
2993   * To abort suspend, start the request_queue.
2994   */
dm_suspend(struct mapped_device * md,unsigned int suspend_flags)2995  int dm_suspend(struct mapped_device *md, unsigned int suspend_flags)
2996  {
2997  	struct dm_table *map = NULL;
2998  	int r = 0;
2999  
3000  retry:
3001  	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
3002  
3003  	if (dm_suspended_md(md)) {
3004  		r = -EINVAL;
3005  		goto out_unlock;
3006  	}
3007  
3008  	if (dm_suspended_internally_md(md)) {
3009  		/* already internally suspended, wait for internal resume */
3010  		mutex_unlock(&md->suspend_lock);
3011  		r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
3012  		if (r)
3013  			return r;
3014  		goto retry;
3015  	}
3016  
3017  	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3018  	if (!map) {
3019  		/* avoid deadlock with fs/namespace.c:do_mount() */
3020  		suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
3021  	}
3022  
3023  	r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
3024  	if (r)
3025  		goto out_unlock;
3026  
3027  	set_bit(DMF_POST_SUSPENDING, &md->flags);
3028  	dm_table_postsuspend_targets(map);
3029  	clear_bit(DMF_POST_SUSPENDING, &md->flags);
3030  
3031  out_unlock:
3032  	mutex_unlock(&md->suspend_lock);
3033  	return r;
3034  }
3035  
__dm_resume(struct mapped_device * md,struct dm_table * map)3036  static int __dm_resume(struct mapped_device *md, struct dm_table *map)
3037  {
3038  	if (map) {
3039  		int r = dm_table_resume_targets(map);
3040  
3041  		if (r)
3042  			return r;
3043  	}
3044  
3045  	dm_queue_flush(md);
3046  
3047  	/*
3048  	 * Flushing deferred I/Os must be done after targets are resumed
3049  	 * so that mapping of targets can work correctly.
3050  	 * Request-based dm is queueing the deferred I/Os in its request_queue.
3051  	 */
3052  	if (dm_request_based(md))
3053  		dm_start_queue(md->queue);
3054  
3055  	unlock_fs(md);
3056  
3057  	return 0;
3058  }
3059  
dm_resume(struct mapped_device * md)3060  int dm_resume(struct mapped_device *md)
3061  {
3062  	int r;
3063  	struct dm_table *map = NULL;
3064  
3065  retry:
3066  	r = -EINVAL;
3067  	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
3068  
3069  	if (!dm_suspended_md(md))
3070  		goto out;
3071  
3072  	if (dm_suspended_internally_md(md)) {
3073  		/* already internally suspended, wait for internal resume */
3074  		mutex_unlock(&md->suspend_lock);
3075  		r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
3076  		if (r)
3077  			return r;
3078  		goto retry;
3079  	}
3080  
3081  	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3082  	if (!map || !dm_table_get_size(map))
3083  		goto out;
3084  
3085  	r = __dm_resume(md, map);
3086  	if (r)
3087  		goto out;
3088  
3089  	clear_bit(DMF_SUSPENDED, &md->flags);
3090  out:
3091  	mutex_unlock(&md->suspend_lock);
3092  
3093  	return r;
3094  }
3095  
3096  /*
3097   * Internal suspend/resume works like userspace-driven suspend. It waits
3098   * until all bios finish and prevents issuing new bios to the target drivers.
3099   * It may be used only from the kernel.
3100   */
3101  
__dm_internal_suspend(struct mapped_device * md,unsigned int suspend_flags)3102  static void __dm_internal_suspend(struct mapped_device *md, unsigned int suspend_flags)
3103  {
3104  	struct dm_table *map = NULL;
3105  
3106  	lockdep_assert_held(&md->suspend_lock);
3107  
3108  	if (md->internal_suspend_count++)
3109  		return; /* nested internal suspend */
3110  
3111  	if (dm_suspended_md(md)) {
3112  		set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3113  		return; /* nest suspend */
3114  	}
3115  
3116  	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3117  
3118  	/*
3119  	 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
3120  	 * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
3121  	 * would require changing .presuspend to return an error -- avoid this
3122  	 * until there is a need for more elaborate variants of internal suspend.
3123  	 */
3124  	(void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
3125  			    DMF_SUSPENDED_INTERNALLY);
3126  
3127  	set_bit(DMF_POST_SUSPENDING, &md->flags);
3128  	dm_table_postsuspend_targets(map);
3129  	clear_bit(DMF_POST_SUSPENDING, &md->flags);
3130  }
3131  
__dm_internal_resume(struct mapped_device * md)3132  static void __dm_internal_resume(struct mapped_device *md)
3133  {
3134  	int r;
3135  	struct dm_table *map;
3136  
3137  	BUG_ON(!md->internal_suspend_count);
3138  
3139  	if (--md->internal_suspend_count)
3140  		return; /* resume from nested internal suspend */
3141  
3142  	if (dm_suspended_md(md))
3143  		goto done; /* resume from nested suspend */
3144  
3145  	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3146  	r = __dm_resume(md, map);
3147  	if (r) {
3148  		/*
3149  		 * If a preresume method of some target failed, we are in a
3150  		 * tricky situation. We can't return an error to the caller. We
3151  		 * can't fake success because then the "resume" and
3152  		 * "postsuspend" methods would not be paired correctly, and it
3153  		 * would break various targets, for example it would cause list
3154  		 * corruption in the "origin" target.
3155  		 *
3156  		 * So, we fake normal suspend here, to make sure that the
3157  		 * "resume" and "postsuspend" methods will be paired correctly.
3158  		 */
3159  		DMERR("Preresume method failed: %d", r);
3160  		set_bit(DMF_SUSPENDED, &md->flags);
3161  	}
3162  done:
3163  	clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3164  	smp_mb__after_atomic();
3165  	wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
3166  }
3167  
dm_internal_suspend_noflush(struct mapped_device * md)3168  void dm_internal_suspend_noflush(struct mapped_device *md)
3169  {
3170  	mutex_lock(&md->suspend_lock);
3171  	__dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
3172  	mutex_unlock(&md->suspend_lock);
3173  }
3174  EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
3175  
dm_internal_resume(struct mapped_device * md)3176  void dm_internal_resume(struct mapped_device *md)
3177  {
3178  	mutex_lock(&md->suspend_lock);
3179  	__dm_internal_resume(md);
3180  	mutex_unlock(&md->suspend_lock);
3181  }
3182  EXPORT_SYMBOL_GPL(dm_internal_resume);
3183  
3184  /*
3185   * Fast variants of internal suspend/resume hold md->suspend_lock,
3186   * which prevents interaction with userspace-driven suspend.
3187   */
3188  
dm_internal_suspend_fast(struct mapped_device * md)3189  void dm_internal_suspend_fast(struct mapped_device *md)
3190  {
3191  	mutex_lock(&md->suspend_lock);
3192  	if (dm_suspended_md(md) || dm_suspended_internally_md(md))
3193  		return;
3194  
3195  	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
3196  	synchronize_srcu(&md->io_barrier);
3197  	flush_workqueue(md->wq);
3198  	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
3199  }
3200  EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
3201  
dm_internal_resume_fast(struct mapped_device * md)3202  void dm_internal_resume_fast(struct mapped_device *md)
3203  {
3204  	if (dm_suspended_md(md) || dm_suspended_internally_md(md))
3205  		goto done;
3206  
3207  	dm_queue_flush(md);
3208  
3209  done:
3210  	mutex_unlock(&md->suspend_lock);
3211  }
3212  EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
3213  
3214  /*
3215   *---------------------------------------------------------------
3216   * Event notification.
3217   *---------------------------------------------------------------
3218   */
dm_kobject_uevent(struct mapped_device * md,enum kobject_action action,unsigned int cookie,bool need_resize_uevent)3219  int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
3220  		      unsigned int cookie, bool need_resize_uevent)
3221  {
3222  	int r;
3223  	unsigned int noio_flag;
3224  	char udev_cookie[DM_COOKIE_LENGTH];
3225  	char *envp[3] = { NULL, NULL, NULL };
3226  	char **envpp = envp;
3227  	if (cookie) {
3228  		snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
3229  			 DM_COOKIE_ENV_VAR_NAME, cookie);
3230  		*envpp++ = udev_cookie;
3231  	}
3232  	if (need_resize_uevent) {
3233  		*envpp++ = "RESIZE=1";
3234  	}
3235  
3236  	noio_flag = memalloc_noio_save();
3237  
3238  	r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
3239  
3240  	memalloc_noio_restore(noio_flag);
3241  
3242  	return r;
3243  }
3244  
dm_next_uevent_seq(struct mapped_device * md)3245  uint32_t dm_next_uevent_seq(struct mapped_device *md)
3246  {
3247  	return atomic_add_return(1, &md->uevent_seq);
3248  }
3249  
dm_get_event_nr(struct mapped_device * md)3250  uint32_t dm_get_event_nr(struct mapped_device *md)
3251  {
3252  	return atomic_read(&md->event_nr);
3253  }
3254  
dm_wait_event(struct mapped_device * md,int event_nr)3255  int dm_wait_event(struct mapped_device *md, int event_nr)
3256  {
3257  	return wait_event_interruptible(md->eventq,
3258  			(event_nr != atomic_read(&md->event_nr)));
3259  }
3260  
dm_uevent_add(struct mapped_device * md,struct list_head * elist)3261  void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
3262  {
3263  	unsigned long flags;
3264  
3265  	spin_lock_irqsave(&md->uevent_lock, flags);
3266  	list_add(elist, &md->uevent_list);
3267  	spin_unlock_irqrestore(&md->uevent_lock, flags);
3268  }
3269  
3270  /*
3271   * The gendisk is only valid as long as you have a reference
3272   * count on 'md'.
3273   */
dm_disk(struct mapped_device * md)3274  struct gendisk *dm_disk(struct mapped_device *md)
3275  {
3276  	return md->disk;
3277  }
3278  EXPORT_SYMBOL_GPL(dm_disk);
3279  
dm_kobject(struct mapped_device * md)3280  struct kobject *dm_kobject(struct mapped_device *md)
3281  {
3282  	return &md->kobj_holder.kobj;
3283  }
3284  
dm_get_from_kobject(struct kobject * kobj)3285  struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
3286  {
3287  	struct mapped_device *md;
3288  
3289  	md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
3290  
3291  	spin_lock(&_minor_lock);
3292  	if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
3293  		md = NULL;
3294  		goto out;
3295  	}
3296  	dm_get(md);
3297  out:
3298  	spin_unlock(&_minor_lock);
3299  
3300  	return md;
3301  }
3302  
dm_suspended_md(struct mapped_device * md)3303  int dm_suspended_md(struct mapped_device *md)
3304  {
3305  	return test_bit(DMF_SUSPENDED, &md->flags);
3306  }
3307  
dm_post_suspending_md(struct mapped_device * md)3308  static int dm_post_suspending_md(struct mapped_device *md)
3309  {
3310  	return test_bit(DMF_POST_SUSPENDING, &md->flags);
3311  }
3312  
dm_suspended_internally_md(struct mapped_device * md)3313  int dm_suspended_internally_md(struct mapped_device *md)
3314  {
3315  	return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3316  }
3317  
dm_test_deferred_remove_flag(struct mapped_device * md)3318  int dm_test_deferred_remove_flag(struct mapped_device *md)
3319  {
3320  	return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
3321  }
3322  
dm_suspended(struct dm_target * ti)3323  int dm_suspended(struct dm_target *ti)
3324  {
3325  	return dm_suspended_md(ti->table->md);
3326  }
3327  EXPORT_SYMBOL_GPL(dm_suspended);
3328  
dm_post_suspending(struct dm_target * ti)3329  int dm_post_suspending(struct dm_target *ti)
3330  {
3331  	return dm_post_suspending_md(ti->table->md);
3332  }
3333  EXPORT_SYMBOL_GPL(dm_post_suspending);
3334  
dm_noflush_suspending(struct dm_target * ti)3335  int dm_noflush_suspending(struct dm_target *ti)
3336  {
3337  	return __noflush_suspending(ti->table->md);
3338  }
3339  EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3340  
dm_free_md_mempools(struct dm_md_mempools * pools)3341  void dm_free_md_mempools(struct dm_md_mempools *pools)
3342  {
3343  	if (!pools)
3344  		return;
3345  
3346  	bioset_exit(&pools->bs);
3347  	bioset_exit(&pools->io_bs);
3348  
3349  	kfree(pools);
3350  }
3351  
3352  struct dm_pr {
3353  	u64	old_key;
3354  	u64	new_key;
3355  	u32	flags;
3356  	bool	abort;
3357  	bool	fail_early;
3358  	int	ret;
3359  	enum pr_type type;
3360  	struct pr_keys *read_keys;
3361  	struct pr_held_reservation *rsv;
3362  };
3363  
dm_call_pr(struct block_device * bdev,iterate_devices_callout_fn fn,struct dm_pr * pr)3364  static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
3365  		      struct dm_pr *pr)
3366  {
3367  	struct mapped_device *md = bdev->bd_disk->private_data;
3368  	struct dm_table *table;
3369  	struct dm_target *ti;
3370  	int ret = -ENOTTY, srcu_idx;
3371  
3372  	table = dm_get_live_table(md, &srcu_idx);
3373  	if (!table || !dm_table_get_size(table))
3374  		goto out;
3375  
3376  	/* We only support devices that have a single target */
3377  	if (table->num_targets != 1)
3378  		goto out;
3379  	ti = dm_table_get_target(table, 0);
3380  
3381  	if (dm_suspended_md(md)) {
3382  		ret = -EAGAIN;
3383  		goto out;
3384  	}
3385  
3386  	ret = -EINVAL;
3387  	if (!ti->type->iterate_devices)
3388  		goto out;
3389  
3390  	ti->type->iterate_devices(ti, fn, pr);
3391  	ret = 0;
3392  out:
3393  	dm_put_live_table(md, srcu_idx);
3394  	return ret;
3395  }
3396  
3397  /*
3398   * For register / unregister we need to manually call out to every path.
3399   */
__dm_pr_register(struct dm_target * ti,struct dm_dev * dev,sector_t start,sector_t len,void * data)3400  static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3401  			    sector_t start, sector_t len, void *data)
3402  {
3403  	struct dm_pr *pr = data;
3404  	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3405  	int ret;
3406  
3407  	if (!ops || !ops->pr_register) {
3408  		pr->ret = -EOPNOTSUPP;
3409  		return -1;
3410  	}
3411  
3412  	ret = ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3413  	if (!ret)
3414  		return 0;
3415  
3416  	if (!pr->ret)
3417  		pr->ret = ret;
3418  
3419  	if (pr->fail_early)
3420  		return -1;
3421  
3422  	return 0;
3423  }
3424  
dm_pr_register(struct block_device * bdev,u64 old_key,u64 new_key,u32 flags)3425  static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3426  			  u32 flags)
3427  {
3428  	struct dm_pr pr = {
3429  		.old_key	= old_key,
3430  		.new_key	= new_key,
3431  		.flags		= flags,
3432  		.fail_early	= true,
3433  		.ret		= 0,
3434  	};
3435  	int ret;
3436  
3437  	ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3438  	if (ret) {
3439  		/* Didn't even get to register a path */
3440  		return ret;
3441  	}
3442  
3443  	if (!pr.ret)
3444  		return 0;
3445  	ret = pr.ret;
3446  
3447  	if (!new_key)
3448  		return ret;
3449  
3450  	/* unregister all paths if we failed to register any path */
3451  	pr.old_key = new_key;
3452  	pr.new_key = 0;
3453  	pr.flags = 0;
3454  	pr.fail_early = false;
3455  	(void) dm_call_pr(bdev, __dm_pr_register, &pr);
3456  	return ret;
3457  }
3458  
3459  
__dm_pr_reserve(struct dm_target * ti,struct dm_dev * dev,sector_t start,sector_t len,void * data)3460  static int __dm_pr_reserve(struct dm_target *ti, struct dm_dev *dev,
3461  			   sector_t start, sector_t len, void *data)
3462  {
3463  	struct dm_pr *pr = data;
3464  	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3465  
3466  	if (!ops || !ops->pr_reserve) {
3467  		pr->ret = -EOPNOTSUPP;
3468  		return -1;
3469  	}
3470  
3471  	pr->ret = ops->pr_reserve(dev->bdev, pr->old_key, pr->type, pr->flags);
3472  	if (!pr->ret)
3473  		return -1;
3474  
3475  	return 0;
3476  }
3477  
dm_pr_reserve(struct block_device * bdev,u64 key,enum pr_type type,u32 flags)3478  static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3479  			 u32 flags)
3480  {
3481  	struct dm_pr pr = {
3482  		.old_key	= key,
3483  		.flags		= flags,
3484  		.type		= type,
3485  		.fail_early	= false,
3486  		.ret		= 0,
3487  	};
3488  	int ret;
3489  
3490  	ret = dm_call_pr(bdev, __dm_pr_reserve, &pr);
3491  	if (ret)
3492  		return ret;
3493  
3494  	return pr.ret;
3495  }
3496  
3497  /*
3498   * If there is a non-All Registrants type of reservation, the release must be
3499   * sent down the holding path. For the cases where there is no reservation or
3500   * the path is not the holder the device will also return success, so we must
3501   * try each path to make sure we got the correct path.
3502   */
__dm_pr_release(struct dm_target * ti,struct dm_dev * dev,sector_t start,sector_t len,void * data)3503  static int __dm_pr_release(struct dm_target *ti, struct dm_dev *dev,
3504  			   sector_t start, sector_t len, void *data)
3505  {
3506  	struct dm_pr *pr = data;
3507  	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3508  
3509  	if (!ops || !ops->pr_release) {
3510  		pr->ret = -EOPNOTSUPP;
3511  		return -1;
3512  	}
3513  
3514  	pr->ret = ops->pr_release(dev->bdev, pr->old_key, pr->type);
3515  	if (pr->ret)
3516  		return -1;
3517  
3518  	return 0;
3519  }
3520  
dm_pr_release(struct block_device * bdev,u64 key,enum pr_type type)3521  static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3522  {
3523  	struct dm_pr pr = {
3524  		.old_key	= key,
3525  		.type		= type,
3526  		.fail_early	= false,
3527  	};
3528  	int ret;
3529  
3530  	ret = dm_call_pr(bdev, __dm_pr_release, &pr);
3531  	if (ret)
3532  		return ret;
3533  
3534  	return pr.ret;
3535  }
3536  
__dm_pr_preempt(struct dm_target * ti,struct dm_dev * dev,sector_t start,sector_t len,void * data)3537  static int __dm_pr_preempt(struct dm_target *ti, struct dm_dev *dev,
3538  			   sector_t start, sector_t len, void *data)
3539  {
3540  	struct dm_pr *pr = data;
3541  	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3542  
3543  	if (!ops || !ops->pr_preempt) {
3544  		pr->ret = -EOPNOTSUPP;
3545  		return -1;
3546  	}
3547  
3548  	pr->ret = ops->pr_preempt(dev->bdev, pr->old_key, pr->new_key, pr->type,
3549  				  pr->abort);
3550  	if (!pr->ret)
3551  		return -1;
3552  
3553  	return 0;
3554  }
3555  
dm_pr_preempt(struct block_device * bdev,u64 old_key,u64 new_key,enum pr_type type,bool abort)3556  static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3557  			 enum pr_type type, bool abort)
3558  {
3559  	struct dm_pr pr = {
3560  		.new_key	= new_key,
3561  		.old_key	= old_key,
3562  		.type		= type,
3563  		.fail_early	= false,
3564  	};
3565  	int ret;
3566  
3567  	ret = dm_call_pr(bdev, __dm_pr_preempt, &pr);
3568  	if (ret)
3569  		return ret;
3570  
3571  	return pr.ret;
3572  }
3573  
dm_pr_clear(struct block_device * bdev,u64 key)3574  static int dm_pr_clear(struct block_device *bdev, u64 key)
3575  {
3576  	struct mapped_device *md = bdev->bd_disk->private_data;
3577  	const struct pr_ops *ops;
3578  	int r, srcu_idx;
3579  
3580  	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3581  	if (r < 0)
3582  		goto out;
3583  
3584  	ops = bdev->bd_disk->fops->pr_ops;
3585  	if (ops && ops->pr_clear)
3586  		r = ops->pr_clear(bdev, key);
3587  	else
3588  		r = -EOPNOTSUPP;
3589  out:
3590  	dm_unprepare_ioctl(md, srcu_idx);
3591  	return r;
3592  }
3593  
__dm_pr_read_keys(struct dm_target * ti,struct dm_dev * dev,sector_t start,sector_t len,void * data)3594  static int __dm_pr_read_keys(struct dm_target *ti, struct dm_dev *dev,
3595  			     sector_t start, sector_t len, void *data)
3596  {
3597  	struct dm_pr *pr = data;
3598  	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3599  
3600  	if (!ops || !ops->pr_read_keys) {
3601  		pr->ret = -EOPNOTSUPP;
3602  		return -1;
3603  	}
3604  
3605  	pr->ret = ops->pr_read_keys(dev->bdev, pr->read_keys);
3606  	if (!pr->ret)
3607  		return -1;
3608  
3609  	return 0;
3610  }
3611  
dm_pr_read_keys(struct block_device * bdev,struct pr_keys * keys)3612  static int dm_pr_read_keys(struct block_device *bdev, struct pr_keys *keys)
3613  {
3614  	struct dm_pr pr = {
3615  		.read_keys = keys,
3616  	};
3617  	int ret;
3618  
3619  	ret = dm_call_pr(bdev, __dm_pr_read_keys, &pr);
3620  	if (ret)
3621  		return ret;
3622  
3623  	return pr.ret;
3624  }
3625  
__dm_pr_read_reservation(struct dm_target * ti,struct dm_dev * dev,sector_t start,sector_t len,void * data)3626  static int __dm_pr_read_reservation(struct dm_target *ti, struct dm_dev *dev,
3627  				    sector_t start, sector_t len, void *data)
3628  {
3629  	struct dm_pr *pr = data;
3630  	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3631  
3632  	if (!ops || !ops->pr_read_reservation) {
3633  		pr->ret = -EOPNOTSUPP;
3634  		return -1;
3635  	}
3636  
3637  	pr->ret = ops->pr_read_reservation(dev->bdev, pr->rsv);
3638  	if (!pr->ret)
3639  		return -1;
3640  
3641  	return 0;
3642  }
3643  
dm_pr_read_reservation(struct block_device * bdev,struct pr_held_reservation * rsv)3644  static int dm_pr_read_reservation(struct block_device *bdev,
3645  				  struct pr_held_reservation *rsv)
3646  {
3647  	struct dm_pr pr = {
3648  		.rsv = rsv,
3649  	};
3650  	int ret;
3651  
3652  	ret = dm_call_pr(bdev, __dm_pr_read_reservation, &pr);
3653  	if (ret)
3654  		return ret;
3655  
3656  	return pr.ret;
3657  }
3658  
3659  static const struct pr_ops dm_pr_ops = {
3660  	.pr_register	= dm_pr_register,
3661  	.pr_reserve	= dm_pr_reserve,
3662  	.pr_release	= dm_pr_release,
3663  	.pr_preempt	= dm_pr_preempt,
3664  	.pr_clear	= dm_pr_clear,
3665  	.pr_read_keys	= dm_pr_read_keys,
3666  	.pr_read_reservation = dm_pr_read_reservation,
3667  };
3668  
3669  static const struct block_device_operations dm_blk_dops = {
3670  	.submit_bio = dm_submit_bio,
3671  	.poll_bio = dm_poll_bio,
3672  	.open = dm_blk_open,
3673  	.release = dm_blk_close,
3674  	.ioctl = dm_blk_ioctl,
3675  	.getgeo = dm_blk_getgeo,
3676  	.report_zones = dm_blk_report_zones,
3677  	.pr_ops = &dm_pr_ops,
3678  	.owner = THIS_MODULE
3679  };
3680  
3681  static const struct block_device_operations dm_rq_blk_dops = {
3682  	.open = dm_blk_open,
3683  	.release = dm_blk_close,
3684  	.ioctl = dm_blk_ioctl,
3685  	.getgeo = dm_blk_getgeo,
3686  	.pr_ops = &dm_pr_ops,
3687  	.owner = THIS_MODULE
3688  };
3689  
3690  static const struct dax_operations dm_dax_ops = {
3691  	.direct_access = dm_dax_direct_access,
3692  	.zero_page_range = dm_dax_zero_page_range,
3693  	.recovery_write = dm_dax_recovery_write,
3694  };
3695  
3696  /*
3697   * module hooks
3698   */
3699  module_init(dm_init);
3700  module_exit(dm_exit);
3701  
3702  module_param(major, uint, 0);
3703  MODULE_PARM_DESC(major, "The major number of the device mapper");
3704  
3705  module_param(reserved_bio_based_ios, uint, 0644);
3706  MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3707  
3708  module_param(dm_numa_node, int, 0644);
3709  MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3710  
3711  module_param(swap_bios, int, 0644);
3712  MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
3713  
3714  MODULE_DESCRIPTION(DM_NAME " driver");
3715  MODULE_AUTHOR("Joe Thornber <dm-devel@lists.linux.dev>");
3716  MODULE_LICENSE("GPL");
3717