1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * Userspace block device - block device which IO is handled from userspace
4   *
5   * Take full use of io_uring passthrough command for communicating with
6   * ublk userspace daemon(ublksrvd) for handling basic IO request.
7   *
8   * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9   *
10   * (part of code stolen from loop.c)
11   */
12  #include <linux/module.h>
13  #include <linux/moduleparam.h>
14  #include <linux/sched.h>
15  #include <linux/fs.h>
16  #include <linux/pagemap.h>
17  #include <linux/file.h>
18  #include <linux/stat.h>
19  #include <linux/errno.h>
20  #include <linux/major.h>
21  #include <linux/wait.h>
22  #include <linux/blkdev.h>
23  #include <linux/init.h>
24  #include <linux/swap.h>
25  #include <linux/slab.h>
26  #include <linux/compat.h>
27  #include <linux/mutex.h>
28  #include <linux/writeback.h>
29  #include <linux/completion.h>
30  #include <linux/highmem.h>
31  #include <linux/sysfs.h>
32  #include <linux/miscdevice.h>
33  #include <linux/falloc.h>
34  #include <linux/uio.h>
35  #include <linux/ioprio.h>
36  #include <linux/sched/mm.h>
37  #include <linux/uaccess.h>
38  #include <linux/cdev.h>
39  #include <linux/io_uring/cmd.h>
40  #include <linux/blk-mq.h>
41  #include <linux/delay.h>
42  #include <linux/mm.h>
43  #include <asm/page.h>
44  #include <linux/task_work.h>
45  #include <linux/namei.h>
46  #include <linux/kref.h>
47  #include <uapi/linux/ublk_cmd.h>
48  
49  #define UBLK_MINORS		(1U << MINORBITS)
50  
51  /* private ioctl command mirror */
52  #define UBLK_CMD_DEL_DEV_ASYNC	_IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
53  
54  /* All UBLK_F_* have to be included into UBLK_F_ALL */
55  #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
56  		| UBLK_F_URING_CMD_COMP_IN_TASK \
57  		| UBLK_F_NEED_GET_DATA \
58  		| UBLK_F_USER_RECOVERY \
59  		| UBLK_F_USER_RECOVERY_REISSUE \
60  		| UBLK_F_UNPRIVILEGED_DEV \
61  		| UBLK_F_CMD_IOCTL_ENCODE \
62  		| UBLK_F_USER_COPY \
63  		| UBLK_F_ZONED)
64  
65  /* All UBLK_PARAM_TYPE_* should be included here */
66  #define UBLK_PARAM_TYPE_ALL                                \
67  	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
68  	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
69  
70  struct ublk_rq_data {
71  	struct llist_node node;
72  
73  	struct kref ref;
74  };
75  
76  struct ublk_uring_cmd_pdu {
77  	struct ublk_queue *ubq;
78  	u16 tag;
79  };
80  
81  /*
82   * io command is active: sqe cmd is received, and its cqe isn't done
83   *
84   * If the flag is set, the io command is owned by ublk driver, and waited
85   * for incoming blk-mq request from the ublk block device.
86   *
87   * If the flag is cleared, the io command will be completed, and owned by
88   * ublk server.
89   */
90  #define UBLK_IO_FLAG_ACTIVE	0x01
91  
92  /*
93   * IO command is completed via cqe, and it is being handled by ublksrv, and
94   * not committed yet
95   *
96   * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
97   * cross verification
98   */
99  #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
100  
101  /*
102   * IO command is aborted, so this flag is set in case of
103   * !UBLK_IO_FLAG_ACTIVE.
104   *
105   * After this flag is observed, any pending or new incoming request
106   * associated with this io command will be failed immediately
107   */
108  #define UBLK_IO_FLAG_ABORTED 0x04
109  
110  /*
111   * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
112   * get data buffer address from ublksrv.
113   *
114   * Then, bio data could be copied into this data buffer for a WRITE request
115   * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
116   */
117  #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
118  
119  /* atomic RW with ubq->cancel_lock */
120  #define UBLK_IO_FLAG_CANCELED	0x80000000
121  
122  struct ublk_io {
123  	/* userspace buffer address from io cmd */
124  	__u64	addr;
125  	unsigned int flags;
126  	int res;
127  
128  	struct io_uring_cmd *cmd;
129  };
130  
131  struct ublk_queue {
132  	int q_id;
133  	int q_depth;
134  
135  	unsigned long flags;
136  	struct task_struct	*ubq_daemon;
137  	char *io_cmd_buf;
138  
139  	struct llist_head	io_cmds;
140  
141  	unsigned long io_addr;	/* mapped vm address */
142  	unsigned int max_io_sz;
143  	bool force_abort;
144  	bool timeout;
145  	bool canceling;
146  	unsigned short nr_io_ready;	/* how many ios setup */
147  	spinlock_t		cancel_lock;
148  	struct ublk_device *dev;
149  	struct ublk_io ios[];
150  };
151  
152  struct ublk_device {
153  	struct gendisk		*ub_disk;
154  
155  	char	*__queues;
156  
157  	unsigned int	queue_size;
158  	struct ublksrv_ctrl_dev_info	dev_info;
159  
160  	struct blk_mq_tag_set	tag_set;
161  
162  	struct cdev		cdev;
163  	struct device		cdev_dev;
164  
165  #define UB_STATE_OPEN		0
166  #define UB_STATE_USED		1
167  #define UB_STATE_DELETED	2
168  	unsigned long		state;
169  	int			ub_number;
170  
171  	struct mutex		mutex;
172  
173  	spinlock_t		lock;
174  	struct mm_struct	*mm;
175  
176  	struct ublk_params	params;
177  
178  	struct completion	completion;
179  	unsigned int		nr_queues_ready;
180  	unsigned int		nr_privileged_daemon;
181  
182  	struct work_struct	quiesce_work;
183  	struct work_struct	stop_work;
184  };
185  
186  /* header of ublk_params */
187  struct ublk_params_header {
188  	__u32	len;
189  	__u32	types;
190  };
191  
192  static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq);
193  
194  static inline unsigned int ublk_req_build_flags(struct request *req);
195  static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
196  						   int tag);
ublk_dev_is_user_copy(const struct ublk_device * ub)197  static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
198  {
199  	return ub->dev_info.flags & UBLK_F_USER_COPY;
200  }
201  
ublk_dev_is_zoned(const struct ublk_device * ub)202  static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
203  {
204  	return ub->dev_info.flags & UBLK_F_ZONED;
205  }
206  
ublk_queue_is_zoned(struct ublk_queue * ubq)207  static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
208  {
209  	return ubq->flags & UBLK_F_ZONED;
210  }
211  
212  #ifdef CONFIG_BLK_DEV_ZONED
213  
214  struct ublk_zoned_report_desc {
215  	__u64 sector;
216  	__u32 operation;
217  	__u32 nr_zones;
218  };
219  
220  static DEFINE_XARRAY(ublk_zoned_report_descs);
221  
ublk_zoned_insert_report_desc(const struct request * req,struct ublk_zoned_report_desc * desc)222  static int ublk_zoned_insert_report_desc(const struct request *req,
223  		struct ublk_zoned_report_desc *desc)
224  {
225  	return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
226  			    desc, GFP_KERNEL);
227  }
228  
ublk_zoned_erase_report_desc(const struct request * req)229  static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
230  		const struct request *req)
231  {
232  	return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
233  }
234  
ublk_zoned_get_report_desc(const struct request * req)235  static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
236  		const struct request *req)
237  {
238  	return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
239  }
240  
ublk_get_nr_zones(const struct ublk_device * ub)241  static int ublk_get_nr_zones(const struct ublk_device *ub)
242  {
243  	const struct ublk_param_basic *p = &ub->params.basic;
244  
245  	/* Zone size is a power of 2 */
246  	return p->dev_sectors >> ilog2(p->chunk_sectors);
247  }
248  
ublk_revalidate_disk_zones(struct ublk_device * ub)249  static int ublk_revalidate_disk_zones(struct ublk_device *ub)
250  {
251  	return blk_revalidate_disk_zones(ub->ub_disk);
252  }
253  
ublk_dev_param_zoned_validate(const struct ublk_device * ub)254  static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
255  {
256  	const struct ublk_param_zoned *p = &ub->params.zoned;
257  	int nr_zones;
258  
259  	if (!ublk_dev_is_zoned(ub))
260  		return -EINVAL;
261  
262  	if (!p->max_zone_append_sectors)
263  		return -EINVAL;
264  
265  	nr_zones = ublk_get_nr_zones(ub);
266  
267  	if (p->max_active_zones > nr_zones)
268  		return -EINVAL;
269  
270  	if (p->max_open_zones > nr_zones)
271  		return -EINVAL;
272  
273  	return 0;
274  }
275  
ublk_dev_param_zoned_apply(struct ublk_device * ub)276  static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
277  {
278  	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
279  }
280  
281  /* Based on virtblk_alloc_report_buffer */
ublk_alloc_report_buffer(struct ublk_device * ublk,unsigned int nr_zones,size_t * buflen)282  static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
283  				      unsigned int nr_zones, size_t *buflen)
284  {
285  	struct request_queue *q = ublk->ub_disk->queue;
286  	size_t bufsize;
287  	void *buf;
288  
289  	nr_zones = min_t(unsigned int, nr_zones,
290  			 ublk->ub_disk->nr_zones);
291  
292  	bufsize = nr_zones * sizeof(struct blk_zone);
293  	bufsize =
294  		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
295  
296  	while (bufsize >= sizeof(struct blk_zone)) {
297  		buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
298  		if (buf) {
299  			*buflen = bufsize;
300  			return buf;
301  		}
302  		bufsize >>= 1;
303  	}
304  
305  	*buflen = 0;
306  	return NULL;
307  }
308  
ublk_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,report_zones_cb cb,void * data)309  static int ublk_report_zones(struct gendisk *disk, sector_t sector,
310  		      unsigned int nr_zones, report_zones_cb cb, void *data)
311  {
312  	struct ublk_device *ub = disk->private_data;
313  	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
314  	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
315  	unsigned int done_zones = 0;
316  	unsigned int max_zones_per_request;
317  	int ret;
318  	struct blk_zone *buffer;
319  	size_t buffer_length;
320  
321  	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
322  			 nr_zones);
323  
324  	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
325  	if (!buffer)
326  		return -ENOMEM;
327  
328  	max_zones_per_request = buffer_length / sizeof(struct blk_zone);
329  
330  	while (done_zones < nr_zones) {
331  		unsigned int remaining_zones = nr_zones - done_zones;
332  		unsigned int zones_in_request =
333  			min_t(unsigned int, remaining_zones, max_zones_per_request);
334  		struct request *req;
335  		struct ublk_zoned_report_desc desc;
336  		blk_status_t status;
337  
338  		memset(buffer, 0, buffer_length);
339  
340  		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
341  		if (IS_ERR(req)) {
342  			ret = PTR_ERR(req);
343  			goto out;
344  		}
345  
346  		desc.operation = UBLK_IO_OP_REPORT_ZONES;
347  		desc.sector = sector;
348  		desc.nr_zones = zones_in_request;
349  		ret = ublk_zoned_insert_report_desc(req, &desc);
350  		if (ret)
351  			goto free_req;
352  
353  		ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
354  					GFP_KERNEL);
355  		if (ret)
356  			goto erase_desc;
357  
358  		status = blk_execute_rq(req, 0);
359  		ret = blk_status_to_errno(status);
360  erase_desc:
361  		ublk_zoned_erase_report_desc(req);
362  free_req:
363  		blk_mq_free_request(req);
364  		if (ret)
365  			goto out;
366  
367  		for (unsigned int i = 0; i < zones_in_request; i++) {
368  			struct blk_zone *zone = buffer + i;
369  
370  			/* A zero length zone means no more zones in this response */
371  			if (!zone->len)
372  				break;
373  
374  			ret = cb(zone, i, data);
375  			if (ret)
376  				goto out;
377  
378  			done_zones++;
379  			sector += zone_size_sectors;
380  
381  		}
382  	}
383  
384  	ret = done_zones;
385  
386  out:
387  	kvfree(buffer);
388  	return ret;
389  }
390  
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)391  static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
392  					 struct request *req)
393  {
394  	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
395  	struct ublk_io *io = &ubq->ios[req->tag];
396  	struct ublk_zoned_report_desc *desc;
397  	u32 ublk_op;
398  
399  	switch (req_op(req)) {
400  	case REQ_OP_ZONE_OPEN:
401  		ublk_op = UBLK_IO_OP_ZONE_OPEN;
402  		break;
403  	case REQ_OP_ZONE_CLOSE:
404  		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
405  		break;
406  	case REQ_OP_ZONE_FINISH:
407  		ublk_op = UBLK_IO_OP_ZONE_FINISH;
408  		break;
409  	case REQ_OP_ZONE_RESET:
410  		ublk_op = UBLK_IO_OP_ZONE_RESET;
411  		break;
412  	case REQ_OP_ZONE_APPEND:
413  		ublk_op = UBLK_IO_OP_ZONE_APPEND;
414  		break;
415  	case REQ_OP_ZONE_RESET_ALL:
416  		ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
417  		break;
418  	case REQ_OP_DRV_IN:
419  		desc = ublk_zoned_get_report_desc(req);
420  		if (!desc)
421  			return BLK_STS_IOERR;
422  		ublk_op = desc->operation;
423  		switch (ublk_op) {
424  		case UBLK_IO_OP_REPORT_ZONES:
425  			iod->op_flags = ublk_op | ublk_req_build_flags(req);
426  			iod->nr_zones = desc->nr_zones;
427  			iod->start_sector = desc->sector;
428  			return BLK_STS_OK;
429  		default:
430  			return BLK_STS_IOERR;
431  		}
432  	case REQ_OP_DRV_OUT:
433  		/* We do not support drv_out */
434  		return BLK_STS_NOTSUPP;
435  	default:
436  		return BLK_STS_IOERR;
437  	}
438  
439  	iod->op_flags = ublk_op | ublk_req_build_flags(req);
440  	iod->nr_sectors = blk_rq_sectors(req);
441  	iod->start_sector = blk_rq_pos(req);
442  	iod->addr = io->addr;
443  
444  	return BLK_STS_OK;
445  }
446  
447  #else
448  
449  #define ublk_report_zones (NULL)
450  
ublk_dev_param_zoned_validate(const struct ublk_device * ub)451  static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
452  {
453  	return -EOPNOTSUPP;
454  }
455  
ublk_dev_param_zoned_apply(struct ublk_device * ub)456  static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
457  {
458  }
459  
ublk_revalidate_disk_zones(struct ublk_device * ub)460  static int ublk_revalidate_disk_zones(struct ublk_device *ub)
461  {
462  	return 0;
463  }
464  
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)465  static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
466  					 struct request *req)
467  {
468  	return BLK_STS_NOTSUPP;
469  }
470  
471  #endif
472  
473  static inline void __ublk_complete_rq(struct request *req);
474  static void ublk_complete_rq(struct kref *ref);
475  
476  static dev_t ublk_chr_devt;
477  static const struct class ublk_chr_class = {
478  	.name = "ublk-char",
479  };
480  
481  static DEFINE_IDR(ublk_index_idr);
482  static DEFINE_SPINLOCK(ublk_idr_lock);
483  static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
484  
485  static DEFINE_MUTEX(ublk_ctl_mutex);
486  
487  /*
488   * Max ublk devices allowed to add
489   *
490   * It can be extended to one per-user limit in future or even controlled
491   * by cgroup.
492   */
493  #define UBLK_MAX_UBLKS UBLK_MINORS
494  static unsigned int ublks_max = 64;
495  static unsigned int ublks_added;	/* protected by ublk_ctl_mutex */
496  
497  static struct miscdevice ublk_misc;
498  
ublk_pos_to_hwq(loff_t pos)499  static inline unsigned ublk_pos_to_hwq(loff_t pos)
500  {
501  	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
502  		UBLK_QID_BITS_MASK;
503  }
504  
ublk_pos_to_buf_off(loff_t pos)505  static inline unsigned ublk_pos_to_buf_off(loff_t pos)
506  {
507  	return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
508  }
509  
ublk_pos_to_tag(loff_t pos)510  static inline unsigned ublk_pos_to_tag(loff_t pos)
511  {
512  	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
513  		UBLK_TAG_BITS_MASK;
514  }
515  
ublk_dev_param_basic_apply(struct ublk_device * ub)516  static void ublk_dev_param_basic_apply(struct ublk_device *ub)
517  {
518  	const struct ublk_param_basic *p = &ub->params.basic;
519  
520  	if (p->attrs & UBLK_ATTR_READ_ONLY)
521  		set_disk_ro(ub->ub_disk, true);
522  
523  	set_capacity(ub->ub_disk, p->dev_sectors);
524  }
525  
ublk_validate_params(const struct ublk_device * ub)526  static int ublk_validate_params(const struct ublk_device *ub)
527  {
528  	/* basic param is the only one which must be set */
529  	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
530  		const struct ublk_param_basic *p = &ub->params.basic;
531  
532  		if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
533  			return -EINVAL;
534  
535  		if (p->logical_bs_shift > p->physical_bs_shift)
536  			return -EINVAL;
537  
538  		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
539  			return -EINVAL;
540  
541  		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
542  			return -EINVAL;
543  	} else
544  		return -EINVAL;
545  
546  	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
547  		const struct ublk_param_discard *p = &ub->params.discard;
548  
549  		/* So far, only support single segment discard */
550  		if (p->max_discard_sectors && p->max_discard_segments != 1)
551  			return -EINVAL;
552  
553  		if (!p->discard_granularity)
554  			return -EINVAL;
555  	}
556  
557  	/* dev_t is read-only */
558  	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
559  		return -EINVAL;
560  
561  	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
562  		return ublk_dev_param_zoned_validate(ub);
563  	else if (ublk_dev_is_zoned(ub))
564  		return -EINVAL;
565  
566  	return 0;
567  }
568  
ublk_apply_params(struct ublk_device * ub)569  static void ublk_apply_params(struct ublk_device *ub)
570  {
571  	ublk_dev_param_basic_apply(ub);
572  
573  	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
574  		ublk_dev_param_zoned_apply(ub);
575  }
576  
ublk_support_user_copy(const struct ublk_queue * ubq)577  static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
578  {
579  	return ubq->flags & UBLK_F_USER_COPY;
580  }
581  
ublk_need_req_ref(const struct ublk_queue * ubq)582  static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
583  {
584  	/*
585  	 * read()/write() is involved in user copy, so request reference
586  	 * has to be grabbed
587  	 */
588  	return ublk_support_user_copy(ubq);
589  }
590  
ublk_init_req_ref(const struct ublk_queue * ubq,struct request * req)591  static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
592  		struct request *req)
593  {
594  	if (ublk_need_req_ref(ubq)) {
595  		struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
596  
597  		kref_init(&data->ref);
598  	}
599  }
600  
ublk_get_req_ref(const struct ublk_queue * ubq,struct request * req)601  static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
602  		struct request *req)
603  {
604  	if (ublk_need_req_ref(ubq)) {
605  		struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
606  
607  		return kref_get_unless_zero(&data->ref);
608  	}
609  
610  	return true;
611  }
612  
ublk_put_req_ref(const struct ublk_queue * ubq,struct request * req)613  static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
614  		struct request *req)
615  {
616  	if (ublk_need_req_ref(ubq)) {
617  		struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
618  
619  		kref_put(&data->ref, ublk_complete_rq);
620  	} else {
621  		__ublk_complete_rq(req);
622  	}
623  }
624  
ublk_need_get_data(const struct ublk_queue * ubq)625  static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
626  {
627  	return ubq->flags & UBLK_F_NEED_GET_DATA;
628  }
629  
630  /* Called in slow path only, keep it noinline for trace purpose */
ublk_get_device(struct ublk_device * ub)631  static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
632  {
633  	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
634  		return ub;
635  	return NULL;
636  }
637  
638  /* Called in slow path only, keep it noinline for trace purpose */
ublk_put_device(struct ublk_device * ub)639  static noinline void ublk_put_device(struct ublk_device *ub)
640  {
641  	put_device(&ub->cdev_dev);
642  }
643  
ublk_get_queue(struct ublk_device * dev,int qid)644  static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
645  		int qid)
646  {
647         return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
648  }
649  
ublk_rq_has_data(const struct request * rq)650  static inline bool ublk_rq_has_data(const struct request *rq)
651  {
652  	return bio_has_data(rq->bio);
653  }
654  
ublk_get_iod(struct ublk_queue * ubq,int tag)655  static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
656  		int tag)
657  {
658  	return (struct ublksrv_io_desc *)
659  		&(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
660  }
661  
ublk_queue_cmd_buf(struct ublk_device * ub,int q_id)662  static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
663  {
664  	return ublk_get_queue(ub, q_id)->io_cmd_buf;
665  }
666  
ublk_queue_cmd_buf_size(struct ublk_device * ub,int q_id)667  static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
668  {
669  	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
670  
671  	return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc),
672  			PAGE_SIZE);
673  }
674  
ublk_queue_can_use_recovery_reissue(struct ublk_queue * ubq)675  static inline bool ublk_queue_can_use_recovery_reissue(
676  		struct ublk_queue *ubq)
677  {
678  	return (ubq->flags & UBLK_F_USER_RECOVERY) &&
679  			(ubq->flags & UBLK_F_USER_RECOVERY_REISSUE);
680  }
681  
ublk_queue_can_use_recovery(struct ublk_queue * ubq)682  static inline bool ublk_queue_can_use_recovery(
683  		struct ublk_queue *ubq)
684  {
685  	return ubq->flags & UBLK_F_USER_RECOVERY;
686  }
687  
ublk_can_use_recovery(struct ublk_device * ub)688  static inline bool ublk_can_use_recovery(struct ublk_device *ub)
689  {
690  	return ub->dev_info.flags & UBLK_F_USER_RECOVERY;
691  }
692  
ublk_free_disk(struct gendisk * disk)693  static void ublk_free_disk(struct gendisk *disk)
694  {
695  	struct ublk_device *ub = disk->private_data;
696  
697  	clear_bit(UB_STATE_USED, &ub->state);
698  	ublk_put_device(ub);
699  }
700  
ublk_store_owner_uid_gid(unsigned int * owner_uid,unsigned int * owner_gid)701  static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
702  		unsigned int *owner_gid)
703  {
704  	kuid_t uid;
705  	kgid_t gid;
706  
707  	current_uid_gid(&uid, &gid);
708  
709  	*owner_uid = from_kuid(&init_user_ns, uid);
710  	*owner_gid = from_kgid(&init_user_ns, gid);
711  }
712  
ublk_open(struct gendisk * disk,blk_mode_t mode)713  static int ublk_open(struct gendisk *disk, blk_mode_t mode)
714  {
715  	struct ublk_device *ub = disk->private_data;
716  
717  	if (capable(CAP_SYS_ADMIN))
718  		return 0;
719  
720  	/*
721  	 * If it is one unprivileged device, only owner can open
722  	 * the disk. Otherwise it could be one trap made by one
723  	 * evil user who grants this disk's privileges to other
724  	 * users deliberately.
725  	 *
726  	 * This way is reasonable too given anyone can create
727  	 * unprivileged device, and no need other's grant.
728  	 */
729  	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
730  		unsigned int curr_uid, curr_gid;
731  
732  		ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
733  
734  		if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
735  				ub->dev_info.owner_gid)
736  			return -EPERM;
737  	}
738  
739  	return 0;
740  }
741  
742  static const struct block_device_operations ub_fops = {
743  	.owner =	THIS_MODULE,
744  	.open =		ublk_open,
745  	.free_disk =	ublk_free_disk,
746  	.report_zones =	ublk_report_zones,
747  };
748  
749  #define UBLK_MAX_PIN_PAGES	32
750  
751  struct ublk_io_iter {
752  	struct page *pages[UBLK_MAX_PIN_PAGES];
753  	struct bio *bio;
754  	struct bvec_iter iter;
755  };
756  
757  /* return how many pages are copied */
ublk_copy_io_pages(struct ublk_io_iter * data,size_t total,size_t pg_off,int dir)758  static void ublk_copy_io_pages(struct ublk_io_iter *data,
759  		size_t total, size_t pg_off, int dir)
760  {
761  	unsigned done = 0;
762  	unsigned pg_idx = 0;
763  
764  	while (done < total) {
765  		struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
766  		unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
767  				(unsigned)(PAGE_SIZE - pg_off));
768  		void *bv_buf = bvec_kmap_local(&bv);
769  		void *pg_buf = kmap_local_page(data->pages[pg_idx]);
770  
771  		if (dir == ITER_DEST)
772  			memcpy(pg_buf + pg_off, bv_buf, bytes);
773  		else
774  			memcpy(bv_buf, pg_buf + pg_off, bytes);
775  
776  		kunmap_local(pg_buf);
777  		kunmap_local(bv_buf);
778  
779  		/* advance page array */
780  		pg_off += bytes;
781  		if (pg_off == PAGE_SIZE) {
782  			pg_idx += 1;
783  			pg_off = 0;
784  		}
785  
786  		done += bytes;
787  
788  		/* advance bio */
789  		bio_advance_iter_single(data->bio, &data->iter, bytes);
790  		if (!data->iter.bi_size) {
791  			data->bio = data->bio->bi_next;
792  			if (data->bio == NULL)
793  				break;
794  			data->iter = data->bio->bi_iter;
795  		}
796  	}
797  }
798  
ublk_advance_io_iter(const struct request * req,struct ublk_io_iter * iter,unsigned int offset)799  static bool ublk_advance_io_iter(const struct request *req,
800  		struct ublk_io_iter *iter, unsigned int offset)
801  {
802  	struct bio *bio = req->bio;
803  
804  	for_each_bio(bio) {
805  		if (bio->bi_iter.bi_size > offset) {
806  			iter->bio = bio;
807  			iter->iter = bio->bi_iter;
808  			bio_advance_iter(iter->bio, &iter->iter, offset);
809  			return true;
810  		}
811  		offset -= bio->bi_iter.bi_size;
812  	}
813  	return false;
814  }
815  
816  /*
817   * Copy data between request pages and io_iter, and 'offset'
818   * is the start point of linear offset of request.
819   */
ublk_copy_user_pages(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)820  static size_t ublk_copy_user_pages(const struct request *req,
821  		unsigned offset, struct iov_iter *uiter, int dir)
822  {
823  	struct ublk_io_iter iter;
824  	size_t done = 0;
825  
826  	if (!ublk_advance_io_iter(req, &iter, offset))
827  		return 0;
828  
829  	while (iov_iter_count(uiter) && iter.bio) {
830  		unsigned nr_pages;
831  		ssize_t len;
832  		size_t off;
833  		int i;
834  
835  		len = iov_iter_get_pages2(uiter, iter.pages,
836  				iov_iter_count(uiter),
837  				UBLK_MAX_PIN_PAGES, &off);
838  		if (len <= 0)
839  			return done;
840  
841  		ublk_copy_io_pages(&iter, len, off, dir);
842  		nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
843  		for (i = 0; i < nr_pages; i++) {
844  			if (dir == ITER_DEST)
845  				set_page_dirty(iter.pages[i]);
846  			put_page(iter.pages[i]);
847  		}
848  		done += len;
849  	}
850  
851  	return done;
852  }
853  
ublk_need_map_req(const struct request * req)854  static inline bool ublk_need_map_req(const struct request *req)
855  {
856  	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
857  }
858  
ublk_need_unmap_req(const struct request * req)859  static inline bool ublk_need_unmap_req(const struct request *req)
860  {
861  	return ublk_rq_has_data(req) &&
862  	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
863  }
864  
ublk_map_io(const struct ublk_queue * ubq,const struct request * req,struct ublk_io * io)865  static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
866  		struct ublk_io *io)
867  {
868  	const unsigned int rq_bytes = blk_rq_bytes(req);
869  
870  	if (ublk_support_user_copy(ubq))
871  		return rq_bytes;
872  
873  	/*
874  	 * no zero copy, we delay copy WRITE request data into ublksrv
875  	 * context and the big benefit is that pinning pages in current
876  	 * context is pretty fast, see ublk_pin_user_pages
877  	 */
878  	if (ublk_need_map_req(req)) {
879  		struct iov_iter iter;
880  		const int dir = ITER_DEST;
881  
882  		import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter);
883  		return ublk_copy_user_pages(req, 0, &iter, dir);
884  	}
885  	return rq_bytes;
886  }
887  
ublk_unmap_io(const struct ublk_queue * ubq,const struct request * req,struct ublk_io * io)888  static int ublk_unmap_io(const struct ublk_queue *ubq,
889  		const struct request *req,
890  		struct ublk_io *io)
891  {
892  	const unsigned int rq_bytes = blk_rq_bytes(req);
893  
894  	if (ublk_support_user_copy(ubq))
895  		return rq_bytes;
896  
897  	if (ublk_need_unmap_req(req)) {
898  		struct iov_iter iter;
899  		const int dir = ITER_SOURCE;
900  
901  		WARN_ON_ONCE(io->res > rq_bytes);
902  
903  		import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter);
904  		return ublk_copy_user_pages(req, 0, &iter, dir);
905  	}
906  	return rq_bytes;
907  }
908  
ublk_req_build_flags(struct request * req)909  static inline unsigned int ublk_req_build_flags(struct request *req)
910  {
911  	unsigned flags = 0;
912  
913  	if (req->cmd_flags & REQ_FAILFAST_DEV)
914  		flags |= UBLK_IO_F_FAILFAST_DEV;
915  
916  	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
917  		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
918  
919  	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
920  		flags |= UBLK_IO_F_FAILFAST_DRIVER;
921  
922  	if (req->cmd_flags & REQ_META)
923  		flags |= UBLK_IO_F_META;
924  
925  	if (req->cmd_flags & REQ_FUA)
926  		flags |= UBLK_IO_F_FUA;
927  
928  	if (req->cmd_flags & REQ_NOUNMAP)
929  		flags |= UBLK_IO_F_NOUNMAP;
930  
931  	if (req->cmd_flags & REQ_SWAP)
932  		flags |= UBLK_IO_F_SWAP;
933  
934  	return flags;
935  }
936  
ublk_setup_iod(struct ublk_queue * ubq,struct request * req)937  static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
938  {
939  	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
940  	struct ublk_io *io = &ubq->ios[req->tag];
941  	enum req_op op = req_op(req);
942  	u32 ublk_op;
943  
944  	if (!ublk_queue_is_zoned(ubq) &&
945  	    (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
946  		return BLK_STS_IOERR;
947  
948  	switch (req_op(req)) {
949  	case REQ_OP_READ:
950  		ublk_op = UBLK_IO_OP_READ;
951  		break;
952  	case REQ_OP_WRITE:
953  		ublk_op = UBLK_IO_OP_WRITE;
954  		break;
955  	case REQ_OP_FLUSH:
956  		ublk_op = UBLK_IO_OP_FLUSH;
957  		break;
958  	case REQ_OP_DISCARD:
959  		ublk_op = UBLK_IO_OP_DISCARD;
960  		break;
961  	case REQ_OP_WRITE_ZEROES:
962  		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
963  		break;
964  	default:
965  		if (ublk_queue_is_zoned(ubq))
966  			return ublk_setup_iod_zoned(ubq, req);
967  		return BLK_STS_IOERR;
968  	}
969  
970  	/* need to translate since kernel may change */
971  	iod->op_flags = ublk_op | ublk_req_build_flags(req);
972  	iod->nr_sectors = blk_rq_sectors(req);
973  	iod->start_sector = blk_rq_pos(req);
974  	iod->addr = io->addr;
975  
976  	return BLK_STS_OK;
977  }
978  
ublk_get_uring_cmd_pdu(struct io_uring_cmd * ioucmd)979  static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
980  		struct io_uring_cmd *ioucmd)
981  {
982  	return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
983  }
984  
ubq_daemon_is_dying(struct ublk_queue * ubq)985  static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
986  {
987  	return ubq->ubq_daemon->flags & PF_EXITING;
988  }
989  
990  /* todo: handle partial completion */
__ublk_complete_rq(struct request * req)991  static inline void __ublk_complete_rq(struct request *req)
992  {
993  	struct ublk_queue *ubq = req->mq_hctx->driver_data;
994  	struct ublk_io *io = &ubq->ios[req->tag];
995  	unsigned int unmapped_bytes;
996  	blk_status_t res = BLK_STS_OK;
997  
998  	/* called from ublk_abort_queue() code path */
999  	if (io->flags & UBLK_IO_FLAG_ABORTED) {
1000  		res = BLK_STS_IOERR;
1001  		goto exit;
1002  	}
1003  
1004  	/* failed read IO if nothing is read */
1005  	if (!io->res && req_op(req) == REQ_OP_READ)
1006  		io->res = -EIO;
1007  
1008  	if (io->res < 0) {
1009  		res = errno_to_blk_status(io->res);
1010  		goto exit;
1011  	}
1012  
1013  	/*
1014  	 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1015  	 * directly.
1016  	 *
1017  	 * Both the two needn't unmap.
1018  	 */
1019  	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1020  	    req_op(req) != REQ_OP_DRV_IN)
1021  		goto exit;
1022  
1023  	/* for READ request, writing data in iod->addr to rq buffers */
1024  	unmapped_bytes = ublk_unmap_io(ubq, req, io);
1025  
1026  	/*
1027  	 * Extremely impossible since we got data filled in just before
1028  	 *
1029  	 * Re-read simply for this unlikely case.
1030  	 */
1031  	if (unlikely(unmapped_bytes < io->res))
1032  		io->res = unmapped_bytes;
1033  
1034  	if (blk_update_request(req, BLK_STS_OK, io->res))
1035  		blk_mq_requeue_request(req, true);
1036  	else
1037  		__blk_mq_end_request(req, BLK_STS_OK);
1038  
1039  	return;
1040  exit:
1041  	blk_mq_end_request(req, res);
1042  }
1043  
ublk_complete_rq(struct kref * ref)1044  static void ublk_complete_rq(struct kref *ref)
1045  {
1046  	struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
1047  			ref);
1048  	struct request *req = blk_mq_rq_from_pdu(data);
1049  
1050  	__ublk_complete_rq(req);
1051  }
1052  
1053  /*
1054   * Since __ublk_rq_task_work always fails requests immediately during
1055   * exiting, __ublk_fail_req() is only called from abort context during
1056   * exiting. So lock is unnecessary.
1057   *
1058   * Also aborting may not be started yet, keep in mind that one failed
1059   * request may be issued by block layer again.
1060   */
__ublk_fail_req(struct ublk_queue * ubq,struct ublk_io * io,struct request * req)1061  static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
1062  		struct request *req)
1063  {
1064  	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
1065  
1066  	if (ublk_queue_can_use_recovery_reissue(ubq))
1067  		blk_mq_requeue_request(req, false);
1068  	else
1069  		ublk_put_req_ref(ubq, req);
1070  }
1071  
ubq_complete_io_cmd(struct ublk_io * io,int res,unsigned issue_flags)1072  static void ubq_complete_io_cmd(struct ublk_io *io, int res,
1073  				unsigned issue_flags)
1074  {
1075  	/* mark this cmd owned by ublksrv */
1076  	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1077  
1078  	/*
1079  	 * clear ACTIVE since we are done with this sqe/cmd slot
1080  	 * We can only accept io cmd in case of being not active.
1081  	 */
1082  	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1083  
1084  	/* tell ublksrv one io request is coming */
1085  	io_uring_cmd_done(io->cmd, res, 0, issue_flags);
1086  }
1087  
1088  #define UBLK_REQUEUE_DELAY_MS	3
1089  
__ublk_abort_rq(struct ublk_queue * ubq,struct request * rq)1090  static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1091  		struct request *rq)
1092  {
1093  	/* We cannot process this rq so just requeue it. */
1094  	if (ublk_queue_can_use_recovery(ubq))
1095  		blk_mq_requeue_request(rq, false);
1096  	else
1097  		blk_mq_end_request(rq, BLK_STS_IOERR);
1098  }
1099  
__ublk_rq_task_work(struct request * req,unsigned issue_flags)1100  static inline void __ublk_rq_task_work(struct request *req,
1101  				       unsigned issue_flags)
1102  {
1103  	struct ublk_queue *ubq = req->mq_hctx->driver_data;
1104  	int tag = req->tag;
1105  	struct ublk_io *io = &ubq->ios[tag];
1106  	unsigned int mapped_bytes;
1107  
1108  	pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
1109  			__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
1110  			ublk_get_iod(ubq, req->tag)->addr);
1111  
1112  	/*
1113  	 * Task is exiting if either:
1114  	 *
1115  	 * (1) current != ubq_daemon.
1116  	 * io_uring_cmd_complete_in_task() tries to run task_work
1117  	 * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
1118  	 *
1119  	 * (2) current->flags & PF_EXITING.
1120  	 */
1121  	if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
1122  		__ublk_abort_rq(ubq, req);
1123  		return;
1124  	}
1125  
1126  	if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1127  		/*
1128  		 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1129  		 * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1130  		 * and notify it.
1131  		 */
1132  		if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
1133  			io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1134  			pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
1135  					__func__, io->cmd->cmd_op, ubq->q_id,
1136  					req->tag, io->flags);
1137  			ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags);
1138  			return;
1139  		}
1140  		/*
1141  		 * We have handled UBLK_IO_NEED_GET_DATA command,
1142  		 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
1143  		 * do the copy work.
1144  		 */
1145  		io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
1146  		/* update iod->addr because ublksrv may have passed a new io buffer */
1147  		ublk_get_iod(ubq, req->tag)->addr = io->addr;
1148  		pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
1149  				__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
1150  				ublk_get_iod(ubq, req->tag)->addr);
1151  	}
1152  
1153  	mapped_bytes = ublk_map_io(ubq, req, io);
1154  
1155  	/* partially mapped, update io descriptor */
1156  	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1157  		/*
1158  		 * Nothing mapped, retry until we succeed.
1159  		 *
1160  		 * We may never succeed in mapping any bytes here because
1161  		 * of OOM. TODO: reserve one buffer with single page pinned
1162  		 * for providing forward progress guarantee.
1163  		 */
1164  		if (unlikely(!mapped_bytes)) {
1165  			blk_mq_requeue_request(req, false);
1166  			blk_mq_delay_kick_requeue_list(req->q,
1167  					UBLK_REQUEUE_DELAY_MS);
1168  			return;
1169  		}
1170  
1171  		ublk_get_iod(ubq, req->tag)->nr_sectors =
1172  			mapped_bytes >> 9;
1173  	}
1174  
1175  	ublk_init_req_ref(ubq, req);
1176  	ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
1177  }
1178  
ublk_forward_io_cmds(struct ublk_queue * ubq,unsigned issue_flags)1179  static inline void ublk_forward_io_cmds(struct ublk_queue *ubq,
1180  					unsigned issue_flags)
1181  {
1182  	struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
1183  	struct ublk_rq_data *data, *tmp;
1184  
1185  	io_cmds = llist_reverse_order(io_cmds);
1186  	llist_for_each_entry_safe(data, tmp, io_cmds, node)
1187  		__ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags);
1188  }
1189  
ublk_rq_task_work_cb(struct io_uring_cmd * cmd,unsigned issue_flags)1190  static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
1191  {
1192  	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1193  	struct ublk_queue *ubq = pdu->ubq;
1194  
1195  	ublk_forward_io_cmds(ubq, issue_flags);
1196  }
1197  
ublk_queue_cmd(struct ublk_queue * ubq,struct request * rq)1198  static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
1199  {
1200  	struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
1201  
1202  	if (llist_add(&data->node, &ubq->io_cmds)) {
1203  		struct ublk_io *io = &ubq->ios[rq->tag];
1204  
1205  		io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb);
1206  	}
1207  }
1208  
ublk_timeout(struct request * rq)1209  static enum blk_eh_timer_return ublk_timeout(struct request *rq)
1210  {
1211  	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
1212  	unsigned int nr_inflight = 0;
1213  	int i;
1214  
1215  	if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
1216  		if (!ubq->timeout) {
1217  			send_sig(SIGKILL, ubq->ubq_daemon, 0);
1218  			ubq->timeout = true;
1219  		}
1220  
1221  		return BLK_EH_DONE;
1222  	}
1223  
1224  	if (!ubq_daemon_is_dying(ubq))
1225  		return BLK_EH_RESET_TIMER;
1226  
1227  	for (i = 0; i < ubq->q_depth; i++) {
1228  		struct ublk_io *io = &ubq->ios[i];
1229  
1230  		if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
1231  			nr_inflight++;
1232  	}
1233  
1234  	/* cancelable uring_cmd can't help us if all commands are in-flight */
1235  	if (nr_inflight == ubq->q_depth) {
1236  		struct ublk_device *ub = ubq->dev;
1237  
1238  		if (ublk_abort_requests(ub, ubq)) {
1239  			if (ublk_can_use_recovery(ub))
1240  				schedule_work(&ub->quiesce_work);
1241  			else
1242  				schedule_work(&ub->stop_work);
1243  		}
1244  		return BLK_EH_DONE;
1245  	}
1246  
1247  	return BLK_EH_RESET_TIMER;
1248  }
1249  
ublk_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)1250  static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
1251  		const struct blk_mq_queue_data *bd)
1252  {
1253  	struct ublk_queue *ubq = hctx->driver_data;
1254  	struct request *rq = bd->rq;
1255  	blk_status_t res;
1256  
1257  	/* fill iod to slot in io cmd buffer */
1258  	res = ublk_setup_iod(ubq, rq);
1259  	if (unlikely(res != BLK_STS_OK))
1260  		return BLK_STS_IOERR;
1261  
1262  	/* With recovery feature enabled, force_abort is set in
1263  	 * ublk_stop_dev() before calling del_gendisk(). We have to
1264  	 * abort all requeued and new rqs here to let del_gendisk()
1265  	 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
1266  	 * to avoid UAF on io_uring ctx.
1267  	 *
1268  	 * Note: force_abort is guaranteed to be seen because it is set
1269  	 * before request queue is unqiuesced.
1270  	 */
1271  	if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
1272  		return BLK_STS_IOERR;
1273  
1274  	if (unlikely(ubq->canceling)) {
1275  		__ublk_abort_rq(ubq, rq);
1276  		return BLK_STS_OK;
1277  	}
1278  
1279  	blk_mq_start_request(bd->rq);
1280  	ublk_queue_cmd(ubq, rq);
1281  
1282  	return BLK_STS_OK;
1283  }
1284  
ublk_init_hctx(struct blk_mq_hw_ctx * hctx,void * driver_data,unsigned int hctx_idx)1285  static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
1286  		unsigned int hctx_idx)
1287  {
1288  	struct ublk_device *ub = driver_data;
1289  	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
1290  
1291  	hctx->driver_data = ubq;
1292  	return 0;
1293  }
1294  
1295  static const struct blk_mq_ops ublk_mq_ops = {
1296  	.queue_rq       = ublk_queue_rq,
1297  	.init_hctx	= ublk_init_hctx,
1298  	.timeout	= ublk_timeout,
1299  };
1300  
ublk_ch_open(struct inode * inode,struct file * filp)1301  static int ublk_ch_open(struct inode *inode, struct file *filp)
1302  {
1303  	struct ublk_device *ub = container_of(inode->i_cdev,
1304  			struct ublk_device, cdev);
1305  
1306  	if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
1307  		return -EBUSY;
1308  	filp->private_data = ub;
1309  	return 0;
1310  }
1311  
ublk_ch_release(struct inode * inode,struct file * filp)1312  static int ublk_ch_release(struct inode *inode, struct file *filp)
1313  {
1314  	struct ublk_device *ub = filp->private_data;
1315  
1316  	clear_bit(UB_STATE_OPEN, &ub->state);
1317  	return 0;
1318  }
1319  
1320  /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
ublk_ch_mmap(struct file * filp,struct vm_area_struct * vma)1321  static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
1322  {
1323  	struct ublk_device *ub = filp->private_data;
1324  	size_t sz = vma->vm_end - vma->vm_start;
1325  	unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc);
1326  	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
1327  	int q_id, ret = 0;
1328  
1329  	spin_lock(&ub->lock);
1330  	if (!ub->mm)
1331  		ub->mm = current->mm;
1332  	if (current->mm != ub->mm)
1333  		ret = -EINVAL;
1334  	spin_unlock(&ub->lock);
1335  
1336  	if (ret)
1337  		return ret;
1338  
1339  	if (vma->vm_flags & VM_WRITE)
1340  		return -EPERM;
1341  
1342  	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
1343  	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
1344  		return -EINVAL;
1345  
1346  	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
1347  	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
1348  			__func__, q_id, current->pid, vma->vm_start,
1349  			phys_off, (unsigned long)sz);
1350  
1351  	if (sz != ublk_queue_cmd_buf_size(ub, q_id))
1352  		return -EINVAL;
1353  
1354  	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
1355  	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
1356  }
1357  
ublk_commit_completion(struct ublk_device * ub,const struct ublksrv_io_cmd * ub_cmd)1358  static void ublk_commit_completion(struct ublk_device *ub,
1359  		const struct ublksrv_io_cmd *ub_cmd)
1360  {
1361  	u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
1362  	struct ublk_queue *ubq = ublk_get_queue(ub, qid);
1363  	struct ublk_io *io = &ubq->ios[tag];
1364  	struct request *req;
1365  
1366  	/* now this cmd slot is owned by nbd driver */
1367  	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
1368  	io->res = ub_cmd->result;
1369  
1370  	/* find the io request and complete */
1371  	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
1372  	if (WARN_ON_ONCE(unlikely(!req)))
1373  		return;
1374  
1375  	if (req_op(req) == REQ_OP_ZONE_APPEND)
1376  		req->__sector = ub_cmd->zone_append_lba;
1377  
1378  	if (likely(!blk_should_fake_timeout(req->q)))
1379  		ublk_put_req_ref(ubq, req);
1380  }
1381  
1382  /*
1383   * Called from ubq_daemon context via cancel fn, meantime quiesce ublk
1384   * blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon
1385   * context, so everything is serialized.
1386   */
ublk_abort_queue(struct ublk_device * ub,struct ublk_queue * ubq)1387  static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
1388  {
1389  	int i;
1390  
1391  	for (i = 0; i < ubq->q_depth; i++) {
1392  		struct ublk_io *io = &ubq->ios[i];
1393  
1394  		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
1395  			struct request *rq;
1396  
1397  			/*
1398  			 * Either we fail the request or ublk_rq_task_work_fn
1399  			 * will do it
1400  			 */
1401  			rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
1402  			if (rq && blk_mq_request_started(rq)) {
1403  				io->flags |= UBLK_IO_FLAG_ABORTED;
1404  				__ublk_fail_req(ubq, io, rq);
1405  			}
1406  		}
1407  	}
1408  }
1409  
ublk_abort_requests(struct ublk_device * ub,struct ublk_queue * ubq)1410  static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
1411  {
1412  	struct gendisk *disk;
1413  
1414  	spin_lock(&ubq->cancel_lock);
1415  	if (ubq->canceling) {
1416  		spin_unlock(&ubq->cancel_lock);
1417  		return false;
1418  	}
1419  	ubq->canceling = true;
1420  	spin_unlock(&ubq->cancel_lock);
1421  
1422  	spin_lock(&ub->lock);
1423  	disk = ub->ub_disk;
1424  	if (disk)
1425  		get_device(disk_to_dev(disk));
1426  	spin_unlock(&ub->lock);
1427  
1428  	/* Our disk has been dead */
1429  	if (!disk)
1430  		return false;
1431  
1432  	/* Now we are serialized with ublk_queue_rq() */
1433  	blk_mq_quiesce_queue(disk->queue);
1434  	/* abort queue is for making forward progress */
1435  	ublk_abort_queue(ub, ubq);
1436  	blk_mq_unquiesce_queue(disk->queue);
1437  	put_device(disk_to_dev(disk));
1438  
1439  	return true;
1440  }
1441  
ublk_cancel_cmd(struct ublk_queue * ubq,struct ublk_io * io,unsigned int issue_flags)1442  static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
1443  		unsigned int issue_flags)
1444  {
1445  	bool done;
1446  
1447  	if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
1448  		return;
1449  
1450  	spin_lock(&ubq->cancel_lock);
1451  	done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
1452  	if (!done)
1453  		io->flags |= UBLK_IO_FLAG_CANCELED;
1454  	spin_unlock(&ubq->cancel_lock);
1455  
1456  	if (!done)
1457  		io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
1458  }
1459  
1460  /*
1461   * The ublk char device won't be closed when calling cancel fn, so both
1462   * ublk device and queue are guaranteed to be live
1463   */
ublk_uring_cmd_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)1464  static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
1465  		unsigned int issue_flags)
1466  {
1467  	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1468  	struct ublk_queue *ubq = pdu->ubq;
1469  	struct task_struct *task;
1470  	struct ublk_device *ub;
1471  	bool need_schedule;
1472  	struct ublk_io *io;
1473  
1474  	if (WARN_ON_ONCE(!ubq))
1475  		return;
1476  
1477  	if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
1478  		return;
1479  
1480  	task = io_uring_cmd_get_task(cmd);
1481  	if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
1482  		return;
1483  
1484  	ub = ubq->dev;
1485  	need_schedule = ublk_abort_requests(ub, ubq);
1486  
1487  	io = &ubq->ios[pdu->tag];
1488  	WARN_ON_ONCE(io->cmd != cmd);
1489  	ublk_cancel_cmd(ubq, io, issue_flags);
1490  
1491  	if (need_schedule) {
1492  		if (ublk_can_use_recovery(ub))
1493  			schedule_work(&ub->quiesce_work);
1494  		else
1495  			schedule_work(&ub->stop_work);
1496  	}
1497  }
1498  
ublk_queue_ready(struct ublk_queue * ubq)1499  static inline bool ublk_queue_ready(struct ublk_queue *ubq)
1500  {
1501  	return ubq->nr_io_ready == ubq->q_depth;
1502  }
1503  
ublk_cancel_queue(struct ublk_queue * ubq)1504  static void ublk_cancel_queue(struct ublk_queue *ubq)
1505  {
1506  	int i;
1507  
1508  	for (i = 0; i < ubq->q_depth; i++)
1509  		ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED);
1510  }
1511  
1512  /* Cancel all pending commands, must be called after del_gendisk() returns */
ublk_cancel_dev(struct ublk_device * ub)1513  static void ublk_cancel_dev(struct ublk_device *ub)
1514  {
1515  	int i;
1516  
1517  	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1518  		ublk_cancel_queue(ublk_get_queue(ub, i));
1519  }
1520  
ublk_check_inflight_rq(struct request * rq,void * data)1521  static bool ublk_check_inflight_rq(struct request *rq, void *data)
1522  {
1523  	bool *idle = data;
1524  
1525  	if (blk_mq_request_started(rq)) {
1526  		*idle = false;
1527  		return false;
1528  	}
1529  	return true;
1530  }
1531  
ublk_wait_tagset_rqs_idle(struct ublk_device * ub)1532  static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
1533  {
1534  	bool idle;
1535  
1536  	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
1537  	while (true) {
1538  		idle = true;
1539  		blk_mq_tagset_busy_iter(&ub->tag_set,
1540  				ublk_check_inflight_rq, &idle);
1541  		if (idle)
1542  			break;
1543  		msleep(UBLK_REQUEUE_DELAY_MS);
1544  	}
1545  }
1546  
__ublk_quiesce_dev(struct ublk_device * ub)1547  static void __ublk_quiesce_dev(struct ublk_device *ub)
1548  {
1549  	pr_devel("%s: quiesce ub: dev_id %d state %s\n",
1550  			__func__, ub->dev_info.dev_id,
1551  			ub->dev_info.state == UBLK_S_DEV_LIVE ?
1552  			"LIVE" : "QUIESCED");
1553  	blk_mq_quiesce_queue(ub->ub_disk->queue);
1554  	ublk_wait_tagset_rqs_idle(ub);
1555  	ub->dev_info.state = UBLK_S_DEV_QUIESCED;
1556  }
1557  
ublk_quiesce_work_fn(struct work_struct * work)1558  static void ublk_quiesce_work_fn(struct work_struct *work)
1559  {
1560  	struct ublk_device *ub =
1561  		container_of(work, struct ublk_device, quiesce_work);
1562  
1563  	mutex_lock(&ub->mutex);
1564  	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
1565  		goto unlock;
1566  	__ublk_quiesce_dev(ub);
1567   unlock:
1568  	mutex_unlock(&ub->mutex);
1569  	ublk_cancel_dev(ub);
1570  }
1571  
ublk_unquiesce_dev(struct ublk_device * ub)1572  static void ublk_unquiesce_dev(struct ublk_device *ub)
1573  {
1574  	int i;
1575  
1576  	pr_devel("%s: unquiesce ub: dev_id %d state %s\n",
1577  			__func__, ub->dev_info.dev_id,
1578  			ub->dev_info.state == UBLK_S_DEV_LIVE ?
1579  			"LIVE" : "QUIESCED");
1580  	/* quiesce_work has run. We let requeued rqs be aborted
1581  	 * before running fallback_wq. "force_abort" must be seen
1582  	 * after request queue is unqiuesced. Then del_gendisk()
1583  	 * can move on.
1584  	 */
1585  	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1586  		ublk_get_queue(ub, i)->force_abort = true;
1587  
1588  	blk_mq_unquiesce_queue(ub->ub_disk->queue);
1589  	/* We may have requeued some rqs in ublk_quiesce_queue() */
1590  	blk_mq_kick_requeue_list(ub->ub_disk->queue);
1591  }
1592  
ublk_stop_dev(struct ublk_device * ub)1593  static void ublk_stop_dev(struct ublk_device *ub)
1594  {
1595  	struct gendisk *disk;
1596  
1597  	mutex_lock(&ub->mutex);
1598  	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
1599  		goto unlock;
1600  	if (ublk_can_use_recovery(ub)) {
1601  		if (ub->dev_info.state == UBLK_S_DEV_LIVE)
1602  			__ublk_quiesce_dev(ub);
1603  		ublk_unquiesce_dev(ub);
1604  	}
1605  	del_gendisk(ub->ub_disk);
1606  
1607  	/* Sync with ublk_abort_queue() by holding the lock */
1608  	spin_lock(&ub->lock);
1609  	disk = ub->ub_disk;
1610  	ub->dev_info.state = UBLK_S_DEV_DEAD;
1611  	ub->dev_info.ublksrv_pid = -1;
1612  	ub->ub_disk = NULL;
1613  	spin_unlock(&ub->lock);
1614  	put_disk(disk);
1615   unlock:
1616  	mutex_unlock(&ub->mutex);
1617  	ublk_cancel_dev(ub);
1618  }
1619  
1620  /* device can only be started after all IOs are ready */
ublk_mark_io_ready(struct ublk_device * ub,struct ublk_queue * ubq)1621  static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
1622  {
1623  	mutex_lock(&ub->mutex);
1624  	ubq->nr_io_ready++;
1625  	if (ublk_queue_ready(ubq)) {
1626  		ubq->ubq_daemon = current;
1627  		get_task_struct(ubq->ubq_daemon);
1628  		ub->nr_queues_ready++;
1629  
1630  		if (capable(CAP_SYS_ADMIN))
1631  			ub->nr_privileged_daemon++;
1632  	}
1633  	if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
1634  		complete_all(&ub->completion);
1635  	mutex_unlock(&ub->mutex);
1636  }
1637  
ublk_handle_need_get_data(struct ublk_device * ub,int q_id,int tag)1638  static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
1639  		int tag)
1640  {
1641  	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1642  	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
1643  
1644  	ublk_queue_cmd(ubq, req);
1645  }
1646  
ublk_check_cmd_op(u32 cmd_op)1647  static inline int ublk_check_cmd_op(u32 cmd_op)
1648  {
1649  	u32 ioc_type = _IOC_TYPE(cmd_op);
1650  
1651  	if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
1652  		return -EOPNOTSUPP;
1653  
1654  	if (ioc_type != 'u' && ioc_type != 0)
1655  		return -EOPNOTSUPP;
1656  
1657  	return 0;
1658  }
1659  
ublk_fill_io_cmd(struct ublk_io * io,struct io_uring_cmd * cmd,unsigned long buf_addr)1660  static inline void ublk_fill_io_cmd(struct ublk_io *io,
1661  		struct io_uring_cmd *cmd, unsigned long buf_addr)
1662  {
1663  	io->cmd = cmd;
1664  	io->flags |= UBLK_IO_FLAG_ACTIVE;
1665  	io->addr = buf_addr;
1666  }
1667  
ublk_prep_cancel(struct io_uring_cmd * cmd,unsigned int issue_flags,struct ublk_queue * ubq,unsigned int tag)1668  static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
1669  				    unsigned int issue_flags,
1670  				    struct ublk_queue *ubq, unsigned int tag)
1671  {
1672  	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1673  
1674  	/*
1675  	 * Safe to refer to @ubq since ublk_queue won't be died until its
1676  	 * commands are completed
1677  	 */
1678  	pdu->ubq = ubq;
1679  	pdu->tag = tag;
1680  	io_uring_cmd_mark_cancelable(cmd, issue_flags);
1681  }
1682  
__ublk_ch_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags,const struct ublksrv_io_cmd * ub_cmd)1683  static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
1684  			       unsigned int issue_flags,
1685  			       const struct ublksrv_io_cmd *ub_cmd)
1686  {
1687  	struct ublk_device *ub = cmd->file->private_data;
1688  	struct ublk_queue *ubq;
1689  	struct ublk_io *io;
1690  	u32 cmd_op = cmd->cmd_op;
1691  	unsigned tag = ub_cmd->tag;
1692  	int ret = -EINVAL;
1693  	struct request *req;
1694  
1695  	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
1696  			__func__, cmd->cmd_op, ub_cmd->q_id, tag,
1697  			ub_cmd->result);
1698  
1699  	if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
1700  		goto out;
1701  
1702  	ubq = ublk_get_queue(ub, ub_cmd->q_id);
1703  	if (!ubq || ub_cmd->q_id != ubq->q_id)
1704  		goto out;
1705  
1706  	if (ubq->ubq_daemon && ubq->ubq_daemon != current)
1707  		goto out;
1708  
1709  	if (tag >= ubq->q_depth)
1710  		goto out;
1711  
1712  	io = &ubq->ios[tag];
1713  
1714  	/* there is pending io cmd, something must be wrong */
1715  	if (io->flags & UBLK_IO_FLAG_ACTIVE) {
1716  		ret = -EBUSY;
1717  		goto out;
1718  	}
1719  
1720  	/*
1721  	 * ensure that the user issues UBLK_IO_NEED_GET_DATA
1722  	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
1723  	 */
1724  	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
1725  			^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
1726  		goto out;
1727  
1728  	ret = ublk_check_cmd_op(cmd_op);
1729  	if (ret)
1730  		goto out;
1731  
1732  	ret = -EINVAL;
1733  	switch (_IOC_NR(cmd_op)) {
1734  	case UBLK_IO_FETCH_REQ:
1735  		/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
1736  		if (ublk_queue_ready(ubq)) {
1737  			ret = -EBUSY;
1738  			goto out;
1739  		}
1740  		/*
1741  		 * The io is being handled by server, so COMMIT_RQ is expected
1742  		 * instead of FETCH_REQ
1743  		 */
1744  		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
1745  			goto out;
1746  
1747  		if (!ublk_support_user_copy(ubq)) {
1748  			/*
1749  			 * FETCH_RQ has to provide IO buffer if NEED GET
1750  			 * DATA is not enabled
1751  			 */
1752  			if (!ub_cmd->addr && !ublk_need_get_data(ubq))
1753  				goto out;
1754  		} else if (ub_cmd->addr) {
1755  			/* User copy requires addr to be unset */
1756  			ret = -EINVAL;
1757  			goto out;
1758  		}
1759  
1760  		ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
1761  		ublk_mark_io_ready(ub, ubq);
1762  		break;
1763  	case UBLK_IO_COMMIT_AND_FETCH_REQ:
1764  		req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
1765  
1766  		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
1767  			goto out;
1768  
1769  		if (!ublk_support_user_copy(ubq)) {
1770  			/*
1771  			 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
1772  			 * NEED GET DATA is not enabled or it is Read IO.
1773  			 */
1774  			if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
1775  						req_op(req) == REQ_OP_READ))
1776  				goto out;
1777  		} else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
1778  			/*
1779  			 * User copy requires addr to be unset when command is
1780  			 * not zone append
1781  			 */
1782  			ret = -EINVAL;
1783  			goto out;
1784  		}
1785  
1786  		ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
1787  		ublk_commit_completion(ub, ub_cmd);
1788  		break;
1789  	case UBLK_IO_NEED_GET_DATA:
1790  		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
1791  			goto out;
1792  		ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
1793  		ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag);
1794  		break;
1795  	default:
1796  		goto out;
1797  	}
1798  	ublk_prep_cancel(cmd, issue_flags, ubq, tag);
1799  	return -EIOCBQUEUED;
1800  
1801   out:
1802  	io_uring_cmd_done(cmd, ret, 0, issue_flags);
1803  	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
1804  			__func__, cmd_op, tag, ret, io->flags);
1805  	return -EIOCBQUEUED;
1806  }
1807  
__ublk_check_and_get_req(struct ublk_device * ub,struct ublk_queue * ubq,int tag,size_t offset)1808  static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
1809  		struct ublk_queue *ubq, int tag, size_t offset)
1810  {
1811  	struct request *req;
1812  
1813  	if (!ublk_need_req_ref(ubq))
1814  		return NULL;
1815  
1816  	req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1817  	if (!req)
1818  		return NULL;
1819  
1820  	if (!ublk_get_req_ref(ubq, req))
1821  		return NULL;
1822  
1823  	if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
1824  		goto fail_put;
1825  
1826  	if (!ublk_rq_has_data(req))
1827  		goto fail_put;
1828  
1829  	if (offset > blk_rq_bytes(req))
1830  		goto fail_put;
1831  
1832  	return req;
1833  fail_put:
1834  	ublk_put_req_ref(ubq, req);
1835  	return NULL;
1836  }
1837  
ublk_ch_uring_cmd_local(struct io_uring_cmd * cmd,unsigned int issue_flags)1838  static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
1839  		unsigned int issue_flags)
1840  {
1841  	/*
1842  	 * Not necessary for async retry, but let's keep it simple and always
1843  	 * copy the values to avoid any potential reuse.
1844  	 */
1845  	const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
1846  	const struct ublksrv_io_cmd ub_cmd = {
1847  		.q_id = READ_ONCE(ub_src->q_id),
1848  		.tag = READ_ONCE(ub_src->tag),
1849  		.result = READ_ONCE(ub_src->result),
1850  		.addr = READ_ONCE(ub_src->addr)
1851  	};
1852  
1853  	WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
1854  
1855  	return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
1856  }
1857  
ublk_ch_uring_cmd_cb(struct io_uring_cmd * cmd,unsigned int issue_flags)1858  static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
1859  		unsigned int issue_flags)
1860  {
1861  	ublk_ch_uring_cmd_local(cmd, issue_flags);
1862  }
1863  
ublk_ch_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)1864  static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
1865  {
1866  	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
1867  		ublk_uring_cmd_cancel_fn(cmd, issue_flags);
1868  		return 0;
1869  	}
1870  
1871  	/* well-implemented server won't run into unlocked */
1872  	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
1873  		io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
1874  		return -EIOCBQUEUED;
1875  	}
1876  
1877  	return ublk_ch_uring_cmd_local(cmd, issue_flags);
1878  }
1879  
ublk_check_ubuf_dir(const struct request * req,int ubuf_dir)1880  static inline bool ublk_check_ubuf_dir(const struct request *req,
1881  		int ubuf_dir)
1882  {
1883  	/* copy ubuf to request pages */
1884  	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
1885  	    ubuf_dir == ITER_SOURCE)
1886  		return true;
1887  
1888  	/* copy request pages to ubuf */
1889  	if ((req_op(req) == REQ_OP_WRITE ||
1890  	     req_op(req) == REQ_OP_ZONE_APPEND) &&
1891  	    ubuf_dir == ITER_DEST)
1892  		return true;
1893  
1894  	return false;
1895  }
1896  
ublk_check_and_get_req(struct kiocb * iocb,struct iov_iter * iter,size_t * off,int dir)1897  static struct request *ublk_check_and_get_req(struct kiocb *iocb,
1898  		struct iov_iter *iter, size_t *off, int dir)
1899  {
1900  	struct ublk_device *ub = iocb->ki_filp->private_data;
1901  	struct ublk_queue *ubq;
1902  	struct request *req;
1903  	size_t buf_off;
1904  	u16 tag, q_id;
1905  
1906  	if (!ub)
1907  		return ERR_PTR(-EACCES);
1908  
1909  	if (!user_backed_iter(iter))
1910  		return ERR_PTR(-EACCES);
1911  
1912  	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
1913  		return ERR_PTR(-EACCES);
1914  
1915  	tag = ublk_pos_to_tag(iocb->ki_pos);
1916  	q_id = ublk_pos_to_hwq(iocb->ki_pos);
1917  	buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
1918  
1919  	if (q_id >= ub->dev_info.nr_hw_queues)
1920  		return ERR_PTR(-EINVAL);
1921  
1922  	ubq = ublk_get_queue(ub, q_id);
1923  	if (!ubq)
1924  		return ERR_PTR(-EINVAL);
1925  
1926  	if (tag >= ubq->q_depth)
1927  		return ERR_PTR(-EINVAL);
1928  
1929  	req = __ublk_check_and_get_req(ub, ubq, tag, buf_off);
1930  	if (!req)
1931  		return ERR_PTR(-EINVAL);
1932  
1933  	if (!req->mq_hctx || !req->mq_hctx->driver_data)
1934  		goto fail;
1935  
1936  	if (!ublk_check_ubuf_dir(req, dir))
1937  		goto fail;
1938  
1939  	*off = buf_off;
1940  	return req;
1941  fail:
1942  	ublk_put_req_ref(ubq, req);
1943  	return ERR_PTR(-EACCES);
1944  }
1945  
ublk_ch_read_iter(struct kiocb * iocb,struct iov_iter * to)1946  static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
1947  {
1948  	struct ublk_queue *ubq;
1949  	struct request *req;
1950  	size_t buf_off;
1951  	size_t ret;
1952  
1953  	req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST);
1954  	if (IS_ERR(req))
1955  		return PTR_ERR(req);
1956  
1957  	ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
1958  	ubq = req->mq_hctx->driver_data;
1959  	ublk_put_req_ref(ubq, req);
1960  
1961  	return ret;
1962  }
1963  
ublk_ch_write_iter(struct kiocb * iocb,struct iov_iter * from)1964  static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
1965  {
1966  	struct ublk_queue *ubq;
1967  	struct request *req;
1968  	size_t buf_off;
1969  	size_t ret;
1970  
1971  	req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE);
1972  	if (IS_ERR(req))
1973  		return PTR_ERR(req);
1974  
1975  	ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
1976  	ubq = req->mq_hctx->driver_data;
1977  	ublk_put_req_ref(ubq, req);
1978  
1979  	return ret;
1980  }
1981  
1982  static const struct file_operations ublk_ch_fops = {
1983  	.owner = THIS_MODULE,
1984  	.open = ublk_ch_open,
1985  	.release = ublk_ch_release,
1986  	.read_iter = ublk_ch_read_iter,
1987  	.write_iter = ublk_ch_write_iter,
1988  	.uring_cmd = ublk_ch_uring_cmd,
1989  	.mmap = ublk_ch_mmap,
1990  };
1991  
ublk_deinit_queue(struct ublk_device * ub,int q_id)1992  static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
1993  {
1994  	int size = ublk_queue_cmd_buf_size(ub, q_id);
1995  	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1996  
1997  	if (ubq->ubq_daemon)
1998  		put_task_struct(ubq->ubq_daemon);
1999  	if (ubq->io_cmd_buf)
2000  		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
2001  }
2002  
ublk_init_queue(struct ublk_device * ub,int q_id)2003  static int ublk_init_queue(struct ublk_device *ub, int q_id)
2004  {
2005  	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
2006  	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
2007  	void *ptr;
2008  	int size;
2009  
2010  	spin_lock_init(&ubq->cancel_lock);
2011  	ubq->flags = ub->dev_info.flags;
2012  	ubq->q_id = q_id;
2013  	ubq->q_depth = ub->dev_info.queue_depth;
2014  	size = ublk_queue_cmd_buf_size(ub, q_id);
2015  
2016  	ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
2017  	if (!ptr)
2018  		return -ENOMEM;
2019  
2020  	ubq->io_cmd_buf = ptr;
2021  	ubq->dev = ub;
2022  	return 0;
2023  }
2024  
ublk_deinit_queues(struct ublk_device * ub)2025  static void ublk_deinit_queues(struct ublk_device *ub)
2026  {
2027  	int nr_queues = ub->dev_info.nr_hw_queues;
2028  	int i;
2029  
2030  	if (!ub->__queues)
2031  		return;
2032  
2033  	for (i = 0; i < nr_queues; i++)
2034  		ublk_deinit_queue(ub, i);
2035  	kfree(ub->__queues);
2036  }
2037  
ublk_init_queues(struct ublk_device * ub)2038  static int ublk_init_queues(struct ublk_device *ub)
2039  {
2040  	int nr_queues = ub->dev_info.nr_hw_queues;
2041  	int depth = ub->dev_info.queue_depth;
2042  	int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
2043  	int i, ret = -ENOMEM;
2044  
2045  	ub->queue_size = ubq_size;
2046  	ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
2047  	if (!ub->__queues)
2048  		return ret;
2049  
2050  	for (i = 0; i < nr_queues; i++) {
2051  		if (ublk_init_queue(ub, i))
2052  			goto fail;
2053  	}
2054  
2055  	init_completion(&ub->completion);
2056  	return 0;
2057  
2058   fail:
2059  	ublk_deinit_queues(ub);
2060  	return ret;
2061  }
2062  
ublk_alloc_dev_number(struct ublk_device * ub,int idx)2063  static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
2064  {
2065  	int i = idx;
2066  	int err;
2067  
2068  	spin_lock(&ublk_idr_lock);
2069  	/* allocate id, if @id >= 0, we're requesting that specific id */
2070  	if (i >= 0) {
2071  		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
2072  		if (err == -ENOSPC)
2073  			err = -EEXIST;
2074  	} else {
2075  		err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
2076  				GFP_NOWAIT);
2077  	}
2078  	spin_unlock(&ublk_idr_lock);
2079  
2080  	if (err >= 0)
2081  		ub->ub_number = err;
2082  
2083  	return err;
2084  }
2085  
ublk_free_dev_number(struct ublk_device * ub)2086  static void ublk_free_dev_number(struct ublk_device *ub)
2087  {
2088  	spin_lock(&ublk_idr_lock);
2089  	idr_remove(&ublk_index_idr, ub->ub_number);
2090  	wake_up_all(&ublk_idr_wq);
2091  	spin_unlock(&ublk_idr_lock);
2092  }
2093  
ublk_cdev_rel(struct device * dev)2094  static void ublk_cdev_rel(struct device *dev)
2095  {
2096  	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
2097  
2098  	blk_mq_free_tag_set(&ub->tag_set);
2099  	ublk_deinit_queues(ub);
2100  	ublk_free_dev_number(ub);
2101  	mutex_destroy(&ub->mutex);
2102  	kfree(ub);
2103  }
2104  
ublk_add_chdev(struct ublk_device * ub)2105  static int ublk_add_chdev(struct ublk_device *ub)
2106  {
2107  	struct device *dev = &ub->cdev_dev;
2108  	int minor = ub->ub_number;
2109  	int ret;
2110  
2111  	dev->parent = ublk_misc.this_device;
2112  	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
2113  	dev->class = &ublk_chr_class;
2114  	dev->release = ublk_cdev_rel;
2115  	device_initialize(dev);
2116  
2117  	ret = dev_set_name(dev, "ublkc%d", minor);
2118  	if (ret)
2119  		goto fail;
2120  
2121  	cdev_init(&ub->cdev, &ublk_ch_fops);
2122  	ret = cdev_device_add(&ub->cdev, dev);
2123  	if (ret)
2124  		goto fail;
2125  
2126  	ublks_added++;
2127  	return 0;
2128   fail:
2129  	put_device(dev);
2130  	return ret;
2131  }
2132  
ublk_stop_work_fn(struct work_struct * work)2133  static void ublk_stop_work_fn(struct work_struct *work)
2134  {
2135  	struct ublk_device *ub =
2136  		container_of(work, struct ublk_device, stop_work);
2137  
2138  	ublk_stop_dev(ub);
2139  }
2140  
2141  /* align max io buffer size with PAGE_SIZE */
ublk_align_max_io_size(struct ublk_device * ub)2142  static void ublk_align_max_io_size(struct ublk_device *ub)
2143  {
2144  	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
2145  
2146  	ub->dev_info.max_io_buf_bytes =
2147  		round_down(max_io_bytes, PAGE_SIZE);
2148  }
2149  
ublk_add_tag_set(struct ublk_device * ub)2150  static int ublk_add_tag_set(struct ublk_device *ub)
2151  {
2152  	ub->tag_set.ops = &ublk_mq_ops;
2153  	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
2154  	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
2155  	ub->tag_set.numa_node = NUMA_NO_NODE;
2156  	ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
2157  	ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
2158  	ub->tag_set.driver_data = ub;
2159  	return blk_mq_alloc_tag_set(&ub->tag_set);
2160  }
2161  
ublk_remove(struct ublk_device * ub)2162  static void ublk_remove(struct ublk_device *ub)
2163  {
2164  	ublk_stop_dev(ub);
2165  	cancel_work_sync(&ub->stop_work);
2166  	cancel_work_sync(&ub->quiesce_work);
2167  	cdev_device_del(&ub->cdev, &ub->cdev_dev);
2168  	ublk_put_device(ub);
2169  	ublks_added--;
2170  }
2171  
ublk_get_device_from_id(int idx)2172  static struct ublk_device *ublk_get_device_from_id(int idx)
2173  {
2174  	struct ublk_device *ub = NULL;
2175  
2176  	if (idx < 0)
2177  		return NULL;
2178  
2179  	spin_lock(&ublk_idr_lock);
2180  	ub = idr_find(&ublk_index_idr, idx);
2181  	if (ub)
2182  		ub = ublk_get_device(ub);
2183  	spin_unlock(&ublk_idr_lock);
2184  
2185  	return ub;
2186  }
2187  
ublk_ctrl_start_dev(struct ublk_device * ub,struct io_uring_cmd * cmd)2188  static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
2189  {
2190  	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2191  	const struct ublk_param_basic *p = &ub->params.basic;
2192  	int ublksrv_pid = (int)header->data[0];
2193  	struct queue_limits lim = {
2194  		.logical_block_size	= 1 << p->logical_bs_shift,
2195  		.physical_block_size	= 1 << p->physical_bs_shift,
2196  		.io_min			= 1 << p->io_min_shift,
2197  		.io_opt			= 1 << p->io_opt_shift,
2198  		.max_hw_sectors		= p->max_sectors,
2199  		.chunk_sectors		= p->chunk_sectors,
2200  		.virt_boundary_mask	= p->virt_boundary_mask,
2201  		.max_segments		= USHRT_MAX,
2202  		.max_segment_size	= UINT_MAX,
2203  		.dma_alignment		= 3,
2204  	};
2205  	struct gendisk *disk;
2206  	int ret = -EINVAL;
2207  
2208  	if (ublksrv_pid <= 0)
2209  		return -EINVAL;
2210  	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
2211  		return -EINVAL;
2212  
2213  	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
2214  		const struct ublk_param_discard *pd = &ub->params.discard;
2215  
2216  		lim.discard_alignment = pd->discard_alignment;
2217  		lim.discard_granularity = pd->discard_granularity;
2218  		lim.max_hw_discard_sectors = pd->max_discard_sectors;
2219  		lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
2220  		lim.max_discard_segments = pd->max_discard_segments;
2221  	}
2222  
2223  	if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
2224  		const struct ublk_param_zoned *p = &ub->params.zoned;
2225  
2226  		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
2227  			return -EOPNOTSUPP;
2228  
2229  		lim.features |= BLK_FEAT_ZONED;
2230  		lim.max_active_zones = p->max_active_zones;
2231  		lim.max_open_zones =  p->max_open_zones;
2232  		lim.max_zone_append_sectors = p->max_zone_append_sectors;
2233  	}
2234  
2235  	if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
2236  		lim.features |= BLK_FEAT_WRITE_CACHE;
2237  		if (ub->params.basic.attrs & UBLK_ATTR_FUA)
2238  			lim.features |= BLK_FEAT_FUA;
2239  	}
2240  
2241  	if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
2242  		lim.features |= BLK_FEAT_ROTATIONAL;
2243  
2244  	if (wait_for_completion_interruptible(&ub->completion) != 0)
2245  		return -EINTR;
2246  
2247  	mutex_lock(&ub->mutex);
2248  	if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
2249  	    test_bit(UB_STATE_USED, &ub->state)) {
2250  		ret = -EEXIST;
2251  		goto out_unlock;
2252  	}
2253  
2254  	disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
2255  	if (IS_ERR(disk)) {
2256  		ret = PTR_ERR(disk);
2257  		goto out_unlock;
2258  	}
2259  	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
2260  	disk->fops = &ub_fops;
2261  	disk->private_data = ub;
2262  
2263  	ub->dev_info.ublksrv_pid = ublksrv_pid;
2264  	ub->ub_disk = disk;
2265  
2266  	ublk_apply_params(ub);
2267  
2268  	/* don't probe partitions if any one ubq daemon is un-trusted */
2269  	if (ub->nr_privileged_daemon != ub->nr_queues_ready)
2270  		set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
2271  
2272  	ublk_get_device(ub);
2273  	ub->dev_info.state = UBLK_S_DEV_LIVE;
2274  
2275  	if (ublk_dev_is_zoned(ub)) {
2276  		ret = ublk_revalidate_disk_zones(ub);
2277  		if (ret)
2278  			goto out_put_cdev;
2279  	}
2280  
2281  	ret = add_disk(disk);
2282  	if (ret)
2283  		goto out_put_cdev;
2284  
2285  	set_bit(UB_STATE_USED, &ub->state);
2286  
2287  out_put_cdev:
2288  	if (ret) {
2289  		ub->dev_info.state = UBLK_S_DEV_DEAD;
2290  		ublk_put_device(ub);
2291  	}
2292  	if (ret)
2293  		put_disk(disk);
2294  out_unlock:
2295  	mutex_unlock(&ub->mutex);
2296  	return ret;
2297  }
2298  
ublk_ctrl_get_queue_affinity(struct ublk_device * ub,struct io_uring_cmd * cmd)2299  static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
2300  		struct io_uring_cmd *cmd)
2301  {
2302  	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2303  	void __user *argp = (void __user *)(unsigned long)header->addr;
2304  	cpumask_var_t cpumask;
2305  	unsigned long queue;
2306  	unsigned int retlen;
2307  	unsigned int i;
2308  	int ret;
2309  
2310  	if (header->len * BITS_PER_BYTE < nr_cpu_ids)
2311  		return -EINVAL;
2312  	if (header->len & (sizeof(unsigned long)-1))
2313  		return -EINVAL;
2314  	if (!header->addr)
2315  		return -EINVAL;
2316  
2317  	queue = header->data[0];
2318  	if (queue >= ub->dev_info.nr_hw_queues)
2319  		return -EINVAL;
2320  
2321  	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
2322  		return -ENOMEM;
2323  
2324  	for_each_possible_cpu(i) {
2325  		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
2326  			cpumask_set_cpu(i, cpumask);
2327  	}
2328  
2329  	ret = -EFAULT;
2330  	retlen = min_t(unsigned short, header->len, cpumask_size());
2331  	if (copy_to_user(argp, cpumask, retlen))
2332  		goto out_free_cpumask;
2333  	if (retlen != header->len &&
2334  	    clear_user(argp + retlen, header->len - retlen))
2335  		goto out_free_cpumask;
2336  
2337  	ret = 0;
2338  out_free_cpumask:
2339  	free_cpumask_var(cpumask);
2340  	return ret;
2341  }
2342  
ublk_dump_dev_info(struct ublksrv_ctrl_dev_info * info)2343  static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
2344  {
2345  	pr_devel("%s: dev id %d flags %llx\n", __func__,
2346  			info->dev_id, info->flags);
2347  	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
2348  			info->nr_hw_queues, info->queue_depth);
2349  }
2350  
ublk_ctrl_add_dev(struct io_uring_cmd * cmd)2351  static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
2352  {
2353  	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2354  	void __user *argp = (void __user *)(unsigned long)header->addr;
2355  	struct ublksrv_ctrl_dev_info info;
2356  	struct ublk_device *ub;
2357  	int ret = -EINVAL;
2358  
2359  	if (header->len < sizeof(info) || !header->addr)
2360  		return -EINVAL;
2361  	if (header->queue_id != (u16)-1) {
2362  		pr_warn("%s: queue_id is wrong %x\n",
2363  			__func__, header->queue_id);
2364  		return -EINVAL;
2365  	}
2366  
2367  	if (copy_from_user(&info, argp, sizeof(info)))
2368  		return -EFAULT;
2369  
2370  	if (capable(CAP_SYS_ADMIN))
2371  		info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
2372  	else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
2373  		return -EPERM;
2374  
2375  	/*
2376  	 * unprivileged device can't be trusted, but RECOVERY and
2377  	 * RECOVERY_REISSUE still may hang error handling, so can't
2378  	 * support recovery features for unprivileged ublk now
2379  	 *
2380  	 * TODO: provide forward progress for RECOVERY handler, so that
2381  	 * unprivileged device can benefit from it
2382  	 */
2383  	if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
2384  		info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
2385  				UBLK_F_USER_RECOVERY);
2386  
2387  		/*
2388  		 * For USER_COPY, we depends on userspace to fill request
2389  		 * buffer by pwrite() to ublk char device, which can't be
2390  		 * used for unprivileged device
2391  		 */
2392  		if (info.flags & UBLK_F_USER_COPY)
2393  			return -EINVAL;
2394  	}
2395  
2396  	/* the created device is always owned by current user */
2397  	ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
2398  
2399  	if (header->dev_id != info.dev_id) {
2400  		pr_warn("%s: dev id not match %u %u\n",
2401  			__func__, header->dev_id, info.dev_id);
2402  		return -EINVAL;
2403  	}
2404  
2405  	if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
2406  		pr_warn("%s: dev id is too large. Max supported is %d\n",
2407  			__func__, UBLK_MAX_UBLKS - 1);
2408  		return -EINVAL;
2409  	}
2410  
2411  	ublk_dump_dev_info(&info);
2412  
2413  	ret = mutex_lock_killable(&ublk_ctl_mutex);
2414  	if (ret)
2415  		return ret;
2416  
2417  	ret = -EACCES;
2418  	if (ublks_added >= ublks_max)
2419  		goto out_unlock;
2420  
2421  	ret = -ENOMEM;
2422  	ub = kzalloc(sizeof(*ub), GFP_KERNEL);
2423  	if (!ub)
2424  		goto out_unlock;
2425  	mutex_init(&ub->mutex);
2426  	spin_lock_init(&ub->lock);
2427  	INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
2428  	INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
2429  
2430  	ret = ublk_alloc_dev_number(ub, header->dev_id);
2431  	if (ret < 0)
2432  		goto out_free_ub;
2433  
2434  	memcpy(&ub->dev_info, &info, sizeof(info));
2435  
2436  	/* update device id */
2437  	ub->dev_info.dev_id = ub->ub_number;
2438  
2439  	/*
2440  	 * 64bit flags will be copied back to userspace as feature
2441  	 * negotiation result, so have to clear flags which driver
2442  	 * doesn't support yet, then userspace can get correct flags
2443  	 * (features) to handle.
2444  	 */
2445  	ub->dev_info.flags &= UBLK_F_ALL;
2446  
2447  	ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
2448  		UBLK_F_URING_CMD_COMP_IN_TASK;
2449  
2450  	/* GET_DATA isn't needed any more with USER_COPY */
2451  	if (ublk_dev_is_user_copy(ub))
2452  		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
2453  
2454  	/* Zoned storage support requires user copy feature */
2455  	if (ublk_dev_is_zoned(ub) &&
2456  	    (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) {
2457  		ret = -EINVAL;
2458  		goto out_free_dev_number;
2459  	}
2460  
2461  	/* We are not ready to support zero copy */
2462  	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
2463  
2464  	ub->dev_info.nr_hw_queues = min_t(unsigned int,
2465  			ub->dev_info.nr_hw_queues, nr_cpu_ids);
2466  	ublk_align_max_io_size(ub);
2467  
2468  	ret = ublk_init_queues(ub);
2469  	if (ret)
2470  		goto out_free_dev_number;
2471  
2472  	ret = ublk_add_tag_set(ub);
2473  	if (ret)
2474  		goto out_deinit_queues;
2475  
2476  	ret = -EFAULT;
2477  	if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
2478  		goto out_free_tag_set;
2479  
2480  	/*
2481  	 * Add the char dev so that ublksrv daemon can be setup.
2482  	 * ublk_add_chdev() will cleanup everything if it fails.
2483  	 */
2484  	ret = ublk_add_chdev(ub);
2485  	goto out_unlock;
2486  
2487  out_free_tag_set:
2488  	blk_mq_free_tag_set(&ub->tag_set);
2489  out_deinit_queues:
2490  	ublk_deinit_queues(ub);
2491  out_free_dev_number:
2492  	ublk_free_dev_number(ub);
2493  out_free_ub:
2494  	mutex_destroy(&ub->mutex);
2495  	kfree(ub);
2496  out_unlock:
2497  	mutex_unlock(&ublk_ctl_mutex);
2498  	return ret;
2499  }
2500  
ublk_idr_freed(int id)2501  static inline bool ublk_idr_freed(int id)
2502  {
2503  	void *ptr;
2504  
2505  	spin_lock(&ublk_idr_lock);
2506  	ptr = idr_find(&ublk_index_idr, id);
2507  	spin_unlock(&ublk_idr_lock);
2508  
2509  	return ptr == NULL;
2510  }
2511  
ublk_ctrl_del_dev(struct ublk_device ** p_ub,bool wait)2512  static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
2513  {
2514  	struct ublk_device *ub = *p_ub;
2515  	int idx = ub->ub_number;
2516  	int ret;
2517  
2518  	ret = mutex_lock_killable(&ublk_ctl_mutex);
2519  	if (ret)
2520  		return ret;
2521  
2522  	if (!test_bit(UB_STATE_DELETED, &ub->state)) {
2523  		ublk_remove(ub);
2524  		set_bit(UB_STATE_DELETED, &ub->state);
2525  	}
2526  
2527  	/* Mark the reference as consumed */
2528  	*p_ub = NULL;
2529  	ublk_put_device(ub);
2530  	mutex_unlock(&ublk_ctl_mutex);
2531  
2532  	/*
2533  	 * Wait until the idr is removed, then it can be reused after
2534  	 * DEL_DEV command is returned.
2535  	 *
2536  	 * If we returns because of user interrupt, future delete command
2537  	 * may come:
2538  	 *
2539  	 * - the device number isn't freed, this device won't or needn't
2540  	 *   be deleted again, since UB_STATE_DELETED is set, and device
2541  	 *   will be released after the last reference is dropped
2542  	 *
2543  	 * - the device number is freed already, we will not find this
2544  	 *   device via ublk_get_device_from_id()
2545  	 */
2546  	if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
2547  		return -EINTR;
2548  	return 0;
2549  }
2550  
ublk_ctrl_cmd_dump(struct io_uring_cmd * cmd)2551  static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
2552  {
2553  	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2554  
2555  	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
2556  			__func__, cmd->cmd_op, header->dev_id, header->queue_id,
2557  			header->data[0], header->addr, header->len);
2558  }
2559  
ublk_ctrl_stop_dev(struct ublk_device * ub)2560  static int ublk_ctrl_stop_dev(struct ublk_device *ub)
2561  {
2562  	ublk_stop_dev(ub);
2563  	cancel_work_sync(&ub->stop_work);
2564  	cancel_work_sync(&ub->quiesce_work);
2565  
2566  	return 0;
2567  }
2568  
ublk_ctrl_get_dev_info(struct ublk_device * ub,struct io_uring_cmd * cmd)2569  static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
2570  		struct io_uring_cmd *cmd)
2571  {
2572  	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2573  	void __user *argp = (void __user *)(unsigned long)header->addr;
2574  
2575  	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
2576  		return -EINVAL;
2577  
2578  	if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info)))
2579  		return -EFAULT;
2580  
2581  	return 0;
2582  }
2583  
2584  /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
ublk_ctrl_fill_params_devt(struct ublk_device * ub)2585  static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
2586  {
2587  	ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
2588  	ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
2589  
2590  	if (ub->ub_disk) {
2591  		ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
2592  		ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
2593  	} else {
2594  		ub->params.devt.disk_major = 0;
2595  		ub->params.devt.disk_minor = 0;
2596  	}
2597  	ub->params.types |= UBLK_PARAM_TYPE_DEVT;
2598  }
2599  
ublk_ctrl_get_params(struct ublk_device * ub,struct io_uring_cmd * cmd)2600  static int ublk_ctrl_get_params(struct ublk_device *ub,
2601  		struct io_uring_cmd *cmd)
2602  {
2603  	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2604  	void __user *argp = (void __user *)(unsigned long)header->addr;
2605  	struct ublk_params_header ph;
2606  	int ret;
2607  
2608  	if (header->len <= sizeof(ph) || !header->addr)
2609  		return -EINVAL;
2610  
2611  	if (copy_from_user(&ph, argp, sizeof(ph)))
2612  		return -EFAULT;
2613  
2614  	if (ph.len > header->len || !ph.len)
2615  		return -EINVAL;
2616  
2617  	if (ph.len > sizeof(struct ublk_params))
2618  		ph.len = sizeof(struct ublk_params);
2619  
2620  	mutex_lock(&ub->mutex);
2621  	ublk_ctrl_fill_params_devt(ub);
2622  	if (copy_to_user(argp, &ub->params, ph.len))
2623  		ret = -EFAULT;
2624  	else
2625  		ret = 0;
2626  	mutex_unlock(&ub->mutex);
2627  
2628  	return ret;
2629  }
2630  
ublk_ctrl_set_params(struct ublk_device * ub,struct io_uring_cmd * cmd)2631  static int ublk_ctrl_set_params(struct ublk_device *ub,
2632  		struct io_uring_cmd *cmd)
2633  {
2634  	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2635  	void __user *argp = (void __user *)(unsigned long)header->addr;
2636  	struct ublk_params_header ph;
2637  	int ret = -EFAULT;
2638  
2639  	if (header->len <= sizeof(ph) || !header->addr)
2640  		return -EINVAL;
2641  
2642  	if (copy_from_user(&ph, argp, sizeof(ph)))
2643  		return -EFAULT;
2644  
2645  	if (ph.len > header->len || !ph.len || !ph.types)
2646  		return -EINVAL;
2647  
2648  	if (ph.len > sizeof(struct ublk_params))
2649  		ph.len = sizeof(struct ublk_params);
2650  
2651  	/* parameters can only be changed when device isn't live */
2652  	mutex_lock(&ub->mutex);
2653  	if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
2654  		ret = -EACCES;
2655  	} else if (copy_from_user(&ub->params, argp, ph.len)) {
2656  		ret = -EFAULT;
2657  	} else {
2658  		/* clear all we don't support yet */
2659  		ub->params.types &= UBLK_PARAM_TYPE_ALL;
2660  		ret = ublk_validate_params(ub);
2661  		if (ret)
2662  			ub->params.types = 0;
2663  	}
2664  	mutex_unlock(&ub->mutex);
2665  
2666  	return ret;
2667  }
2668  
ublk_queue_reinit(struct ublk_device * ub,struct ublk_queue * ubq)2669  static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2670  {
2671  	int i;
2672  
2673  	WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
2674  
2675  	/* All old ioucmds have to be completed */
2676  	ubq->nr_io_ready = 0;
2677  	/* old daemon is PF_EXITING, put it now */
2678  	put_task_struct(ubq->ubq_daemon);
2679  	/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
2680  	ubq->ubq_daemon = NULL;
2681  	ubq->timeout = false;
2682  	ubq->canceling = false;
2683  
2684  	for (i = 0; i < ubq->q_depth; i++) {
2685  		struct ublk_io *io = &ubq->ios[i];
2686  
2687  		/* forget everything now and be ready for new FETCH_REQ */
2688  		io->flags = 0;
2689  		io->cmd = NULL;
2690  		io->addr = 0;
2691  	}
2692  }
2693  
ublk_ctrl_start_recovery(struct ublk_device * ub,struct io_uring_cmd * cmd)2694  static int ublk_ctrl_start_recovery(struct ublk_device *ub,
2695  		struct io_uring_cmd *cmd)
2696  {
2697  	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2698  	int ret = -EINVAL;
2699  	int i;
2700  
2701  	mutex_lock(&ub->mutex);
2702  	if (!ublk_can_use_recovery(ub))
2703  		goto out_unlock;
2704  	if (!ub->nr_queues_ready)
2705  		goto out_unlock;
2706  	/*
2707  	 * START_RECOVERY is only allowd after:
2708  	 *
2709  	 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
2710  	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
2711  	 *     released.
2712  	 *
2713  	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
2714  	 *     (a)has quiesced request queue
2715  	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
2716  	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
2717  	 *     (d)has completed/camceled all ioucmds owned by ther dying process
2718  	 */
2719  	if (test_bit(UB_STATE_OPEN, &ub->state) ||
2720  			ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
2721  		ret = -EBUSY;
2722  		goto out_unlock;
2723  	}
2724  	pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
2725  	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2726  		ublk_queue_reinit(ub, ublk_get_queue(ub, i));
2727  	/* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
2728  	ub->mm = NULL;
2729  	ub->nr_queues_ready = 0;
2730  	ub->nr_privileged_daemon = 0;
2731  	init_completion(&ub->completion);
2732  	ret = 0;
2733   out_unlock:
2734  	mutex_unlock(&ub->mutex);
2735  	return ret;
2736  }
2737  
ublk_ctrl_end_recovery(struct ublk_device * ub,struct io_uring_cmd * cmd)2738  static int ublk_ctrl_end_recovery(struct ublk_device *ub,
2739  		struct io_uring_cmd *cmd)
2740  {
2741  	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2742  	int ublksrv_pid = (int)header->data[0];
2743  	int ret = -EINVAL;
2744  
2745  	pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
2746  			__func__, ub->dev_info.nr_hw_queues, header->dev_id);
2747  	/* wait until new ubq_daemon sending all FETCH_REQ */
2748  	if (wait_for_completion_interruptible(&ub->completion))
2749  		return -EINTR;
2750  
2751  	pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
2752  			__func__, ub->dev_info.nr_hw_queues, header->dev_id);
2753  
2754  	mutex_lock(&ub->mutex);
2755  	if (!ublk_can_use_recovery(ub))
2756  		goto out_unlock;
2757  
2758  	if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
2759  		ret = -EBUSY;
2760  		goto out_unlock;
2761  	}
2762  	ub->dev_info.ublksrv_pid = ublksrv_pid;
2763  	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
2764  			__func__, ublksrv_pid, header->dev_id);
2765  	blk_mq_unquiesce_queue(ub->ub_disk->queue);
2766  	pr_devel("%s: queue unquiesced, dev id %d.\n",
2767  			__func__, header->dev_id);
2768  	blk_mq_kick_requeue_list(ub->ub_disk->queue);
2769  	ub->dev_info.state = UBLK_S_DEV_LIVE;
2770  	ret = 0;
2771   out_unlock:
2772  	mutex_unlock(&ub->mutex);
2773  	return ret;
2774  }
2775  
ublk_ctrl_get_features(struct io_uring_cmd * cmd)2776  static int ublk_ctrl_get_features(struct io_uring_cmd *cmd)
2777  {
2778  	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2779  	void __user *argp = (void __user *)(unsigned long)header->addr;
2780  	u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY;
2781  
2782  	if (header->len != UBLK_FEATURES_LEN || !header->addr)
2783  		return -EINVAL;
2784  
2785  	if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
2786  		return -EFAULT;
2787  
2788  	return 0;
2789  }
2790  
2791  /*
2792   * All control commands are sent via /dev/ublk-control, so we have to check
2793   * the destination device's permission
2794   */
ublk_char_dev_permission(struct ublk_device * ub,const char * dev_path,int mask)2795  static int ublk_char_dev_permission(struct ublk_device *ub,
2796  		const char *dev_path, int mask)
2797  {
2798  	int err;
2799  	struct path path;
2800  	struct kstat stat;
2801  
2802  	err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
2803  	if (err)
2804  		return err;
2805  
2806  	err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
2807  	if (err)
2808  		goto exit;
2809  
2810  	err = -EPERM;
2811  	if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
2812  		goto exit;
2813  
2814  	err = inode_permission(&nop_mnt_idmap,
2815  			d_backing_inode(path.dentry), mask);
2816  exit:
2817  	path_put(&path);
2818  	return err;
2819  }
2820  
ublk_ctrl_uring_cmd_permission(struct ublk_device * ub,struct io_uring_cmd * cmd)2821  static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
2822  		struct io_uring_cmd *cmd)
2823  {
2824  	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe);
2825  	bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
2826  	void __user *argp = (void __user *)(unsigned long)header->addr;
2827  	char *dev_path = NULL;
2828  	int ret = 0;
2829  	int mask;
2830  
2831  	if (!unprivileged) {
2832  		if (!capable(CAP_SYS_ADMIN))
2833  			return -EPERM;
2834  		/*
2835  		 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
2836  		 * char_dev_path in payload too, since userspace may not
2837  		 * know if the specified device is created as unprivileged
2838  		 * mode.
2839  		 */
2840  		if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2)
2841  			return 0;
2842  	}
2843  
2844  	/*
2845  	 * User has to provide the char device path for unprivileged ublk
2846  	 *
2847  	 * header->addr always points to the dev path buffer, and
2848  	 * header->dev_path_len records length of dev path buffer.
2849  	 */
2850  	if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
2851  		return -EINVAL;
2852  
2853  	if (header->len < header->dev_path_len)
2854  		return -EINVAL;
2855  
2856  	dev_path = memdup_user_nul(argp, header->dev_path_len);
2857  	if (IS_ERR(dev_path))
2858  		return PTR_ERR(dev_path);
2859  
2860  	ret = -EINVAL;
2861  	switch (_IOC_NR(cmd->cmd_op)) {
2862  	case UBLK_CMD_GET_DEV_INFO:
2863  	case UBLK_CMD_GET_DEV_INFO2:
2864  	case UBLK_CMD_GET_QUEUE_AFFINITY:
2865  	case UBLK_CMD_GET_PARAMS:
2866  	case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
2867  		mask = MAY_READ;
2868  		break;
2869  	case UBLK_CMD_START_DEV:
2870  	case UBLK_CMD_STOP_DEV:
2871  	case UBLK_CMD_ADD_DEV:
2872  	case UBLK_CMD_DEL_DEV:
2873  	case UBLK_CMD_SET_PARAMS:
2874  	case UBLK_CMD_START_USER_RECOVERY:
2875  	case UBLK_CMD_END_USER_RECOVERY:
2876  		mask = MAY_READ | MAY_WRITE;
2877  		break;
2878  	default:
2879  		goto exit;
2880  	}
2881  
2882  	ret = ublk_char_dev_permission(ub, dev_path, mask);
2883  	if (!ret) {
2884  		header->len -= header->dev_path_len;
2885  		header->addr += header->dev_path_len;
2886  	}
2887  	pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
2888  			__func__, ub->ub_number, cmd->cmd_op,
2889  			ub->dev_info.owner_uid, ub->dev_info.owner_gid,
2890  			dev_path, ret);
2891  exit:
2892  	kfree(dev_path);
2893  	return ret;
2894  }
2895  
ublk_ctrl_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)2896  static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
2897  		unsigned int issue_flags)
2898  {
2899  	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2900  	struct ublk_device *ub = NULL;
2901  	u32 cmd_op = cmd->cmd_op;
2902  	int ret = -EINVAL;
2903  
2904  	if (issue_flags & IO_URING_F_NONBLOCK)
2905  		return -EAGAIN;
2906  
2907  	ublk_ctrl_cmd_dump(cmd);
2908  
2909  	if (!(issue_flags & IO_URING_F_SQE128))
2910  		goto out;
2911  
2912  	ret = ublk_check_cmd_op(cmd_op);
2913  	if (ret)
2914  		goto out;
2915  
2916  	if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
2917  		ret = ublk_ctrl_get_features(cmd);
2918  		goto out;
2919  	}
2920  
2921  	if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
2922  		ret = -ENODEV;
2923  		ub = ublk_get_device_from_id(header->dev_id);
2924  		if (!ub)
2925  			goto out;
2926  
2927  		ret = ublk_ctrl_uring_cmd_permission(ub, cmd);
2928  		if (ret)
2929  			goto put_dev;
2930  	}
2931  
2932  	switch (_IOC_NR(cmd_op)) {
2933  	case UBLK_CMD_START_DEV:
2934  		ret = ublk_ctrl_start_dev(ub, cmd);
2935  		break;
2936  	case UBLK_CMD_STOP_DEV:
2937  		ret = ublk_ctrl_stop_dev(ub);
2938  		break;
2939  	case UBLK_CMD_GET_DEV_INFO:
2940  	case UBLK_CMD_GET_DEV_INFO2:
2941  		ret = ublk_ctrl_get_dev_info(ub, cmd);
2942  		break;
2943  	case UBLK_CMD_ADD_DEV:
2944  		ret = ublk_ctrl_add_dev(cmd);
2945  		break;
2946  	case UBLK_CMD_DEL_DEV:
2947  		ret = ublk_ctrl_del_dev(&ub, true);
2948  		break;
2949  	case UBLK_CMD_DEL_DEV_ASYNC:
2950  		ret = ublk_ctrl_del_dev(&ub, false);
2951  		break;
2952  	case UBLK_CMD_GET_QUEUE_AFFINITY:
2953  		ret = ublk_ctrl_get_queue_affinity(ub, cmd);
2954  		break;
2955  	case UBLK_CMD_GET_PARAMS:
2956  		ret = ublk_ctrl_get_params(ub, cmd);
2957  		break;
2958  	case UBLK_CMD_SET_PARAMS:
2959  		ret = ublk_ctrl_set_params(ub, cmd);
2960  		break;
2961  	case UBLK_CMD_START_USER_RECOVERY:
2962  		ret = ublk_ctrl_start_recovery(ub, cmd);
2963  		break;
2964  	case UBLK_CMD_END_USER_RECOVERY:
2965  		ret = ublk_ctrl_end_recovery(ub, cmd);
2966  		break;
2967  	default:
2968  		ret = -ENOTSUPP;
2969  		break;
2970  	}
2971  
2972   put_dev:
2973  	if (ub)
2974  		ublk_put_device(ub);
2975   out:
2976  	io_uring_cmd_done(cmd, ret, 0, issue_flags);
2977  	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
2978  			__func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
2979  	return -EIOCBQUEUED;
2980  }
2981  
2982  static const struct file_operations ublk_ctl_fops = {
2983  	.open		= nonseekable_open,
2984  	.uring_cmd      = ublk_ctrl_uring_cmd,
2985  	.owner		= THIS_MODULE,
2986  	.llseek		= noop_llseek,
2987  };
2988  
2989  static struct miscdevice ublk_misc = {
2990  	.minor		= MISC_DYNAMIC_MINOR,
2991  	.name		= "ublk-control",
2992  	.fops		= &ublk_ctl_fops,
2993  };
2994  
ublk_init(void)2995  static int __init ublk_init(void)
2996  {
2997  	int ret;
2998  
2999  	BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
3000  			UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
3001  
3002  	init_waitqueue_head(&ublk_idr_wq);
3003  
3004  	ret = misc_register(&ublk_misc);
3005  	if (ret)
3006  		return ret;
3007  
3008  	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
3009  	if (ret)
3010  		goto unregister_mis;
3011  
3012  	ret = class_register(&ublk_chr_class);
3013  	if (ret)
3014  		goto free_chrdev_region;
3015  
3016  	return 0;
3017  
3018  free_chrdev_region:
3019  	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
3020  unregister_mis:
3021  	misc_deregister(&ublk_misc);
3022  	return ret;
3023  }
3024  
ublk_exit(void)3025  static void __exit ublk_exit(void)
3026  {
3027  	struct ublk_device *ub;
3028  	int id;
3029  
3030  	idr_for_each_entry(&ublk_index_idr, ub, id)
3031  		ublk_remove(ub);
3032  
3033  	class_unregister(&ublk_chr_class);
3034  	misc_deregister(&ublk_misc);
3035  
3036  	idr_destroy(&ublk_index_idr);
3037  	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
3038  }
3039  
3040  module_init(ublk_init);
3041  module_exit(ublk_exit);
3042  
ublk_set_max_ublks(const char * buf,const struct kernel_param * kp)3043  static int ublk_set_max_ublks(const char *buf, const struct kernel_param *kp)
3044  {
3045  	return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
3046  }
3047  
ublk_get_max_ublks(char * buf,const struct kernel_param * kp)3048  static int ublk_get_max_ublks(char *buf, const struct kernel_param *kp)
3049  {
3050  	return sysfs_emit(buf, "%u\n", ublks_max);
3051  }
3052  
3053  static const struct kernel_param_ops ublk_max_ublks_ops = {
3054  	.set = ublk_set_max_ublks,
3055  	.get = ublk_get_max_ublks,
3056  };
3057  
3058  module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644);
3059  MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)");
3060  
3061  MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
3062  MODULE_DESCRIPTION("Userspace block device");
3063  MODULE_LICENSE("GPL");
3064