1  // SPDX-License-Identifier: GPL-2.0
2  #include <linux/kernel.h>
3  #include <linux/errno.h>
4  #include <linux/fs.h>
5  #include <linux/file.h>
6  #include <linux/mm.h>
7  #include <linux/slab.h>
8  #include <linux/nospec.h>
9  #include <linux/hugetlb.h>
10  #include <linux/compat.h>
11  #include <linux/io_uring.h>
12  
13  #include <uapi/linux/io_uring.h>
14  
15  #include "io_uring.h"
16  #include "alloc_cache.h"
17  #include "openclose.h"
18  #include "rsrc.h"
19  #include "memmap.h"
20  #include "register.h"
21  
22  struct io_rsrc_update {
23  	struct file			*file;
24  	u64				arg;
25  	u32				nr_args;
26  	u32				offset;
27  };
28  
29  static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
30  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
31  				  struct io_mapped_ubuf **pimu,
32  				  struct page **last_hpage);
33  
34  /* only define max */
35  #define IORING_MAX_FIXED_FILES	(1U << 20)
36  #define IORING_MAX_REG_BUFFERS	(1U << 14)
37  
38  static const struct io_mapped_ubuf dummy_ubuf = {
39  	/* set invalid range, so io_import_fixed() fails meeting it */
40  	.ubuf = -1UL,
41  	.len = UINT_MAX,
42  };
43  
__io_account_mem(struct user_struct * user,unsigned long nr_pages)44  int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
45  {
46  	unsigned long page_limit, cur_pages, new_pages;
47  
48  	if (!nr_pages)
49  		return 0;
50  
51  	/* Don't allow more pages than we can safely lock */
52  	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
53  
54  	cur_pages = atomic_long_read(&user->locked_vm);
55  	do {
56  		new_pages = cur_pages + nr_pages;
57  		if (new_pages > page_limit)
58  			return -ENOMEM;
59  	} while (!atomic_long_try_cmpxchg(&user->locked_vm,
60  					  &cur_pages, new_pages));
61  	return 0;
62  }
63  
io_unaccount_mem(struct io_ring_ctx * ctx,unsigned long nr_pages)64  static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
65  {
66  	if (ctx->user)
67  		__io_unaccount_mem(ctx->user, nr_pages);
68  
69  	if (ctx->mm_account)
70  		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
71  }
72  
io_account_mem(struct io_ring_ctx * ctx,unsigned long nr_pages)73  static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
74  {
75  	int ret;
76  
77  	if (ctx->user) {
78  		ret = __io_account_mem(ctx->user, nr_pages);
79  		if (ret)
80  			return ret;
81  	}
82  
83  	if (ctx->mm_account)
84  		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
85  
86  	return 0;
87  }
88  
io_buffer_validate(struct iovec * iov)89  static int io_buffer_validate(struct iovec *iov)
90  {
91  	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
92  
93  	/*
94  	 * Don't impose further limits on the size and buffer
95  	 * constraints here, we'll -EINVAL later when IO is
96  	 * submitted if they are wrong.
97  	 */
98  	if (!iov->iov_base)
99  		return iov->iov_len ? -EFAULT : 0;
100  	if (!iov->iov_len)
101  		return -EFAULT;
102  
103  	/* arbitrary limit, but we need something */
104  	if (iov->iov_len > SZ_1G)
105  		return -EFAULT;
106  
107  	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
108  		return -EOVERFLOW;
109  
110  	return 0;
111  }
112  
io_buffer_unmap(struct io_ring_ctx * ctx,struct io_mapped_ubuf ** slot)113  static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
114  {
115  	struct io_mapped_ubuf *imu = *slot;
116  	unsigned int i;
117  
118  	*slot = NULL;
119  	if (imu != &dummy_ubuf) {
120  		if (!refcount_dec_and_test(&imu->refs))
121  			return;
122  		for (i = 0; i < imu->nr_bvecs; i++)
123  			unpin_user_page(imu->bvec[i].bv_page);
124  		if (imu->acct_pages)
125  			io_unaccount_mem(ctx, imu->acct_pages);
126  		kvfree(imu);
127  	}
128  }
129  
io_rsrc_put_work(struct io_rsrc_node * node)130  static void io_rsrc_put_work(struct io_rsrc_node *node)
131  {
132  	struct io_rsrc_put *prsrc = &node->item;
133  
134  	if (prsrc->tag)
135  		io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0);
136  
137  	switch (node->type) {
138  	case IORING_RSRC_FILE:
139  		fput(prsrc->file);
140  		break;
141  	case IORING_RSRC_BUFFER:
142  		io_rsrc_buf_put(node->ctx, prsrc);
143  		break;
144  	default:
145  		WARN_ON_ONCE(1);
146  		break;
147  	}
148  }
149  
io_rsrc_node_destroy(struct io_ring_ctx * ctx,struct io_rsrc_node * node)150  void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
151  {
152  	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
153  		kfree(node);
154  }
155  
io_rsrc_node_ref_zero(struct io_rsrc_node * node)156  void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
157  	__must_hold(&node->ctx->uring_lock)
158  {
159  	struct io_ring_ctx *ctx = node->ctx;
160  
161  	while (!list_empty(&ctx->rsrc_ref_list)) {
162  		node = list_first_entry(&ctx->rsrc_ref_list,
163  					    struct io_rsrc_node, node);
164  		/* recycle ref nodes in order */
165  		if (node->refs)
166  			break;
167  		list_del(&node->node);
168  
169  		if (likely(!node->empty))
170  			io_rsrc_put_work(node);
171  		io_rsrc_node_destroy(ctx, node);
172  	}
173  	if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
174  		wake_up_all(&ctx->rsrc_quiesce_wq);
175  }
176  
io_rsrc_node_alloc(struct io_ring_ctx * ctx)177  struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
178  {
179  	struct io_rsrc_node *ref_node;
180  
181  	ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache);
182  	if (!ref_node) {
183  		ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
184  		if (!ref_node)
185  			return NULL;
186  	}
187  
188  	ref_node->ctx = ctx;
189  	ref_node->empty = 0;
190  	ref_node->refs = 1;
191  	return ref_node;
192  }
193  
io_rsrc_ref_quiesce(struct io_rsrc_data * data,struct io_ring_ctx * ctx)194  __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
195  				      struct io_ring_ctx *ctx)
196  {
197  	struct io_rsrc_node *backup;
198  	DEFINE_WAIT(we);
199  	int ret;
200  
201  	/* As We may drop ->uring_lock, other task may have started quiesce */
202  	if (data->quiesce)
203  		return -ENXIO;
204  
205  	backup = io_rsrc_node_alloc(ctx);
206  	if (!backup)
207  		return -ENOMEM;
208  	ctx->rsrc_node->empty = true;
209  	ctx->rsrc_node->type = -1;
210  	list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
211  	io_put_rsrc_node(ctx, ctx->rsrc_node);
212  	ctx->rsrc_node = backup;
213  
214  	if (list_empty(&ctx->rsrc_ref_list))
215  		return 0;
216  
217  	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
218  		atomic_set(&ctx->cq_wait_nr, 1);
219  		smp_mb();
220  	}
221  
222  	ctx->rsrc_quiesce++;
223  	data->quiesce = true;
224  	do {
225  		prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
226  		mutex_unlock(&ctx->uring_lock);
227  
228  		ret = io_run_task_work_sig(ctx);
229  		if (ret < 0) {
230  			finish_wait(&ctx->rsrc_quiesce_wq, &we);
231  			mutex_lock(&ctx->uring_lock);
232  			if (list_empty(&ctx->rsrc_ref_list))
233  				ret = 0;
234  			break;
235  		}
236  
237  		schedule();
238  		mutex_lock(&ctx->uring_lock);
239  		ret = 0;
240  	} while (!list_empty(&ctx->rsrc_ref_list));
241  
242  	finish_wait(&ctx->rsrc_quiesce_wq, &we);
243  	data->quiesce = false;
244  	ctx->rsrc_quiesce--;
245  
246  	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
247  		atomic_set(&ctx->cq_wait_nr, 0);
248  		smp_mb();
249  	}
250  	return ret;
251  }
252  
io_free_page_table(void ** table,size_t size)253  static void io_free_page_table(void **table, size_t size)
254  {
255  	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
256  
257  	for (i = 0; i < nr_tables; i++)
258  		kfree(table[i]);
259  	kfree(table);
260  }
261  
io_rsrc_data_free(struct io_rsrc_data * data)262  static void io_rsrc_data_free(struct io_rsrc_data *data)
263  {
264  	size_t size = data->nr * sizeof(data->tags[0][0]);
265  
266  	if (data->tags)
267  		io_free_page_table((void **)data->tags, size);
268  	kfree(data);
269  }
270  
io_alloc_page_table(size_t size)271  static __cold void **io_alloc_page_table(size_t size)
272  {
273  	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
274  	size_t init_size = size;
275  	void **table;
276  
277  	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
278  	if (!table)
279  		return NULL;
280  
281  	for (i = 0; i < nr_tables; i++) {
282  		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
283  
284  		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
285  		if (!table[i]) {
286  			io_free_page_table(table, init_size);
287  			return NULL;
288  		}
289  		size -= this_size;
290  	}
291  	return table;
292  }
293  
io_rsrc_data_alloc(struct io_ring_ctx * ctx,int type,u64 __user * utags,unsigned nr,struct io_rsrc_data ** pdata)294  __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type,
295  				     u64 __user *utags,
296  				     unsigned nr, struct io_rsrc_data **pdata)
297  {
298  	struct io_rsrc_data *data;
299  	int ret = 0;
300  	unsigned i;
301  
302  	data = kzalloc(sizeof(*data), GFP_KERNEL);
303  	if (!data)
304  		return -ENOMEM;
305  	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
306  	if (!data->tags) {
307  		kfree(data);
308  		return -ENOMEM;
309  	}
310  
311  	data->nr = nr;
312  	data->ctx = ctx;
313  	data->rsrc_type = type;
314  	if (utags) {
315  		ret = -EFAULT;
316  		for (i = 0; i < nr; i++) {
317  			u64 *tag_slot = io_get_tag_slot(data, i);
318  
319  			if (copy_from_user(tag_slot, &utags[i],
320  					   sizeof(*tag_slot)))
321  				goto fail;
322  		}
323  	}
324  	*pdata = data;
325  	return 0;
326  fail:
327  	io_rsrc_data_free(data);
328  	return ret;
329  }
330  
__io_sqe_files_update(struct io_ring_ctx * ctx,struct io_uring_rsrc_update2 * up,unsigned nr_args)331  static int __io_sqe_files_update(struct io_ring_ctx *ctx,
332  				 struct io_uring_rsrc_update2 *up,
333  				 unsigned nr_args)
334  {
335  	u64 __user *tags = u64_to_user_ptr(up->tags);
336  	__s32 __user *fds = u64_to_user_ptr(up->data);
337  	struct io_rsrc_data *data = ctx->file_data;
338  	struct io_fixed_file *file_slot;
339  	int fd, i, err = 0;
340  	unsigned int done;
341  
342  	if (!ctx->file_data)
343  		return -ENXIO;
344  	if (up->offset + nr_args > ctx->nr_user_files)
345  		return -EINVAL;
346  
347  	for (done = 0; done < nr_args; done++) {
348  		u64 tag = 0;
349  
350  		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
351  		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
352  			err = -EFAULT;
353  			break;
354  		}
355  		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
356  			err = -EINVAL;
357  			break;
358  		}
359  		if (fd == IORING_REGISTER_FILES_SKIP)
360  			continue;
361  
362  		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
363  		file_slot = io_fixed_file_slot(&ctx->file_table, i);
364  
365  		if (file_slot->file_ptr) {
366  			err = io_queue_rsrc_removal(data, i,
367  						    io_slot_file(file_slot));
368  			if (err)
369  				break;
370  			file_slot->file_ptr = 0;
371  			io_file_bitmap_clear(&ctx->file_table, i);
372  		}
373  		if (fd != -1) {
374  			struct file *file = fget(fd);
375  
376  			if (!file) {
377  				err = -EBADF;
378  				break;
379  			}
380  			/*
381  			 * Don't allow io_uring instances to be registered.
382  			 */
383  			if (io_is_uring_fops(file)) {
384  				fput(file);
385  				err = -EBADF;
386  				break;
387  			}
388  			*io_get_tag_slot(data, i) = tag;
389  			io_fixed_file_set(file_slot, file);
390  			io_file_bitmap_set(&ctx->file_table, i);
391  		}
392  	}
393  	return done ? done : err;
394  }
395  
__io_sqe_buffers_update(struct io_ring_ctx * ctx,struct io_uring_rsrc_update2 * up,unsigned int nr_args)396  static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
397  				   struct io_uring_rsrc_update2 *up,
398  				   unsigned int nr_args)
399  {
400  	u64 __user *tags = u64_to_user_ptr(up->tags);
401  	struct iovec fast_iov, *iov;
402  	struct page *last_hpage = NULL;
403  	struct iovec __user *uvec;
404  	u64 user_data = up->data;
405  	__u32 done;
406  	int i, err;
407  
408  	if (!ctx->buf_data)
409  		return -ENXIO;
410  	if (up->offset + nr_args > ctx->nr_user_bufs)
411  		return -EINVAL;
412  
413  	for (done = 0; done < nr_args; done++) {
414  		struct io_mapped_ubuf *imu;
415  		u64 tag = 0;
416  
417  		uvec = u64_to_user_ptr(user_data);
418  		iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
419  		if (IS_ERR(iov)) {
420  			err = PTR_ERR(iov);
421  			break;
422  		}
423  		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
424  			err = -EFAULT;
425  			break;
426  		}
427  		err = io_buffer_validate(iov);
428  		if (err)
429  			break;
430  		if (!iov->iov_base && tag) {
431  			err = -EINVAL;
432  			break;
433  		}
434  		err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage);
435  		if (err)
436  			break;
437  
438  		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
439  		if (ctx->user_bufs[i] != &dummy_ubuf) {
440  			err = io_queue_rsrc_removal(ctx->buf_data, i,
441  						    ctx->user_bufs[i]);
442  			if (unlikely(err)) {
443  				io_buffer_unmap(ctx, &imu);
444  				break;
445  			}
446  			ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
447  		}
448  
449  		ctx->user_bufs[i] = imu;
450  		*io_get_tag_slot(ctx->buf_data, i) = tag;
451  		if (ctx->compat)
452  			user_data += sizeof(struct compat_iovec);
453  		else
454  			user_data += sizeof(struct iovec);
455  	}
456  	return done ? done : err;
457  }
458  
__io_register_rsrc_update(struct io_ring_ctx * ctx,unsigned type,struct io_uring_rsrc_update2 * up,unsigned nr_args)459  static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
460  				     struct io_uring_rsrc_update2 *up,
461  				     unsigned nr_args)
462  {
463  	__u32 tmp;
464  
465  	lockdep_assert_held(&ctx->uring_lock);
466  
467  	if (check_add_overflow(up->offset, nr_args, &tmp))
468  		return -EOVERFLOW;
469  
470  	switch (type) {
471  	case IORING_RSRC_FILE:
472  		return __io_sqe_files_update(ctx, up, nr_args);
473  	case IORING_RSRC_BUFFER:
474  		return __io_sqe_buffers_update(ctx, up, nr_args);
475  	}
476  	return -EINVAL;
477  }
478  
io_register_files_update(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)479  int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
480  			     unsigned nr_args)
481  {
482  	struct io_uring_rsrc_update2 up;
483  
484  	if (!nr_args)
485  		return -EINVAL;
486  	memset(&up, 0, sizeof(up));
487  	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
488  		return -EFAULT;
489  	if (up.resv || up.resv2)
490  		return -EINVAL;
491  	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
492  }
493  
io_register_rsrc_update(struct io_ring_ctx * ctx,void __user * arg,unsigned size,unsigned type)494  int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
495  			    unsigned size, unsigned type)
496  {
497  	struct io_uring_rsrc_update2 up;
498  
499  	if (size != sizeof(up))
500  		return -EINVAL;
501  	if (copy_from_user(&up, arg, sizeof(up)))
502  		return -EFAULT;
503  	if (!up.nr || up.resv || up.resv2)
504  		return -EINVAL;
505  	return __io_register_rsrc_update(ctx, type, &up, up.nr);
506  }
507  
io_register_rsrc(struct io_ring_ctx * ctx,void __user * arg,unsigned int size,unsigned int type)508  __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
509  			    unsigned int size, unsigned int type)
510  {
511  	struct io_uring_rsrc_register rr;
512  
513  	/* keep it extendible */
514  	if (size != sizeof(rr))
515  		return -EINVAL;
516  
517  	memset(&rr, 0, sizeof(rr));
518  	if (copy_from_user(&rr, arg, size))
519  		return -EFAULT;
520  	if (!rr.nr || rr.resv2)
521  		return -EINVAL;
522  	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
523  		return -EINVAL;
524  
525  	switch (type) {
526  	case IORING_RSRC_FILE:
527  		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
528  			break;
529  		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
530  					     rr.nr, u64_to_user_ptr(rr.tags));
531  	case IORING_RSRC_BUFFER:
532  		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
533  			break;
534  		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
535  					       rr.nr, u64_to_user_ptr(rr.tags));
536  	}
537  	return -EINVAL;
538  }
539  
io_files_update_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)540  int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
541  {
542  	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
543  
544  	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
545  		return -EINVAL;
546  	if (sqe->rw_flags || sqe->splice_fd_in)
547  		return -EINVAL;
548  
549  	up->offset = READ_ONCE(sqe->off);
550  	up->nr_args = READ_ONCE(sqe->len);
551  	if (!up->nr_args)
552  		return -EINVAL;
553  	up->arg = READ_ONCE(sqe->addr);
554  	return 0;
555  }
556  
io_files_update_with_index_alloc(struct io_kiocb * req,unsigned int issue_flags)557  static int io_files_update_with_index_alloc(struct io_kiocb *req,
558  					    unsigned int issue_flags)
559  {
560  	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
561  	__s32 __user *fds = u64_to_user_ptr(up->arg);
562  	unsigned int done;
563  	struct file *file;
564  	int ret, fd;
565  
566  	if (!req->ctx->file_data)
567  		return -ENXIO;
568  
569  	for (done = 0; done < up->nr_args; done++) {
570  		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
571  			ret = -EFAULT;
572  			break;
573  		}
574  
575  		file = fget(fd);
576  		if (!file) {
577  			ret = -EBADF;
578  			break;
579  		}
580  		ret = io_fixed_fd_install(req, issue_flags, file,
581  					  IORING_FILE_INDEX_ALLOC);
582  		if (ret < 0)
583  			break;
584  		if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
585  			__io_close_fixed(req->ctx, issue_flags, ret);
586  			ret = -EFAULT;
587  			break;
588  		}
589  	}
590  
591  	if (done)
592  		return done;
593  	return ret;
594  }
595  
io_files_update(struct io_kiocb * req,unsigned int issue_flags)596  int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
597  {
598  	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
599  	struct io_ring_ctx *ctx = req->ctx;
600  	struct io_uring_rsrc_update2 up2;
601  	int ret;
602  
603  	up2.offset = up->offset;
604  	up2.data = up->arg;
605  	up2.nr = 0;
606  	up2.tags = 0;
607  	up2.resv = 0;
608  	up2.resv2 = 0;
609  
610  	if (up->offset == IORING_FILE_INDEX_ALLOC) {
611  		ret = io_files_update_with_index_alloc(req, issue_flags);
612  	} else {
613  		io_ring_submit_lock(ctx, issue_flags);
614  		ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
615  						&up2, up->nr_args);
616  		io_ring_submit_unlock(ctx, issue_flags);
617  	}
618  
619  	if (ret < 0)
620  		req_set_fail(req);
621  	io_req_set_res(req, ret, 0);
622  	return IOU_OK;
623  }
624  
io_queue_rsrc_removal(struct io_rsrc_data * data,unsigned idx,void * rsrc)625  int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
626  {
627  	struct io_ring_ctx *ctx = data->ctx;
628  	struct io_rsrc_node *node = ctx->rsrc_node;
629  	u64 *tag_slot = io_get_tag_slot(data, idx);
630  
631  	ctx->rsrc_node = io_rsrc_node_alloc(ctx);
632  	if (unlikely(!ctx->rsrc_node)) {
633  		ctx->rsrc_node = node;
634  		return -ENOMEM;
635  	}
636  
637  	node->item.rsrc = rsrc;
638  	node->type = data->rsrc_type;
639  	node->item.tag = *tag_slot;
640  	*tag_slot = 0;
641  	list_add_tail(&node->node, &ctx->rsrc_ref_list);
642  	io_put_rsrc_node(ctx, node);
643  	return 0;
644  }
645  
__io_sqe_files_unregister(struct io_ring_ctx * ctx)646  void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
647  {
648  	int i;
649  
650  	for (i = 0; i < ctx->nr_user_files; i++) {
651  		struct file *file = io_file_from_index(&ctx->file_table, i);
652  
653  		if (!file)
654  			continue;
655  		io_file_bitmap_clear(&ctx->file_table, i);
656  		fput(file);
657  	}
658  
659  	io_free_file_tables(&ctx->file_table);
660  	io_file_table_set_alloc_range(ctx, 0, 0);
661  	io_rsrc_data_free(ctx->file_data);
662  	ctx->file_data = NULL;
663  	ctx->nr_user_files = 0;
664  }
665  
io_sqe_files_unregister(struct io_ring_ctx * ctx)666  int io_sqe_files_unregister(struct io_ring_ctx *ctx)
667  {
668  	unsigned nr = ctx->nr_user_files;
669  	int ret;
670  
671  	if (!ctx->file_data)
672  		return -ENXIO;
673  
674  	/*
675  	 * Quiesce may unlock ->uring_lock, and while it's not held
676  	 * prevent new requests using the table.
677  	 */
678  	ctx->nr_user_files = 0;
679  	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
680  	ctx->nr_user_files = nr;
681  	if (!ret)
682  		__io_sqe_files_unregister(ctx);
683  	return ret;
684  }
685  
io_sqe_files_register(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args,u64 __user * tags)686  int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
687  			  unsigned nr_args, u64 __user *tags)
688  {
689  	__s32 __user *fds = (__s32 __user *) arg;
690  	struct file *file;
691  	int fd, ret;
692  	unsigned i;
693  
694  	if (ctx->file_data)
695  		return -EBUSY;
696  	if (!nr_args)
697  		return -EINVAL;
698  	if (nr_args > IORING_MAX_FIXED_FILES)
699  		return -EMFILE;
700  	if (nr_args > rlimit(RLIMIT_NOFILE))
701  		return -EMFILE;
702  	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args,
703  				 &ctx->file_data);
704  	if (ret)
705  		return ret;
706  
707  	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
708  		io_rsrc_data_free(ctx->file_data);
709  		ctx->file_data = NULL;
710  		return -ENOMEM;
711  	}
712  
713  	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
714  		struct io_fixed_file *file_slot;
715  
716  		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
717  			ret = -EFAULT;
718  			goto fail;
719  		}
720  		/* allow sparse sets */
721  		if (!fds || fd == -1) {
722  			ret = -EINVAL;
723  			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
724  				goto fail;
725  			continue;
726  		}
727  
728  		file = fget(fd);
729  		ret = -EBADF;
730  		if (unlikely(!file))
731  			goto fail;
732  
733  		/*
734  		 * Don't allow io_uring instances to be registered.
735  		 */
736  		if (io_is_uring_fops(file)) {
737  			fput(file);
738  			goto fail;
739  		}
740  		file_slot = io_fixed_file_slot(&ctx->file_table, i);
741  		io_fixed_file_set(file_slot, file);
742  		io_file_bitmap_set(&ctx->file_table, i);
743  	}
744  
745  	/* default it to the whole table */
746  	io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
747  	return 0;
748  fail:
749  	__io_sqe_files_unregister(ctx);
750  	return ret;
751  }
752  
io_rsrc_buf_put(struct io_ring_ctx * ctx,struct io_rsrc_put * prsrc)753  static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
754  {
755  	io_buffer_unmap(ctx, &prsrc->buf);
756  	prsrc->buf = NULL;
757  }
758  
__io_sqe_buffers_unregister(struct io_ring_ctx * ctx)759  void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
760  {
761  	unsigned int i;
762  
763  	for (i = 0; i < ctx->nr_user_bufs; i++)
764  		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
765  	kfree(ctx->user_bufs);
766  	io_rsrc_data_free(ctx->buf_data);
767  	ctx->user_bufs = NULL;
768  	ctx->buf_data = NULL;
769  	ctx->nr_user_bufs = 0;
770  }
771  
io_sqe_buffers_unregister(struct io_ring_ctx * ctx)772  int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
773  {
774  	unsigned nr = ctx->nr_user_bufs;
775  	int ret;
776  
777  	if (!ctx->buf_data)
778  		return -ENXIO;
779  
780  	/*
781  	 * Quiesce may unlock ->uring_lock, and while it's not held
782  	 * prevent new requests using the table.
783  	 */
784  	ctx->nr_user_bufs = 0;
785  	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
786  	ctx->nr_user_bufs = nr;
787  	if (!ret)
788  		__io_sqe_buffers_unregister(ctx);
789  	return ret;
790  }
791  
792  /*
793   * Not super efficient, but this is just a registration time. And we do cache
794   * the last compound head, so generally we'll only do a full search if we don't
795   * match that one.
796   *
797   * We check if the given compound head page has already been accounted, to
798   * avoid double accounting it. This allows us to account the full size of the
799   * page, not just the constituent pages of a huge page.
800   */
headpage_already_acct(struct io_ring_ctx * ctx,struct page ** pages,int nr_pages,struct page * hpage)801  static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
802  				  int nr_pages, struct page *hpage)
803  {
804  	int i, j;
805  
806  	/* check current page array */
807  	for (i = 0; i < nr_pages; i++) {
808  		if (!PageCompound(pages[i]))
809  			continue;
810  		if (compound_head(pages[i]) == hpage)
811  			return true;
812  	}
813  
814  	/* check previously registered pages */
815  	for (i = 0; i < ctx->nr_user_bufs; i++) {
816  		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
817  
818  		for (j = 0; j < imu->nr_bvecs; j++) {
819  			if (!PageCompound(imu->bvec[j].bv_page))
820  				continue;
821  			if (compound_head(imu->bvec[j].bv_page) == hpage)
822  				return true;
823  		}
824  	}
825  
826  	return false;
827  }
828  
io_buffer_account_pin(struct io_ring_ctx * ctx,struct page ** pages,int nr_pages,struct io_mapped_ubuf * imu,struct page ** last_hpage)829  static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
830  				 int nr_pages, struct io_mapped_ubuf *imu,
831  				 struct page **last_hpage)
832  {
833  	int i, ret;
834  
835  	imu->acct_pages = 0;
836  	for (i = 0; i < nr_pages; i++) {
837  		if (!PageCompound(pages[i])) {
838  			imu->acct_pages++;
839  		} else {
840  			struct page *hpage;
841  
842  			hpage = compound_head(pages[i]);
843  			if (hpage == *last_hpage)
844  				continue;
845  			*last_hpage = hpage;
846  			if (headpage_already_acct(ctx, pages, i, hpage))
847  				continue;
848  			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
849  		}
850  	}
851  
852  	if (!imu->acct_pages)
853  		return 0;
854  
855  	ret = io_account_mem(ctx, imu->acct_pages);
856  	if (ret)
857  		imu->acct_pages = 0;
858  	return ret;
859  }
860  
io_do_coalesce_buffer(struct page *** pages,int * nr_pages,struct io_imu_folio_data * data,int nr_folios)861  static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
862  				struct io_imu_folio_data *data, int nr_folios)
863  {
864  	struct page **page_array = *pages, **new_array = NULL;
865  	int nr_pages_left = *nr_pages, i, j;
866  
867  	/* Store head pages only*/
868  	new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
869  					GFP_KERNEL);
870  	if (!new_array)
871  		return false;
872  
873  	new_array[0] = compound_head(page_array[0]);
874  	/*
875  	 * The pages are bound to the folio, it doesn't
876  	 * actually unpin them but drops all but one reference,
877  	 * which is usually put down by io_buffer_unmap().
878  	 * Note, needs a better helper.
879  	 */
880  	if (data->nr_pages_head > 1)
881  		unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
882  
883  	j = data->nr_pages_head;
884  	nr_pages_left -= data->nr_pages_head;
885  	for (i = 1; i < nr_folios; i++) {
886  		unsigned int nr_unpin;
887  
888  		new_array[i] = page_array[j];
889  		nr_unpin = min_t(unsigned int, nr_pages_left - 1,
890  					data->nr_pages_mid - 1);
891  		if (nr_unpin)
892  			unpin_user_pages(&page_array[j+1], nr_unpin);
893  		j += data->nr_pages_mid;
894  		nr_pages_left -= data->nr_pages_mid;
895  	}
896  	kvfree(page_array);
897  	*pages = new_array;
898  	*nr_pages = nr_folios;
899  	return true;
900  }
901  
io_try_coalesce_buffer(struct page *** pages,int * nr_pages,struct io_imu_folio_data * data)902  static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
903  					 struct io_imu_folio_data *data)
904  {
905  	struct page **page_array = *pages;
906  	struct folio *folio = page_folio(page_array[0]);
907  	unsigned int count = 1, nr_folios = 1;
908  	int i;
909  
910  	if (*nr_pages <= 1)
911  		return false;
912  
913  	data->nr_pages_mid = folio_nr_pages(folio);
914  	if (data->nr_pages_mid == 1)
915  		return false;
916  
917  	data->folio_shift = folio_shift(folio);
918  	/*
919  	 * Check if pages are contiguous inside a folio, and all folios have
920  	 * the same page count except for the head and tail.
921  	 */
922  	for (i = 1; i < *nr_pages; i++) {
923  		if (page_folio(page_array[i]) == folio &&
924  			page_array[i] == page_array[i-1] + 1) {
925  			count++;
926  			continue;
927  		}
928  
929  		if (nr_folios == 1) {
930  			if (folio_page_idx(folio, page_array[i-1]) !=
931  				data->nr_pages_mid - 1)
932  				return false;
933  
934  			data->nr_pages_head = count;
935  		} else if (count != data->nr_pages_mid) {
936  			return false;
937  		}
938  
939  		folio = page_folio(page_array[i]);
940  		if (folio_size(folio) != (1UL << data->folio_shift) ||
941  			folio_page_idx(folio, page_array[i]) != 0)
942  			return false;
943  
944  		count = 1;
945  		nr_folios++;
946  	}
947  	if (nr_folios == 1)
948  		data->nr_pages_head = count;
949  
950  	return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
951  }
952  
io_sqe_buffer_register(struct io_ring_ctx * ctx,struct iovec * iov,struct io_mapped_ubuf ** pimu,struct page ** last_hpage)953  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
954  				  struct io_mapped_ubuf **pimu,
955  				  struct page **last_hpage)
956  {
957  	struct io_mapped_ubuf *imu = NULL;
958  	struct page **pages = NULL;
959  	unsigned long off;
960  	size_t size;
961  	int ret, nr_pages, i;
962  	struct io_imu_folio_data data;
963  	bool coalesced;
964  
965  	*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
966  	if (!iov->iov_base)
967  		return 0;
968  
969  	ret = -ENOMEM;
970  	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
971  				&nr_pages);
972  	if (IS_ERR(pages)) {
973  		ret = PTR_ERR(pages);
974  		pages = NULL;
975  		goto done;
976  	}
977  
978  	/* If it's huge page(s), try to coalesce them into fewer bvec entries */
979  	coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
980  
981  	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
982  	if (!imu)
983  		goto done;
984  
985  	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
986  	if (ret) {
987  		unpin_user_pages(pages, nr_pages);
988  		goto done;
989  	}
990  
991  	size = iov->iov_len;
992  	/* store original address for later verification */
993  	imu->ubuf = (unsigned long) iov->iov_base;
994  	imu->len = iov->iov_len;
995  	imu->nr_bvecs = nr_pages;
996  	imu->folio_shift = PAGE_SHIFT;
997  	if (coalesced)
998  		imu->folio_shift = data.folio_shift;
999  	refcount_set(&imu->refs, 1);
1000  	off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
1001  	*pimu = imu;
1002  	ret = 0;
1003  
1004  	for (i = 0; i < nr_pages; i++) {
1005  		size_t vec_len;
1006  
1007  		vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
1008  		bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
1009  		off = 0;
1010  		size -= vec_len;
1011  	}
1012  done:
1013  	if (ret)
1014  		kvfree(imu);
1015  	kvfree(pages);
1016  	return ret;
1017  }
1018  
io_buffers_map_alloc(struct io_ring_ctx * ctx,unsigned int nr_args)1019  static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
1020  {
1021  	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
1022  	return ctx->user_bufs ? 0 : -ENOMEM;
1023  }
1024  
io_sqe_buffers_register(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args,u64 __user * tags)1025  int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
1026  			    unsigned int nr_args, u64 __user *tags)
1027  {
1028  	struct page *last_hpage = NULL;
1029  	struct io_rsrc_data *data;
1030  	struct iovec fast_iov, *iov = &fast_iov;
1031  	const struct iovec __user *uvec;
1032  	int i, ret;
1033  
1034  	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
1035  
1036  	if (ctx->user_bufs)
1037  		return -EBUSY;
1038  	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
1039  		return -EINVAL;
1040  	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data);
1041  	if (ret)
1042  		return ret;
1043  	ret = io_buffers_map_alloc(ctx, nr_args);
1044  	if (ret) {
1045  		io_rsrc_data_free(data);
1046  		return ret;
1047  	}
1048  
1049  	if (!arg)
1050  		memset(iov, 0, sizeof(*iov));
1051  
1052  	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
1053  		if (arg) {
1054  			uvec = (struct iovec __user *) arg;
1055  			iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
1056  			if (IS_ERR(iov)) {
1057  				ret = PTR_ERR(iov);
1058  				break;
1059  			}
1060  			ret = io_buffer_validate(iov);
1061  			if (ret)
1062  				break;
1063  			if (ctx->compat)
1064  				arg += sizeof(struct compat_iovec);
1065  			else
1066  				arg += sizeof(struct iovec);
1067  		}
1068  
1069  		if (!iov->iov_base && *io_get_tag_slot(data, i)) {
1070  			ret = -EINVAL;
1071  			break;
1072  		}
1073  
1074  		ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i],
1075  					     &last_hpage);
1076  		if (ret)
1077  			break;
1078  	}
1079  
1080  	WARN_ON_ONCE(ctx->buf_data);
1081  
1082  	ctx->buf_data = data;
1083  	if (ret)
1084  		__io_sqe_buffers_unregister(ctx);
1085  	return ret;
1086  }
1087  
io_import_fixed(int ddir,struct iov_iter * iter,struct io_mapped_ubuf * imu,u64 buf_addr,size_t len)1088  int io_import_fixed(int ddir, struct iov_iter *iter,
1089  			   struct io_mapped_ubuf *imu,
1090  			   u64 buf_addr, size_t len)
1091  {
1092  	u64 buf_end;
1093  	size_t offset;
1094  
1095  	if (WARN_ON_ONCE(!imu))
1096  		return -EFAULT;
1097  	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1098  		return -EFAULT;
1099  	/* not inside the mapped region */
1100  	if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
1101  		return -EFAULT;
1102  
1103  	/*
1104  	 * Might not be a start of buffer, set size appropriately
1105  	 * and advance us to the beginning.
1106  	 */
1107  	offset = buf_addr - imu->ubuf;
1108  	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
1109  
1110  	if (offset) {
1111  		/*
1112  		 * Don't use iov_iter_advance() here, as it's really slow for
1113  		 * using the latter parts of a big fixed buffer - it iterates
1114  		 * over each segment manually. We can cheat a bit here, because
1115  		 * we know that:
1116  		 *
1117  		 * 1) it's a BVEC iter, we set it up
1118  		 * 2) all bvecs are the same in size, except potentially the
1119  		 *    first and last bvec
1120  		 *
1121  		 * So just find our index, and adjust the iterator afterwards.
1122  		 * If the offset is within the first bvec (or the whole first
1123  		 * bvec, just use iov_iter_advance(). This makes it easier
1124  		 * since we can just skip the first segment, which may not
1125  		 * be folio_size aligned.
1126  		 */
1127  		const struct bio_vec *bvec = imu->bvec;
1128  
1129  		if (offset < bvec->bv_len) {
1130  			iter->bvec = bvec;
1131  			iter->count -= offset;
1132  			iter->iov_offset = offset;
1133  		} else {
1134  			unsigned long seg_skip;
1135  
1136  			/* skip first vec */
1137  			offset -= bvec->bv_len;
1138  			seg_skip = 1 + (offset >> imu->folio_shift);
1139  
1140  			iter->bvec = bvec + seg_skip;
1141  			iter->nr_segs -= seg_skip;
1142  			iter->count -= bvec->bv_len + offset;
1143  			iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
1144  		}
1145  	}
1146  
1147  	return 0;
1148  }
1149  
io_clone_buffers(struct io_ring_ctx * ctx,struct io_ring_ctx * src_ctx)1150  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
1151  {
1152  	struct io_mapped_ubuf **user_bufs;
1153  	struct io_rsrc_data *data;
1154  	int i, ret, nbufs;
1155  
1156  	/*
1157  	 * Drop our own lock here. We'll setup the data we need and reference
1158  	 * the source buffers, then re-grab, check, and assign at the end.
1159  	 */
1160  	mutex_unlock(&ctx->uring_lock);
1161  
1162  	mutex_lock(&src_ctx->uring_lock);
1163  	ret = -ENXIO;
1164  	nbufs = src_ctx->nr_user_bufs;
1165  	if (!nbufs)
1166  		goto out_unlock;
1167  	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data);
1168  	if (ret)
1169  		goto out_unlock;
1170  
1171  	ret = -ENOMEM;
1172  	user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL);
1173  	if (!user_bufs)
1174  		goto out_free_data;
1175  
1176  	for (i = 0; i < nbufs; i++) {
1177  		struct io_mapped_ubuf *src = src_ctx->user_bufs[i];
1178  
1179  		if (src != &dummy_ubuf)
1180  			refcount_inc(&src->refs);
1181  		user_bufs[i] = src;
1182  	}
1183  
1184  	/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
1185  	mutex_unlock(&src_ctx->uring_lock);
1186  	mutex_lock(&ctx->uring_lock);
1187  	if (!ctx->user_bufs) {
1188  		ctx->user_bufs = user_bufs;
1189  		ctx->buf_data = data;
1190  		ctx->nr_user_bufs = nbufs;
1191  		return 0;
1192  	}
1193  
1194  	/* someone raced setting up buffers, dump ours */
1195  	for (i = 0; i < nbufs; i++)
1196  		io_buffer_unmap(ctx, &user_bufs[i]);
1197  	io_rsrc_data_free(data);
1198  	kfree(user_bufs);
1199  	return -EBUSY;
1200  out_free_data:
1201  	io_rsrc_data_free(data);
1202  out_unlock:
1203  	mutex_unlock(&src_ctx->uring_lock);
1204  	mutex_lock(&ctx->uring_lock);
1205  	return ret;
1206  }
1207  
1208  /*
1209   * Copy the registered buffers from the source ring whose file descriptor
1210   * is given in the src_fd to the current ring. This is identical to registering
1211   * the buffers with ctx, except faster as mappings already exist.
1212   *
1213   * Since the memory is already accounted once, don't account it again.
1214   */
io_register_clone_buffers(struct io_ring_ctx * ctx,void __user * arg)1215  int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
1216  {
1217  	struct io_uring_clone_buffers buf;
1218  	bool registered_src;
1219  	struct file *file;
1220  	int ret;
1221  
1222  	if (ctx->user_bufs || ctx->nr_user_bufs)
1223  		return -EBUSY;
1224  	if (copy_from_user(&buf, arg, sizeof(buf)))
1225  		return -EFAULT;
1226  	if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED)
1227  		return -EINVAL;
1228  	if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
1229  		return -EINVAL;
1230  
1231  	registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
1232  	file = io_uring_register_get_file(buf.src_fd, registered_src);
1233  	if (IS_ERR(file))
1234  		return PTR_ERR(file);
1235  	ret = io_clone_buffers(ctx, file->private_data);
1236  	if (!registered_src)
1237  		fput(file);
1238  	return ret;
1239  }
1240