1  // SPDX-License-Identifier: GPL-2.0
2  /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3   *
4   * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5   * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6   * list for access by an in-kernel user.
7   *
8   * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9   * between the domains and xarray.
10   */
11  #include <linux/err.h>
12  #include <linux/errno.h>
13  #include <linux/iommu.h>
14  #include <linux/iommufd.h>
15  #include <linux/lockdep.h>
16  #include <linux/sched/mm.h>
17  #include <linux/slab.h>
18  #include <uapi/linux/iommufd.h>
19  
20  #include "double_span.h"
21  #include "io_pagetable.h"
22  
23  struct iopt_pages_list {
24  	struct iopt_pages *pages;
25  	struct iopt_area *area;
26  	struct list_head next;
27  	unsigned long start_byte;
28  	unsigned long length;
29  };
30  
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)31  struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
32  					struct io_pagetable *iopt,
33  					unsigned long iova,
34  					unsigned long last_iova)
35  {
36  	lockdep_assert_held(&iopt->iova_rwsem);
37  
38  	iter->cur_iova = iova;
39  	iter->last_iova = last_iova;
40  	iter->area = iopt_area_iter_first(iopt, iova, iova);
41  	if (!iter->area)
42  		return NULL;
43  	if (!iter->area->pages) {
44  		iter->area = NULL;
45  		return NULL;
46  	}
47  	return iter->area;
48  }
49  
iopt_area_contig_next(struct iopt_area_contig_iter * iter)50  struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
51  {
52  	unsigned long last_iova;
53  
54  	if (!iter->area)
55  		return NULL;
56  	last_iova = iopt_area_last_iova(iter->area);
57  	if (iter->last_iova <= last_iova)
58  		return NULL;
59  
60  	iter->cur_iova = last_iova + 1;
61  	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
62  					 iter->last_iova);
63  	if (!iter->area)
64  		return NULL;
65  	if (iter->cur_iova != iopt_area_iova(iter->area) ||
66  	    !iter->area->pages) {
67  		iter->area = NULL;
68  		return NULL;
69  	}
70  	return iter->area;
71  }
72  
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)73  static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
74  				    unsigned long length,
75  				    unsigned long iova_alignment,
76  				    unsigned long page_offset)
77  {
78  	if (span->is_used || span->last_hole - span->start_hole < length - 1)
79  		return false;
80  
81  	span->start_hole = ALIGN(span->start_hole, iova_alignment) |
82  			   page_offset;
83  	if (span->start_hole > span->last_hole ||
84  	    span->last_hole - span->start_hole < length - 1)
85  		return false;
86  	return true;
87  }
88  
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)89  static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
90  				    unsigned long length,
91  				    unsigned long iova_alignment,
92  				    unsigned long page_offset)
93  {
94  	if (span->is_hole || span->last_used - span->start_used < length - 1)
95  		return false;
96  
97  	span->start_used = ALIGN(span->start_used, iova_alignment) |
98  			   page_offset;
99  	if (span->start_used > span->last_used ||
100  	    span->last_used - span->start_used < length - 1)
101  		return false;
102  	return true;
103  }
104  
105  /*
106   * Automatically find a block of IOVA that is not being used and not reserved.
107   * Does not return a 0 IOVA even if it is valid.
108   */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long uptr,unsigned long length)109  static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
110  			   unsigned long uptr, unsigned long length)
111  {
112  	unsigned long page_offset = uptr % PAGE_SIZE;
113  	struct interval_tree_double_span_iter used_span;
114  	struct interval_tree_span_iter allowed_span;
115  	unsigned long max_alignment = PAGE_SIZE;
116  	unsigned long iova_alignment;
117  
118  	lockdep_assert_held(&iopt->iova_rwsem);
119  
120  	/* Protect roundup_pow-of_two() from overflow */
121  	if (length == 0 || length >= ULONG_MAX / 2)
122  		return -EOVERFLOW;
123  
124  	/*
125  	 * Keep alignment present in the uptr when building the IOVA, this
126  	 * increases the chance we can map a THP.
127  	 */
128  	if (!uptr)
129  		iova_alignment = roundup_pow_of_two(length);
130  	else
131  		iova_alignment = min_t(unsigned long,
132  				       roundup_pow_of_two(length),
133  				       1UL << __ffs64(uptr));
134  
135  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
136  	max_alignment = HPAGE_SIZE;
137  #endif
138  	/* Protect against ALIGN() overflow */
139  	if (iova_alignment >= max_alignment)
140  		iova_alignment = max_alignment;
141  
142  	if (iova_alignment < iopt->iova_alignment)
143  		return -EINVAL;
144  
145  	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
146  				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
147  		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
148  			allowed_span.start_used = PAGE_SIZE;
149  			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
150  			allowed_span.is_hole = false;
151  		}
152  
153  		if (!__alloc_iova_check_used(&allowed_span, length,
154  					     iova_alignment, page_offset))
155  			continue;
156  
157  		interval_tree_for_each_double_span(
158  			&used_span, &iopt->reserved_itree, &iopt->area_itree,
159  			allowed_span.start_used, allowed_span.last_used) {
160  			if (!__alloc_iova_check_hole(&used_span, length,
161  						     iova_alignment,
162  						     page_offset))
163  				continue;
164  
165  			*iova = used_span.start_hole;
166  			return 0;
167  		}
168  	}
169  	return -ENOSPC;
170  }
171  
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)172  static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
173  			   unsigned long length)
174  {
175  	unsigned long last;
176  
177  	lockdep_assert_held(&iopt->iova_rwsem);
178  
179  	if ((iova & (iopt->iova_alignment - 1)))
180  		return -EINVAL;
181  
182  	if (check_add_overflow(iova, length - 1, &last))
183  		return -EOVERFLOW;
184  
185  	/* No reserved IOVA intersects the range */
186  	if (iopt_reserved_iter_first(iopt, iova, last))
187  		return -EINVAL;
188  
189  	/* Check that there is not already a mapping in the range */
190  	if (iopt_area_iter_first(iopt, iova, last))
191  		return -EEXIST;
192  	return 0;
193  }
194  
195  /*
196   * The area takes a slice of the pages from start_bytes to start_byte + length
197   */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)198  static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
199  			    struct iopt_pages *pages, unsigned long iova,
200  			    unsigned long start_byte, unsigned long length,
201  			    int iommu_prot)
202  {
203  	lockdep_assert_held_write(&iopt->iova_rwsem);
204  
205  	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
206  		return -EPERM;
207  
208  	area->iommu_prot = iommu_prot;
209  	area->page_offset = start_byte % PAGE_SIZE;
210  	if (area->page_offset & (iopt->iova_alignment - 1))
211  		return -EINVAL;
212  
213  	area->node.start = iova;
214  	if (check_add_overflow(iova, length - 1, &area->node.last))
215  		return -EOVERFLOW;
216  
217  	area->pages_node.start = start_byte / PAGE_SIZE;
218  	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
219  		return -EOVERFLOW;
220  	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
221  	if (WARN_ON(area->pages_node.last >= pages->npages))
222  		return -EOVERFLOW;
223  
224  	/*
225  	 * The area is inserted with a NULL pages indicating it is not fully
226  	 * initialized yet.
227  	 */
228  	area->iopt = iopt;
229  	interval_tree_insert(&area->node, &iopt->area_itree);
230  	return 0;
231  }
232  
iopt_area_alloc(void)233  static struct iopt_area *iopt_area_alloc(void)
234  {
235  	struct iopt_area *area;
236  
237  	area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
238  	if (!area)
239  		return NULL;
240  	RB_CLEAR_NODE(&area->node.rb);
241  	RB_CLEAR_NODE(&area->pages_node.rb);
242  	return area;
243  }
244  
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)245  static int iopt_alloc_area_pages(struct io_pagetable *iopt,
246  				 struct list_head *pages_list,
247  				 unsigned long length, unsigned long *dst_iova,
248  				 int iommu_prot, unsigned int flags)
249  {
250  	struct iopt_pages_list *elm;
251  	unsigned long iova;
252  	int rc = 0;
253  
254  	list_for_each_entry(elm, pages_list, next) {
255  		elm->area = iopt_area_alloc();
256  		if (!elm->area)
257  			return -ENOMEM;
258  	}
259  
260  	down_write(&iopt->iova_rwsem);
261  	if ((length & (iopt->iova_alignment - 1)) || !length) {
262  		rc = -EINVAL;
263  		goto out_unlock;
264  	}
265  
266  	if (flags & IOPT_ALLOC_IOVA) {
267  		/* Use the first entry to guess the ideal IOVA alignment */
268  		elm = list_first_entry(pages_list, struct iopt_pages_list,
269  				       next);
270  		rc = iopt_alloc_iova(
271  			iopt, dst_iova,
272  			(uintptr_t)elm->pages->uptr + elm->start_byte, length);
273  		if (rc)
274  			goto out_unlock;
275  		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
276  		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
277  			rc = -EINVAL;
278  			goto out_unlock;
279  		}
280  	} else {
281  		rc = iopt_check_iova(iopt, *dst_iova, length);
282  		if (rc)
283  			goto out_unlock;
284  	}
285  
286  	/*
287  	 * Areas are created with a NULL pages so that the IOVA space is
288  	 * reserved and we can unlock the iova_rwsem.
289  	 */
290  	iova = *dst_iova;
291  	list_for_each_entry(elm, pages_list, next) {
292  		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
293  				      elm->start_byte, elm->length, iommu_prot);
294  		if (rc)
295  			goto out_unlock;
296  		iova += elm->length;
297  	}
298  
299  out_unlock:
300  	up_write(&iopt->iova_rwsem);
301  	return rc;
302  }
303  
iopt_abort_area(struct iopt_area * area)304  static void iopt_abort_area(struct iopt_area *area)
305  {
306  	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
307  		WARN_ON(area->pages);
308  	if (area->iopt) {
309  		down_write(&area->iopt->iova_rwsem);
310  		interval_tree_remove(&area->node, &area->iopt->area_itree);
311  		up_write(&area->iopt->iova_rwsem);
312  	}
313  	kfree(area);
314  }
315  
iopt_free_pages_list(struct list_head * pages_list)316  void iopt_free_pages_list(struct list_head *pages_list)
317  {
318  	struct iopt_pages_list *elm;
319  
320  	while ((elm = list_first_entry_or_null(pages_list,
321  					       struct iopt_pages_list, next))) {
322  		if (elm->area)
323  			iopt_abort_area(elm->area);
324  		if (elm->pages)
325  			iopt_put_pages(elm->pages);
326  		list_del(&elm->next);
327  		kfree(elm);
328  	}
329  }
330  
iopt_fill_domains_pages(struct list_head * pages_list)331  static int iopt_fill_domains_pages(struct list_head *pages_list)
332  {
333  	struct iopt_pages_list *undo_elm;
334  	struct iopt_pages_list *elm;
335  	int rc;
336  
337  	list_for_each_entry(elm, pages_list, next) {
338  		rc = iopt_area_fill_domains(elm->area, elm->pages);
339  		if (rc)
340  			goto err_undo;
341  	}
342  	return 0;
343  
344  err_undo:
345  	list_for_each_entry(undo_elm, pages_list, next) {
346  		if (undo_elm == elm)
347  			break;
348  		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
349  	}
350  	return rc;
351  }
352  
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)353  int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
354  		   unsigned long length, unsigned long *dst_iova,
355  		   int iommu_prot, unsigned int flags)
356  {
357  	struct iopt_pages_list *elm;
358  	int rc;
359  
360  	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
361  				   iommu_prot, flags);
362  	if (rc)
363  		return rc;
364  
365  	down_read(&iopt->domains_rwsem);
366  	rc = iopt_fill_domains_pages(pages_list);
367  	if (rc)
368  		goto out_unlock_domains;
369  
370  	down_write(&iopt->iova_rwsem);
371  	list_for_each_entry(elm, pages_list, next) {
372  		/*
373  		 * area->pages must be set inside the domains_rwsem to ensure
374  		 * any newly added domains will get filled. Moves the reference
375  		 * in from the list.
376  		 */
377  		elm->area->pages = elm->pages;
378  		elm->pages = NULL;
379  		elm->area = NULL;
380  	}
381  	up_write(&iopt->iova_rwsem);
382  out_unlock_domains:
383  	up_read(&iopt->domains_rwsem);
384  	return rc;
385  }
386  
387  /**
388   * iopt_map_user_pages() - Map a user VA to an iova in the io page table
389   * @ictx: iommufd_ctx the iopt is part of
390   * @iopt: io_pagetable to act on
391   * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
392   *        the chosen iova on output. Otherwise is the iova to map to on input
393   * @uptr: User VA to map
394   * @length: Number of bytes to map
395   * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
396   * @flags: IOPT_ALLOC_IOVA or zero
397   *
398   * iova, uptr, and length must be aligned to iova_alignment. For domain backed
399   * page tables this will pin the pages and load them into the domain at iova.
400   * For non-domain page tables this will only setup a lazy reference and the
401   * caller must use iopt_access_pages() to touch them.
402   *
403   * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
404   * destroyed.
405   */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)406  int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
407  			unsigned long *iova, void __user *uptr,
408  			unsigned long length, int iommu_prot,
409  			unsigned int flags)
410  {
411  	struct iopt_pages_list elm = {};
412  	LIST_HEAD(pages_list);
413  	int rc;
414  
415  	elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
416  	if (IS_ERR(elm.pages))
417  		return PTR_ERR(elm.pages);
418  	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
419  	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
420  		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
421  	elm.start_byte = uptr - elm.pages->uptr;
422  	elm.length = length;
423  	list_add(&elm.next, &pages_list);
424  
425  	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
426  	if (rc) {
427  		if (elm.area)
428  			iopt_abort_area(elm.area);
429  		if (elm.pages)
430  			iopt_put_pages(elm.pages);
431  		return rc;
432  	}
433  	return 0;
434  }
435  
436  struct iova_bitmap_fn_arg {
437  	unsigned long flags;
438  	struct io_pagetable *iopt;
439  	struct iommu_domain *domain;
440  	struct iommu_dirty_bitmap *dirty;
441  };
442  
__iommu_read_and_clear_dirty(struct iova_bitmap * bitmap,unsigned long iova,size_t length,void * opaque)443  static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
444  					unsigned long iova, size_t length,
445  					void *opaque)
446  {
447  	struct iopt_area *area;
448  	struct iopt_area_contig_iter iter;
449  	struct iova_bitmap_fn_arg *arg = opaque;
450  	struct iommu_domain *domain = arg->domain;
451  	struct iommu_dirty_bitmap *dirty = arg->dirty;
452  	const struct iommu_dirty_ops *ops = domain->dirty_ops;
453  	unsigned long last_iova = iova + length - 1;
454  	unsigned long flags = arg->flags;
455  	int ret;
456  
457  	iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
458  		unsigned long last = min(last_iova, iopt_area_last_iova(area));
459  
460  		ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
461  						last - iter.cur_iova + 1, flags,
462  						dirty);
463  		if (ret)
464  			return ret;
465  	}
466  
467  	if (!iopt_area_contig_done(&iter))
468  		return -EINVAL;
469  	return 0;
470  }
471  
472  static int
iommu_read_and_clear_dirty(struct iommu_domain * domain,struct io_pagetable * iopt,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)473  iommu_read_and_clear_dirty(struct iommu_domain *domain,
474  			   struct io_pagetable *iopt, unsigned long flags,
475  			   struct iommu_hwpt_get_dirty_bitmap *bitmap)
476  {
477  	const struct iommu_dirty_ops *ops = domain->dirty_ops;
478  	struct iommu_iotlb_gather gather;
479  	struct iommu_dirty_bitmap dirty;
480  	struct iova_bitmap_fn_arg arg;
481  	struct iova_bitmap *iter;
482  	int ret = 0;
483  
484  	if (!ops || !ops->read_and_clear_dirty)
485  		return -EOPNOTSUPP;
486  
487  	iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
488  				 bitmap->page_size,
489  				 u64_to_user_ptr(bitmap->data));
490  	if (IS_ERR(iter))
491  		return -ENOMEM;
492  
493  	iommu_dirty_bitmap_init(&dirty, iter, &gather);
494  
495  	arg.flags = flags;
496  	arg.iopt = iopt;
497  	arg.domain = domain;
498  	arg.dirty = &dirty;
499  	iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
500  
501  	if (!(flags & IOMMU_DIRTY_NO_CLEAR))
502  		iommu_iotlb_sync(domain, &gather);
503  
504  	iova_bitmap_free(iter);
505  
506  	return ret;
507  }
508  
iommufd_check_iova_range(struct io_pagetable * iopt,struct iommu_hwpt_get_dirty_bitmap * bitmap)509  int iommufd_check_iova_range(struct io_pagetable *iopt,
510  			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
511  {
512  	size_t iommu_pgsize = iopt->iova_alignment;
513  	u64 last_iova;
514  
515  	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
516  		return -EOVERFLOW;
517  
518  	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
519  		return -EOVERFLOW;
520  
521  	if ((bitmap->iova & (iommu_pgsize - 1)) ||
522  	    ((last_iova + 1) & (iommu_pgsize - 1)))
523  		return -EINVAL;
524  
525  	if (!bitmap->page_size)
526  		return -EINVAL;
527  
528  	if ((bitmap->iova & (bitmap->page_size - 1)) ||
529  	    ((last_iova + 1) & (bitmap->page_size - 1)))
530  		return -EINVAL;
531  
532  	return 0;
533  }
534  
iopt_read_and_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)535  int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
536  				   struct iommu_domain *domain,
537  				   unsigned long flags,
538  				   struct iommu_hwpt_get_dirty_bitmap *bitmap)
539  {
540  	int ret;
541  
542  	ret = iommufd_check_iova_range(iopt, bitmap);
543  	if (ret)
544  		return ret;
545  
546  	down_read(&iopt->iova_rwsem);
547  	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
548  	up_read(&iopt->iova_rwsem);
549  
550  	return ret;
551  }
552  
iopt_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain)553  static int iopt_clear_dirty_data(struct io_pagetable *iopt,
554  				 struct iommu_domain *domain)
555  {
556  	const struct iommu_dirty_ops *ops = domain->dirty_ops;
557  	struct iommu_iotlb_gather gather;
558  	struct iommu_dirty_bitmap dirty;
559  	struct iopt_area *area;
560  	int ret = 0;
561  
562  	lockdep_assert_held_read(&iopt->iova_rwsem);
563  
564  	iommu_dirty_bitmap_init(&dirty, NULL, &gather);
565  
566  	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
567  	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
568  		if (!area->pages)
569  			continue;
570  
571  		ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
572  						iopt_area_length(area), 0,
573  						&dirty);
574  		if (ret)
575  			break;
576  	}
577  
578  	iommu_iotlb_sync(domain, &gather);
579  	return ret;
580  }
581  
iopt_set_dirty_tracking(struct io_pagetable * iopt,struct iommu_domain * domain,bool enable)582  int iopt_set_dirty_tracking(struct io_pagetable *iopt,
583  			    struct iommu_domain *domain, bool enable)
584  {
585  	const struct iommu_dirty_ops *ops = domain->dirty_ops;
586  	int ret = 0;
587  
588  	if (!ops)
589  		return -EOPNOTSUPP;
590  
591  	down_read(&iopt->iova_rwsem);
592  
593  	/* Clear dirty bits from PTEs to ensure a clean snapshot */
594  	if (enable) {
595  		ret = iopt_clear_dirty_data(iopt, domain);
596  		if (ret)
597  			goto out_unlock;
598  	}
599  
600  	ret = ops->set_dirty_tracking(domain, enable);
601  
602  out_unlock:
603  	up_read(&iopt->iova_rwsem);
604  	return ret;
605  }
606  
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)607  int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
608  		   unsigned long length, struct list_head *pages_list)
609  {
610  	struct iopt_area_contig_iter iter;
611  	unsigned long last_iova;
612  	struct iopt_area *area;
613  	int rc;
614  
615  	if (!length)
616  		return -EINVAL;
617  	if (check_add_overflow(iova, length - 1, &last_iova))
618  		return -EOVERFLOW;
619  
620  	down_read(&iopt->iova_rwsem);
621  	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
622  		struct iopt_pages_list *elm;
623  		unsigned long last = min(last_iova, iopt_area_last_iova(area));
624  
625  		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
626  		if (!elm) {
627  			rc = -ENOMEM;
628  			goto err_free;
629  		}
630  		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
631  		elm->pages = area->pages;
632  		elm->length = (last - iter.cur_iova) + 1;
633  		kref_get(&elm->pages->kref);
634  		list_add_tail(&elm->next, pages_list);
635  	}
636  	if (!iopt_area_contig_done(&iter)) {
637  		rc = -ENOENT;
638  		goto err_free;
639  	}
640  	up_read(&iopt->iova_rwsem);
641  	return 0;
642  err_free:
643  	up_read(&iopt->iova_rwsem);
644  	iopt_free_pages_list(pages_list);
645  	return rc;
646  }
647  
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)648  static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
649  				 unsigned long last, unsigned long *unmapped)
650  {
651  	struct iopt_area *area;
652  	unsigned long unmapped_bytes = 0;
653  	unsigned int tries = 0;
654  	int rc = -ENOENT;
655  
656  	/*
657  	 * The domains_rwsem must be held in read mode any time any area->pages
658  	 * is NULL. This prevents domain attach/detatch from running
659  	 * concurrently with cleaning up the area.
660  	 */
661  again:
662  	down_read(&iopt->domains_rwsem);
663  	down_write(&iopt->iova_rwsem);
664  	while ((area = iopt_area_iter_first(iopt, start, last))) {
665  		unsigned long area_last = iopt_area_last_iova(area);
666  		unsigned long area_first = iopt_area_iova(area);
667  		struct iopt_pages *pages;
668  
669  		/* Userspace should not race map/unmap's of the same area */
670  		if (!area->pages) {
671  			rc = -EBUSY;
672  			goto out_unlock_iova;
673  		}
674  
675  		if (area_first < start || area_last > last) {
676  			rc = -ENOENT;
677  			goto out_unlock_iova;
678  		}
679  
680  		if (area_first != start)
681  			tries = 0;
682  
683  		/*
684  		 * num_accesses writers must hold the iova_rwsem too, so we can
685  		 * safely read it under the write side of the iovam_rwsem
686  		 * without the pages->mutex.
687  		 */
688  		if (area->num_accesses) {
689  			size_t length = iopt_area_length(area);
690  
691  			start = area_first;
692  			area->prevent_access = true;
693  			up_write(&iopt->iova_rwsem);
694  			up_read(&iopt->domains_rwsem);
695  
696  			iommufd_access_notify_unmap(iopt, area_first, length);
697  			/* Something is not responding to unmap requests. */
698  			tries++;
699  			if (WARN_ON(tries > 100))
700  				return -EDEADLOCK;
701  			goto again;
702  		}
703  
704  		pages = area->pages;
705  		area->pages = NULL;
706  		up_write(&iopt->iova_rwsem);
707  
708  		iopt_area_unfill_domains(area, pages);
709  		iopt_abort_area(area);
710  		iopt_put_pages(pages);
711  
712  		unmapped_bytes += area_last - area_first + 1;
713  
714  		down_write(&iopt->iova_rwsem);
715  	}
716  	if (unmapped_bytes)
717  		rc = 0;
718  
719  out_unlock_iova:
720  	up_write(&iopt->iova_rwsem);
721  	up_read(&iopt->domains_rwsem);
722  	if (unmapped)
723  		*unmapped = unmapped_bytes;
724  	return rc;
725  }
726  
727  /**
728   * iopt_unmap_iova() - Remove a range of iova
729   * @iopt: io_pagetable to act on
730   * @iova: Starting iova to unmap
731   * @length: Number of bytes to unmap
732   * @unmapped: Return number of bytes unmapped
733   *
734   * The requested range must be a superset of existing ranges.
735   * Splitting/truncating IOVA mappings is not allowed.
736   */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)737  int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
738  		    unsigned long length, unsigned long *unmapped)
739  {
740  	unsigned long iova_last;
741  
742  	if (!length)
743  		return -EINVAL;
744  
745  	if (check_add_overflow(iova, length - 1, &iova_last))
746  		return -EOVERFLOW;
747  
748  	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
749  }
750  
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)751  int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
752  {
753  	int rc;
754  
755  	rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
756  	/* If the IOVAs are empty then unmap all succeeds */
757  	if (rc == -ENOENT)
758  		return 0;
759  	return rc;
760  }
761  
762  /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)763  int iopt_set_allow_iova(struct io_pagetable *iopt,
764  			struct rb_root_cached *allowed_iova)
765  {
766  	struct iopt_allowed *allowed;
767  
768  	down_write(&iopt->iova_rwsem);
769  	swap(*allowed_iova, iopt->allowed_itree);
770  
771  	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
772  	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
773  		if (iopt_reserved_iter_first(iopt, allowed->node.start,
774  					     allowed->node.last)) {
775  			swap(*allowed_iova, iopt->allowed_itree);
776  			up_write(&iopt->iova_rwsem);
777  			return -EADDRINUSE;
778  		}
779  	}
780  	up_write(&iopt->iova_rwsem);
781  	return 0;
782  }
783  
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)784  int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
785  		      unsigned long last, void *owner)
786  {
787  	struct iopt_reserved *reserved;
788  
789  	lockdep_assert_held_write(&iopt->iova_rwsem);
790  
791  	if (iopt_area_iter_first(iopt, start, last) ||
792  	    iopt_allowed_iter_first(iopt, start, last))
793  		return -EADDRINUSE;
794  
795  	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
796  	if (!reserved)
797  		return -ENOMEM;
798  	reserved->node.start = start;
799  	reserved->node.last = last;
800  	reserved->owner = owner;
801  	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
802  	return 0;
803  }
804  
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)805  static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
806  {
807  	struct iopt_reserved *reserved, *next;
808  
809  	lockdep_assert_held_write(&iopt->iova_rwsem);
810  
811  	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
812  	     reserved = next) {
813  		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
814  
815  		if (reserved->owner == owner) {
816  			interval_tree_remove(&reserved->node,
817  					     &iopt->reserved_itree);
818  			kfree(reserved);
819  		}
820  	}
821  }
822  
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)823  void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
824  {
825  	down_write(&iopt->iova_rwsem);
826  	__iopt_remove_reserved_iova(iopt, owner);
827  	up_write(&iopt->iova_rwsem);
828  }
829  
iopt_init_table(struct io_pagetable * iopt)830  void iopt_init_table(struct io_pagetable *iopt)
831  {
832  	init_rwsem(&iopt->iova_rwsem);
833  	init_rwsem(&iopt->domains_rwsem);
834  	iopt->area_itree = RB_ROOT_CACHED;
835  	iopt->allowed_itree = RB_ROOT_CACHED;
836  	iopt->reserved_itree = RB_ROOT_CACHED;
837  	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
838  	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
839  
840  	/*
841  	 * iopt's start as SW tables that can use the entire size_t IOVA space
842  	 * due to the use of size_t in the APIs. They have no alignment
843  	 * restriction.
844  	 */
845  	iopt->iova_alignment = 1;
846  }
847  
iopt_destroy_table(struct io_pagetable * iopt)848  void iopt_destroy_table(struct io_pagetable *iopt)
849  {
850  	struct interval_tree_node *node;
851  
852  	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
853  		iopt_remove_reserved_iova(iopt, NULL);
854  
855  	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
856  						ULONG_MAX))) {
857  		interval_tree_remove(node, &iopt->allowed_itree);
858  		kfree(container_of(node, struct iopt_allowed, node));
859  	}
860  
861  	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
862  	WARN_ON(!xa_empty(&iopt->domains));
863  	WARN_ON(!xa_empty(&iopt->access_list));
864  	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
865  }
866  
867  /**
868   * iopt_unfill_domain() - Unfill a domain with PFNs
869   * @iopt: io_pagetable to act on
870   * @domain: domain to unfill
871   *
872   * This is used when removing a domain from the iopt. Every area in the iopt
873   * will be unmapped from the domain. The domain must already be removed from the
874   * domains xarray.
875   */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)876  static void iopt_unfill_domain(struct io_pagetable *iopt,
877  			       struct iommu_domain *domain)
878  {
879  	struct iopt_area *area;
880  
881  	lockdep_assert_held(&iopt->iova_rwsem);
882  	lockdep_assert_held_write(&iopt->domains_rwsem);
883  
884  	/*
885  	 * Some other domain is holding all the pfns still, rapidly unmap this
886  	 * domain.
887  	 */
888  	if (iopt->next_domain_id != 0) {
889  		/* Pick an arbitrary remaining domain to act as storage */
890  		struct iommu_domain *storage_domain =
891  			xa_load(&iopt->domains, 0);
892  
893  		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
894  		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
895  			struct iopt_pages *pages = area->pages;
896  
897  			if (!pages)
898  				continue;
899  
900  			mutex_lock(&pages->mutex);
901  			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
902  				WARN_ON(!area->storage_domain);
903  			if (area->storage_domain == domain)
904  				area->storage_domain = storage_domain;
905  			mutex_unlock(&pages->mutex);
906  
907  			iopt_area_unmap_domain(area, domain);
908  		}
909  		return;
910  	}
911  
912  	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
913  	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
914  		struct iopt_pages *pages = area->pages;
915  
916  		if (!pages)
917  			continue;
918  
919  		mutex_lock(&pages->mutex);
920  		interval_tree_remove(&area->pages_node, &pages->domains_itree);
921  		WARN_ON(area->storage_domain != domain);
922  		area->storage_domain = NULL;
923  		iopt_area_unfill_domain(area, pages, domain);
924  		mutex_unlock(&pages->mutex);
925  	}
926  }
927  
928  /**
929   * iopt_fill_domain() - Fill a domain with PFNs
930   * @iopt: io_pagetable to act on
931   * @domain: domain to fill
932   *
933   * Fill the domain with PFNs from every area in the iopt. On failure the domain
934   * is left unchanged.
935   */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)936  static int iopt_fill_domain(struct io_pagetable *iopt,
937  			    struct iommu_domain *domain)
938  {
939  	struct iopt_area *end_area;
940  	struct iopt_area *area;
941  	int rc;
942  
943  	lockdep_assert_held(&iopt->iova_rwsem);
944  	lockdep_assert_held_write(&iopt->domains_rwsem);
945  
946  	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
947  	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
948  		struct iopt_pages *pages = area->pages;
949  
950  		if (!pages)
951  			continue;
952  
953  		mutex_lock(&pages->mutex);
954  		rc = iopt_area_fill_domain(area, domain);
955  		if (rc) {
956  			mutex_unlock(&pages->mutex);
957  			goto out_unfill;
958  		}
959  		if (!area->storage_domain) {
960  			WARN_ON(iopt->next_domain_id != 0);
961  			area->storage_domain = domain;
962  			interval_tree_insert(&area->pages_node,
963  					     &pages->domains_itree);
964  		}
965  		mutex_unlock(&pages->mutex);
966  	}
967  	return 0;
968  
969  out_unfill:
970  	end_area = area;
971  	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
972  	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
973  		struct iopt_pages *pages = area->pages;
974  
975  		if (area == end_area)
976  			break;
977  		if (!pages)
978  			continue;
979  		mutex_lock(&pages->mutex);
980  		if (iopt->next_domain_id == 0) {
981  			interval_tree_remove(&area->pages_node,
982  					     &pages->domains_itree);
983  			area->storage_domain = NULL;
984  		}
985  		iopt_area_unfill_domain(area, pages, domain);
986  		mutex_unlock(&pages->mutex);
987  	}
988  	return rc;
989  }
990  
991  /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)992  static int iopt_check_iova_alignment(struct io_pagetable *iopt,
993  				     unsigned long new_iova_alignment)
994  {
995  	unsigned long align_mask = new_iova_alignment - 1;
996  	struct iopt_area *area;
997  
998  	lockdep_assert_held(&iopt->iova_rwsem);
999  	lockdep_assert_held(&iopt->domains_rwsem);
1000  
1001  	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1002  	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
1003  		if ((iopt_area_iova(area) & align_mask) ||
1004  		    (iopt_area_length(area) & align_mask) ||
1005  		    (area->page_offset & align_mask))
1006  			return -EADDRINUSE;
1007  
1008  	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1009  		struct iommufd_access *access;
1010  		unsigned long index;
1011  
1012  		xa_for_each(&iopt->access_list, index, access)
1013  			if (WARN_ON(access->iova_alignment >
1014  				    new_iova_alignment))
1015  				return -EADDRINUSE;
1016  	}
1017  	return 0;
1018  }
1019  
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1020  int iopt_table_add_domain(struct io_pagetable *iopt,
1021  			  struct iommu_domain *domain)
1022  {
1023  	const struct iommu_domain_geometry *geometry = &domain->geometry;
1024  	struct iommu_domain *iter_domain;
1025  	unsigned int new_iova_alignment;
1026  	unsigned long index;
1027  	int rc;
1028  
1029  	down_write(&iopt->domains_rwsem);
1030  	down_write(&iopt->iova_rwsem);
1031  
1032  	xa_for_each(&iopt->domains, index, iter_domain) {
1033  		if (WARN_ON(iter_domain == domain)) {
1034  			rc = -EEXIST;
1035  			goto out_unlock;
1036  		}
1037  	}
1038  
1039  	/*
1040  	 * The io page size drives the iova_alignment. Internally the iopt_pages
1041  	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1042  	 * objects into the iommu_domain.
1043  	 *
1044  	 * A iommu_domain must always be able to accept PAGE_SIZE to be
1045  	 * compatible as we can't guarantee higher contiguity.
1046  	 */
1047  	new_iova_alignment = max_t(unsigned long,
1048  				   1UL << __ffs(domain->pgsize_bitmap),
1049  				   iopt->iova_alignment);
1050  	if (new_iova_alignment > PAGE_SIZE) {
1051  		rc = -EINVAL;
1052  		goto out_unlock;
1053  	}
1054  	if (new_iova_alignment != iopt->iova_alignment) {
1055  		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1056  		if (rc)
1057  			goto out_unlock;
1058  	}
1059  
1060  	/* No area exists that is outside the allowed domain aperture */
1061  	if (geometry->aperture_start != 0) {
1062  		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1063  				       domain);
1064  		if (rc)
1065  			goto out_reserved;
1066  	}
1067  	if (geometry->aperture_end != ULONG_MAX) {
1068  		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1069  				       ULONG_MAX, domain);
1070  		if (rc)
1071  			goto out_reserved;
1072  	}
1073  
1074  	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1075  	if (rc)
1076  		goto out_reserved;
1077  
1078  	rc = iopt_fill_domain(iopt, domain);
1079  	if (rc)
1080  		goto out_release;
1081  
1082  	iopt->iova_alignment = new_iova_alignment;
1083  	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1084  	iopt->next_domain_id++;
1085  	up_write(&iopt->iova_rwsem);
1086  	up_write(&iopt->domains_rwsem);
1087  	return 0;
1088  out_release:
1089  	xa_release(&iopt->domains, iopt->next_domain_id);
1090  out_reserved:
1091  	__iopt_remove_reserved_iova(iopt, domain);
1092  out_unlock:
1093  	up_write(&iopt->iova_rwsem);
1094  	up_write(&iopt->domains_rwsem);
1095  	return rc;
1096  }
1097  
iopt_calculate_iova_alignment(struct io_pagetable * iopt)1098  static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1099  {
1100  	unsigned long new_iova_alignment;
1101  	struct iommufd_access *access;
1102  	struct iommu_domain *domain;
1103  	unsigned long index;
1104  
1105  	lockdep_assert_held_write(&iopt->iova_rwsem);
1106  	lockdep_assert_held(&iopt->domains_rwsem);
1107  
1108  	/* See batch_iommu_map_small() */
1109  	if (iopt->disable_large_pages)
1110  		new_iova_alignment = PAGE_SIZE;
1111  	else
1112  		new_iova_alignment = 1;
1113  
1114  	xa_for_each(&iopt->domains, index, domain)
1115  		new_iova_alignment = max_t(unsigned long,
1116  					   1UL << __ffs(domain->pgsize_bitmap),
1117  					   new_iova_alignment);
1118  	xa_for_each(&iopt->access_list, index, access)
1119  		new_iova_alignment = max_t(unsigned long,
1120  					   access->iova_alignment,
1121  					   new_iova_alignment);
1122  
1123  	if (new_iova_alignment > iopt->iova_alignment) {
1124  		int rc;
1125  
1126  		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1127  		if (rc)
1128  			return rc;
1129  	}
1130  	iopt->iova_alignment = new_iova_alignment;
1131  	return 0;
1132  }
1133  
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1134  void iopt_table_remove_domain(struct io_pagetable *iopt,
1135  			      struct iommu_domain *domain)
1136  {
1137  	struct iommu_domain *iter_domain = NULL;
1138  	unsigned long index;
1139  
1140  	down_write(&iopt->domains_rwsem);
1141  	down_write(&iopt->iova_rwsem);
1142  
1143  	xa_for_each(&iopt->domains, index, iter_domain)
1144  		if (iter_domain == domain)
1145  			break;
1146  	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1147  		goto out_unlock;
1148  
1149  	/*
1150  	 * Compress the xarray to keep it linear by swapping the entry to erase
1151  	 * with the tail entry and shrinking the tail.
1152  	 */
1153  	iopt->next_domain_id--;
1154  	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1155  	if (index != iopt->next_domain_id)
1156  		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1157  
1158  	iopt_unfill_domain(iopt, domain);
1159  	__iopt_remove_reserved_iova(iopt, domain);
1160  
1161  	WARN_ON(iopt_calculate_iova_alignment(iopt));
1162  out_unlock:
1163  	up_write(&iopt->iova_rwsem);
1164  	up_write(&iopt->domains_rwsem);
1165  }
1166  
1167  /**
1168   * iopt_area_split - Split an area into two parts at iova
1169   * @area: The area to split
1170   * @iova: Becomes the last of a new area
1171   *
1172   * This splits an area into two. It is part of the VFIO compatibility to allow
1173   * poking a hole in the mapping. The two areas continue to point at the same
1174   * iopt_pages, just with different starting bytes.
1175   */
iopt_area_split(struct iopt_area * area,unsigned long iova)1176  static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1177  {
1178  	unsigned long alignment = area->iopt->iova_alignment;
1179  	unsigned long last_iova = iopt_area_last_iova(area);
1180  	unsigned long start_iova = iopt_area_iova(area);
1181  	unsigned long new_start = iova + 1;
1182  	struct io_pagetable *iopt = area->iopt;
1183  	struct iopt_pages *pages = area->pages;
1184  	struct iopt_area *lhs;
1185  	struct iopt_area *rhs;
1186  	int rc;
1187  
1188  	lockdep_assert_held_write(&iopt->iova_rwsem);
1189  
1190  	if (iova == start_iova || iova == last_iova)
1191  		return 0;
1192  
1193  	if (!pages || area->prevent_access)
1194  		return -EBUSY;
1195  
1196  	if (new_start & (alignment - 1) ||
1197  	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1198  		return -EINVAL;
1199  
1200  	lhs = iopt_area_alloc();
1201  	if (!lhs)
1202  		return -ENOMEM;
1203  
1204  	rhs = iopt_area_alloc();
1205  	if (!rhs) {
1206  		rc = -ENOMEM;
1207  		goto err_free_lhs;
1208  	}
1209  
1210  	mutex_lock(&pages->mutex);
1211  	/*
1212  	 * Splitting is not permitted if an access exists, we don't track enough
1213  	 * information to split existing accesses.
1214  	 */
1215  	if (area->num_accesses) {
1216  		rc = -EINVAL;
1217  		goto err_unlock;
1218  	}
1219  
1220  	/*
1221  	 * Splitting is not permitted if a domain could have been mapped with
1222  	 * huge pages.
1223  	 */
1224  	if (area->storage_domain && !iopt->disable_large_pages) {
1225  		rc = -EINVAL;
1226  		goto err_unlock;
1227  	}
1228  
1229  	interval_tree_remove(&area->node, &iopt->area_itree);
1230  	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1231  			      iopt_area_start_byte(area, start_iova),
1232  			      (new_start - 1) - start_iova + 1,
1233  			      area->iommu_prot);
1234  	if (WARN_ON(rc))
1235  		goto err_insert;
1236  
1237  	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1238  			      iopt_area_start_byte(area, new_start),
1239  			      last_iova - new_start + 1, area->iommu_prot);
1240  	if (WARN_ON(rc))
1241  		goto err_remove_lhs;
1242  
1243  	/*
1244  	 * If the original area has filled a domain, domains_itree has to be
1245  	 * updated.
1246  	 */
1247  	if (area->storage_domain) {
1248  		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1249  		interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1250  		interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1251  	}
1252  
1253  	lhs->storage_domain = area->storage_domain;
1254  	lhs->pages = area->pages;
1255  	rhs->storage_domain = area->storage_domain;
1256  	rhs->pages = area->pages;
1257  	kref_get(&rhs->pages->kref);
1258  	kfree(area);
1259  	mutex_unlock(&pages->mutex);
1260  
1261  	/*
1262  	 * No change to domains or accesses because the pages hasn't been
1263  	 * changed
1264  	 */
1265  	return 0;
1266  
1267  err_remove_lhs:
1268  	interval_tree_remove(&lhs->node, &iopt->area_itree);
1269  err_insert:
1270  	interval_tree_insert(&area->node, &iopt->area_itree);
1271  err_unlock:
1272  	mutex_unlock(&pages->mutex);
1273  	kfree(rhs);
1274  err_free_lhs:
1275  	kfree(lhs);
1276  	return rc;
1277  }
1278  
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1279  int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1280  		  size_t num_iovas)
1281  {
1282  	int rc = 0;
1283  	int i;
1284  
1285  	down_write(&iopt->iova_rwsem);
1286  	for (i = 0; i < num_iovas; i++) {
1287  		struct iopt_area *area;
1288  
1289  		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1290  		if (!area)
1291  			continue;
1292  		rc = iopt_area_split(area, iovas[i]);
1293  		if (rc)
1294  			break;
1295  	}
1296  	up_write(&iopt->iova_rwsem);
1297  	return rc;
1298  }
1299  
iopt_enable_large_pages(struct io_pagetable * iopt)1300  void iopt_enable_large_pages(struct io_pagetable *iopt)
1301  {
1302  	int rc;
1303  
1304  	down_write(&iopt->domains_rwsem);
1305  	down_write(&iopt->iova_rwsem);
1306  	WRITE_ONCE(iopt->disable_large_pages, false);
1307  	rc = iopt_calculate_iova_alignment(iopt);
1308  	WARN_ON(rc);
1309  	up_write(&iopt->iova_rwsem);
1310  	up_write(&iopt->domains_rwsem);
1311  }
1312  
iopt_disable_large_pages(struct io_pagetable * iopt)1313  int iopt_disable_large_pages(struct io_pagetable *iopt)
1314  {
1315  	int rc = 0;
1316  
1317  	down_write(&iopt->domains_rwsem);
1318  	down_write(&iopt->iova_rwsem);
1319  	if (iopt->disable_large_pages)
1320  		goto out_unlock;
1321  
1322  	/* Won't do it if domains already have pages mapped in them */
1323  	if (!xa_empty(&iopt->domains) &&
1324  	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1325  		rc = -EINVAL;
1326  		goto out_unlock;
1327  	}
1328  
1329  	WRITE_ONCE(iopt->disable_large_pages, true);
1330  	rc = iopt_calculate_iova_alignment(iopt);
1331  	if (rc)
1332  		WRITE_ONCE(iopt->disable_large_pages, false);
1333  out_unlock:
1334  	up_write(&iopt->iova_rwsem);
1335  	up_write(&iopt->domains_rwsem);
1336  	return rc;
1337  }
1338  
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1339  int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1340  {
1341  	u32 new_id;
1342  	int rc;
1343  
1344  	down_write(&iopt->domains_rwsem);
1345  	down_write(&iopt->iova_rwsem);
1346  	rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1347  		      GFP_KERNEL_ACCOUNT);
1348  
1349  	if (rc)
1350  		goto out_unlock;
1351  
1352  	rc = iopt_calculate_iova_alignment(iopt);
1353  	if (rc) {
1354  		xa_erase(&iopt->access_list, new_id);
1355  		goto out_unlock;
1356  	}
1357  	access->iopt_access_list_id = new_id;
1358  
1359  out_unlock:
1360  	up_write(&iopt->iova_rwsem);
1361  	up_write(&iopt->domains_rwsem);
1362  	return rc;
1363  }
1364  
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1365  void iopt_remove_access(struct io_pagetable *iopt,
1366  			struct iommufd_access *access,
1367  			u32 iopt_access_list_id)
1368  {
1369  	down_write(&iopt->domains_rwsem);
1370  	down_write(&iopt->iova_rwsem);
1371  	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1372  	WARN_ON(iopt_calculate_iova_alignment(iopt));
1373  	up_write(&iopt->iova_rwsem);
1374  	up_write(&iopt->domains_rwsem);
1375  }
1376  
1377  /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1378  int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1379  					struct device *dev,
1380  					phys_addr_t *sw_msi_start)
1381  {
1382  	struct iommu_resv_region *resv;
1383  	LIST_HEAD(resv_regions);
1384  	unsigned int num_hw_msi = 0;
1385  	unsigned int num_sw_msi = 0;
1386  	int rc;
1387  
1388  	if (iommufd_should_fail())
1389  		return -EINVAL;
1390  
1391  	down_write(&iopt->iova_rwsem);
1392  	/* FIXME: drivers allocate memory but there is no failure propogated */
1393  	iommu_get_resv_regions(dev, &resv_regions);
1394  
1395  	list_for_each_entry(resv, &resv_regions, list) {
1396  		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1397  			continue;
1398  
1399  		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1400  			num_hw_msi++;
1401  		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1402  			*sw_msi_start = resv->start;
1403  			num_sw_msi++;
1404  		}
1405  
1406  		rc = iopt_reserve_iova(iopt, resv->start,
1407  				       resv->length - 1 + resv->start, dev);
1408  		if (rc)
1409  			goto out_reserved;
1410  	}
1411  
1412  	/* Drivers must offer sane combinations of regions */
1413  	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1414  		rc = -EINVAL;
1415  		goto out_reserved;
1416  	}
1417  
1418  	rc = 0;
1419  	goto out_free_resv;
1420  
1421  out_reserved:
1422  	__iopt_remove_reserved_iova(iopt, dev);
1423  out_free_resv:
1424  	iommu_put_resv_regions(dev, &resv_regions);
1425  	up_write(&iopt->iova_rwsem);
1426  	return rc;
1427  }
1428