1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * VFIO: IOMMU DMA mapping support for Type1 IOMMU
4   *
5   * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6   *     Author: Alex Williamson <alex.williamson@redhat.com>
7   *
8   * Derived from original vfio:
9   * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10   * Author: Tom Lyon, pugs@cisco.com
11   *
12   * We arbitrarily define a Type1 IOMMU as one matching the below code.
13   * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
14   * VT-d, but that makes it harder to re-use as theoretically anyone
15   * implementing a similar IOMMU could make use of this.  We expect the
16   * IOMMU to support the IOMMU API and have few to no restrictions around
17   * the IOVA range that can be mapped.  The Type1 IOMMU is currently
18   * optimized for relatively static mappings of a userspace process with
19   * userspace pages pinned into memory.  We also assume devices and IOMMU
20   * domains are PCI based as the IOMMU API is still centered around a
21   * device/bus interface rather than a group interface.
22   */
23  
24  #include <linux/compat.h>
25  #include <linux/device.h>
26  #include <linux/fs.h>
27  #include <linux/highmem.h>
28  #include <linux/iommu.h>
29  #include <linux/module.h>
30  #include <linux/mm.h>
31  #include <linux/kthread.h>
32  #include <linux/rbtree.h>
33  #include <linux/sched/signal.h>
34  #include <linux/sched/mm.h>
35  #include <linux/slab.h>
36  #include <linux/uaccess.h>
37  #include <linux/vfio.h>
38  #include <linux/workqueue.h>
39  #include <linux/notifier.h>
40  #include "vfio.h"
41  
42  #define DRIVER_VERSION  "0.2"
43  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
44  #define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
45  
46  static bool allow_unsafe_interrupts;
47  module_param_named(allow_unsafe_interrupts,
48  		   allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
49  MODULE_PARM_DESC(allow_unsafe_interrupts,
50  		 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
51  
52  static bool disable_hugepages;
53  module_param_named(disable_hugepages,
54  		   disable_hugepages, bool, S_IRUGO | S_IWUSR);
55  MODULE_PARM_DESC(disable_hugepages,
56  		 "Disable VFIO IOMMU support for IOMMU hugepages.");
57  
58  static unsigned int dma_entry_limit __read_mostly = U16_MAX;
59  module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
60  MODULE_PARM_DESC(dma_entry_limit,
61  		 "Maximum number of user DMA mappings per container (65535).");
62  
63  struct vfio_iommu {
64  	struct list_head	domain_list;
65  	struct list_head	iova_list;
66  	struct mutex		lock;
67  	struct rb_root		dma_list;
68  	struct list_head	device_list;
69  	struct mutex		device_list_lock;
70  	unsigned int		dma_avail;
71  	unsigned int		vaddr_invalid_count;
72  	uint64_t		pgsize_bitmap;
73  	uint64_t		num_non_pinned_groups;
74  	bool			v2;
75  	bool			nesting;
76  	bool			dirty_page_tracking;
77  	struct list_head	emulated_iommu_groups;
78  };
79  
80  struct vfio_domain {
81  	struct iommu_domain	*domain;
82  	struct list_head	next;
83  	struct list_head	group_list;
84  	bool			fgsp : 1;	/* Fine-grained super pages */
85  	bool			enforce_cache_coherency : 1;
86  };
87  
88  struct vfio_dma {
89  	struct rb_node		node;
90  	dma_addr_t		iova;		/* Device address */
91  	unsigned long		vaddr;		/* Process virtual addr */
92  	size_t			size;		/* Map size (bytes) */
93  	int			prot;		/* IOMMU_READ/WRITE */
94  	bool			iommu_mapped;
95  	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
96  	bool			vaddr_invalid;
97  	struct task_struct	*task;
98  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
99  	unsigned long		*bitmap;
100  	struct mm_struct	*mm;
101  	size_t			locked_vm;
102  };
103  
104  struct vfio_batch {
105  	struct page		**pages;	/* for pin_user_pages_remote */
106  	struct page		*fallback_page; /* if pages alloc fails */
107  	int			capacity;	/* length of pages array */
108  	int			size;		/* of batch currently */
109  	int			offset;		/* of next entry in pages */
110  };
111  
112  struct vfio_iommu_group {
113  	struct iommu_group	*iommu_group;
114  	struct list_head	next;
115  	bool			pinned_page_dirty_scope;
116  };
117  
118  struct vfio_iova {
119  	struct list_head	list;
120  	dma_addr_t		start;
121  	dma_addr_t		end;
122  };
123  
124  /*
125   * Guest RAM pinning working set or DMA target
126   */
127  struct vfio_pfn {
128  	struct rb_node		node;
129  	dma_addr_t		iova;		/* Device address */
130  	unsigned long		pfn;		/* Host pfn */
131  	unsigned int		ref_count;
132  };
133  
134  struct vfio_regions {
135  	struct list_head list;
136  	dma_addr_t iova;
137  	phys_addr_t phys;
138  	size_t len;
139  };
140  
141  #define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
142  
143  /*
144   * Input argument of number of bits to bitmap_set() is unsigned integer, which
145   * further casts to signed integer for unaligned multi-bit operation,
146   * __bitmap_set().
147   * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
148   * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
149   * system.
150   */
151  #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
152  #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
153  
154  static int put_pfn(unsigned long pfn, int prot);
155  
156  static struct vfio_iommu_group*
157  vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
158  			    struct iommu_group *iommu_group);
159  
160  /*
161   * This code handles mapping and unmapping of user data buffers
162   * into DMA'ble space using the IOMMU
163   */
164  
vfio_find_dma(struct vfio_iommu * iommu,dma_addr_t start,size_t size)165  static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
166  				      dma_addr_t start, size_t size)
167  {
168  	struct rb_node *node = iommu->dma_list.rb_node;
169  
170  	while (node) {
171  		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
172  
173  		if (start + size <= dma->iova)
174  			node = node->rb_left;
175  		else if (start >= dma->iova + dma->size)
176  			node = node->rb_right;
177  		else
178  			return dma;
179  	}
180  
181  	return NULL;
182  }
183  
vfio_find_dma_first_node(struct vfio_iommu * iommu,dma_addr_t start,u64 size)184  static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
185  						dma_addr_t start, u64 size)
186  {
187  	struct rb_node *res = NULL;
188  	struct rb_node *node = iommu->dma_list.rb_node;
189  	struct vfio_dma *dma_res = NULL;
190  
191  	while (node) {
192  		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
193  
194  		if (start < dma->iova + dma->size) {
195  			res = node;
196  			dma_res = dma;
197  			if (start >= dma->iova)
198  				break;
199  			node = node->rb_left;
200  		} else {
201  			node = node->rb_right;
202  		}
203  	}
204  	if (res && size && dma_res->iova >= start + size)
205  		res = NULL;
206  	return res;
207  }
208  
vfio_link_dma(struct vfio_iommu * iommu,struct vfio_dma * new)209  static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
210  {
211  	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
212  	struct vfio_dma *dma;
213  
214  	while (*link) {
215  		parent = *link;
216  		dma = rb_entry(parent, struct vfio_dma, node);
217  
218  		if (new->iova + new->size <= dma->iova)
219  			link = &(*link)->rb_left;
220  		else
221  			link = &(*link)->rb_right;
222  	}
223  
224  	rb_link_node(&new->node, parent, link);
225  	rb_insert_color(&new->node, &iommu->dma_list);
226  }
227  
vfio_unlink_dma(struct vfio_iommu * iommu,struct vfio_dma * old)228  static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
229  {
230  	rb_erase(&old->node, &iommu->dma_list);
231  }
232  
233  
vfio_dma_bitmap_alloc(struct vfio_dma * dma,size_t pgsize)234  static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
235  {
236  	uint64_t npages = dma->size / pgsize;
237  
238  	if (npages > DIRTY_BITMAP_PAGES_MAX)
239  		return -EINVAL;
240  
241  	/*
242  	 * Allocate extra 64 bits that are used to calculate shift required for
243  	 * bitmap_shift_left() to manipulate and club unaligned number of pages
244  	 * in adjacent vfio_dma ranges.
245  	 */
246  	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
247  			       GFP_KERNEL);
248  	if (!dma->bitmap)
249  		return -ENOMEM;
250  
251  	return 0;
252  }
253  
vfio_dma_bitmap_free(struct vfio_dma * dma)254  static void vfio_dma_bitmap_free(struct vfio_dma *dma)
255  {
256  	kvfree(dma->bitmap);
257  	dma->bitmap = NULL;
258  }
259  
vfio_dma_populate_bitmap(struct vfio_dma * dma,size_t pgsize)260  static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
261  {
262  	struct rb_node *p;
263  	unsigned long pgshift = __ffs(pgsize);
264  
265  	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
266  		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
267  
268  		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
269  	}
270  }
271  
vfio_iommu_populate_bitmap_full(struct vfio_iommu * iommu)272  static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
273  {
274  	struct rb_node *n;
275  	unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
276  
277  	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
278  		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
279  
280  		bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
281  	}
282  }
283  
vfio_dma_bitmap_alloc_all(struct vfio_iommu * iommu,size_t pgsize)284  static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
285  {
286  	struct rb_node *n;
287  
288  	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
289  		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
290  		int ret;
291  
292  		ret = vfio_dma_bitmap_alloc(dma, pgsize);
293  		if (ret) {
294  			struct rb_node *p;
295  
296  			for (p = rb_prev(n); p; p = rb_prev(p)) {
297  				struct vfio_dma *dma = rb_entry(n,
298  							struct vfio_dma, node);
299  
300  				vfio_dma_bitmap_free(dma);
301  			}
302  			return ret;
303  		}
304  		vfio_dma_populate_bitmap(dma, pgsize);
305  	}
306  	return 0;
307  }
308  
vfio_dma_bitmap_free_all(struct vfio_iommu * iommu)309  static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
310  {
311  	struct rb_node *n;
312  
313  	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
314  		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
315  
316  		vfio_dma_bitmap_free(dma);
317  	}
318  }
319  
320  /*
321   * Helper Functions for host iova-pfn list
322   */
vfio_find_vpfn(struct vfio_dma * dma,dma_addr_t iova)323  static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
324  {
325  	struct vfio_pfn *vpfn;
326  	struct rb_node *node = dma->pfn_list.rb_node;
327  
328  	while (node) {
329  		vpfn = rb_entry(node, struct vfio_pfn, node);
330  
331  		if (iova < vpfn->iova)
332  			node = node->rb_left;
333  		else if (iova > vpfn->iova)
334  			node = node->rb_right;
335  		else
336  			return vpfn;
337  	}
338  	return NULL;
339  }
340  
vfio_link_pfn(struct vfio_dma * dma,struct vfio_pfn * new)341  static void vfio_link_pfn(struct vfio_dma *dma,
342  			  struct vfio_pfn *new)
343  {
344  	struct rb_node **link, *parent = NULL;
345  	struct vfio_pfn *vpfn;
346  
347  	link = &dma->pfn_list.rb_node;
348  	while (*link) {
349  		parent = *link;
350  		vpfn = rb_entry(parent, struct vfio_pfn, node);
351  
352  		if (new->iova < vpfn->iova)
353  			link = &(*link)->rb_left;
354  		else
355  			link = &(*link)->rb_right;
356  	}
357  
358  	rb_link_node(&new->node, parent, link);
359  	rb_insert_color(&new->node, &dma->pfn_list);
360  }
361  
vfio_unlink_pfn(struct vfio_dma * dma,struct vfio_pfn * old)362  static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
363  {
364  	rb_erase(&old->node, &dma->pfn_list);
365  }
366  
vfio_add_to_pfn_list(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn)367  static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
368  				unsigned long pfn)
369  {
370  	struct vfio_pfn *vpfn;
371  
372  	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
373  	if (!vpfn)
374  		return -ENOMEM;
375  
376  	vpfn->iova = iova;
377  	vpfn->pfn = pfn;
378  	vpfn->ref_count = 1;
379  	vfio_link_pfn(dma, vpfn);
380  	return 0;
381  }
382  
vfio_remove_from_pfn_list(struct vfio_dma * dma,struct vfio_pfn * vpfn)383  static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
384  				      struct vfio_pfn *vpfn)
385  {
386  	vfio_unlink_pfn(dma, vpfn);
387  	kfree(vpfn);
388  }
389  
vfio_iova_get_vfio_pfn(struct vfio_dma * dma,unsigned long iova)390  static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
391  					       unsigned long iova)
392  {
393  	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
394  
395  	if (vpfn)
396  		vpfn->ref_count++;
397  	return vpfn;
398  }
399  
vfio_iova_put_vfio_pfn(struct vfio_dma * dma,struct vfio_pfn * vpfn)400  static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
401  {
402  	int ret = 0;
403  
404  	vpfn->ref_count--;
405  	if (!vpfn->ref_count) {
406  		ret = put_pfn(vpfn->pfn, dma->prot);
407  		vfio_remove_from_pfn_list(dma, vpfn);
408  	}
409  	return ret;
410  }
411  
mm_lock_acct(struct task_struct * task,struct mm_struct * mm,bool lock_cap,long npage)412  static int mm_lock_acct(struct task_struct *task, struct mm_struct *mm,
413  			bool lock_cap, long npage)
414  {
415  	int ret = mmap_write_lock_killable(mm);
416  
417  	if (ret)
418  		return ret;
419  
420  	ret = __account_locked_vm(mm, abs(npage), npage > 0, task, lock_cap);
421  	mmap_write_unlock(mm);
422  	return ret;
423  }
424  
vfio_lock_acct(struct vfio_dma * dma,long npage,bool async)425  static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
426  {
427  	struct mm_struct *mm;
428  	int ret;
429  
430  	if (!npage)
431  		return 0;
432  
433  	mm = dma->mm;
434  	if (async && !mmget_not_zero(mm))
435  		return -ESRCH; /* process exited */
436  
437  	ret = mm_lock_acct(dma->task, mm, dma->lock_cap, npage);
438  	if (!ret)
439  		dma->locked_vm += npage;
440  
441  	if (async)
442  		mmput(mm);
443  
444  	return ret;
445  }
446  
447  /*
448   * Some mappings aren't backed by a struct page, for example an mmap'd
449   * MMIO range for our own or another device.  These use a different
450   * pfn conversion and shouldn't be tracked as locked pages.
451   * For compound pages, any driver that sets the reserved bit in head
452   * page needs to set the reserved bit in all subpages to be safe.
453   */
is_invalid_reserved_pfn(unsigned long pfn)454  static bool is_invalid_reserved_pfn(unsigned long pfn)
455  {
456  	if (pfn_valid(pfn))
457  		return PageReserved(pfn_to_page(pfn));
458  
459  	return true;
460  }
461  
put_pfn(unsigned long pfn,int prot)462  static int put_pfn(unsigned long pfn, int prot)
463  {
464  	if (!is_invalid_reserved_pfn(pfn)) {
465  		struct page *page = pfn_to_page(pfn);
466  
467  		unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
468  		return 1;
469  	}
470  	return 0;
471  }
472  
473  #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
474  
vfio_batch_init(struct vfio_batch * batch)475  static void vfio_batch_init(struct vfio_batch *batch)
476  {
477  	batch->size = 0;
478  	batch->offset = 0;
479  
480  	if (unlikely(disable_hugepages))
481  		goto fallback;
482  
483  	batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
484  	if (!batch->pages)
485  		goto fallback;
486  
487  	batch->capacity = VFIO_BATCH_MAX_CAPACITY;
488  	return;
489  
490  fallback:
491  	batch->pages = &batch->fallback_page;
492  	batch->capacity = 1;
493  }
494  
vfio_batch_unpin(struct vfio_batch * batch,struct vfio_dma * dma)495  static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
496  {
497  	while (batch->size) {
498  		unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
499  
500  		put_pfn(pfn, dma->prot);
501  		batch->offset++;
502  		batch->size--;
503  	}
504  }
505  
vfio_batch_fini(struct vfio_batch * batch)506  static void vfio_batch_fini(struct vfio_batch *batch)
507  {
508  	if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
509  		free_page((unsigned long)batch->pages);
510  }
511  
follow_fault_pfn(struct vm_area_struct * vma,struct mm_struct * mm,unsigned long vaddr,unsigned long * pfn,bool write_fault)512  static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
513  			    unsigned long vaddr, unsigned long *pfn,
514  			    bool write_fault)
515  {
516  	struct follow_pfnmap_args args = { .vma = vma, .address = vaddr };
517  	int ret;
518  
519  	ret = follow_pfnmap_start(&args);
520  	if (ret) {
521  		bool unlocked = false;
522  
523  		ret = fixup_user_fault(mm, vaddr,
524  				       FAULT_FLAG_REMOTE |
525  				       (write_fault ?  FAULT_FLAG_WRITE : 0),
526  				       &unlocked);
527  		if (unlocked)
528  			return -EAGAIN;
529  
530  		if (ret)
531  			return ret;
532  
533  		ret = follow_pfnmap_start(&args);
534  		if (ret)
535  			return ret;
536  	}
537  
538  	if (write_fault && !args.writable)
539  		ret = -EFAULT;
540  	else
541  		*pfn = args.pfn;
542  
543  	follow_pfnmap_end(&args);
544  	return ret;
545  }
546  
547  /*
548   * Returns the positive number of pfns successfully obtained or a negative
549   * error code.
550   */
vaddr_get_pfns(struct mm_struct * mm,unsigned long vaddr,long npages,int prot,unsigned long * pfn,struct page ** pages)551  static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
552  			  long npages, int prot, unsigned long *pfn,
553  			  struct page **pages)
554  {
555  	struct vm_area_struct *vma;
556  	unsigned int flags = 0;
557  	int ret;
558  
559  	if (prot & IOMMU_WRITE)
560  		flags |= FOLL_WRITE;
561  
562  	mmap_read_lock(mm);
563  	ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
564  				    pages, NULL);
565  	if (ret > 0) {
566  		*pfn = page_to_pfn(pages[0]);
567  		goto done;
568  	}
569  
570  	vaddr = untagged_addr_remote(mm, vaddr);
571  
572  retry:
573  	vma = vma_lookup(mm, vaddr);
574  
575  	if (vma && vma->vm_flags & VM_PFNMAP) {
576  		ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
577  		if (ret == -EAGAIN)
578  			goto retry;
579  
580  		if (!ret) {
581  			if (is_invalid_reserved_pfn(*pfn))
582  				ret = 1;
583  			else
584  				ret = -EFAULT;
585  		}
586  	}
587  done:
588  	mmap_read_unlock(mm);
589  	return ret;
590  }
591  
592  /*
593   * Attempt to pin pages.  We really don't want to track all the pfns and
594   * the iommu can only map chunks of consecutive pfns anyway, so get the
595   * first page and all consecutive pages with the same locking.
596   */
vfio_pin_pages_remote(struct vfio_dma * dma,unsigned long vaddr,long npage,unsigned long * pfn_base,unsigned long limit,struct vfio_batch * batch)597  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
598  				  long npage, unsigned long *pfn_base,
599  				  unsigned long limit, struct vfio_batch *batch)
600  {
601  	unsigned long pfn;
602  	struct mm_struct *mm = current->mm;
603  	long ret, pinned = 0, lock_acct = 0;
604  	bool rsvd;
605  	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
606  
607  	/* This code path is only user initiated */
608  	if (!mm)
609  		return -ENODEV;
610  
611  	if (batch->size) {
612  		/* Leftover pages in batch from an earlier call. */
613  		*pfn_base = page_to_pfn(batch->pages[batch->offset]);
614  		pfn = *pfn_base;
615  		rsvd = is_invalid_reserved_pfn(*pfn_base);
616  	} else {
617  		*pfn_base = 0;
618  	}
619  
620  	while (npage) {
621  		if (!batch->size) {
622  			/* Empty batch, so refill it. */
623  			long req_pages = min_t(long, npage, batch->capacity);
624  
625  			ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
626  					     &pfn, batch->pages);
627  			if (ret < 0)
628  				goto unpin_out;
629  
630  			batch->size = ret;
631  			batch->offset = 0;
632  
633  			if (!*pfn_base) {
634  				*pfn_base = pfn;
635  				rsvd = is_invalid_reserved_pfn(*pfn_base);
636  			}
637  		}
638  
639  		/*
640  		 * pfn is preset for the first iteration of this inner loop and
641  		 * updated at the end to handle a VM_PFNMAP pfn.  In that case,
642  		 * batch->pages isn't valid (there's no struct page), so allow
643  		 * batch->pages to be touched only when there's more than one
644  		 * pfn to check, which guarantees the pfns are from a
645  		 * !VM_PFNMAP vma.
646  		 */
647  		while (true) {
648  			if (pfn != *pfn_base + pinned ||
649  			    rsvd != is_invalid_reserved_pfn(pfn))
650  				goto out;
651  
652  			/*
653  			 * Reserved pages aren't counted against the user,
654  			 * externally pinned pages are already counted against
655  			 * the user.
656  			 */
657  			if (!rsvd && !vfio_find_vpfn(dma, iova)) {
658  				if (!dma->lock_cap &&
659  				    mm->locked_vm + lock_acct + 1 > limit) {
660  					pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
661  						__func__, limit << PAGE_SHIFT);
662  					ret = -ENOMEM;
663  					goto unpin_out;
664  				}
665  				lock_acct++;
666  			}
667  
668  			pinned++;
669  			npage--;
670  			vaddr += PAGE_SIZE;
671  			iova += PAGE_SIZE;
672  			batch->offset++;
673  			batch->size--;
674  
675  			if (!batch->size)
676  				break;
677  
678  			pfn = page_to_pfn(batch->pages[batch->offset]);
679  		}
680  
681  		if (unlikely(disable_hugepages))
682  			break;
683  	}
684  
685  out:
686  	ret = vfio_lock_acct(dma, lock_acct, false);
687  
688  unpin_out:
689  	if (batch->size == 1 && !batch->offset) {
690  		/* May be a VM_PFNMAP pfn, which the batch can't remember. */
691  		put_pfn(pfn, dma->prot);
692  		batch->size = 0;
693  	}
694  
695  	if (ret < 0) {
696  		if (pinned && !rsvd) {
697  			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
698  				put_pfn(pfn, dma->prot);
699  		}
700  		vfio_batch_unpin(batch, dma);
701  
702  		return ret;
703  	}
704  
705  	return pinned;
706  }
707  
vfio_unpin_pages_remote(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn,long npage,bool do_accounting)708  static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
709  				    unsigned long pfn, long npage,
710  				    bool do_accounting)
711  {
712  	long unlocked = 0, locked = 0;
713  	long i;
714  
715  	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
716  		if (put_pfn(pfn++, dma->prot)) {
717  			unlocked++;
718  			if (vfio_find_vpfn(dma, iova))
719  				locked++;
720  		}
721  	}
722  
723  	if (do_accounting)
724  		vfio_lock_acct(dma, locked - unlocked, true);
725  
726  	return unlocked;
727  }
728  
vfio_pin_page_external(struct vfio_dma * dma,unsigned long vaddr,unsigned long * pfn_base,bool do_accounting)729  static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
730  				  unsigned long *pfn_base, bool do_accounting)
731  {
732  	struct page *pages[1];
733  	struct mm_struct *mm;
734  	int ret;
735  
736  	mm = dma->mm;
737  	if (!mmget_not_zero(mm))
738  		return -ENODEV;
739  
740  	ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
741  	if (ret != 1)
742  		goto out;
743  
744  	ret = 0;
745  
746  	if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
747  		ret = vfio_lock_acct(dma, 1, false);
748  		if (ret) {
749  			put_pfn(*pfn_base, dma->prot);
750  			if (ret == -ENOMEM)
751  				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
752  					"(%ld) exceeded\n", __func__,
753  					dma->task->comm, task_pid_nr(dma->task),
754  					task_rlimit(dma->task, RLIMIT_MEMLOCK));
755  		}
756  	}
757  
758  out:
759  	mmput(mm);
760  	return ret;
761  }
762  
vfio_unpin_page_external(struct vfio_dma * dma,dma_addr_t iova,bool do_accounting)763  static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
764  				    bool do_accounting)
765  {
766  	int unlocked;
767  	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
768  
769  	if (!vpfn)
770  		return 0;
771  
772  	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
773  
774  	if (do_accounting)
775  		vfio_lock_acct(dma, -unlocked, true);
776  
777  	return unlocked;
778  }
779  
vfio_iommu_type1_pin_pages(void * iommu_data,struct iommu_group * iommu_group,dma_addr_t user_iova,int npage,int prot,struct page ** pages)780  static int vfio_iommu_type1_pin_pages(void *iommu_data,
781  				      struct iommu_group *iommu_group,
782  				      dma_addr_t user_iova,
783  				      int npage, int prot,
784  				      struct page **pages)
785  {
786  	struct vfio_iommu *iommu = iommu_data;
787  	struct vfio_iommu_group *group;
788  	int i, j, ret;
789  	unsigned long remote_vaddr;
790  	struct vfio_dma *dma;
791  	bool do_accounting;
792  
793  	if (!iommu || !pages)
794  		return -EINVAL;
795  
796  	/* Supported for v2 version only */
797  	if (!iommu->v2)
798  		return -EACCES;
799  
800  	mutex_lock(&iommu->lock);
801  
802  	if (WARN_ONCE(iommu->vaddr_invalid_count,
803  		      "vfio_pin_pages not allowed with VFIO_UPDATE_VADDR\n")) {
804  		ret = -EBUSY;
805  		goto pin_done;
806  	}
807  
808  	/* Fail if no dma_umap notifier is registered */
809  	if (list_empty(&iommu->device_list)) {
810  		ret = -EINVAL;
811  		goto pin_done;
812  	}
813  
814  	/*
815  	 * If iommu capable domain exist in the container then all pages are
816  	 * already pinned and accounted. Accounting should be done if there is no
817  	 * iommu capable domain in the container.
818  	 */
819  	do_accounting = list_empty(&iommu->domain_list);
820  
821  	for (i = 0; i < npage; i++) {
822  		unsigned long phys_pfn;
823  		dma_addr_t iova;
824  		struct vfio_pfn *vpfn;
825  
826  		iova = user_iova + PAGE_SIZE * i;
827  		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
828  		if (!dma) {
829  			ret = -EINVAL;
830  			goto pin_unwind;
831  		}
832  
833  		if ((dma->prot & prot) != prot) {
834  			ret = -EPERM;
835  			goto pin_unwind;
836  		}
837  
838  		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
839  		if (vpfn) {
840  			pages[i] = pfn_to_page(vpfn->pfn);
841  			continue;
842  		}
843  
844  		remote_vaddr = dma->vaddr + (iova - dma->iova);
845  		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn,
846  					     do_accounting);
847  		if (ret)
848  			goto pin_unwind;
849  
850  		if (!pfn_valid(phys_pfn)) {
851  			ret = -EINVAL;
852  			goto pin_unwind;
853  		}
854  
855  		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn);
856  		if (ret) {
857  			if (put_pfn(phys_pfn, dma->prot) && do_accounting)
858  				vfio_lock_acct(dma, -1, true);
859  			goto pin_unwind;
860  		}
861  
862  		pages[i] = pfn_to_page(phys_pfn);
863  
864  		if (iommu->dirty_page_tracking) {
865  			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
866  
867  			/*
868  			 * Bitmap populated with the smallest supported page
869  			 * size
870  			 */
871  			bitmap_set(dma->bitmap,
872  				   (iova - dma->iova) >> pgshift, 1);
873  		}
874  	}
875  	ret = i;
876  
877  	group = vfio_iommu_find_iommu_group(iommu, iommu_group);
878  	if (!group->pinned_page_dirty_scope) {
879  		group->pinned_page_dirty_scope = true;
880  		iommu->num_non_pinned_groups--;
881  	}
882  
883  	goto pin_done;
884  
885  pin_unwind:
886  	pages[i] = NULL;
887  	for (j = 0; j < i; j++) {
888  		dma_addr_t iova;
889  
890  		iova = user_iova + PAGE_SIZE * j;
891  		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
892  		vfio_unpin_page_external(dma, iova, do_accounting);
893  		pages[j] = NULL;
894  	}
895  pin_done:
896  	mutex_unlock(&iommu->lock);
897  	return ret;
898  }
899  
vfio_iommu_type1_unpin_pages(void * iommu_data,dma_addr_t user_iova,int npage)900  static void vfio_iommu_type1_unpin_pages(void *iommu_data,
901  					 dma_addr_t user_iova, int npage)
902  {
903  	struct vfio_iommu *iommu = iommu_data;
904  	bool do_accounting;
905  	int i;
906  
907  	/* Supported for v2 version only */
908  	if (WARN_ON(!iommu->v2))
909  		return;
910  
911  	mutex_lock(&iommu->lock);
912  
913  	do_accounting = list_empty(&iommu->domain_list);
914  	for (i = 0; i < npage; i++) {
915  		dma_addr_t iova = user_iova + PAGE_SIZE * i;
916  		struct vfio_dma *dma;
917  
918  		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
919  		if (!dma)
920  			break;
921  
922  		vfio_unpin_page_external(dma, iova, do_accounting);
923  	}
924  
925  	mutex_unlock(&iommu->lock);
926  
927  	WARN_ON(i != npage);
928  }
929  
vfio_sync_unpin(struct vfio_dma * dma,struct vfio_domain * domain,struct list_head * regions,struct iommu_iotlb_gather * iotlb_gather)930  static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
931  			    struct list_head *regions,
932  			    struct iommu_iotlb_gather *iotlb_gather)
933  {
934  	long unlocked = 0;
935  	struct vfio_regions *entry, *next;
936  
937  	iommu_iotlb_sync(domain->domain, iotlb_gather);
938  
939  	list_for_each_entry_safe(entry, next, regions, list) {
940  		unlocked += vfio_unpin_pages_remote(dma,
941  						    entry->iova,
942  						    entry->phys >> PAGE_SHIFT,
943  						    entry->len >> PAGE_SHIFT,
944  						    false);
945  		list_del(&entry->list);
946  		kfree(entry);
947  	}
948  
949  	cond_resched();
950  
951  	return unlocked;
952  }
953  
954  /*
955   * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
956   * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
957   * of these regions (currently using a list).
958   *
959   * This value specifies maximum number of regions for each IOTLB flush sync.
960   */
961  #define VFIO_IOMMU_TLB_SYNC_MAX		512
962  
unmap_unpin_fast(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked,struct list_head * unmapped_list,int * unmapped_cnt,struct iommu_iotlb_gather * iotlb_gather)963  static size_t unmap_unpin_fast(struct vfio_domain *domain,
964  			       struct vfio_dma *dma, dma_addr_t *iova,
965  			       size_t len, phys_addr_t phys, long *unlocked,
966  			       struct list_head *unmapped_list,
967  			       int *unmapped_cnt,
968  			       struct iommu_iotlb_gather *iotlb_gather)
969  {
970  	size_t unmapped = 0;
971  	struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
972  
973  	if (entry) {
974  		unmapped = iommu_unmap_fast(domain->domain, *iova, len,
975  					    iotlb_gather);
976  
977  		if (!unmapped) {
978  			kfree(entry);
979  		} else {
980  			entry->iova = *iova;
981  			entry->phys = phys;
982  			entry->len  = unmapped;
983  			list_add_tail(&entry->list, unmapped_list);
984  
985  			*iova += unmapped;
986  			(*unmapped_cnt)++;
987  		}
988  	}
989  
990  	/*
991  	 * Sync if the number of fast-unmap regions hits the limit
992  	 * or in case of errors.
993  	 */
994  	if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
995  		*unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
996  					     iotlb_gather);
997  		*unmapped_cnt = 0;
998  	}
999  
1000  	return unmapped;
1001  }
1002  
unmap_unpin_slow(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked)1003  static size_t unmap_unpin_slow(struct vfio_domain *domain,
1004  			       struct vfio_dma *dma, dma_addr_t *iova,
1005  			       size_t len, phys_addr_t phys,
1006  			       long *unlocked)
1007  {
1008  	size_t unmapped = iommu_unmap(domain->domain, *iova, len);
1009  
1010  	if (unmapped) {
1011  		*unlocked += vfio_unpin_pages_remote(dma, *iova,
1012  						     phys >> PAGE_SHIFT,
1013  						     unmapped >> PAGE_SHIFT,
1014  						     false);
1015  		*iova += unmapped;
1016  		cond_resched();
1017  	}
1018  	return unmapped;
1019  }
1020  
vfio_unmap_unpin(struct vfio_iommu * iommu,struct vfio_dma * dma,bool do_accounting)1021  static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
1022  			     bool do_accounting)
1023  {
1024  	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
1025  	struct vfio_domain *domain, *d;
1026  	LIST_HEAD(unmapped_region_list);
1027  	struct iommu_iotlb_gather iotlb_gather;
1028  	int unmapped_region_cnt = 0;
1029  	long unlocked = 0;
1030  
1031  	if (!dma->size)
1032  		return 0;
1033  
1034  	if (list_empty(&iommu->domain_list))
1035  		return 0;
1036  
1037  	/*
1038  	 * We use the IOMMU to track the physical addresses, otherwise we'd
1039  	 * need a much more complicated tracking system.  Unfortunately that
1040  	 * means we need to use one of the iommu domains to figure out the
1041  	 * pfns to unpin.  The rest need to be unmapped in advance so we have
1042  	 * no iommu translations remaining when the pages are unpinned.
1043  	 */
1044  	domain = d = list_first_entry(&iommu->domain_list,
1045  				      struct vfio_domain, next);
1046  
1047  	list_for_each_entry_continue(d, &iommu->domain_list, next) {
1048  		iommu_unmap(d->domain, dma->iova, dma->size);
1049  		cond_resched();
1050  	}
1051  
1052  	iommu_iotlb_gather_init(&iotlb_gather);
1053  	while (iova < end) {
1054  		size_t unmapped, len;
1055  		phys_addr_t phys, next;
1056  
1057  		phys = iommu_iova_to_phys(domain->domain, iova);
1058  		if (WARN_ON(!phys)) {
1059  			iova += PAGE_SIZE;
1060  			continue;
1061  		}
1062  
1063  		/*
1064  		 * To optimize for fewer iommu_unmap() calls, each of which
1065  		 * may require hardware cache flushing, try to find the
1066  		 * largest contiguous physical memory chunk to unmap.
1067  		 */
1068  		for (len = PAGE_SIZE;
1069  		     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
1070  			next = iommu_iova_to_phys(domain->domain, iova + len);
1071  			if (next != phys + len)
1072  				break;
1073  		}
1074  
1075  		/*
1076  		 * First, try to use fast unmap/unpin. In case of failure,
1077  		 * switch to slow unmap/unpin path.
1078  		 */
1079  		unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
1080  					    &unlocked, &unmapped_region_list,
1081  					    &unmapped_region_cnt,
1082  					    &iotlb_gather);
1083  		if (!unmapped) {
1084  			unmapped = unmap_unpin_slow(domain, dma, &iova, len,
1085  						    phys, &unlocked);
1086  			if (WARN_ON(!unmapped))
1087  				break;
1088  		}
1089  	}
1090  
1091  	dma->iommu_mapped = false;
1092  
1093  	if (unmapped_region_cnt) {
1094  		unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
1095  					    &iotlb_gather);
1096  	}
1097  
1098  	if (do_accounting) {
1099  		vfio_lock_acct(dma, -unlocked, true);
1100  		return 0;
1101  	}
1102  	return unlocked;
1103  }
1104  
vfio_remove_dma(struct vfio_iommu * iommu,struct vfio_dma * dma)1105  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
1106  {
1107  	WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
1108  	vfio_unmap_unpin(iommu, dma, true);
1109  	vfio_unlink_dma(iommu, dma);
1110  	put_task_struct(dma->task);
1111  	mmdrop(dma->mm);
1112  	vfio_dma_bitmap_free(dma);
1113  	if (dma->vaddr_invalid)
1114  		iommu->vaddr_invalid_count--;
1115  	kfree(dma);
1116  	iommu->dma_avail++;
1117  }
1118  
vfio_update_pgsize_bitmap(struct vfio_iommu * iommu)1119  static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
1120  {
1121  	struct vfio_domain *domain;
1122  
1123  	iommu->pgsize_bitmap = ULONG_MAX;
1124  
1125  	list_for_each_entry(domain, &iommu->domain_list, next)
1126  		iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
1127  
1128  	/*
1129  	 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
1130  	 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
1131  	 * That way the user will be able to map/unmap buffers whose size/
1132  	 * start address is aligned with PAGE_SIZE. Pinning code uses that
1133  	 * granularity while iommu driver can use the sub-PAGE_SIZE size
1134  	 * to map the buffer.
1135  	 */
1136  	if (iommu->pgsize_bitmap & ~PAGE_MASK) {
1137  		iommu->pgsize_bitmap &= PAGE_MASK;
1138  		iommu->pgsize_bitmap |= PAGE_SIZE;
1139  	}
1140  }
1141  
update_user_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,struct vfio_dma * dma,dma_addr_t base_iova,size_t pgsize)1142  static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1143  			      struct vfio_dma *dma, dma_addr_t base_iova,
1144  			      size_t pgsize)
1145  {
1146  	unsigned long pgshift = __ffs(pgsize);
1147  	unsigned long nbits = dma->size >> pgshift;
1148  	unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
1149  	unsigned long copy_offset = bit_offset / BITS_PER_LONG;
1150  	unsigned long shift = bit_offset % BITS_PER_LONG;
1151  	unsigned long leftover;
1152  
1153  	/*
1154  	 * mark all pages dirty if any IOMMU capable device is not able
1155  	 * to report dirty pages and all pages are pinned and mapped.
1156  	 */
1157  	if (iommu->num_non_pinned_groups && dma->iommu_mapped)
1158  		bitmap_set(dma->bitmap, 0, nbits);
1159  
1160  	if (shift) {
1161  		bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
1162  				  nbits + shift);
1163  
1164  		if (copy_from_user(&leftover,
1165  				   (void __user *)(bitmap + copy_offset),
1166  				   sizeof(leftover)))
1167  			return -EFAULT;
1168  
1169  		bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
1170  	}
1171  
1172  	if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
1173  			 DIRTY_BITMAP_BYTES(nbits + shift)))
1174  		return -EFAULT;
1175  
1176  	return 0;
1177  }
1178  
vfio_iova_dirty_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,dma_addr_t iova,size_t size,size_t pgsize)1179  static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1180  				  dma_addr_t iova, size_t size, size_t pgsize)
1181  {
1182  	struct vfio_dma *dma;
1183  	struct rb_node *n;
1184  	unsigned long pgshift = __ffs(pgsize);
1185  	int ret;
1186  
1187  	/*
1188  	 * GET_BITMAP request must fully cover vfio_dma mappings.  Multiple
1189  	 * vfio_dma mappings may be clubbed by specifying large ranges, but
1190  	 * there must not be any previous mappings bisected by the range.
1191  	 * An error will be returned if these conditions are not met.
1192  	 */
1193  	dma = vfio_find_dma(iommu, iova, 1);
1194  	if (dma && dma->iova != iova)
1195  		return -EINVAL;
1196  
1197  	dma = vfio_find_dma(iommu, iova + size - 1, 0);
1198  	if (dma && dma->iova + dma->size != iova + size)
1199  		return -EINVAL;
1200  
1201  	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1202  		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1203  
1204  		if (dma->iova < iova)
1205  			continue;
1206  
1207  		if (dma->iova > iova + size - 1)
1208  			break;
1209  
1210  		ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
1211  		if (ret)
1212  			return ret;
1213  
1214  		/*
1215  		 * Re-populate bitmap to include all pinned pages which are
1216  		 * considered as dirty but exclude pages which are unpinned and
1217  		 * pages which are marked dirty by vfio_dma_rw()
1218  		 */
1219  		bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
1220  		vfio_dma_populate_bitmap(dma, pgsize);
1221  	}
1222  	return 0;
1223  }
1224  
verify_bitmap_size(uint64_t npages,uint64_t bitmap_size)1225  static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
1226  {
1227  	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
1228  	    (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
1229  		return -EINVAL;
1230  
1231  	return 0;
1232  }
1233  
1234  /*
1235   * Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate
1236   * and unmap iovas within the range we're about to unmap. Drivers MUST unpin
1237   * pages in response to an invalidation.
1238   */
vfio_notify_dma_unmap(struct vfio_iommu * iommu,struct vfio_dma * dma)1239  static void vfio_notify_dma_unmap(struct vfio_iommu *iommu,
1240  				  struct vfio_dma *dma)
1241  {
1242  	struct vfio_device *device;
1243  
1244  	if (list_empty(&iommu->device_list))
1245  		return;
1246  
1247  	/*
1248  	 * The device is expected to call vfio_unpin_pages() for any IOVA it has
1249  	 * pinned within the range. Since vfio_unpin_pages() will eventually
1250  	 * call back down to this code and try to obtain the iommu->lock we must
1251  	 * drop it.
1252  	 */
1253  	mutex_lock(&iommu->device_list_lock);
1254  	mutex_unlock(&iommu->lock);
1255  
1256  	list_for_each_entry(device, &iommu->device_list, iommu_entry)
1257  		device->ops->dma_unmap(device, dma->iova, dma->size);
1258  
1259  	mutex_unlock(&iommu->device_list_lock);
1260  	mutex_lock(&iommu->lock);
1261  }
1262  
vfio_dma_do_unmap(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_unmap * unmap,struct vfio_bitmap * bitmap)1263  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
1264  			     struct vfio_iommu_type1_dma_unmap *unmap,
1265  			     struct vfio_bitmap *bitmap)
1266  {
1267  	struct vfio_dma *dma, *dma_last = NULL;
1268  	size_t unmapped = 0, pgsize;
1269  	int ret = -EINVAL, retries = 0;
1270  	unsigned long pgshift;
1271  	dma_addr_t iova = unmap->iova;
1272  	u64 size = unmap->size;
1273  	bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
1274  	bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
1275  	struct rb_node *n, *first_n;
1276  
1277  	mutex_lock(&iommu->lock);
1278  
1279  	/* Cannot update vaddr if mdev is present. */
1280  	if (invalidate_vaddr && !list_empty(&iommu->emulated_iommu_groups)) {
1281  		ret = -EBUSY;
1282  		goto unlock;
1283  	}
1284  
1285  	pgshift = __ffs(iommu->pgsize_bitmap);
1286  	pgsize = (size_t)1 << pgshift;
1287  
1288  	if (iova & (pgsize - 1))
1289  		goto unlock;
1290  
1291  	if (unmap_all) {
1292  		if (iova || size)
1293  			goto unlock;
1294  		size = U64_MAX;
1295  	} else if (!size || size & (pgsize - 1) ||
1296  		   iova + size - 1 < iova || size > SIZE_MAX) {
1297  		goto unlock;
1298  	}
1299  
1300  	/* When dirty tracking is enabled, allow only min supported pgsize */
1301  	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
1302  	    (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
1303  		goto unlock;
1304  	}
1305  
1306  	WARN_ON((pgsize - 1) & PAGE_MASK);
1307  again:
1308  	/*
1309  	 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
1310  	 * avoid tracking individual mappings.  This means that the granularity
1311  	 * of the original mapping was lost and the user was allowed to attempt
1312  	 * to unmap any range.  Depending on the contiguousness of physical
1313  	 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
1314  	 * or may not have worked.  We only guaranteed unmap granularity
1315  	 * matching the original mapping; even though it was untracked here,
1316  	 * the original mappings are reflected in IOMMU mappings.  This
1317  	 * resulted in a couple unusual behaviors.  First, if a range is not
1318  	 * able to be unmapped, ex. a set of 4k pages that was mapped as a
1319  	 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
1320  	 * a zero sized unmap.  Also, if an unmap request overlaps the first
1321  	 * address of a hugepage, the IOMMU will unmap the entire hugepage.
1322  	 * This also returns success and the returned unmap size reflects the
1323  	 * actual size unmapped.
1324  	 *
1325  	 * We attempt to maintain compatibility with this "v1" interface, but
1326  	 * we take control out of the hands of the IOMMU.  Therefore, an unmap
1327  	 * request offset from the beginning of the original mapping will
1328  	 * return success with zero sized unmap.  And an unmap request covering
1329  	 * the first iova of mapping will unmap the entire range.
1330  	 *
1331  	 * The v2 version of this interface intends to be more deterministic.
1332  	 * Unmap requests must fully cover previous mappings.  Multiple
1333  	 * mappings may still be unmaped by specifying large ranges, but there
1334  	 * must not be any previous mappings bisected by the range.  An error
1335  	 * will be returned if these conditions are not met.  The v2 interface
1336  	 * will only return success and a size of zero if there were no
1337  	 * mappings within the range.
1338  	 */
1339  	if (iommu->v2 && !unmap_all) {
1340  		dma = vfio_find_dma(iommu, iova, 1);
1341  		if (dma && dma->iova != iova)
1342  			goto unlock;
1343  
1344  		dma = vfio_find_dma(iommu, iova + size - 1, 0);
1345  		if (dma && dma->iova + dma->size != iova + size)
1346  			goto unlock;
1347  	}
1348  
1349  	ret = 0;
1350  	n = first_n = vfio_find_dma_first_node(iommu, iova, size);
1351  
1352  	while (n) {
1353  		dma = rb_entry(n, struct vfio_dma, node);
1354  		if (dma->iova >= iova + size)
1355  			break;
1356  
1357  		if (!iommu->v2 && iova > dma->iova)
1358  			break;
1359  
1360  		if (invalidate_vaddr) {
1361  			if (dma->vaddr_invalid) {
1362  				struct rb_node *last_n = n;
1363  
1364  				for (n = first_n; n != last_n; n = rb_next(n)) {
1365  					dma = rb_entry(n,
1366  						       struct vfio_dma, node);
1367  					dma->vaddr_invalid = false;
1368  					iommu->vaddr_invalid_count--;
1369  				}
1370  				ret = -EINVAL;
1371  				unmapped = 0;
1372  				break;
1373  			}
1374  			dma->vaddr_invalid = true;
1375  			iommu->vaddr_invalid_count++;
1376  			unmapped += dma->size;
1377  			n = rb_next(n);
1378  			continue;
1379  		}
1380  
1381  		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
1382  			if (dma_last == dma) {
1383  				BUG_ON(++retries > 10);
1384  			} else {
1385  				dma_last = dma;
1386  				retries = 0;
1387  			}
1388  
1389  			vfio_notify_dma_unmap(iommu, dma);
1390  			goto again;
1391  		}
1392  
1393  		if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
1394  			ret = update_user_bitmap(bitmap->data, iommu, dma,
1395  						 iova, pgsize);
1396  			if (ret)
1397  				break;
1398  		}
1399  
1400  		unmapped += dma->size;
1401  		n = rb_next(n);
1402  		vfio_remove_dma(iommu, dma);
1403  	}
1404  
1405  unlock:
1406  	mutex_unlock(&iommu->lock);
1407  
1408  	/* Report how much was unmapped */
1409  	unmap->size = unmapped;
1410  
1411  	return ret;
1412  }
1413  
vfio_iommu_map(struct vfio_iommu * iommu,dma_addr_t iova,unsigned long pfn,long npage,int prot)1414  static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
1415  			  unsigned long pfn, long npage, int prot)
1416  {
1417  	struct vfio_domain *d;
1418  	int ret;
1419  
1420  	list_for_each_entry(d, &iommu->domain_list, next) {
1421  		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
1422  				npage << PAGE_SHIFT, prot | IOMMU_CACHE,
1423  				GFP_KERNEL_ACCOUNT);
1424  		if (ret)
1425  			goto unwind;
1426  
1427  		cond_resched();
1428  	}
1429  
1430  	return 0;
1431  
1432  unwind:
1433  	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
1434  		iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1435  		cond_resched();
1436  	}
1437  
1438  	return ret;
1439  }
1440  
vfio_pin_map_dma(struct vfio_iommu * iommu,struct vfio_dma * dma,size_t map_size)1441  static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1442  			    size_t map_size)
1443  {
1444  	dma_addr_t iova = dma->iova;
1445  	unsigned long vaddr = dma->vaddr;
1446  	struct vfio_batch batch;
1447  	size_t size = map_size;
1448  	long npage;
1449  	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1450  	int ret = 0;
1451  
1452  	vfio_batch_init(&batch);
1453  
1454  	while (size) {
1455  		/* Pin a contiguous chunk of memory */
1456  		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1457  					      size >> PAGE_SHIFT, &pfn, limit,
1458  					      &batch);
1459  		if (npage <= 0) {
1460  			WARN_ON(!npage);
1461  			ret = (int)npage;
1462  			break;
1463  		}
1464  
1465  		/* Map it! */
1466  		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1467  				     dma->prot);
1468  		if (ret) {
1469  			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1470  						npage, true);
1471  			vfio_batch_unpin(&batch, dma);
1472  			break;
1473  		}
1474  
1475  		size -= npage << PAGE_SHIFT;
1476  		dma->size += npage << PAGE_SHIFT;
1477  	}
1478  
1479  	vfio_batch_fini(&batch);
1480  	dma->iommu_mapped = true;
1481  
1482  	if (ret)
1483  		vfio_remove_dma(iommu, dma);
1484  
1485  	return ret;
1486  }
1487  
1488  /*
1489   * Check dma map request is within a valid iova range
1490   */
vfio_iommu_iova_dma_valid(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1491  static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1492  				      dma_addr_t start, dma_addr_t end)
1493  {
1494  	struct list_head *iova = &iommu->iova_list;
1495  	struct vfio_iova *node;
1496  
1497  	list_for_each_entry(node, iova, list) {
1498  		if (start >= node->start && end <= node->end)
1499  			return true;
1500  	}
1501  
1502  	/*
1503  	 * Check for list_empty() as well since a container with
1504  	 * a single mdev device will have an empty list.
1505  	 */
1506  	return list_empty(iova);
1507  }
1508  
vfio_change_dma_owner(struct vfio_dma * dma)1509  static int vfio_change_dma_owner(struct vfio_dma *dma)
1510  {
1511  	struct task_struct *task = current->group_leader;
1512  	struct mm_struct *mm = current->mm;
1513  	long npage = dma->locked_vm;
1514  	bool lock_cap;
1515  	int ret;
1516  
1517  	if (mm == dma->mm)
1518  		return 0;
1519  
1520  	lock_cap = capable(CAP_IPC_LOCK);
1521  	ret = mm_lock_acct(task, mm, lock_cap, npage);
1522  	if (ret)
1523  		return ret;
1524  
1525  	if (mmget_not_zero(dma->mm)) {
1526  		mm_lock_acct(dma->task, dma->mm, dma->lock_cap, -npage);
1527  		mmput(dma->mm);
1528  	}
1529  
1530  	if (dma->task != task) {
1531  		put_task_struct(dma->task);
1532  		dma->task = get_task_struct(task);
1533  	}
1534  	mmdrop(dma->mm);
1535  	dma->mm = mm;
1536  	mmgrab(dma->mm);
1537  	dma->lock_cap = lock_cap;
1538  	return 0;
1539  }
1540  
vfio_dma_do_map(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_map * map)1541  static int vfio_dma_do_map(struct vfio_iommu *iommu,
1542  			   struct vfio_iommu_type1_dma_map *map)
1543  {
1544  	bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
1545  	dma_addr_t iova = map->iova;
1546  	unsigned long vaddr = map->vaddr;
1547  	size_t size = map->size;
1548  	int ret = 0, prot = 0;
1549  	size_t pgsize;
1550  	struct vfio_dma *dma;
1551  
1552  	/* Verify that none of our __u64 fields overflow */
1553  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1554  		return -EINVAL;
1555  
1556  	/* READ/WRITE from device perspective */
1557  	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1558  		prot |= IOMMU_WRITE;
1559  	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1560  		prot |= IOMMU_READ;
1561  
1562  	if ((prot && set_vaddr) || (!prot && !set_vaddr))
1563  		return -EINVAL;
1564  
1565  	mutex_lock(&iommu->lock);
1566  
1567  	pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
1568  
1569  	WARN_ON((pgsize - 1) & PAGE_MASK);
1570  
1571  	if (!size || (size | iova | vaddr) & (pgsize - 1)) {
1572  		ret = -EINVAL;
1573  		goto out_unlock;
1574  	}
1575  
1576  	/* Don't allow IOVA or virtual address wrap */
1577  	if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
1578  		ret = -EINVAL;
1579  		goto out_unlock;
1580  	}
1581  
1582  	dma = vfio_find_dma(iommu, iova, size);
1583  	if (set_vaddr) {
1584  		if (!dma) {
1585  			ret = -ENOENT;
1586  		} else if (!dma->vaddr_invalid || dma->iova != iova ||
1587  			   dma->size != size) {
1588  			ret = -EINVAL;
1589  		} else {
1590  			ret = vfio_change_dma_owner(dma);
1591  			if (ret)
1592  				goto out_unlock;
1593  			dma->vaddr = vaddr;
1594  			dma->vaddr_invalid = false;
1595  			iommu->vaddr_invalid_count--;
1596  		}
1597  		goto out_unlock;
1598  	} else if (dma) {
1599  		ret = -EEXIST;
1600  		goto out_unlock;
1601  	}
1602  
1603  	if (!iommu->dma_avail) {
1604  		ret = -ENOSPC;
1605  		goto out_unlock;
1606  	}
1607  
1608  	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1609  		ret = -EINVAL;
1610  		goto out_unlock;
1611  	}
1612  
1613  	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1614  	if (!dma) {
1615  		ret = -ENOMEM;
1616  		goto out_unlock;
1617  	}
1618  
1619  	iommu->dma_avail--;
1620  	dma->iova = iova;
1621  	dma->vaddr = vaddr;
1622  	dma->prot = prot;
1623  
1624  	/*
1625  	 * We need to be able to both add to a task's locked memory and test
1626  	 * against the locked memory limit and we need to be able to do both
1627  	 * outside of this call path as pinning can be asynchronous via the
1628  	 * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
1629  	 * task_struct. Save the group_leader so that all DMA tracking uses
1630  	 * the same task, to make debugging easier.  VM locked pages requires
1631  	 * an mm_struct, so grab the mm in case the task dies.
1632  	 */
1633  	get_task_struct(current->group_leader);
1634  	dma->task = current->group_leader;
1635  	dma->lock_cap = capable(CAP_IPC_LOCK);
1636  	dma->mm = current->mm;
1637  	mmgrab(dma->mm);
1638  
1639  	dma->pfn_list = RB_ROOT;
1640  
1641  	/* Insert zero-sized and grow as we map chunks of it */
1642  	vfio_link_dma(iommu, dma);
1643  
1644  	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
1645  	if (list_empty(&iommu->domain_list))
1646  		dma->size = size;
1647  	else
1648  		ret = vfio_pin_map_dma(iommu, dma, size);
1649  
1650  	if (!ret && iommu->dirty_page_tracking) {
1651  		ret = vfio_dma_bitmap_alloc(dma, pgsize);
1652  		if (ret)
1653  			vfio_remove_dma(iommu, dma);
1654  	}
1655  
1656  out_unlock:
1657  	mutex_unlock(&iommu->lock);
1658  	return ret;
1659  }
1660  
vfio_iommu_replay(struct vfio_iommu * iommu,struct vfio_domain * domain)1661  static int vfio_iommu_replay(struct vfio_iommu *iommu,
1662  			     struct vfio_domain *domain)
1663  {
1664  	struct vfio_batch batch;
1665  	struct vfio_domain *d = NULL;
1666  	struct rb_node *n;
1667  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1668  	int ret;
1669  
1670  	/* Arbitrarily pick the first domain in the list for lookups */
1671  	if (!list_empty(&iommu->domain_list))
1672  		d = list_first_entry(&iommu->domain_list,
1673  				     struct vfio_domain, next);
1674  
1675  	vfio_batch_init(&batch);
1676  
1677  	n = rb_first(&iommu->dma_list);
1678  
1679  	for (; n; n = rb_next(n)) {
1680  		struct vfio_dma *dma;
1681  		dma_addr_t iova;
1682  
1683  		dma = rb_entry(n, struct vfio_dma, node);
1684  		iova = dma->iova;
1685  
1686  		while (iova < dma->iova + dma->size) {
1687  			phys_addr_t phys;
1688  			size_t size;
1689  
1690  			if (dma->iommu_mapped) {
1691  				phys_addr_t p;
1692  				dma_addr_t i;
1693  
1694  				if (WARN_ON(!d)) { /* mapped w/o a domain?! */
1695  					ret = -EINVAL;
1696  					goto unwind;
1697  				}
1698  
1699  				phys = iommu_iova_to_phys(d->domain, iova);
1700  
1701  				if (WARN_ON(!phys)) {
1702  					iova += PAGE_SIZE;
1703  					continue;
1704  				}
1705  
1706  				size = PAGE_SIZE;
1707  				p = phys + size;
1708  				i = iova + size;
1709  				while (i < dma->iova + dma->size &&
1710  				       p == iommu_iova_to_phys(d->domain, i)) {
1711  					size += PAGE_SIZE;
1712  					p += PAGE_SIZE;
1713  					i += PAGE_SIZE;
1714  				}
1715  			} else {
1716  				unsigned long pfn;
1717  				unsigned long vaddr = dma->vaddr +
1718  						     (iova - dma->iova);
1719  				size_t n = dma->iova + dma->size - iova;
1720  				long npage;
1721  
1722  				npage = vfio_pin_pages_remote(dma, vaddr,
1723  							      n >> PAGE_SHIFT,
1724  							      &pfn, limit,
1725  							      &batch);
1726  				if (npage <= 0) {
1727  					WARN_ON(!npage);
1728  					ret = (int)npage;
1729  					goto unwind;
1730  				}
1731  
1732  				phys = pfn << PAGE_SHIFT;
1733  				size = npage << PAGE_SHIFT;
1734  			}
1735  
1736  			ret = iommu_map(domain->domain, iova, phys, size,
1737  					dma->prot | IOMMU_CACHE,
1738  					GFP_KERNEL_ACCOUNT);
1739  			if (ret) {
1740  				if (!dma->iommu_mapped) {
1741  					vfio_unpin_pages_remote(dma, iova,
1742  							phys >> PAGE_SHIFT,
1743  							size >> PAGE_SHIFT,
1744  							true);
1745  					vfio_batch_unpin(&batch, dma);
1746  				}
1747  				goto unwind;
1748  			}
1749  
1750  			iova += size;
1751  		}
1752  	}
1753  
1754  	/* All dmas are now mapped, defer to second tree walk for unwind */
1755  	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1756  		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1757  
1758  		dma->iommu_mapped = true;
1759  	}
1760  
1761  	vfio_batch_fini(&batch);
1762  	return 0;
1763  
1764  unwind:
1765  	for (; n; n = rb_prev(n)) {
1766  		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1767  		dma_addr_t iova;
1768  
1769  		if (dma->iommu_mapped) {
1770  			iommu_unmap(domain->domain, dma->iova, dma->size);
1771  			continue;
1772  		}
1773  
1774  		iova = dma->iova;
1775  		while (iova < dma->iova + dma->size) {
1776  			phys_addr_t phys, p;
1777  			size_t size;
1778  			dma_addr_t i;
1779  
1780  			phys = iommu_iova_to_phys(domain->domain, iova);
1781  			if (!phys) {
1782  				iova += PAGE_SIZE;
1783  				continue;
1784  			}
1785  
1786  			size = PAGE_SIZE;
1787  			p = phys + size;
1788  			i = iova + size;
1789  			while (i < dma->iova + dma->size &&
1790  			       p == iommu_iova_to_phys(domain->domain, i)) {
1791  				size += PAGE_SIZE;
1792  				p += PAGE_SIZE;
1793  				i += PAGE_SIZE;
1794  			}
1795  
1796  			iommu_unmap(domain->domain, iova, size);
1797  			vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
1798  						size >> PAGE_SHIFT, true);
1799  		}
1800  	}
1801  
1802  	vfio_batch_fini(&batch);
1803  	return ret;
1804  }
1805  
1806  /*
1807   * We change our unmap behavior slightly depending on whether the IOMMU
1808   * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
1809   * for practically any contiguous power-of-two mapping we give it.  This means
1810   * we don't need to look for contiguous chunks ourselves to make unmapping
1811   * more efficient.  On IOMMUs with coarse-grained super pages, like Intel VT-d
1812   * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
1813   * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
1814   * hugetlbfs is in use.
1815   */
vfio_test_domain_fgsp(struct vfio_domain * domain,struct list_head * regions)1816  static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *regions)
1817  {
1818  	int ret, order = get_order(PAGE_SIZE * 2);
1819  	struct vfio_iova *region;
1820  	struct page *pages;
1821  	dma_addr_t start;
1822  
1823  	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
1824  	if (!pages)
1825  		return;
1826  
1827  	list_for_each_entry(region, regions, list) {
1828  		start = ALIGN(region->start, PAGE_SIZE * 2);
1829  		if (start >= region->end || (region->end - start < PAGE_SIZE * 2))
1830  			continue;
1831  
1832  		ret = iommu_map(domain->domain, start, page_to_phys(pages), PAGE_SIZE * 2,
1833  				IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE,
1834  				GFP_KERNEL_ACCOUNT);
1835  		if (!ret) {
1836  			size_t unmapped = iommu_unmap(domain->domain, start, PAGE_SIZE);
1837  
1838  			if (unmapped == PAGE_SIZE)
1839  				iommu_unmap(domain->domain, start + PAGE_SIZE, PAGE_SIZE);
1840  			else
1841  				domain->fgsp = true;
1842  		}
1843  		break;
1844  	}
1845  
1846  	__free_pages(pages, order);
1847  }
1848  
find_iommu_group(struct vfio_domain * domain,struct iommu_group * iommu_group)1849  static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain,
1850  						 struct iommu_group *iommu_group)
1851  {
1852  	struct vfio_iommu_group *g;
1853  
1854  	list_for_each_entry(g, &domain->group_list, next) {
1855  		if (g->iommu_group == iommu_group)
1856  			return g;
1857  	}
1858  
1859  	return NULL;
1860  }
1861  
1862  static struct vfio_iommu_group*
vfio_iommu_find_iommu_group(struct vfio_iommu * iommu,struct iommu_group * iommu_group)1863  vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
1864  			    struct iommu_group *iommu_group)
1865  {
1866  	struct vfio_iommu_group *group;
1867  	struct vfio_domain *domain;
1868  
1869  	list_for_each_entry(domain, &iommu->domain_list, next) {
1870  		group = find_iommu_group(domain, iommu_group);
1871  		if (group)
1872  			return group;
1873  	}
1874  
1875  	list_for_each_entry(group, &iommu->emulated_iommu_groups, next)
1876  		if (group->iommu_group == iommu_group)
1877  			return group;
1878  	return NULL;
1879  }
1880  
vfio_iommu_has_sw_msi(struct list_head * group_resv_regions,phys_addr_t * base)1881  static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1882  				  phys_addr_t *base)
1883  {
1884  	struct iommu_resv_region *region;
1885  	bool ret = false;
1886  
1887  	list_for_each_entry(region, group_resv_regions, list) {
1888  		/*
1889  		 * The presence of any 'real' MSI regions should take
1890  		 * precedence over the software-managed one if the
1891  		 * IOMMU driver happens to advertise both types.
1892  		 */
1893  		if (region->type == IOMMU_RESV_MSI) {
1894  			ret = false;
1895  			break;
1896  		}
1897  
1898  		if (region->type == IOMMU_RESV_SW_MSI) {
1899  			*base = region->start;
1900  			ret = true;
1901  		}
1902  	}
1903  
1904  	return ret;
1905  }
1906  
1907  /*
1908   * This is a helper function to insert an address range to iova list.
1909   * The list is initially created with a single entry corresponding to
1910   * the IOMMU domain geometry to which the device group is attached.
1911   * The list aperture gets modified when a new domain is added to the
1912   * container if the new aperture doesn't conflict with the current one
1913   * or with any existing dma mappings. The list is also modified to
1914   * exclude any reserved regions associated with the device group.
1915   */
vfio_iommu_iova_insert(struct list_head * head,dma_addr_t start,dma_addr_t end)1916  static int vfio_iommu_iova_insert(struct list_head *head,
1917  				  dma_addr_t start, dma_addr_t end)
1918  {
1919  	struct vfio_iova *region;
1920  
1921  	region = kmalloc(sizeof(*region), GFP_KERNEL);
1922  	if (!region)
1923  		return -ENOMEM;
1924  
1925  	INIT_LIST_HEAD(&region->list);
1926  	region->start = start;
1927  	region->end = end;
1928  
1929  	list_add_tail(&region->list, head);
1930  	return 0;
1931  }
1932  
1933  /*
1934   * Check the new iommu aperture conflicts with existing aper or with any
1935   * existing dma mappings.
1936   */
vfio_iommu_aper_conflict(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1937  static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
1938  				     dma_addr_t start, dma_addr_t end)
1939  {
1940  	struct vfio_iova *first, *last;
1941  	struct list_head *iova = &iommu->iova_list;
1942  
1943  	if (list_empty(iova))
1944  		return false;
1945  
1946  	/* Disjoint sets, return conflict */
1947  	first = list_first_entry(iova, struct vfio_iova, list);
1948  	last = list_last_entry(iova, struct vfio_iova, list);
1949  	if (start > last->end || end < first->start)
1950  		return true;
1951  
1952  	/* Check for any existing dma mappings below the new start */
1953  	if (start > first->start) {
1954  		if (vfio_find_dma(iommu, first->start, start - first->start))
1955  			return true;
1956  	}
1957  
1958  	/* Check for any existing dma mappings beyond the new end */
1959  	if (end < last->end) {
1960  		if (vfio_find_dma(iommu, end + 1, last->end - end))
1961  			return true;
1962  	}
1963  
1964  	return false;
1965  }
1966  
1967  /*
1968   * Resize iommu iova aperture window. This is called only if the new
1969   * aperture has no conflict with existing aperture and dma mappings.
1970   */
vfio_iommu_aper_resize(struct list_head * iova,dma_addr_t start,dma_addr_t end)1971  static int vfio_iommu_aper_resize(struct list_head *iova,
1972  				  dma_addr_t start, dma_addr_t end)
1973  {
1974  	struct vfio_iova *node, *next;
1975  
1976  	if (list_empty(iova))
1977  		return vfio_iommu_iova_insert(iova, start, end);
1978  
1979  	/* Adjust iova list start */
1980  	list_for_each_entry_safe(node, next, iova, list) {
1981  		if (start < node->start)
1982  			break;
1983  		if (start >= node->start && start < node->end) {
1984  			node->start = start;
1985  			break;
1986  		}
1987  		/* Delete nodes before new start */
1988  		list_del(&node->list);
1989  		kfree(node);
1990  	}
1991  
1992  	/* Adjust iova list end */
1993  	list_for_each_entry_safe(node, next, iova, list) {
1994  		if (end > node->end)
1995  			continue;
1996  		if (end > node->start && end <= node->end) {
1997  			node->end = end;
1998  			continue;
1999  		}
2000  		/* Delete nodes after new end */
2001  		list_del(&node->list);
2002  		kfree(node);
2003  	}
2004  
2005  	return 0;
2006  }
2007  
2008  /*
2009   * Check reserved region conflicts with existing dma mappings
2010   */
vfio_iommu_resv_conflict(struct vfio_iommu * iommu,struct list_head * resv_regions)2011  static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
2012  				     struct list_head *resv_regions)
2013  {
2014  	struct iommu_resv_region *region;
2015  
2016  	/* Check for conflict with existing dma mappings */
2017  	list_for_each_entry(region, resv_regions, list) {
2018  		if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
2019  			continue;
2020  
2021  		if (vfio_find_dma(iommu, region->start, region->length))
2022  			return true;
2023  	}
2024  
2025  	return false;
2026  }
2027  
2028  /*
2029   * Check iova region overlap with  reserved regions and
2030   * exclude them from the iommu iova range
2031   */
vfio_iommu_resv_exclude(struct list_head * iova,struct list_head * resv_regions)2032  static int vfio_iommu_resv_exclude(struct list_head *iova,
2033  				   struct list_head *resv_regions)
2034  {
2035  	struct iommu_resv_region *resv;
2036  	struct vfio_iova *n, *next;
2037  
2038  	list_for_each_entry(resv, resv_regions, list) {
2039  		phys_addr_t start, end;
2040  
2041  		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
2042  			continue;
2043  
2044  		start = resv->start;
2045  		end = resv->start + resv->length - 1;
2046  
2047  		list_for_each_entry_safe(n, next, iova, list) {
2048  			int ret = 0;
2049  
2050  			/* No overlap */
2051  			if (start > n->end || end < n->start)
2052  				continue;
2053  			/*
2054  			 * Insert a new node if current node overlaps with the
2055  			 * reserve region to exclude that from valid iova range.
2056  			 * Note that, new node is inserted before the current
2057  			 * node and finally the current node is deleted keeping
2058  			 * the list updated and sorted.
2059  			 */
2060  			if (start > n->start)
2061  				ret = vfio_iommu_iova_insert(&n->list, n->start,
2062  							     start - 1);
2063  			if (!ret && end < n->end)
2064  				ret = vfio_iommu_iova_insert(&n->list, end + 1,
2065  							     n->end);
2066  			if (ret)
2067  				return ret;
2068  
2069  			list_del(&n->list);
2070  			kfree(n);
2071  		}
2072  	}
2073  
2074  	if (list_empty(iova))
2075  		return -EINVAL;
2076  
2077  	return 0;
2078  }
2079  
vfio_iommu_resv_free(struct list_head * resv_regions)2080  static void vfio_iommu_resv_free(struct list_head *resv_regions)
2081  {
2082  	struct iommu_resv_region *n, *next;
2083  
2084  	list_for_each_entry_safe(n, next, resv_regions, list) {
2085  		list_del(&n->list);
2086  		kfree(n);
2087  	}
2088  }
2089  
vfio_iommu_iova_free(struct list_head * iova)2090  static void vfio_iommu_iova_free(struct list_head *iova)
2091  {
2092  	struct vfio_iova *n, *next;
2093  
2094  	list_for_each_entry_safe(n, next, iova, list) {
2095  		list_del(&n->list);
2096  		kfree(n);
2097  	}
2098  }
2099  
vfio_iommu_iova_get_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)2100  static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
2101  				    struct list_head *iova_copy)
2102  {
2103  	struct list_head *iova = &iommu->iova_list;
2104  	struct vfio_iova *n;
2105  	int ret;
2106  
2107  	list_for_each_entry(n, iova, list) {
2108  		ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
2109  		if (ret)
2110  			goto out_free;
2111  	}
2112  
2113  	return 0;
2114  
2115  out_free:
2116  	vfio_iommu_iova_free(iova_copy);
2117  	return ret;
2118  }
2119  
vfio_iommu_iova_insert_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)2120  static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
2121  					struct list_head *iova_copy)
2122  {
2123  	struct list_head *iova = &iommu->iova_list;
2124  
2125  	vfio_iommu_iova_free(iova);
2126  
2127  	list_splice_tail(iova_copy, iova);
2128  }
2129  
vfio_iommu_domain_alloc(struct device * dev,void * data)2130  static int vfio_iommu_domain_alloc(struct device *dev, void *data)
2131  {
2132  	struct iommu_domain **domain = data;
2133  
2134  	*domain = iommu_paging_domain_alloc(dev);
2135  	return 1; /* Don't iterate */
2136  }
2137  
vfio_iommu_type1_attach_group(void * iommu_data,struct iommu_group * iommu_group,enum vfio_group_type type)2138  static int vfio_iommu_type1_attach_group(void *iommu_data,
2139  		struct iommu_group *iommu_group, enum vfio_group_type type)
2140  {
2141  	struct vfio_iommu *iommu = iommu_data;
2142  	struct vfio_iommu_group *group;
2143  	struct vfio_domain *domain, *d;
2144  	bool resv_msi;
2145  	phys_addr_t resv_msi_base = 0;
2146  	struct iommu_domain_geometry *geo;
2147  	LIST_HEAD(iova_copy);
2148  	LIST_HEAD(group_resv_regions);
2149  	int ret = -EBUSY;
2150  
2151  	mutex_lock(&iommu->lock);
2152  
2153  	/* Attach could require pinning, so disallow while vaddr is invalid. */
2154  	if (iommu->vaddr_invalid_count)
2155  		goto out_unlock;
2156  
2157  	/* Check for duplicates */
2158  	ret = -EINVAL;
2159  	if (vfio_iommu_find_iommu_group(iommu, iommu_group))
2160  		goto out_unlock;
2161  
2162  	ret = -ENOMEM;
2163  	group = kzalloc(sizeof(*group), GFP_KERNEL);
2164  	if (!group)
2165  		goto out_unlock;
2166  	group->iommu_group = iommu_group;
2167  
2168  	if (type == VFIO_EMULATED_IOMMU) {
2169  		list_add(&group->next, &iommu->emulated_iommu_groups);
2170  		/*
2171  		 * An emulated IOMMU group cannot dirty memory directly, it can
2172  		 * only use interfaces that provide dirty tracking.
2173  		 * The iommu scope can only be promoted with the addition of a
2174  		 * dirty tracking group.
2175  		 */
2176  		group->pinned_page_dirty_scope = true;
2177  		ret = 0;
2178  		goto out_unlock;
2179  	}
2180  
2181  	ret = -ENOMEM;
2182  	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2183  	if (!domain)
2184  		goto out_free_group;
2185  
2186  	/*
2187  	 * Going via the iommu_group iterator avoids races, and trivially gives
2188  	 * us a representative device for the IOMMU API call. We don't actually
2189  	 * want to iterate beyond the first device (if any).
2190  	 */
2191  	iommu_group_for_each_dev(iommu_group, &domain->domain,
2192  				 vfio_iommu_domain_alloc);
2193  	if (IS_ERR(domain->domain)) {
2194  		ret = PTR_ERR(domain->domain);
2195  		goto out_free_domain;
2196  	}
2197  
2198  	if (iommu->nesting) {
2199  		ret = iommu_enable_nesting(domain->domain);
2200  		if (ret)
2201  			goto out_domain;
2202  	}
2203  
2204  	ret = iommu_attach_group(domain->domain, group->iommu_group);
2205  	if (ret)
2206  		goto out_domain;
2207  
2208  	/* Get aperture info */
2209  	geo = &domain->domain->geometry;
2210  	if (vfio_iommu_aper_conflict(iommu, geo->aperture_start,
2211  				     geo->aperture_end)) {
2212  		ret = -EINVAL;
2213  		goto out_detach;
2214  	}
2215  
2216  	ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
2217  	if (ret)
2218  		goto out_detach;
2219  
2220  	if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
2221  		ret = -EINVAL;
2222  		goto out_detach;
2223  	}
2224  
2225  	/*
2226  	 * We don't want to work on the original iova list as the list
2227  	 * gets modified and in case of failure we have to retain the
2228  	 * original list. Get a copy here.
2229  	 */
2230  	ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
2231  	if (ret)
2232  		goto out_detach;
2233  
2234  	ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start,
2235  				     geo->aperture_end);
2236  	if (ret)
2237  		goto out_detach;
2238  
2239  	ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
2240  	if (ret)
2241  		goto out_detach;
2242  
2243  	resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
2244  
2245  	INIT_LIST_HEAD(&domain->group_list);
2246  	list_add(&group->next, &domain->group_list);
2247  
2248  	if (!allow_unsafe_interrupts &&
2249  	    !iommu_group_has_isolated_msi(iommu_group)) {
2250  		pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2251  		       __func__);
2252  		ret = -EPERM;
2253  		goto out_detach;
2254  	}
2255  
2256  	/*
2257  	 * If the IOMMU can block non-coherent operations (ie PCIe TLPs with
2258  	 * no-snoop set) then VFIO always turns this feature on because on Intel
2259  	 * platforms it optimizes KVM to disable wbinvd emulation.
2260  	 */
2261  	if (domain->domain->ops->enforce_cache_coherency)
2262  		domain->enforce_cache_coherency =
2263  			domain->domain->ops->enforce_cache_coherency(
2264  				domain->domain);
2265  
2266  	/*
2267  	 * Try to match an existing compatible domain.  We don't want to
2268  	 * preclude an IOMMU driver supporting multiple bus_types and being
2269  	 * able to include different bus_types in the same IOMMU domain, so
2270  	 * we test whether the domains use the same iommu_ops rather than
2271  	 * testing if they're on the same bus_type.
2272  	 */
2273  	list_for_each_entry(d, &iommu->domain_list, next) {
2274  		if (d->domain->ops == domain->domain->ops &&
2275  		    d->enforce_cache_coherency ==
2276  			    domain->enforce_cache_coherency) {
2277  			iommu_detach_group(domain->domain, group->iommu_group);
2278  			if (!iommu_attach_group(d->domain,
2279  						group->iommu_group)) {
2280  				list_add(&group->next, &d->group_list);
2281  				iommu_domain_free(domain->domain);
2282  				kfree(domain);
2283  				goto done;
2284  			}
2285  
2286  			ret = iommu_attach_group(domain->domain,
2287  						 group->iommu_group);
2288  			if (ret)
2289  				goto out_domain;
2290  		}
2291  	}
2292  
2293  	vfio_test_domain_fgsp(domain, &iova_copy);
2294  
2295  	/* replay mappings on new domains */
2296  	ret = vfio_iommu_replay(iommu, domain);
2297  	if (ret)
2298  		goto out_detach;
2299  
2300  	if (resv_msi) {
2301  		ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
2302  		if (ret && ret != -ENODEV)
2303  			goto out_detach;
2304  	}
2305  
2306  	list_add(&domain->next, &iommu->domain_list);
2307  	vfio_update_pgsize_bitmap(iommu);
2308  done:
2309  	/* Delete the old one and insert new iova list */
2310  	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2311  
2312  	/*
2313  	 * An iommu backed group can dirty memory directly and therefore
2314  	 * demotes the iommu scope until it declares itself dirty tracking
2315  	 * capable via the page pinning interface.
2316  	 */
2317  	iommu->num_non_pinned_groups++;
2318  	mutex_unlock(&iommu->lock);
2319  	vfio_iommu_resv_free(&group_resv_regions);
2320  
2321  	return 0;
2322  
2323  out_detach:
2324  	iommu_detach_group(domain->domain, group->iommu_group);
2325  out_domain:
2326  	iommu_domain_free(domain->domain);
2327  	vfio_iommu_iova_free(&iova_copy);
2328  	vfio_iommu_resv_free(&group_resv_regions);
2329  out_free_domain:
2330  	kfree(domain);
2331  out_free_group:
2332  	kfree(group);
2333  out_unlock:
2334  	mutex_unlock(&iommu->lock);
2335  	return ret;
2336  }
2337  
vfio_iommu_unmap_unpin_all(struct vfio_iommu * iommu)2338  static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
2339  {
2340  	struct rb_node *node;
2341  
2342  	while ((node = rb_first(&iommu->dma_list)))
2343  		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
2344  }
2345  
vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu * iommu)2346  static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
2347  {
2348  	struct rb_node *n, *p;
2349  
2350  	n = rb_first(&iommu->dma_list);
2351  	for (; n; n = rb_next(n)) {
2352  		struct vfio_dma *dma;
2353  		long locked = 0, unlocked = 0;
2354  
2355  		dma = rb_entry(n, struct vfio_dma, node);
2356  		unlocked += vfio_unmap_unpin(iommu, dma, false);
2357  		p = rb_first(&dma->pfn_list);
2358  		for (; p; p = rb_next(p)) {
2359  			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
2360  							 node);
2361  
2362  			if (!is_invalid_reserved_pfn(vpfn->pfn))
2363  				locked++;
2364  		}
2365  		vfio_lock_acct(dma, locked - unlocked, true);
2366  	}
2367  }
2368  
2369  /*
2370   * Called when a domain is removed in detach. It is possible that
2371   * the removed domain decided the iova aperture window. Modify the
2372   * iova aperture with the smallest window among existing domains.
2373   */
vfio_iommu_aper_expand(struct vfio_iommu * iommu,struct list_head * iova_copy)2374  static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
2375  				   struct list_head *iova_copy)
2376  {
2377  	struct vfio_domain *domain;
2378  	struct vfio_iova *node;
2379  	dma_addr_t start = 0;
2380  	dma_addr_t end = (dma_addr_t)~0;
2381  
2382  	if (list_empty(iova_copy))
2383  		return;
2384  
2385  	list_for_each_entry(domain, &iommu->domain_list, next) {
2386  		struct iommu_domain_geometry *geo = &domain->domain->geometry;
2387  
2388  		if (geo->aperture_start > start)
2389  			start = geo->aperture_start;
2390  		if (geo->aperture_end < end)
2391  			end = geo->aperture_end;
2392  	}
2393  
2394  	/* Modify aperture limits. The new aper is either same or bigger */
2395  	node = list_first_entry(iova_copy, struct vfio_iova, list);
2396  	node->start = start;
2397  	node = list_last_entry(iova_copy, struct vfio_iova, list);
2398  	node->end = end;
2399  }
2400  
2401  /*
2402   * Called when a group is detached. The reserved regions for that
2403   * group can be part of valid iova now. But since reserved regions
2404   * may be duplicated among groups, populate the iova valid regions
2405   * list again.
2406   */
vfio_iommu_resv_refresh(struct vfio_iommu * iommu,struct list_head * iova_copy)2407  static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
2408  				   struct list_head *iova_copy)
2409  {
2410  	struct vfio_domain *d;
2411  	struct vfio_iommu_group *g;
2412  	struct vfio_iova *node;
2413  	dma_addr_t start, end;
2414  	LIST_HEAD(resv_regions);
2415  	int ret;
2416  
2417  	if (list_empty(iova_copy))
2418  		return -EINVAL;
2419  
2420  	list_for_each_entry(d, &iommu->domain_list, next) {
2421  		list_for_each_entry(g, &d->group_list, next) {
2422  			ret = iommu_get_group_resv_regions(g->iommu_group,
2423  							   &resv_regions);
2424  			if (ret)
2425  				goto done;
2426  		}
2427  	}
2428  
2429  	node = list_first_entry(iova_copy, struct vfio_iova, list);
2430  	start = node->start;
2431  	node = list_last_entry(iova_copy, struct vfio_iova, list);
2432  	end = node->end;
2433  
2434  	/* purge the iova list and create new one */
2435  	vfio_iommu_iova_free(iova_copy);
2436  
2437  	ret = vfio_iommu_aper_resize(iova_copy, start, end);
2438  	if (ret)
2439  		goto done;
2440  
2441  	/* Exclude current reserved regions from iova ranges */
2442  	ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
2443  done:
2444  	vfio_iommu_resv_free(&resv_regions);
2445  	return ret;
2446  }
2447  
vfio_iommu_type1_detach_group(void * iommu_data,struct iommu_group * iommu_group)2448  static void vfio_iommu_type1_detach_group(void *iommu_data,
2449  					  struct iommu_group *iommu_group)
2450  {
2451  	struct vfio_iommu *iommu = iommu_data;
2452  	struct vfio_domain *domain;
2453  	struct vfio_iommu_group *group;
2454  	bool update_dirty_scope = false;
2455  	LIST_HEAD(iova_copy);
2456  
2457  	mutex_lock(&iommu->lock);
2458  	list_for_each_entry(group, &iommu->emulated_iommu_groups, next) {
2459  		if (group->iommu_group != iommu_group)
2460  			continue;
2461  		update_dirty_scope = !group->pinned_page_dirty_scope;
2462  		list_del(&group->next);
2463  		kfree(group);
2464  
2465  		if (list_empty(&iommu->emulated_iommu_groups) &&
2466  		    list_empty(&iommu->domain_list)) {
2467  			WARN_ON(!list_empty(&iommu->device_list));
2468  			vfio_iommu_unmap_unpin_all(iommu);
2469  		}
2470  		goto detach_group_done;
2471  	}
2472  
2473  	/*
2474  	 * Get a copy of iova list. This will be used to update
2475  	 * and to replace the current one later. Please note that
2476  	 * we will leave the original list as it is if update fails.
2477  	 */
2478  	vfio_iommu_iova_get_copy(iommu, &iova_copy);
2479  
2480  	list_for_each_entry(domain, &iommu->domain_list, next) {
2481  		group = find_iommu_group(domain, iommu_group);
2482  		if (!group)
2483  			continue;
2484  
2485  		iommu_detach_group(domain->domain, group->iommu_group);
2486  		update_dirty_scope = !group->pinned_page_dirty_scope;
2487  		list_del(&group->next);
2488  		kfree(group);
2489  		/*
2490  		 * Group ownership provides privilege, if the group list is
2491  		 * empty, the domain goes away. If it's the last domain with
2492  		 * iommu and external domain doesn't exist, then all the
2493  		 * mappings go away too. If it's the last domain with iommu and
2494  		 * external domain exist, update accounting
2495  		 */
2496  		if (list_empty(&domain->group_list)) {
2497  			if (list_is_singular(&iommu->domain_list)) {
2498  				if (list_empty(&iommu->emulated_iommu_groups)) {
2499  					WARN_ON(!list_empty(
2500  						&iommu->device_list));
2501  					vfio_iommu_unmap_unpin_all(iommu);
2502  				} else {
2503  					vfio_iommu_unmap_unpin_reaccount(iommu);
2504  				}
2505  			}
2506  			iommu_domain_free(domain->domain);
2507  			list_del(&domain->next);
2508  			kfree(domain);
2509  			vfio_iommu_aper_expand(iommu, &iova_copy);
2510  			vfio_update_pgsize_bitmap(iommu);
2511  		}
2512  		break;
2513  	}
2514  
2515  	if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2516  		vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2517  	else
2518  		vfio_iommu_iova_free(&iova_copy);
2519  
2520  detach_group_done:
2521  	/*
2522  	 * Removal of a group without dirty tracking may allow the iommu scope
2523  	 * to be promoted.
2524  	 */
2525  	if (update_dirty_scope) {
2526  		iommu->num_non_pinned_groups--;
2527  		if (iommu->dirty_page_tracking)
2528  			vfio_iommu_populate_bitmap_full(iommu);
2529  	}
2530  	mutex_unlock(&iommu->lock);
2531  }
2532  
vfio_iommu_type1_open(unsigned long arg)2533  static void *vfio_iommu_type1_open(unsigned long arg)
2534  {
2535  	struct vfio_iommu *iommu;
2536  
2537  	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2538  	if (!iommu)
2539  		return ERR_PTR(-ENOMEM);
2540  
2541  	switch (arg) {
2542  	case VFIO_TYPE1_IOMMU:
2543  		break;
2544  	case VFIO_TYPE1_NESTING_IOMMU:
2545  		iommu->nesting = true;
2546  		fallthrough;
2547  	case VFIO_TYPE1v2_IOMMU:
2548  		iommu->v2 = true;
2549  		break;
2550  	default:
2551  		kfree(iommu);
2552  		return ERR_PTR(-EINVAL);
2553  	}
2554  
2555  	INIT_LIST_HEAD(&iommu->domain_list);
2556  	INIT_LIST_HEAD(&iommu->iova_list);
2557  	iommu->dma_list = RB_ROOT;
2558  	iommu->dma_avail = dma_entry_limit;
2559  	mutex_init(&iommu->lock);
2560  	mutex_init(&iommu->device_list_lock);
2561  	INIT_LIST_HEAD(&iommu->device_list);
2562  	iommu->pgsize_bitmap = PAGE_MASK;
2563  	INIT_LIST_HEAD(&iommu->emulated_iommu_groups);
2564  
2565  	return iommu;
2566  }
2567  
vfio_release_domain(struct vfio_domain * domain)2568  static void vfio_release_domain(struct vfio_domain *domain)
2569  {
2570  	struct vfio_iommu_group *group, *group_tmp;
2571  
2572  	list_for_each_entry_safe(group, group_tmp,
2573  				 &domain->group_list, next) {
2574  		iommu_detach_group(domain->domain, group->iommu_group);
2575  		list_del(&group->next);
2576  		kfree(group);
2577  	}
2578  
2579  	iommu_domain_free(domain->domain);
2580  }
2581  
vfio_iommu_type1_release(void * iommu_data)2582  static void vfio_iommu_type1_release(void *iommu_data)
2583  {
2584  	struct vfio_iommu *iommu = iommu_data;
2585  	struct vfio_domain *domain, *domain_tmp;
2586  	struct vfio_iommu_group *group, *next_group;
2587  
2588  	list_for_each_entry_safe(group, next_group,
2589  			&iommu->emulated_iommu_groups, next) {
2590  		list_del(&group->next);
2591  		kfree(group);
2592  	}
2593  
2594  	vfio_iommu_unmap_unpin_all(iommu);
2595  
2596  	list_for_each_entry_safe(domain, domain_tmp,
2597  				 &iommu->domain_list, next) {
2598  		vfio_release_domain(domain);
2599  		list_del(&domain->next);
2600  		kfree(domain);
2601  	}
2602  
2603  	vfio_iommu_iova_free(&iommu->iova_list);
2604  
2605  	kfree(iommu);
2606  }
2607  
vfio_domains_have_enforce_cache_coherency(struct vfio_iommu * iommu)2608  static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu)
2609  {
2610  	struct vfio_domain *domain;
2611  	int ret = 1;
2612  
2613  	mutex_lock(&iommu->lock);
2614  	list_for_each_entry(domain, &iommu->domain_list, next) {
2615  		if (!(domain->enforce_cache_coherency)) {
2616  			ret = 0;
2617  			break;
2618  		}
2619  	}
2620  	mutex_unlock(&iommu->lock);
2621  
2622  	return ret;
2623  }
2624  
vfio_iommu_has_emulated(struct vfio_iommu * iommu)2625  static bool vfio_iommu_has_emulated(struct vfio_iommu *iommu)
2626  {
2627  	bool ret;
2628  
2629  	mutex_lock(&iommu->lock);
2630  	ret = !list_empty(&iommu->emulated_iommu_groups);
2631  	mutex_unlock(&iommu->lock);
2632  	return ret;
2633  }
2634  
vfio_iommu_type1_check_extension(struct vfio_iommu * iommu,unsigned long arg)2635  static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
2636  					    unsigned long arg)
2637  {
2638  	switch (arg) {
2639  	case VFIO_TYPE1_IOMMU:
2640  	case VFIO_TYPE1v2_IOMMU:
2641  	case VFIO_TYPE1_NESTING_IOMMU:
2642  	case VFIO_UNMAP_ALL:
2643  		return 1;
2644  	case VFIO_UPDATE_VADDR:
2645  		/*
2646  		 * Disable this feature if mdevs are present.  They cannot
2647  		 * safely pin/unpin/rw while vaddrs are being updated.
2648  		 */
2649  		return iommu && !vfio_iommu_has_emulated(iommu);
2650  	case VFIO_DMA_CC_IOMMU:
2651  		if (!iommu)
2652  			return 0;
2653  		return vfio_domains_have_enforce_cache_coherency(iommu);
2654  	default:
2655  		return 0;
2656  	}
2657  }
2658  
vfio_iommu_iova_add_cap(struct vfio_info_cap * caps,struct vfio_iommu_type1_info_cap_iova_range * cap_iovas,size_t size)2659  static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2660  		 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2661  		 size_t size)
2662  {
2663  	struct vfio_info_cap_header *header;
2664  	struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2665  
2666  	header = vfio_info_cap_add(caps, size,
2667  				   VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2668  	if (IS_ERR(header))
2669  		return PTR_ERR(header);
2670  
2671  	iova_cap = container_of(header,
2672  				struct vfio_iommu_type1_info_cap_iova_range,
2673  				header);
2674  	iova_cap->nr_iovas = cap_iovas->nr_iovas;
2675  	memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2676  	       cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2677  	return 0;
2678  }
2679  
vfio_iommu_iova_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2680  static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2681  				      struct vfio_info_cap *caps)
2682  {
2683  	struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2684  	struct vfio_iova *iova;
2685  	size_t size;
2686  	int iovas = 0, i = 0, ret;
2687  
2688  	list_for_each_entry(iova, &iommu->iova_list, list)
2689  		iovas++;
2690  
2691  	if (!iovas) {
2692  		/*
2693  		 * Return 0 as a container with a single mdev device
2694  		 * will have an empty list
2695  		 */
2696  		return 0;
2697  	}
2698  
2699  	size = struct_size(cap_iovas, iova_ranges, iovas);
2700  
2701  	cap_iovas = kzalloc(size, GFP_KERNEL);
2702  	if (!cap_iovas)
2703  		return -ENOMEM;
2704  
2705  	cap_iovas->nr_iovas = iovas;
2706  
2707  	list_for_each_entry(iova, &iommu->iova_list, list) {
2708  		cap_iovas->iova_ranges[i].start = iova->start;
2709  		cap_iovas->iova_ranges[i].end = iova->end;
2710  		i++;
2711  	}
2712  
2713  	ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2714  
2715  	kfree(cap_iovas);
2716  	return ret;
2717  }
2718  
vfio_iommu_migration_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2719  static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
2720  					   struct vfio_info_cap *caps)
2721  {
2722  	struct vfio_iommu_type1_info_cap_migration cap_mig = {};
2723  
2724  	cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
2725  	cap_mig.header.version = 1;
2726  
2727  	cap_mig.flags = 0;
2728  	/* support minimum pgsize */
2729  	cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2730  	cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
2731  
2732  	return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
2733  }
2734  
vfio_iommu_dma_avail_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2735  static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
2736  					   struct vfio_info_cap *caps)
2737  {
2738  	struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
2739  
2740  	cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
2741  	cap_dma_avail.header.version = 1;
2742  
2743  	cap_dma_avail.avail = iommu->dma_avail;
2744  
2745  	return vfio_info_add_capability(caps, &cap_dma_avail.header,
2746  					sizeof(cap_dma_avail));
2747  }
2748  
vfio_iommu_type1_get_info(struct vfio_iommu * iommu,unsigned long arg)2749  static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
2750  				     unsigned long arg)
2751  {
2752  	struct vfio_iommu_type1_info info = {};
2753  	unsigned long minsz;
2754  	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2755  	int ret;
2756  
2757  	minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2758  
2759  	if (copy_from_user(&info, (void __user *)arg, minsz))
2760  		return -EFAULT;
2761  
2762  	if (info.argsz < minsz)
2763  		return -EINVAL;
2764  
2765  	minsz = min_t(size_t, info.argsz, sizeof(info));
2766  
2767  	mutex_lock(&iommu->lock);
2768  	info.flags = VFIO_IOMMU_INFO_PGSIZES;
2769  
2770  	info.iova_pgsizes = iommu->pgsize_bitmap;
2771  
2772  	ret = vfio_iommu_migration_build_caps(iommu, &caps);
2773  
2774  	if (!ret)
2775  		ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
2776  
2777  	if (!ret)
2778  		ret = vfio_iommu_iova_build_caps(iommu, &caps);
2779  
2780  	mutex_unlock(&iommu->lock);
2781  
2782  	if (ret)
2783  		return ret;
2784  
2785  	if (caps.size) {
2786  		info.flags |= VFIO_IOMMU_INFO_CAPS;
2787  
2788  		if (info.argsz < sizeof(info) + caps.size) {
2789  			info.argsz = sizeof(info) + caps.size;
2790  		} else {
2791  			vfio_info_cap_shift(&caps, sizeof(info));
2792  			if (copy_to_user((void __user *)arg +
2793  					sizeof(info), caps.buf,
2794  					caps.size)) {
2795  				kfree(caps.buf);
2796  				return -EFAULT;
2797  			}
2798  			info.cap_offset = sizeof(info);
2799  		}
2800  
2801  		kfree(caps.buf);
2802  	}
2803  
2804  	return copy_to_user((void __user *)arg, &info, minsz) ?
2805  			-EFAULT : 0;
2806  }
2807  
vfio_iommu_type1_map_dma(struct vfio_iommu * iommu,unsigned long arg)2808  static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
2809  				    unsigned long arg)
2810  {
2811  	struct vfio_iommu_type1_dma_map map;
2812  	unsigned long minsz;
2813  	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
2814  			VFIO_DMA_MAP_FLAG_VADDR;
2815  
2816  	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2817  
2818  	if (copy_from_user(&map, (void __user *)arg, minsz))
2819  		return -EFAULT;
2820  
2821  	if (map.argsz < minsz || map.flags & ~mask)
2822  		return -EINVAL;
2823  
2824  	return vfio_dma_do_map(iommu, &map);
2825  }
2826  
vfio_iommu_type1_unmap_dma(struct vfio_iommu * iommu,unsigned long arg)2827  static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
2828  				      unsigned long arg)
2829  {
2830  	struct vfio_iommu_type1_dma_unmap unmap;
2831  	struct vfio_bitmap bitmap = { 0 };
2832  	uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
2833  			VFIO_DMA_UNMAP_FLAG_VADDR |
2834  			VFIO_DMA_UNMAP_FLAG_ALL;
2835  	unsigned long minsz;
2836  	int ret;
2837  
2838  	minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2839  
2840  	if (copy_from_user(&unmap, (void __user *)arg, minsz))
2841  		return -EFAULT;
2842  
2843  	if (unmap.argsz < minsz || unmap.flags & ~mask)
2844  		return -EINVAL;
2845  
2846  	if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
2847  	    (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
2848  			    VFIO_DMA_UNMAP_FLAG_VADDR)))
2849  		return -EINVAL;
2850  
2851  	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
2852  		unsigned long pgshift;
2853  
2854  		if (unmap.argsz < (minsz + sizeof(bitmap)))
2855  			return -EINVAL;
2856  
2857  		if (copy_from_user(&bitmap,
2858  				   (void __user *)(arg + minsz),
2859  				   sizeof(bitmap)))
2860  			return -EFAULT;
2861  
2862  		if (!access_ok((void __user *)bitmap.data, bitmap.size))
2863  			return -EINVAL;
2864  
2865  		pgshift = __ffs(bitmap.pgsize);
2866  		ret = verify_bitmap_size(unmap.size >> pgshift,
2867  					 bitmap.size);
2868  		if (ret)
2869  			return ret;
2870  	}
2871  
2872  	ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
2873  	if (ret)
2874  		return ret;
2875  
2876  	return copy_to_user((void __user *)arg, &unmap, minsz) ?
2877  			-EFAULT : 0;
2878  }
2879  
vfio_iommu_type1_dirty_pages(struct vfio_iommu * iommu,unsigned long arg)2880  static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
2881  					unsigned long arg)
2882  {
2883  	struct vfio_iommu_type1_dirty_bitmap dirty;
2884  	uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
2885  			VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
2886  			VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
2887  	unsigned long minsz;
2888  	int ret = 0;
2889  
2890  	if (!iommu->v2)
2891  		return -EACCES;
2892  
2893  	minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
2894  
2895  	if (copy_from_user(&dirty, (void __user *)arg, minsz))
2896  		return -EFAULT;
2897  
2898  	if (dirty.argsz < minsz || dirty.flags & ~mask)
2899  		return -EINVAL;
2900  
2901  	/* only one flag should be set at a time */
2902  	if (__ffs(dirty.flags) != __fls(dirty.flags))
2903  		return -EINVAL;
2904  
2905  	if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
2906  		size_t pgsize;
2907  
2908  		mutex_lock(&iommu->lock);
2909  		pgsize = 1 << __ffs(iommu->pgsize_bitmap);
2910  		if (!iommu->dirty_page_tracking) {
2911  			ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
2912  			if (!ret)
2913  				iommu->dirty_page_tracking = true;
2914  		}
2915  		mutex_unlock(&iommu->lock);
2916  		return ret;
2917  	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
2918  		mutex_lock(&iommu->lock);
2919  		if (iommu->dirty_page_tracking) {
2920  			iommu->dirty_page_tracking = false;
2921  			vfio_dma_bitmap_free_all(iommu);
2922  		}
2923  		mutex_unlock(&iommu->lock);
2924  		return 0;
2925  	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
2926  		struct vfio_iommu_type1_dirty_bitmap_get range;
2927  		unsigned long pgshift;
2928  		size_t data_size = dirty.argsz - minsz;
2929  		size_t iommu_pgsize;
2930  
2931  		if (!data_size || data_size < sizeof(range))
2932  			return -EINVAL;
2933  
2934  		if (copy_from_user(&range, (void __user *)(arg + minsz),
2935  				   sizeof(range)))
2936  			return -EFAULT;
2937  
2938  		if (range.iova + range.size < range.iova)
2939  			return -EINVAL;
2940  		if (!access_ok((void __user *)range.bitmap.data,
2941  			       range.bitmap.size))
2942  			return -EINVAL;
2943  
2944  		pgshift = __ffs(range.bitmap.pgsize);
2945  		ret = verify_bitmap_size(range.size >> pgshift,
2946  					 range.bitmap.size);
2947  		if (ret)
2948  			return ret;
2949  
2950  		mutex_lock(&iommu->lock);
2951  
2952  		iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2953  
2954  		/* allow only smallest supported pgsize */
2955  		if (range.bitmap.pgsize != iommu_pgsize) {
2956  			ret = -EINVAL;
2957  			goto out_unlock;
2958  		}
2959  		if (range.iova & (iommu_pgsize - 1)) {
2960  			ret = -EINVAL;
2961  			goto out_unlock;
2962  		}
2963  		if (!range.size || range.size & (iommu_pgsize - 1)) {
2964  			ret = -EINVAL;
2965  			goto out_unlock;
2966  		}
2967  
2968  		if (iommu->dirty_page_tracking)
2969  			ret = vfio_iova_dirty_bitmap(range.bitmap.data,
2970  						     iommu, range.iova,
2971  						     range.size,
2972  						     range.bitmap.pgsize);
2973  		else
2974  			ret = -EINVAL;
2975  out_unlock:
2976  		mutex_unlock(&iommu->lock);
2977  
2978  		return ret;
2979  	}
2980  
2981  	return -EINVAL;
2982  }
2983  
vfio_iommu_type1_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)2984  static long vfio_iommu_type1_ioctl(void *iommu_data,
2985  				   unsigned int cmd, unsigned long arg)
2986  {
2987  	struct vfio_iommu *iommu = iommu_data;
2988  
2989  	switch (cmd) {
2990  	case VFIO_CHECK_EXTENSION:
2991  		return vfio_iommu_type1_check_extension(iommu, arg);
2992  	case VFIO_IOMMU_GET_INFO:
2993  		return vfio_iommu_type1_get_info(iommu, arg);
2994  	case VFIO_IOMMU_MAP_DMA:
2995  		return vfio_iommu_type1_map_dma(iommu, arg);
2996  	case VFIO_IOMMU_UNMAP_DMA:
2997  		return vfio_iommu_type1_unmap_dma(iommu, arg);
2998  	case VFIO_IOMMU_DIRTY_PAGES:
2999  		return vfio_iommu_type1_dirty_pages(iommu, arg);
3000  	default:
3001  		return -ENOTTY;
3002  	}
3003  }
3004  
vfio_iommu_type1_register_device(void * iommu_data,struct vfio_device * vdev)3005  static void vfio_iommu_type1_register_device(void *iommu_data,
3006  					     struct vfio_device *vdev)
3007  {
3008  	struct vfio_iommu *iommu = iommu_data;
3009  
3010  	if (!vdev->ops->dma_unmap)
3011  		return;
3012  
3013  	/*
3014  	 * list_empty(&iommu->device_list) is tested under the iommu->lock while
3015  	 * iteration for dma_unmap must be done under the device_list_lock.
3016  	 * Holding both locks here allows avoiding the device_list_lock in
3017  	 * several fast paths. See vfio_notify_dma_unmap()
3018  	 */
3019  	mutex_lock(&iommu->lock);
3020  	mutex_lock(&iommu->device_list_lock);
3021  	list_add(&vdev->iommu_entry, &iommu->device_list);
3022  	mutex_unlock(&iommu->device_list_lock);
3023  	mutex_unlock(&iommu->lock);
3024  }
3025  
vfio_iommu_type1_unregister_device(void * iommu_data,struct vfio_device * vdev)3026  static void vfio_iommu_type1_unregister_device(void *iommu_data,
3027  					       struct vfio_device *vdev)
3028  {
3029  	struct vfio_iommu *iommu = iommu_data;
3030  
3031  	if (!vdev->ops->dma_unmap)
3032  		return;
3033  
3034  	mutex_lock(&iommu->lock);
3035  	mutex_lock(&iommu->device_list_lock);
3036  	list_del(&vdev->iommu_entry);
3037  	mutex_unlock(&iommu->device_list_lock);
3038  	mutex_unlock(&iommu->lock);
3039  }
3040  
vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu * iommu,dma_addr_t user_iova,void * data,size_t count,bool write,size_t * copied)3041  static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
3042  					 dma_addr_t user_iova, void *data,
3043  					 size_t count, bool write,
3044  					 size_t *copied)
3045  {
3046  	struct mm_struct *mm;
3047  	unsigned long vaddr;
3048  	struct vfio_dma *dma;
3049  	bool kthread = current->mm == NULL;
3050  	size_t offset;
3051  
3052  	*copied = 0;
3053  
3054  	dma = vfio_find_dma(iommu, user_iova, 1);
3055  	if (!dma)
3056  		return -EINVAL;
3057  
3058  	if ((write && !(dma->prot & IOMMU_WRITE)) ||
3059  			!(dma->prot & IOMMU_READ))
3060  		return -EPERM;
3061  
3062  	mm = dma->mm;
3063  	if (!mmget_not_zero(mm))
3064  		return -EPERM;
3065  
3066  	if (kthread)
3067  		kthread_use_mm(mm);
3068  	else if (current->mm != mm)
3069  		goto out;
3070  
3071  	offset = user_iova - dma->iova;
3072  
3073  	if (count > dma->size - offset)
3074  		count = dma->size - offset;
3075  
3076  	vaddr = dma->vaddr + offset;
3077  
3078  	if (write) {
3079  		*copied = copy_to_user((void __user *)vaddr, data,
3080  					 count) ? 0 : count;
3081  		if (*copied && iommu->dirty_page_tracking) {
3082  			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
3083  			/*
3084  			 * Bitmap populated with the smallest supported page
3085  			 * size
3086  			 */
3087  			bitmap_set(dma->bitmap, offset >> pgshift,
3088  				   ((offset + *copied - 1) >> pgshift) -
3089  				   (offset >> pgshift) + 1);
3090  		}
3091  	} else
3092  		*copied = copy_from_user(data, (void __user *)vaddr,
3093  					   count) ? 0 : count;
3094  	if (kthread)
3095  		kthread_unuse_mm(mm);
3096  out:
3097  	mmput(mm);
3098  	return *copied ? 0 : -EFAULT;
3099  }
3100  
vfio_iommu_type1_dma_rw(void * iommu_data,dma_addr_t user_iova,void * data,size_t count,bool write)3101  static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
3102  				   void *data, size_t count, bool write)
3103  {
3104  	struct vfio_iommu *iommu = iommu_data;
3105  	int ret = 0;
3106  	size_t done;
3107  
3108  	mutex_lock(&iommu->lock);
3109  
3110  	if (WARN_ONCE(iommu->vaddr_invalid_count,
3111  		      "vfio_dma_rw not allowed with VFIO_UPDATE_VADDR\n")) {
3112  		ret = -EBUSY;
3113  		goto out;
3114  	}
3115  
3116  	while (count > 0) {
3117  		ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
3118  						    count, write, &done);
3119  		if (ret)
3120  			break;
3121  
3122  		count -= done;
3123  		data += done;
3124  		user_iova += done;
3125  	}
3126  
3127  out:
3128  	mutex_unlock(&iommu->lock);
3129  	return ret;
3130  }
3131  
3132  static struct iommu_domain *
vfio_iommu_type1_group_iommu_domain(void * iommu_data,struct iommu_group * iommu_group)3133  vfio_iommu_type1_group_iommu_domain(void *iommu_data,
3134  				    struct iommu_group *iommu_group)
3135  {
3136  	struct iommu_domain *domain = ERR_PTR(-ENODEV);
3137  	struct vfio_iommu *iommu = iommu_data;
3138  	struct vfio_domain *d;
3139  
3140  	if (!iommu || !iommu_group)
3141  		return ERR_PTR(-EINVAL);
3142  
3143  	mutex_lock(&iommu->lock);
3144  	list_for_each_entry(d, &iommu->domain_list, next) {
3145  		if (find_iommu_group(d, iommu_group)) {
3146  			domain = d->domain;
3147  			break;
3148  		}
3149  	}
3150  	mutex_unlock(&iommu->lock);
3151  
3152  	return domain;
3153  }
3154  
3155  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
3156  	.name			= "vfio-iommu-type1",
3157  	.owner			= THIS_MODULE,
3158  	.open			= vfio_iommu_type1_open,
3159  	.release		= vfio_iommu_type1_release,
3160  	.ioctl			= vfio_iommu_type1_ioctl,
3161  	.attach_group		= vfio_iommu_type1_attach_group,
3162  	.detach_group		= vfio_iommu_type1_detach_group,
3163  	.pin_pages		= vfio_iommu_type1_pin_pages,
3164  	.unpin_pages		= vfio_iommu_type1_unpin_pages,
3165  	.register_device	= vfio_iommu_type1_register_device,
3166  	.unregister_device	= vfio_iommu_type1_unregister_device,
3167  	.dma_rw			= vfio_iommu_type1_dma_rw,
3168  	.group_iommu_domain	= vfio_iommu_type1_group_iommu_domain,
3169  };
3170  
vfio_iommu_type1_init(void)3171  static int __init vfio_iommu_type1_init(void)
3172  {
3173  	return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
3174  }
3175  
vfio_iommu_type1_cleanup(void)3176  static void __exit vfio_iommu_type1_cleanup(void)
3177  {
3178  	vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
3179  }
3180  
3181  module_init(vfio_iommu_type1_init);
3182  module_exit(vfio_iommu_type1_cleanup);
3183  
3184  MODULE_VERSION(DRIVER_VERSION);
3185  MODULE_LICENSE("GPL v2");
3186  MODULE_AUTHOR(DRIVER_AUTHOR);
3187  MODULE_DESCRIPTION(DRIVER_DESC);
3188