1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * VFIO core
4   *
5   * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6   *     Author: Alex Williamson <alex.williamson@redhat.com>
7   *
8   * Derived from original vfio:
9   * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10   * Author: Tom Lyon, pugs@cisco.com
11   */
12  
13  #include <linux/cdev.h>
14  #include <linux/compat.h>
15  #include <linux/device.h>
16  #include <linux/fs.h>
17  #include <linux/idr.h>
18  #include <linux/iommu.h>
19  #if IS_ENABLED(CONFIG_KVM)
20  #include <linux/kvm_host.h>
21  #endif
22  #include <linux/list.h>
23  #include <linux/miscdevice.h>
24  #include <linux/module.h>
25  #include <linux/mount.h>
26  #include <linux/mutex.h>
27  #include <linux/pci.h>
28  #include <linux/pseudo_fs.h>
29  #include <linux/rwsem.h>
30  #include <linux/sched.h>
31  #include <linux/slab.h>
32  #include <linux/stat.h>
33  #include <linux/string.h>
34  #include <linux/uaccess.h>
35  #include <linux/vfio.h>
36  #include <linux/wait.h>
37  #include <linux/sched/signal.h>
38  #include <linux/pm_runtime.h>
39  #include <linux/interval_tree.h>
40  #include <linux/iova_bitmap.h>
41  #include <linux/iommufd.h>
42  #include "vfio.h"
43  
44  #define DRIVER_VERSION	"0.3"
45  #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
46  #define DRIVER_DESC	"VFIO - User Level meta-driver"
47  
48  #define VFIO_MAGIC 0x5646494f /* "VFIO" */
49  
50  static struct vfio {
51  	struct class			*device_class;
52  	struct ida			device_ida;
53  	struct vfsmount			*vfs_mount;
54  	int				fs_count;
55  } vfio;
56  
57  #ifdef CONFIG_VFIO_NOIOMMU
58  bool vfio_noiommu __read_mostly;
59  module_param_named(enable_unsafe_noiommu_mode,
60  		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
61  MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
62  #endif
63  
64  static DEFINE_XARRAY(vfio_device_set_xa);
65  
vfio_assign_device_set(struct vfio_device * device,void * set_id)66  int vfio_assign_device_set(struct vfio_device *device, void *set_id)
67  {
68  	unsigned long idx = (unsigned long)set_id;
69  	struct vfio_device_set *new_dev_set;
70  	struct vfio_device_set *dev_set;
71  
72  	if (WARN_ON(!set_id))
73  		return -EINVAL;
74  
75  	/*
76  	 * Atomically acquire a singleton object in the xarray for this set_id
77  	 */
78  	xa_lock(&vfio_device_set_xa);
79  	dev_set = xa_load(&vfio_device_set_xa, idx);
80  	if (dev_set)
81  		goto found_get_ref;
82  	xa_unlock(&vfio_device_set_xa);
83  
84  	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
85  	if (!new_dev_set)
86  		return -ENOMEM;
87  	mutex_init(&new_dev_set->lock);
88  	INIT_LIST_HEAD(&new_dev_set->device_list);
89  	new_dev_set->set_id = set_id;
90  
91  	xa_lock(&vfio_device_set_xa);
92  	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
93  			       GFP_KERNEL);
94  	if (!dev_set) {
95  		dev_set = new_dev_set;
96  		goto found_get_ref;
97  	}
98  
99  	kfree(new_dev_set);
100  	if (xa_is_err(dev_set)) {
101  		xa_unlock(&vfio_device_set_xa);
102  		return xa_err(dev_set);
103  	}
104  
105  found_get_ref:
106  	dev_set->device_count++;
107  	xa_unlock(&vfio_device_set_xa);
108  	mutex_lock(&dev_set->lock);
109  	device->dev_set = dev_set;
110  	list_add_tail(&device->dev_set_list, &dev_set->device_list);
111  	mutex_unlock(&dev_set->lock);
112  	return 0;
113  }
114  EXPORT_SYMBOL_GPL(vfio_assign_device_set);
115  
vfio_release_device_set(struct vfio_device * device)116  static void vfio_release_device_set(struct vfio_device *device)
117  {
118  	struct vfio_device_set *dev_set = device->dev_set;
119  
120  	if (!dev_set)
121  		return;
122  
123  	mutex_lock(&dev_set->lock);
124  	list_del(&device->dev_set_list);
125  	mutex_unlock(&dev_set->lock);
126  
127  	xa_lock(&vfio_device_set_xa);
128  	if (!--dev_set->device_count) {
129  		__xa_erase(&vfio_device_set_xa,
130  			   (unsigned long)dev_set->set_id);
131  		mutex_destroy(&dev_set->lock);
132  		kfree(dev_set);
133  	}
134  	xa_unlock(&vfio_device_set_xa);
135  }
136  
vfio_device_set_open_count(struct vfio_device_set * dev_set)137  unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
138  {
139  	struct vfio_device *cur;
140  	unsigned int open_count = 0;
141  
142  	lockdep_assert_held(&dev_set->lock);
143  
144  	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
145  		open_count += cur->open_count;
146  	return open_count;
147  }
148  EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
149  
150  struct vfio_device *
vfio_find_device_in_devset(struct vfio_device_set * dev_set,struct device * dev)151  vfio_find_device_in_devset(struct vfio_device_set *dev_set,
152  			   struct device *dev)
153  {
154  	struct vfio_device *cur;
155  
156  	lockdep_assert_held(&dev_set->lock);
157  
158  	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
159  		if (cur->dev == dev)
160  			return cur;
161  	return NULL;
162  }
163  EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
164  
165  /*
166   * Device objects - create, release, get, put, search
167   */
168  /* Device reference always implies a group reference */
vfio_device_put_registration(struct vfio_device * device)169  void vfio_device_put_registration(struct vfio_device *device)
170  {
171  	if (refcount_dec_and_test(&device->refcount))
172  		complete(&device->comp);
173  }
174  
vfio_device_try_get_registration(struct vfio_device * device)175  bool vfio_device_try_get_registration(struct vfio_device *device)
176  {
177  	return refcount_inc_not_zero(&device->refcount);
178  }
179  
180  /*
181   * VFIO driver API
182   */
183  /* Release helper called by vfio_put_device() */
vfio_device_release(struct device * dev)184  static void vfio_device_release(struct device *dev)
185  {
186  	struct vfio_device *device =
187  			container_of(dev, struct vfio_device, device);
188  
189  	vfio_release_device_set(device);
190  	ida_free(&vfio.device_ida, device->index);
191  
192  	if (device->ops->release)
193  		device->ops->release(device);
194  
195  	iput(device->inode);
196  	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
197  	kvfree(device);
198  }
199  
200  static int vfio_init_device(struct vfio_device *device, struct device *dev,
201  			    const struct vfio_device_ops *ops);
202  
203  /*
204   * Allocate and initialize vfio_device so it can be registered to vfio
205   * core.
206   *
207   * Drivers should use the wrapper vfio_alloc_device() for allocation.
208   * @size is the size of the structure to be allocated, including any
209   * private data used by the driver.
210   *
211   * Driver may provide an @init callback to cover device private data.
212   *
213   * Use vfio_put_device() to release the structure after success return.
214   */
_vfio_alloc_device(size_t size,struct device * dev,const struct vfio_device_ops * ops)215  struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
216  				       const struct vfio_device_ops *ops)
217  {
218  	struct vfio_device *device;
219  	int ret;
220  
221  	if (WARN_ON(size < sizeof(struct vfio_device)))
222  		return ERR_PTR(-EINVAL);
223  
224  	device = kvzalloc(size, GFP_KERNEL);
225  	if (!device)
226  		return ERR_PTR(-ENOMEM);
227  
228  	ret = vfio_init_device(device, dev, ops);
229  	if (ret)
230  		goto out_free;
231  	return device;
232  
233  out_free:
234  	kvfree(device);
235  	return ERR_PTR(ret);
236  }
237  EXPORT_SYMBOL_GPL(_vfio_alloc_device);
238  
vfio_fs_init_fs_context(struct fs_context * fc)239  static int vfio_fs_init_fs_context(struct fs_context *fc)
240  {
241  	return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
242  }
243  
244  static struct file_system_type vfio_fs_type = {
245  	.name = "vfio",
246  	.owner = THIS_MODULE,
247  	.init_fs_context = vfio_fs_init_fs_context,
248  	.kill_sb = kill_anon_super,
249  };
250  
vfio_fs_inode_new(void)251  static struct inode *vfio_fs_inode_new(void)
252  {
253  	struct inode *inode;
254  	int ret;
255  
256  	ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
257  	if (ret)
258  		return ERR_PTR(ret);
259  
260  	inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
261  	if (IS_ERR(inode))
262  		simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
263  
264  	return inode;
265  }
266  
267  /*
268   * Initialize a vfio_device so it can be registered to vfio core.
269   */
vfio_init_device(struct vfio_device * device,struct device * dev,const struct vfio_device_ops * ops)270  static int vfio_init_device(struct vfio_device *device, struct device *dev,
271  			    const struct vfio_device_ops *ops)
272  {
273  	int ret;
274  
275  	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
276  	if (ret < 0) {
277  		dev_dbg(dev, "Error to alloc index\n");
278  		return ret;
279  	}
280  
281  	device->index = ret;
282  	init_completion(&device->comp);
283  	device->dev = dev;
284  	device->ops = ops;
285  	device->inode = vfio_fs_inode_new();
286  	if (IS_ERR(device->inode)) {
287  		ret = PTR_ERR(device->inode);
288  		goto out_inode;
289  	}
290  
291  	if (ops->init) {
292  		ret = ops->init(device);
293  		if (ret)
294  			goto out_uninit;
295  	}
296  
297  	device_initialize(&device->device);
298  	device->device.release = vfio_device_release;
299  	device->device.class = vfio.device_class;
300  	device->device.parent = device->dev;
301  	return 0;
302  
303  out_uninit:
304  	iput(device->inode);
305  	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
306  out_inode:
307  	vfio_release_device_set(device);
308  	ida_free(&vfio.device_ida, device->index);
309  	return ret;
310  }
311  
__vfio_register_dev(struct vfio_device * device,enum vfio_group_type type)312  static int __vfio_register_dev(struct vfio_device *device,
313  			       enum vfio_group_type type)
314  {
315  	int ret;
316  
317  	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
318  		    (!device->ops->bind_iommufd ||
319  		     !device->ops->unbind_iommufd ||
320  		     !device->ops->attach_ioas ||
321  		     !device->ops->detach_ioas)))
322  		return -EINVAL;
323  
324  	/*
325  	 * If the driver doesn't specify a set then the device is added to a
326  	 * singleton set just for itself.
327  	 */
328  	if (!device->dev_set)
329  		vfio_assign_device_set(device, device);
330  
331  	ret = dev_set_name(&device->device, "vfio%d", device->index);
332  	if (ret)
333  		return ret;
334  
335  	ret = vfio_device_set_group(device, type);
336  	if (ret)
337  		return ret;
338  
339  	/*
340  	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
341  	 * restore cache coherency. It has to be checked here because it is only
342  	 * valid for cases where we are using iommu groups.
343  	 */
344  	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
345  	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
346  		ret = -EINVAL;
347  		goto err_out;
348  	}
349  
350  	ret = vfio_device_add(device);
351  	if (ret)
352  		goto err_out;
353  
354  	/* Refcounting can't start until the driver calls register */
355  	refcount_set(&device->refcount, 1);
356  
357  	vfio_device_group_register(device);
358  	vfio_device_debugfs_init(device);
359  
360  	return 0;
361  err_out:
362  	vfio_device_remove_group(device);
363  	return ret;
364  }
365  
vfio_register_group_dev(struct vfio_device * device)366  int vfio_register_group_dev(struct vfio_device *device)
367  {
368  	return __vfio_register_dev(device, VFIO_IOMMU);
369  }
370  EXPORT_SYMBOL_GPL(vfio_register_group_dev);
371  
372  /*
373   * Register a virtual device without IOMMU backing.  The user of this
374   * device must not be able to directly trigger unmediated DMA.
375   */
vfio_register_emulated_iommu_dev(struct vfio_device * device)376  int vfio_register_emulated_iommu_dev(struct vfio_device *device)
377  {
378  	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
379  }
380  EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
381  
382  /*
383   * Decrement the device reference count and wait for the device to be
384   * removed.  Open file descriptors for the device... */
vfio_unregister_group_dev(struct vfio_device * device)385  void vfio_unregister_group_dev(struct vfio_device *device)
386  {
387  	unsigned int i = 0;
388  	bool interrupted = false;
389  	long rc;
390  
391  	/*
392  	 * Prevent new device opened by userspace via the
393  	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
394  	 */
395  	vfio_device_group_unregister(device);
396  
397  	/*
398  	 * Balances vfio_device_add() in register path, also prevents
399  	 * new device opened by userspace in the cdev path.
400  	 */
401  	vfio_device_del(device);
402  
403  	vfio_device_put_registration(device);
404  	rc = try_wait_for_completion(&device->comp);
405  	while (rc <= 0) {
406  		if (device->ops->request)
407  			device->ops->request(device, i++);
408  
409  		if (interrupted) {
410  			rc = wait_for_completion_timeout(&device->comp,
411  							 HZ * 10);
412  		} else {
413  			rc = wait_for_completion_interruptible_timeout(
414  				&device->comp, HZ * 10);
415  			if (rc < 0) {
416  				interrupted = true;
417  				dev_warn(device->dev,
418  					 "Device is currently in use, task"
419  					 " \"%s\" (%d) "
420  					 "blocked until device is released",
421  					 current->comm, task_pid_nr(current));
422  			}
423  		}
424  	}
425  
426  	vfio_device_debugfs_exit(device);
427  	/* Balances vfio_device_set_group in register path */
428  	vfio_device_remove_group(device);
429  }
430  EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
431  
432  #if IS_ENABLED(CONFIG_KVM)
vfio_device_get_kvm_safe(struct vfio_device * device,struct kvm * kvm)433  void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
434  {
435  	void (*pfn)(struct kvm *kvm);
436  	bool (*fn)(struct kvm *kvm);
437  	bool ret;
438  
439  	lockdep_assert_held(&device->dev_set->lock);
440  
441  	if (!kvm)
442  		return;
443  
444  	pfn = symbol_get(kvm_put_kvm);
445  	if (WARN_ON(!pfn))
446  		return;
447  
448  	fn = symbol_get(kvm_get_kvm_safe);
449  	if (WARN_ON(!fn)) {
450  		symbol_put(kvm_put_kvm);
451  		return;
452  	}
453  
454  	ret = fn(kvm);
455  	symbol_put(kvm_get_kvm_safe);
456  	if (!ret) {
457  		symbol_put(kvm_put_kvm);
458  		return;
459  	}
460  
461  	device->put_kvm = pfn;
462  	device->kvm = kvm;
463  }
464  
vfio_device_put_kvm(struct vfio_device * device)465  void vfio_device_put_kvm(struct vfio_device *device)
466  {
467  	lockdep_assert_held(&device->dev_set->lock);
468  
469  	if (!device->kvm)
470  		return;
471  
472  	if (WARN_ON(!device->put_kvm))
473  		goto clear;
474  
475  	device->put_kvm(device->kvm);
476  	device->put_kvm = NULL;
477  	symbol_put(kvm_put_kvm);
478  
479  clear:
480  	device->kvm = NULL;
481  }
482  #endif
483  
484  /* true if the vfio_device has open_device() called but not close_device() */
vfio_assert_device_open(struct vfio_device * device)485  static bool vfio_assert_device_open(struct vfio_device *device)
486  {
487  	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
488  }
489  
490  struct vfio_device_file *
vfio_allocate_device_file(struct vfio_device * device)491  vfio_allocate_device_file(struct vfio_device *device)
492  {
493  	struct vfio_device_file *df;
494  
495  	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
496  	if (!df)
497  		return ERR_PTR(-ENOMEM);
498  
499  	df->device = device;
500  	spin_lock_init(&df->kvm_ref_lock);
501  
502  	return df;
503  }
504  
vfio_df_device_first_open(struct vfio_device_file * df)505  static int vfio_df_device_first_open(struct vfio_device_file *df)
506  {
507  	struct vfio_device *device = df->device;
508  	struct iommufd_ctx *iommufd = df->iommufd;
509  	int ret;
510  
511  	lockdep_assert_held(&device->dev_set->lock);
512  
513  	if (!try_module_get(device->dev->driver->owner))
514  		return -ENODEV;
515  
516  	if (iommufd)
517  		ret = vfio_df_iommufd_bind(df);
518  	else
519  		ret = vfio_device_group_use_iommu(device);
520  	if (ret)
521  		goto err_module_put;
522  
523  	if (device->ops->open_device) {
524  		ret = device->ops->open_device(device);
525  		if (ret)
526  			goto err_unuse_iommu;
527  	}
528  	return 0;
529  
530  err_unuse_iommu:
531  	if (iommufd)
532  		vfio_df_iommufd_unbind(df);
533  	else
534  		vfio_device_group_unuse_iommu(device);
535  err_module_put:
536  	module_put(device->dev->driver->owner);
537  	return ret;
538  }
539  
vfio_df_device_last_close(struct vfio_device_file * df)540  static void vfio_df_device_last_close(struct vfio_device_file *df)
541  {
542  	struct vfio_device *device = df->device;
543  	struct iommufd_ctx *iommufd = df->iommufd;
544  
545  	lockdep_assert_held(&device->dev_set->lock);
546  
547  	if (device->ops->close_device)
548  		device->ops->close_device(device);
549  	if (iommufd)
550  		vfio_df_iommufd_unbind(df);
551  	else
552  		vfio_device_group_unuse_iommu(device);
553  	module_put(device->dev->driver->owner);
554  }
555  
vfio_df_open(struct vfio_device_file * df)556  int vfio_df_open(struct vfio_device_file *df)
557  {
558  	struct vfio_device *device = df->device;
559  	int ret = 0;
560  
561  	lockdep_assert_held(&device->dev_set->lock);
562  
563  	/*
564  	 * Only the group path allows the device to be opened multiple
565  	 * times.  The device cdev path doesn't have a secure way for it.
566  	 */
567  	if (device->open_count != 0 && !df->group)
568  		return -EINVAL;
569  
570  	device->open_count++;
571  	if (device->open_count == 1) {
572  		ret = vfio_df_device_first_open(df);
573  		if (ret)
574  			device->open_count--;
575  	}
576  
577  	return ret;
578  }
579  
vfio_df_close(struct vfio_device_file * df)580  void vfio_df_close(struct vfio_device_file *df)
581  {
582  	struct vfio_device *device = df->device;
583  
584  	lockdep_assert_held(&device->dev_set->lock);
585  
586  	vfio_assert_device_open(device);
587  	if (device->open_count == 1)
588  		vfio_df_device_last_close(df);
589  	device->open_count--;
590  }
591  
592  /*
593   * Wrapper around pm_runtime_resume_and_get().
594   * Return error code on failure or 0 on success.
595   */
vfio_device_pm_runtime_get(struct vfio_device * device)596  static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
597  {
598  	struct device *dev = device->dev;
599  
600  	if (dev->driver && dev->driver->pm) {
601  		int ret;
602  
603  		ret = pm_runtime_resume_and_get(dev);
604  		if (ret) {
605  			dev_info_ratelimited(dev,
606  				"vfio: runtime resume failed %d\n", ret);
607  			return -EIO;
608  		}
609  	}
610  
611  	return 0;
612  }
613  
614  /*
615   * Wrapper around pm_runtime_put().
616   */
vfio_device_pm_runtime_put(struct vfio_device * device)617  static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
618  {
619  	struct device *dev = device->dev;
620  
621  	if (dev->driver && dev->driver->pm)
622  		pm_runtime_put(dev);
623  }
624  
625  /*
626   * VFIO Device fd
627   */
vfio_device_fops_release(struct inode * inode,struct file * filep)628  static int vfio_device_fops_release(struct inode *inode, struct file *filep)
629  {
630  	struct vfio_device_file *df = filep->private_data;
631  	struct vfio_device *device = df->device;
632  
633  	if (df->group)
634  		vfio_df_group_close(df);
635  	else
636  		vfio_df_unbind_iommufd(df);
637  
638  	vfio_device_put_registration(device);
639  
640  	kfree(df);
641  
642  	return 0;
643  }
644  
645  /*
646   * vfio_mig_get_next_state - Compute the next step in the FSM
647   * @cur_fsm - The current state the device is in
648   * @new_fsm - The target state to reach
649   * @next_fsm - Pointer to the next step to get to new_fsm
650   *
651   * Return 0 upon success, otherwise -errno
652   * Upon success the next step in the state progression between cur_fsm and
653   * new_fsm will be set in next_fsm.
654   *
655   * This breaks down requests for combination transitions into smaller steps and
656   * returns the next step to get to new_fsm. The function may need to be called
657   * multiple times before reaching new_fsm.
658   *
659   */
vfio_mig_get_next_state(struct vfio_device * device,enum vfio_device_mig_state cur_fsm,enum vfio_device_mig_state new_fsm,enum vfio_device_mig_state * next_fsm)660  int vfio_mig_get_next_state(struct vfio_device *device,
661  			    enum vfio_device_mig_state cur_fsm,
662  			    enum vfio_device_mig_state new_fsm,
663  			    enum vfio_device_mig_state *next_fsm)
664  {
665  	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
666  	/*
667  	 * The coding in this table requires the driver to implement the
668  	 * following FSM arcs:
669  	 *         RESUMING -> STOP
670  	 *         STOP -> RESUMING
671  	 *         STOP -> STOP_COPY
672  	 *         STOP_COPY -> STOP
673  	 *
674  	 * If P2P is supported then the driver must also implement these FSM
675  	 * arcs:
676  	 *         RUNNING -> RUNNING_P2P
677  	 *         RUNNING_P2P -> RUNNING
678  	 *         RUNNING_P2P -> STOP
679  	 *         STOP -> RUNNING_P2P
680  	 *
681  	 * If precopy is supported then the driver must support these additional
682  	 * FSM arcs:
683  	 *         RUNNING -> PRE_COPY
684  	 *         PRE_COPY -> RUNNING
685  	 *         PRE_COPY -> STOP_COPY
686  	 * However, if precopy and P2P are supported together then the driver
687  	 * must support these additional arcs beyond the P2P arcs above:
688  	 *         PRE_COPY -> RUNNING
689  	 *         PRE_COPY -> PRE_COPY_P2P
690  	 *         PRE_COPY_P2P -> PRE_COPY
691  	 *         PRE_COPY_P2P -> RUNNING_P2P
692  	 *         PRE_COPY_P2P -> STOP_COPY
693  	 *         RUNNING -> PRE_COPY
694  	 *         RUNNING_P2P -> PRE_COPY_P2P
695  	 *
696  	 * Without P2P and precopy the driver must implement:
697  	 *         RUNNING -> STOP
698  	 *         STOP -> RUNNING
699  	 *
700  	 * The coding will step through multiple states for some combination
701  	 * transitions; if all optional features are supported, this means the
702  	 * following ones:
703  	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
704  	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
705  	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
706  	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
707  	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
708  	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
709  	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
710  	 *         RESUMING -> STOP -> RUNNING_P2P
711  	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
712  	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
713  	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
714  	 *         RESUMING -> STOP -> STOP_COPY
715  	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
716  	 *         RUNNING -> RUNNING_P2P -> STOP
717  	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
718  	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
719  	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
720  	 *         RUNNING_P2P -> STOP -> RESUMING
721  	 *         RUNNING_P2P -> STOP -> STOP_COPY
722  	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
723  	 *         STOP -> RUNNING_P2P -> RUNNING
724  	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
725  	 *         STOP_COPY -> STOP -> RESUMING
726  	 *         STOP_COPY -> STOP -> RUNNING_P2P
727  	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
728  	 *
729  	 *  The following transitions are blocked:
730  	 *         STOP_COPY -> PRE_COPY
731  	 *         STOP_COPY -> PRE_COPY_P2P
732  	 */
733  	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
734  		[VFIO_DEVICE_STATE_STOP] = {
735  			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
736  			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
737  			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
738  			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
739  			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
740  			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
741  			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
742  			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
743  		},
744  		[VFIO_DEVICE_STATE_RUNNING] = {
745  			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
746  			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
747  			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
748  			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
749  			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
750  			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
751  			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
752  			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
753  		},
754  		[VFIO_DEVICE_STATE_PRE_COPY] = {
755  			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
756  			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
757  			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
758  			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
759  			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
760  			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
761  			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
762  			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
763  		},
764  		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
765  			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
766  			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
767  			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
768  			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
769  			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
770  			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
771  			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
772  			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
773  		},
774  		[VFIO_DEVICE_STATE_STOP_COPY] = {
775  			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
776  			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
777  			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
778  			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
779  			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
780  			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
781  			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
782  			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
783  		},
784  		[VFIO_DEVICE_STATE_RESUMING] = {
785  			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
786  			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
787  			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
788  			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
789  			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
790  			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
791  			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
792  			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
793  		},
794  		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
795  			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
796  			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
797  			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
798  			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
799  			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
800  			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
801  			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
802  			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
803  		},
804  		[VFIO_DEVICE_STATE_ERROR] = {
805  			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
806  			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
807  			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
808  			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
809  			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
810  			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
811  			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
812  			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
813  		},
814  	};
815  
816  	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
817  		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
818  		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
819  		[VFIO_DEVICE_STATE_PRE_COPY] =
820  			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
821  		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
822  						   VFIO_MIGRATION_P2P |
823  						   VFIO_MIGRATION_PRE_COPY,
824  		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
825  		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
826  		[VFIO_DEVICE_STATE_RUNNING_P2P] =
827  			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
828  		[VFIO_DEVICE_STATE_ERROR] = ~0U,
829  	};
830  
831  	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
832  		    (state_flags_table[cur_fsm] & device->migration_flags) !=
833  			state_flags_table[cur_fsm]))
834  		return -EINVAL;
835  
836  	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
837  	   (state_flags_table[new_fsm] & device->migration_flags) !=
838  			state_flags_table[new_fsm])
839  		return -EINVAL;
840  
841  	/*
842  	 * Arcs touching optional and unsupported states are skipped over. The
843  	 * driver will instead see an arc from the original state to the next
844  	 * logical state, as per the above comment.
845  	 */
846  	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
847  	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
848  			state_flags_table[*next_fsm])
849  		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
850  
851  	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
852  }
853  EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
854  
855  /*
856   * Convert the drivers's struct file into a FD number and return it to userspace
857   */
vfio_ioct_mig_return_fd(struct file * filp,void __user * arg,struct vfio_device_feature_mig_state * mig)858  static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
859  				   struct vfio_device_feature_mig_state *mig)
860  {
861  	int ret;
862  	int fd;
863  
864  	fd = get_unused_fd_flags(O_CLOEXEC);
865  	if (fd < 0) {
866  		ret = fd;
867  		goto out_fput;
868  	}
869  
870  	mig->data_fd = fd;
871  	if (copy_to_user(arg, mig, sizeof(*mig))) {
872  		ret = -EFAULT;
873  		goto out_put_unused;
874  	}
875  	fd_install(fd, filp);
876  	return 0;
877  
878  out_put_unused:
879  	put_unused_fd(fd);
880  out_fput:
881  	fput(filp);
882  	return ret;
883  }
884  
885  static int
vfio_ioctl_device_feature_mig_device_state(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)886  vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
887  					   u32 flags, void __user *arg,
888  					   size_t argsz)
889  {
890  	size_t minsz =
891  		offsetofend(struct vfio_device_feature_mig_state, data_fd);
892  	struct vfio_device_feature_mig_state mig;
893  	struct file *filp = NULL;
894  	int ret;
895  
896  	if (!device->mig_ops)
897  		return -ENOTTY;
898  
899  	ret = vfio_check_feature(flags, argsz,
900  				 VFIO_DEVICE_FEATURE_SET |
901  				 VFIO_DEVICE_FEATURE_GET,
902  				 sizeof(mig));
903  	if (ret != 1)
904  		return ret;
905  
906  	if (copy_from_user(&mig, arg, minsz))
907  		return -EFAULT;
908  
909  	if (flags & VFIO_DEVICE_FEATURE_GET) {
910  		enum vfio_device_mig_state curr_state;
911  
912  		ret = device->mig_ops->migration_get_state(device,
913  							   &curr_state);
914  		if (ret)
915  			return ret;
916  		mig.device_state = curr_state;
917  		goto out_copy;
918  	}
919  
920  	/* Handle the VFIO_DEVICE_FEATURE_SET */
921  	filp = device->mig_ops->migration_set_state(device, mig.device_state);
922  	if (IS_ERR(filp) || !filp)
923  		goto out_copy;
924  
925  	return vfio_ioct_mig_return_fd(filp, arg, &mig);
926  out_copy:
927  	mig.data_fd = -1;
928  	if (copy_to_user(arg, &mig, sizeof(mig)))
929  		return -EFAULT;
930  	if (IS_ERR(filp))
931  		return PTR_ERR(filp);
932  	return 0;
933  }
934  
935  static int
vfio_ioctl_device_feature_migration_data_size(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)936  vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
937  					      u32 flags, void __user *arg,
938  					      size_t argsz)
939  {
940  	struct vfio_device_feature_mig_data_size data_size = {};
941  	unsigned long stop_copy_length;
942  	int ret;
943  
944  	if (!device->mig_ops)
945  		return -ENOTTY;
946  
947  	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
948  				 sizeof(data_size));
949  	if (ret != 1)
950  		return ret;
951  
952  	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
953  	if (ret)
954  		return ret;
955  
956  	data_size.stop_copy_length = stop_copy_length;
957  	if (copy_to_user(arg, &data_size, sizeof(data_size)))
958  		return -EFAULT;
959  
960  	return 0;
961  }
962  
vfio_ioctl_device_feature_migration(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)963  static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
964  					       u32 flags, void __user *arg,
965  					       size_t argsz)
966  {
967  	struct vfio_device_feature_migration mig = {
968  		.flags = device->migration_flags,
969  	};
970  	int ret;
971  
972  	if (!device->mig_ops)
973  		return -ENOTTY;
974  
975  	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
976  				 sizeof(mig));
977  	if (ret != 1)
978  		return ret;
979  	if (copy_to_user(arg, &mig, sizeof(mig)))
980  		return -EFAULT;
981  	return 0;
982  }
983  
vfio_combine_iova_ranges(struct rb_root_cached * root,u32 cur_nodes,u32 req_nodes)984  void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
985  			      u32 req_nodes)
986  {
987  	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
988  	unsigned long min_gap, curr_gap;
989  
990  	/* Special shortcut when a single range is required */
991  	if (req_nodes == 1) {
992  		unsigned long last;
993  
994  		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
995  
996  		/* Empty list */
997  		if (WARN_ON_ONCE(!comb_start))
998  			return;
999  
1000  		curr = comb_start;
1001  		while (curr) {
1002  			last = curr->last;
1003  			prev = curr;
1004  			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1005  			if (prev != comb_start)
1006  				interval_tree_remove(prev, root);
1007  		}
1008  		comb_start->last = last;
1009  		return;
1010  	}
1011  
1012  	/* Combine ranges which have the smallest gap */
1013  	while (cur_nodes > req_nodes) {
1014  		prev = NULL;
1015  		min_gap = ULONG_MAX;
1016  		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
1017  		while (curr) {
1018  			if (prev) {
1019  				curr_gap = curr->start - prev->last;
1020  				if (curr_gap < min_gap) {
1021  					min_gap = curr_gap;
1022  					comb_start = prev;
1023  					comb_end = curr;
1024  				}
1025  			}
1026  			prev = curr;
1027  			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1028  		}
1029  
1030  		/* Empty list or no nodes to combine */
1031  		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
1032  			break;
1033  
1034  		comb_start->last = comb_end->last;
1035  		interval_tree_remove(comb_end, root);
1036  		cur_nodes--;
1037  	}
1038  }
1039  EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
1040  
1041  /* Ranges should fit into a single kernel page */
1042  #define LOG_MAX_RANGES \
1043  	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1044  
1045  static int
vfio_ioctl_device_feature_logging_start(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1046  vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1047  					u32 flags, void __user *arg,
1048  					size_t argsz)
1049  {
1050  	size_t minsz =
1051  		offsetofend(struct vfio_device_feature_dma_logging_control,
1052  			    ranges);
1053  	struct vfio_device_feature_dma_logging_range __user *ranges;
1054  	struct vfio_device_feature_dma_logging_control control;
1055  	struct vfio_device_feature_dma_logging_range range;
1056  	struct rb_root_cached root = RB_ROOT_CACHED;
1057  	struct interval_tree_node *nodes;
1058  	u64 iova_end;
1059  	u32 nnodes;
1060  	int i, ret;
1061  
1062  	if (!device->log_ops)
1063  		return -ENOTTY;
1064  
1065  	ret = vfio_check_feature(flags, argsz,
1066  				 VFIO_DEVICE_FEATURE_SET,
1067  				 sizeof(control));
1068  	if (ret != 1)
1069  		return ret;
1070  
1071  	if (copy_from_user(&control, arg, minsz))
1072  		return -EFAULT;
1073  
1074  	nnodes = control.num_ranges;
1075  	if (!nnodes)
1076  		return -EINVAL;
1077  
1078  	if (nnodes > LOG_MAX_RANGES)
1079  		return -E2BIG;
1080  
1081  	ranges = u64_to_user_ptr(control.ranges);
1082  	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1083  			      GFP_KERNEL);
1084  	if (!nodes)
1085  		return -ENOMEM;
1086  
1087  	for (i = 0; i < nnodes; i++) {
1088  		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1089  			ret = -EFAULT;
1090  			goto end;
1091  		}
1092  		if (!IS_ALIGNED(range.iova, control.page_size) ||
1093  		    !IS_ALIGNED(range.length, control.page_size)) {
1094  			ret = -EINVAL;
1095  			goto end;
1096  		}
1097  
1098  		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1099  		    iova_end > ULONG_MAX) {
1100  			ret = -EOVERFLOW;
1101  			goto end;
1102  		}
1103  
1104  		nodes[i].start = range.iova;
1105  		nodes[i].last = range.iova + range.length - 1;
1106  		if (interval_tree_iter_first(&root, nodes[i].start,
1107  					     nodes[i].last)) {
1108  			/* Range overlapping */
1109  			ret = -EINVAL;
1110  			goto end;
1111  		}
1112  		interval_tree_insert(nodes + i, &root);
1113  	}
1114  
1115  	ret = device->log_ops->log_start(device, &root, nnodes,
1116  					 &control.page_size);
1117  	if (ret)
1118  		goto end;
1119  
1120  	if (copy_to_user(arg, &control, sizeof(control))) {
1121  		ret = -EFAULT;
1122  		device->log_ops->log_stop(device);
1123  	}
1124  
1125  end:
1126  	kfree(nodes);
1127  	return ret;
1128  }
1129  
1130  static int
vfio_ioctl_device_feature_logging_stop(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1131  vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1132  				       u32 flags, void __user *arg,
1133  				       size_t argsz)
1134  {
1135  	int ret;
1136  
1137  	if (!device->log_ops)
1138  		return -ENOTTY;
1139  
1140  	ret = vfio_check_feature(flags, argsz,
1141  				 VFIO_DEVICE_FEATURE_SET, 0);
1142  	if (ret != 1)
1143  		return ret;
1144  
1145  	return device->log_ops->log_stop(device);
1146  }
1147  
vfio_device_log_read_and_clear(struct iova_bitmap * iter,unsigned long iova,size_t length,void * opaque)1148  static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1149  					  unsigned long iova, size_t length,
1150  					  void *opaque)
1151  {
1152  	struct vfio_device *device = opaque;
1153  
1154  	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1155  }
1156  
1157  static int
vfio_ioctl_device_feature_logging_report(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1158  vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1159  					 u32 flags, void __user *arg,
1160  					 size_t argsz)
1161  {
1162  	size_t minsz =
1163  		offsetofend(struct vfio_device_feature_dma_logging_report,
1164  			    bitmap);
1165  	struct vfio_device_feature_dma_logging_report report;
1166  	struct iova_bitmap *iter;
1167  	u64 iova_end;
1168  	int ret;
1169  
1170  	if (!device->log_ops)
1171  		return -ENOTTY;
1172  
1173  	ret = vfio_check_feature(flags, argsz,
1174  				 VFIO_DEVICE_FEATURE_GET,
1175  				 sizeof(report));
1176  	if (ret != 1)
1177  		return ret;
1178  
1179  	if (copy_from_user(&report, arg, minsz))
1180  		return -EFAULT;
1181  
1182  	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1183  		return -EINVAL;
1184  
1185  	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1186  	    iova_end > ULONG_MAX)
1187  		return -EOVERFLOW;
1188  
1189  	iter = iova_bitmap_alloc(report.iova, report.length,
1190  				 report.page_size,
1191  				 u64_to_user_ptr(report.bitmap));
1192  	if (IS_ERR(iter))
1193  		return PTR_ERR(iter);
1194  
1195  	ret = iova_bitmap_for_each(iter, device,
1196  				   vfio_device_log_read_and_clear);
1197  
1198  	iova_bitmap_free(iter);
1199  	return ret;
1200  }
1201  
vfio_ioctl_device_feature(struct vfio_device * device,struct vfio_device_feature __user * arg)1202  static int vfio_ioctl_device_feature(struct vfio_device *device,
1203  				     struct vfio_device_feature __user *arg)
1204  {
1205  	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1206  	struct vfio_device_feature feature;
1207  
1208  	if (copy_from_user(&feature, arg, minsz))
1209  		return -EFAULT;
1210  
1211  	if (feature.argsz < minsz)
1212  		return -EINVAL;
1213  
1214  	/* Check unknown flags */
1215  	if (feature.flags &
1216  	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1217  	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1218  		return -EINVAL;
1219  
1220  	/* GET & SET are mutually exclusive except with PROBE */
1221  	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1222  	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1223  	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1224  		return -EINVAL;
1225  
1226  	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1227  	case VFIO_DEVICE_FEATURE_MIGRATION:
1228  		return vfio_ioctl_device_feature_migration(
1229  			device, feature.flags, arg->data,
1230  			feature.argsz - minsz);
1231  	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1232  		return vfio_ioctl_device_feature_mig_device_state(
1233  			device, feature.flags, arg->data,
1234  			feature.argsz - minsz);
1235  	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1236  		return vfio_ioctl_device_feature_logging_start(
1237  			device, feature.flags, arg->data,
1238  			feature.argsz - minsz);
1239  	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1240  		return vfio_ioctl_device_feature_logging_stop(
1241  			device, feature.flags, arg->data,
1242  			feature.argsz - minsz);
1243  	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1244  		return vfio_ioctl_device_feature_logging_report(
1245  			device, feature.flags, arg->data,
1246  			feature.argsz - minsz);
1247  	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1248  		return vfio_ioctl_device_feature_migration_data_size(
1249  			device, feature.flags, arg->data,
1250  			feature.argsz - minsz);
1251  	default:
1252  		if (unlikely(!device->ops->device_feature))
1253  			return -EINVAL;
1254  		return device->ops->device_feature(device, feature.flags,
1255  						   arg->data,
1256  						   feature.argsz - minsz);
1257  	}
1258  }
1259  
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1260  static long vfio_device_fops_unl_ioctl(struct file *filep,
1261  				       unsigned int cmd, unsigned long arg)
1262  {
1263  	struct vfio_device_file *df = filep->private_data;
1264  	struct vfio_device *device = df->device;
1265  	void __user *uptr = (void __user *)arg;
1266  	int ret;
1267  
1268  	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1269  		return vfio_df_ioctl_bind_iommufd(df, uptr);
1270  
1271  	/* Paired with smp_store_release() following vfio_df_open() */
1272  	if (!smp_load_acquire(&df->access_granted))
1273  		return -EINVAL;
1274  
1275  	ret = vfio_device_pm_runtime_get(device);
1276  	if (ret)
1277  		return ret;
1278  
1279  	/* cdev only ioctls */
1280  	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1281  		switch (cmd) {
1282  		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1283  			ret = vfio_df_ioctl_attach_pt(df, uptr);
1284  			goto out;
1285  
1286  		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1287  			ret = vfio_df_ioctl_detach_pt(df, uptr);
1288  			goto out;
1289  		}
1290  	}
1291  
1292  	switch (cmd) {
1293  	case VFIO_DEVICE_FEATURE:
1294  		ret = vfio_ioctl_device_feature(device, uptr);
1295  		break;
1296  
1297  	default:
1298  		if (unlikely(!device->ops->ioctl))
1299  			ret = -EINVAL;
1300  		else
1301  			ret = device->ops->ioctl(device, cmd, arg);
1302  		break;
1303  	}
1304  out:
1305  	vfio_device_pm_runtime_put(device);
1306  	return ret;
1307  }
1308  
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1309  static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1310  				     size_t count, loff_t *ppos)
1311  {
1312  	struct vfio_device_file *df = filep->private_data;
1313  	struct vfio_device *device = df->device;
1314  
1315  	/* Paired with smp_store_release() following vfio_df_open() */
1316  	if (!smp_load_acquire(&df->access_granted))
1317  		return -EINVAL;
1318  
1319  	if (unlikely(!device->ops->read))
1320  		return -EINVAL;
1321  
1322  	return device->ops->read(device, buf, count, ppos);
1323  }
1324  
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1325  static ssize_t vfio_device_fops_write(struct file *filep,
1326  				      const char __user *buf,
1327  				      size_t count, loff_t *ppos)
1328  {
1329  	struct vfio_device_file *df = filep->private_data;
1330  	struct vfio_device *device = df->device;
1331  
1332  	/* Paired with smp_store_release() following vfio_df_open() */
1333  	if (!smp_load_acquire(&df->access_granted))
1334  		return -EINVAL;
1335  
1336  	if (unlikely(!device->ops->write))
1337  		return -EINVAL;
1338  
1339  	return device->ops->write(device, buf, count, ppos);
1340  }
1341  
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1342  static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1343  {
1344  	struct vfio_device_file *df = filep->private_data;
1345  	struct vfio_device *device = df->device;
1346  
1347  	/* Paired with smp_store_release() following vfio_df_open() */
1348  	if (!smp_load_acquire(&df->access_granted))
1349  		return -EINVAL;
1350  
1351  	if (unlikely(!device->ops->mmap))
1352  		return -EINVAL;
1353  
1354  	return device->ops->mmap(device, vma);
1355  }
1356  
1357  const struct file_operations vfio_device_fops = {
1358  	.owner		= THIS_MODULE,
1359  	.open		= vfio_device_fops_cdev_open,
1360  	.release	= vfio_device_fops_release,
1361  	.read		= vfio_device_fops_read,
1362  	.write		= vfio_device_fops_write,
1363  	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1364  	.compat_ioctl	= compat_ptr_ioctl,
1365  	.mmap		= vfio_device_fops_mmap,
1366  };
1367  
vfio_device_from_file(struct file * file)1368  static struct vfio_device *vfio_device_from_file(struct file *file)
1369  {
1370  	struct vfio_device_file *df = file->private_data;
1371  
1372  	if (file->f_op != &vfio_device_fops)
1373  		return NULL;
1374  	return df->device;
1375  }
1376  
1377  /**
1378   * vfio_file_is_valid - True if the file is valid vfio file
1379   * @file: VFIO group file or VFIO device file
1380   */
vfio_file_is_valid(struct file * file)1381  bool vfio_file_is_valid(struct file *file)
1382  {
1383  	return vfio_group_from_file(file) ||
1384  	       vfio_device_from_file(file);
1385  }
1386  EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1387  
1388  /**
1389   * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1390   *        is always CPU cache coherent
1391   * @file: VFIO group file or VFIO device file
1392   *
1393   * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1394   * bit in DMA transactions. A return of false indicates that the user has
1395   * rights to access additional instructions such as wbinvd on x86.
1396   */
vfio_file_enforced_coherent(struct file * file)1397  bool vfio_file_enforced_coherent(struct file *file)
1398  {
1399  	struct vfio_device *device;
1400  	struct vfio_group *group;
1401  
1402  	group = vfio_group_from_file(file);
1403  	if (group)
1404  		return vfio_group_enforced_coherent(group);
1405  
1406  	device = vfio_device_from_file(file);
1407  	if (device)
1408  		return device_iommu_capable(device->dev,
1409  					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1410  
1411  	return true;
1412  }
1413  EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1414  
vfio_device_file_set_kvm(struct file * file,struct kvm * kvm)1415  static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1416  {
1417  	struct vfio_device_file *df = file->private_data;
1418  
1419  	/*
1420  	 * The kvm is first recorded in the vfio_device_file, and will
1421  	 * be propagated to vfio_device::kvm when the file is bound to
1422  	 * iommufd successfully in the vfio device cdev path.
1423  	 */
1424  	spin_lock(&df->kvm_ref_lock);
1425  	df->kvm = kvm;
1426  	spin_unlock(&df->kvm_ref_lock);
1427  }
1428  
1429  /**
1430   * vfio_file_set_kvm - Link a kvm with VFIO drivers
1431   * @file: VFIO group file or VFIO device file
1432   * @kvm: KVM to link
1433   *
1434   * When a VFIO device is first opened the KVM will be available in
1435   * device->kvm if one was associated with the file.
1436   */
vfio_file_set_kvm(struct file * file,struct kvm * kvm)1437  void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1438  {
1439  	struct vfio_group *group;
1440  
1441  	group = vfio_group_from_file(file);
1442  	if (group)
1443  		vfio_group_set_kvm(group, kvm);
1444  
1445  	if (vfio_device_from_file(file))
1446  		vfio_device_file_set_kvm(file, kvm);
1447  }
1448  EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1449  
1450  /*
1451   * Sub-module support
1452   */
1453  /*
1454   * Helper for managing a buffer of info chain capabilities, allocate or
1455   * reallocate a buffer with additional @size, filling in @id and @version
1456   * of the capability.  A pointer to the new capability is returned.
1457   *
1458   * NB. The chain is based at the head of the buffer, so new entries are
1459   * added to the tail, vfio_info_cap_shift() should be called to fixup the
1460   * next offsets prior to copying to the user buffer.
1461   */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1462  struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1463  					       size_t size, u16 id, u16 version)
1464  {
1465  	void *buf;
1466  	struct vfio_info_cap_header *header, *tmp;
1467  
1468  	/* Ensure that the next capability struct will be aligned */
1469  	size = ALIGN(size, sizeof(u64));
1470  
1471  	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1472  	if (!buf) {
1473  		kfree(caps->buf);
1474  		caps->buf = NULL;
1475  		caps->size = 0;
1476  		return ERR_PTR(-ENOMEM);
1477  	}
1478  
1479  	caps->buf = buf;
1480  	header = buf + caps->size;
1481  
1482  	/* Eventually copied to user buffer, zero */
1483  	memset(header, 0, size);
1484  
1485  	header->id = id;
1486  	header->version = version;
1487  
1488  	/* Add to the end of the capability chain */
1489  	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1490  		; /* nothing */
1491  
1492  	tmp->next = caps->size;
1493  	caps->size += size;
1494  
1495  	return header;
1496  }
1497  EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1498  
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1499  void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1500  {
1501  	struct vfio_info_cap_header *tmp;
1502  	void *buf = (void *)caps->buf;
1503  
1504  	/* Capability structs should start with proper alignment */
1505  	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1506  
1507  	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1508  		tmp->next += offset;
1509  }
1510  EXPORT_SYMBOL(vfio_info_cap_shift);
1511  
vfio_info_add_capability(struct vfio_info_cap * caps,struct vfio_info_cap_header * cap,size_t size)1512  int vfio_info_add_capability(struct vfio_info_cap *caps,
1513  			     struct vfio_info_cap_header *cap, size_t size)
1514  {
1515  	struct vfio_info_cap_header *header;
1516  
1517  	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1518  	if (IS_ERR(header))
1519  		return PTR_ERR(header);
1520  
1521  	memcpy(header + 1, cap + 1, size - sizeof(*header));
1522  
1523  	return 0;
1524  }
1525  EXPORT_SYMBOL(vfio_info_add_capability);
1526  
vfio_set_irqs_validate_and_prepare(struct vfio_irq_set * hdr,int num_irqs,int max_irq_type,size_t * data_size)1527  int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1528  				       int max_irq_type, size_t *data_size)
1529  {
1530  	unsigned long minsz;
1531  	size_t size;
1532  
1533  	minsz = offsetofend(struct vfio_irq_set, count);
1534  
1535  	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1536  	    (hdr->count >= (U32_MAX - hdr->start)) ||
1537  	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1538  				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1539  		return -EINVAL;
1540  
1541  	if (data_size)
1542  		*data_size = 0;
1543  
1544  	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1545  		return -EINVAL;
1546  
1547  	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1548  	case VFIO_IRQ_SET_DATA_NONE:
1549  		size = 0;
1550  		break;
1551  	case VFIO_IRQ_SET_DATA_BOOL:
1552  		size = sizeof(uint8_t);
1553  		break;
1554  	case VFIO_IRQ_SET_DATA_EVENTFD:
1555  		size = sizeof(int32_t);
1556  		break;
1557  	default:
1558  		return -EINVAL;
1559  	}
1560  
1561  	if (size) {
1562  		if (hdr->argsz - minsz < hdr->count * size)
1563  			return -EINVAL;
1564  
1565  		if (!data_size)
1566  			return -EINVAL;
1567  
1568  		*data_size = hdr->count * size;
1569  	}
1570  
1571  	return 0;
1572  }
1573  EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1574  
1575  /*
1576   * Pin contiguous user pages and return their associated host pages for local
1577   * domain only.
1578   * @device [in]  : device
1579   * @iova [in]    : starting IOVA of user pages to be pinned.
1580   * @npage [in]   : count of pages to be pinned.  This count should not
1581   *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1582   * @prot [in]    : protection flags
1583   * @pages[out]   : array of host pages
1584   * Return error or number of pages pinned.
1585   *
1586   * A driver may only call this function if the vfio_device was created
1587   * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1588   */
vfio_pin_pages(struct vfio_device * device,dma_addr_t iova,int npage,int prot,struct page ** pages)1589  int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1590  		   int npage, int prot, struct page **pages)
1591  {
1592  	/* group->container cannot change while a vfio device is open */
1593  	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1594  		return -EINVAL;
1595  	if (!device->ops->dma_unmap)
1596  		return -EINVAL;
1597  	if (vfio_device_has_container(device))
1598  		return vfio_device_container_pin_pages(device, iova,
1599  						       npage, prot, pages);
1600  	if (device->iommufd_access) {
1601  		int ret;
1602  
1603  		if (iova > ULONG_MAX)
1604  			return -EINVAL;
1605  		/*
1606  		 * VFIO ignores the sub page offset, npages is from the start of
1607  		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1608  		 * the sub page offset by doing:
1609  		 *     pages[0] + (iova % PAGE_SIZE)
1610  		 */
1611  		ret = iommufd_access_pin_pages(
1612  			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1613  			npage * PAGE_SIZE, pages,
1614  			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1615  		if (ret)
1616  			return ret;
1617  		return npage;
1618  	}
1619  	return -EINVAL;
1620  }
1621  EXPORT_SYMBOL(vfio_pin_pages);
1622  
1623  /*
1624   * Unpin contiguous host pages for local domain only.
1625   * @device [in]  : device
1626   * @iova [in]    : starting address of user pages to be unpinned.
1627   * @npage [in]   : count of pages to be unpinned.  This count should not
1628   *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1629   */
vfio_unpin_pages(struct vfio_device * device,dma_addr_t iova,int npage)1630  void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1631  {
1632  	if (WARN_ON(!vfio_assert_device_open(device)))
1633  		return;
1634  	if (WARN_ON(!device->ops->dma_unmap))
1635  		return;
1636  
1637  	if (vfio_device_has_container(device)) {
1638  		vfio_device_container_unpin_pages(device, iova, npage);
1639  		return;
1640  	}
1641  	if (device->iommufd_access) {
1642  		if (WARN_ON(iova > ULONG_MAX))
1643  			return;
1644  		iommufd_access_unpin_pages(device->iommufd_access,
1645  					   ALIGN_DOWN(iova, PAGE_SIZE),
1646  					   npage * PAGE_SIZE);
1647  		return;
1648  	}
1649  }
1650  EXPORT_SYMBOL(vfio_unpin_pages);
1651  
1652  /*
1653   * This interface allows the CPUs to perform some sort of virtual DMA on
1654   * behalf of the device.
1655   *
1656   * CPUs read/write from/into a range of IOVAs pointing to user space memory
1657   * into/from a kernel buffer.
1658   *
1659   * As the read/write of user space memory is conducted via the CPUs and is
1660   * not a real device DMA, it is not necessary to pin the user space memory.
1661   *
1662   * @device [in]		: VFIO device
1663   * @iova [in]		: base IOVA of a user space buffer
1664   * @data [in]		: pointer to kernel buffer
1665   * @len [in]		: kernel buffer length
1666   * @write		: indicate read or write
1667   * Return error code on failure or 0 on success.
1668   */
vfio_dma_rw(struct vfio_device * device,dma_addr_t iova,void * data,size_t len,bool write)1669  int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1670  		size_t len, bool write)
1671  {
1672  	if (!data || len <= 0 || !vfio_assert_device_open(device))
1673  		return -EINVAL;
1674  
1675  	if (vfio_device_has_container(device))
1676  		return vfio_device_container_dma_rw(device, iova,
1677  						    data, len, write);
1678  
1679  	if (device->iommufd_access) {
1680  		unsigned int flags = 0;
1681  
1682  		if (iova > ULONG_MAX)
1683  			return -EINVAL;
1684  
1685  		/* VFIO historically tries to auto-detect a kthread */
1686  		if (!current->mm)
1687  			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1688  		if (write)
1689  			flags |= IOMMUFD_ACCESS_RW_WRITE;
1690  		return iommufd_access_rw(device->iommufd_access, iova, data,
1691  					 len, flags);
1692  	}
1693  	return -EINVAL;
1694  }
1695  EXPORT_SYMBOL(vfio_dma_rw);
1696  
1697  /*
1698   * Module/class support
1699   */
vfio_init(void)1700  static int __init vfio_init(void)
1701  {
1702  	int ret;
1703  
1704  	ida_init(&vfio.device_ida);
1705  
1706  	ret = vfio_group_init();
1707  	if (ret)
1708  		return ret;
1709  
1710  	ret = vfio_virqfd_init();
1711  	if (ret)
1712  		goto err_virqfd;
1713  
1714  	/* /sys/class/vfio-dev/vfioX */
1715  	vfio.device_class = class_create("vfio-dev");
1716  	if (IS_ERR(vfio.device_class)) {
1717  		ret = PTR_ERR(vfio.device_class);
1718  		goto err_dev_class;
1719  	}
1720  
1721  	ret = vfio_cdev_init(vfio.device_class);
1722  	if (ret)
1723  		goto err_alloc_dev_chrdev;
1724  
1725  	vfio_debugfs_create_root();
1726  	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1727  	return 0;
1728  
1729  err_alloc_dev_chrdev:
1730  	class_destroy(vfio.device_class);
1731  	vfio.device_class = NULL;
1732  err_dev_class:
1733  	vfio_virqfd_exit();
1734  err_virqfd:
1735  	vfio_group_cleanup();
1736  	return ret;
1737  }
1738  
vfio_cleanup(void)1739  static void __exit vfio_cleanup(void)
1740  {
1741  	vfio_debugfs_remove_root();
1742  	ida_destroy(&vfio.device_ida);
1743  	vfio_cdev_cleanup();
1744  	class_destroy(vfio.device_class);
1745  	vfio.device_class = NULL;
1746  	vfio_virqfd_exit();
1747  	vfio_group_cleanup();
1748  	xa_destroy(&vfio_device_set_xa);
1749  }
1750  
1751  module_init(vfio_init);
1752  module_exit(vfio_cleanup);
1753  
1754  MODULE_IMPORT_NS(IOMMUFD);
1755  MODULE_VERSION(DRIVER_VERSION);
1756  MODULE_LICENSE("GPL v2");
1757  MODULE_AUTHOR(DRIVER_AUTHOR);
1758  MODULE_DESCRIPTION(DRIVER_DESC);
1759  MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1760