1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4   * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5   */
6  
7  #include <linux/bug.h>
8  #include <linux/cpu_pm.h>
9  #include <linux/entry-kvm.h>
10  #include <linux/errno.h>
11  #include <linux/err.h>
12  #include <linux/kvm_host.h>
13  #include <linux/list.h>
14  #include <linux/module.h>
15  #include <linux/vmalloc.h>
16  #include <linux/fs.h>
17  #include <linux/mman.h>
18  #include <linux/sched.h>
19  #include <linux/kvm.h>
20  #include <linux/kvm_irqfd.h>
21  #include <linux/irqbypass.h>
22  #include <linux/sched/stat.h>
23  #include <linux/psci.h>
24  #include <trace/events/kvm.h>
25  
26  #define CREATE_TRACE_POINTS
27  #include "trace_arm.h"
28  
29  #include <linux/uaccess.h>
30  #include <asm/ptrace.h>
31  #include <asm/mman.h>
32  #include <asm/tlbflush.h>
33  #include <asm/cacheflush.h>
34  #include <asm/cpufeature.h>
35  #include <asm/virt.h>
36  #include <asm/kvm_arm.h>
37  #include <asm/kvm_asm.h>
38  #include <asm/kvm_emulate.h>
39  #include <asm/kvm_mmu.h>
40  #include <asm/kvm_nested.h>
41  #include <asm/kvm_pkvm.h>
42  #include <asm/kvm_ptrauth.h>
43  #include <asm/sections.h>
44  
45  #include <kvm/arm_hypercalls.h>
46  #include <kvm/arm_pmu.h>
47  #include <kvm/arm_psci.h>
48  
49  #include "sys_regs.h"
50  
51  static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
52  
53  enum kvm_wfx_trap_policy {
54  	KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */
55  	KVM_WFX_NOTRAP,
56  	KVM_WFX_TRAP,
57  };
58  
59  static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
60  static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
61  
62  DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
63  
64  DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
65  DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
66  
67  DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
68  
69  static bool vgic_present, kvm_arm_initialised;
70  
71  static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);
72  DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
73  
is_kvm_arm_initialised(void)74  bool is_kvm_arm_initialised(void)
75  {
76  	return kvm_arm_initialised;
77  }
78  
kvm_arch_vcpu_should_kick(struct kvm_vcpu * vcpu)79  int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
80  {
81  	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
82  }
83  
84  /*
85   * This functions as an allow-list of protected VM capabilities.
86   * Features not explicitly allowed by this function are denied.
87   */
pkvm_ext_allowed(struct kvm * kvm,long ext)88  static bool pkvm_ext_allowed(struct kvm *kvm, long ext)
89  {
90  	switch (ext) {
91  	case KVM_CAP_IRQCHIP:
92  	case KVM_CAP_ARM_PSCI:
93  	case KVM_CAP_ARM_PSCI_0_2:
94  	case KVM_CAP_NR_VCPUS:
95  	case KVM_CAP_MAX_VCPUS:
96  	case KVM_CAP_MAX_VCPU_ID:
97  	case KVM_CAP_MSI_DEVID:
98  	case KVM_CAP_ARM_VM_IPA_SIZE:
99  	case KVM_CAP_ARM_PMU_V3:
100  	case KVM_CAP_ARM_SVE:
101  	case KVM_CAP_ARM_PTRAUTH_ADDRESS:
102  	case KVM_CAP_ARM_PTRAUTH_GENERIC:
103  		return true;
104  	default:
105  		return false;
106  	}
107  }
108  
kvm_vm_ioctl_enable_cap(struct kvm * kvm,struct kvm_enable_cap * cap)109  int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
110  			    struct kvm_enable_cap *cap)
111  {
112  	int r = -EINVAL;
113  
114  	if (cap->flags)
115  		return -EINVAL;
116  
117  	if (kvm_vm_is_protected(kvm) && !pkvm_ext_allowed(kvm, cap->cap))
118  		return -EINVAL;
119  
120  	switch (cap->cap) {
121  	case KVM_CAP_ARM_NISV_TO_USER:
122  		r = 0;
123  		set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
124  			&kvm->arch.flags);
125  		break;
126  	case KVM_CAP_ARM_MTE:
127  		mutex_lock(&kvm->lock);
128  		if (system_supports_mte() && !kvm->created_vcpus) {
129  			r = 0;
130  			set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
131  		}
132  		mutex_unlock(&kvm->lock);
133  		break;
134  	case KVM_CAP_ARM_SYSTEM_SUSPEND:
135  		r = 0;
136  		set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
137  		break;
138  	case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
139  		mutex_lock(&kvm->slots_lock);
140  		/*
141  		 * To keep things simple, allow changing the chunk
142  		 * size only when no memory slots have been created.
143  		 */
144  		if (kvm_are_all_memslots_empty(kvm)) {
145  			u64 new_cap = cap->args[0];
146  
147  			if (!new_cap || kvm_is_block_size_supported(new_cap)) {
148  				r = 0;
149  				kvm->arch.mmu.split_page_chunk_size = new_cap;
150  			}
151  		}
152  		mutex_unlock(&kvm->slots_lock);
153  		break;
154  	default:
155  		break;
156  	}
157  
158  	return r;
159  }
160  
kvm_arm_default_max_vcpus(void)161  static int kvm_arm_default_max_vcpus(void)
162  {
163  	return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
164  }
165  
166  /**
167   * kvm_arch_init_vm - initializes a VM data structure
168   * @kvm:	pointer to the KVM struct
169   * @type:	kvm device type
170   */
kvm_arch_init_vm(struct kvm * kvm,unsigned long type)171  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
172  {
173  	int ret;
174  
175  	mutex_init(&kvm->arch.config_lock);
176  
177  #ifdef CONFIG_LOCKDEP
178  	/* Clue in lockdep that the config_lock must be taken inside kvm->lock */
179  	mutex_lock(&kvm->lock);
180  	mutex_lock(&kvm->arch.config_lock);
181  	mutex_unlock(&kvm->arch.config_lock);
182  	mutex_unlock(&kvm->lock);
183  #endif
184  
185  	kvm_init_nested(kvm);
186  
187  	ret = kvm_share_hyp(kvm, kvm + 1);
188  	if (ret)
189  		return ret;
190  
191  	ret = pkvm_init_host_vm(kvm);
192  	if (ret)
193  		goto err_unshare_kvm;
194  
195  	if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) {
196  		ret = -ENOMEM;
197  		goto err_unshare_kvm;
198  	}
199  	cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
200  
201  	ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type);
202  	if (ret)
203  		goto err_free_cpumask;
204  
205  	kvm_vgic_early_init(kvm);
206  
207  	kvm_timer_init_vm(kvm);
208  
209  	/* The maximum number of VCPUs is limited by the host's GIC model */
210  	kvm->max_vcpus = kvm_arm_default_max_vcpus();
211  
212  	kvm_arm_init_hypercalls(kvm);
213  
214  	bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES);
215  
216  	return 0;
217  
218  err_free_cpumask:
219  	free_cpumask_var(kvm->arch.supported_cpus);
220  err_unshare_kvm:
221  	kvm_unshare_hyp(kvm, kvm + 1);
222  	return ret;
223  }
224  
kvm_arch_vcpu_fault(struct kvm_vcpu * vcpu,struct vm_fault * vmf)225  vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
226  {
227  	return VM_FAULT_SIGBUS;
228  }
229  
kvm_arch_create_vm_debugfs(struct kvm * kvm)230  void kvm_arch_create_vm_debugfs(struct kvm *kvm)
231  {
232  	kvm_sys_regs_create_debugfs(kvm);
233  	kvm_s2_ptdump_create_debugfs(kvm);
234  }
235  
kvm_destroy_mpidr_data(struct kvm * kvm)236  static void kvm_destroy_mpidr_data(struct kvm *kvm)
237  {
238  	struct kvm_mpidr_data *data;
239  
240  	mutex_lock(&kvm->arch.config_lock);
241  
242  	data = rcu_dereference_protected(kvm->arch.mpidr_data,
243  					 lockdep_is_held(&kvm->arch.config_lock));
244  	if (data) {
245  		rcu_assign_pointer(kvm->arch.mpidr_data, NULL);
246  		synchronize_rcu();
247  		kfree(data);
248  	}
249  
250  	mutex_unlock(&kvm->arch.config_lock);
251  }
252  
253  /**
254   * kvm_arch_destroy_vm - destroy the VM data structure
255   * @kvm:	pointer to the KVM struct
256   */
kvm_arch_destroy_vm(struct kvm * kvm)257  void kvm_arch_destroy_vm(struct kvm *kvm)
258  {
259  	bitmap_free(kvm->arch.pmu_filter);
260  	free_cpumask_var(kvm->arch.supported_cpus);
261  
262  	kvm_vgic_destroy(kvm);
263  
264  	if (is_protected_kvm_enabled())
265  		pkvm_destroy_hyp_vm(kvm);
266  
267  	kvm_destroy_mpidr_data(kvm);
268  
269  	kfree(kvm->arch.sysreg_masks);
270  	kvm_destroy_vcpus(kvm);
271  
272  	kvm_unshare_hyp(kvm, kvm + 1);
273  
274  	kvm_arm_teardown_hypercalls(kvm);
275  }
276  
kvm_has_full_ptr_auth(void)277  static bool kvm_has_full_ptr_auth(void)
278  {
279  	bool apa, gpa, api, gpi, apa3, gpa3;
280  	u64 isar1, isar2, val;
281  
282  	/*
283  	 * Check that:
284  	 *
285  	 * - both Address and Generic auth are implemented for a given
286           *   algorithm (Q5, IMPDEF or Q3)
287  	 * - only a single algorithm is implemented.
288  	 */
289  	if (!system_has_full_ptr_auth())
290  		return false;
291  
292  	isar1 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
293  	isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
294  
295  	apa = !!FIELD_GET(ID_AA64ISAR1_EL1_APA_MASK, isar1);
296  	val = FIELD_GET(ID_AA64ISAR1_EL1_GPA_MASK, isar1);
297  	gpa = (val == ID_AA64ISAR1_EL1_GPA_IMP);
298  
299  	api = !!FIELD_GET(ID_AA64ISAR1_EL1_API_MASK, isar1);
300  	val = FIELD_GET(ID_AA64ISAR1_EL1_GPI_MASK, isar1);
301  	gpi = (val == ID_AA64ISAR1_EL1_GPI_IMP);
302  
303  	apa3 = !!FIELD_GET(ID_AA64ISAR2_EL1_APA3_MASK, isar2);
304  	val  = FIELD_GET(ID_AA64ISAR2_EL1_GPA3_MASK, isar2);
305  	gpa3 = (val == ID_AA64ISAR2_EL1_GPA3_IMP);
306  
307  	return (apa == gpa && api == gpi && apa3 == gpa3 &&
308  		(apa + api + apa3) == 1);
309  }
310  
kvm_vm_ioctl_check_extension(struct kvm * kvm,long ext)311  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
312  {
313  	int r;
314  
315  	if (kvm && kvm_vm_is_protected(kvm) && !pkvm_ext_allowed(kvm, ext))
316  		return 0;
317  
318  	switch (ext) {
319  	case KVM_CAP_IRQCHIP:
320  		r = vgic_present;
321  		break;
322  	case KVM_CAP_IOEVENTFD:
323  	case KVM_CAP_USER_MEMORY:
324  	case KVM_CAP_SYNC_MMU:
325  	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
326  	case KVM_CAP_ONE_REG:
327  	case KVM_CAP_ARM_PSCI:
328  	case KVM_CAP_ARM_PSCI_0_2:
329  	case KVM_CAP_READONLY_MEM:
330  	case KVM_CAP_MP_STATE:
331  	case KVM_CAP_IMMEDIATE_EXIT:
332  	case KVM_CAP_VCPU_EVENTS:
333  	case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
334  	case KVM_CAP_ARM_NISV_TO_USER:
335  	case KVM_CAP_ARM_INJECT_EXT_DABT:
336  	case KVM_CAP_SET_GUEST_DEBUG:
337  	case KVM_CAP_VCPU_ATTRIBUTES:
338  	case KVM_CAP_PTP_KVM:
339  	case KVM_CAP_ARM_SYSTEM_SUSPEND:
340  	case KVM_CAP_IRQFD_RESAMPLE:
341  	case KVM_CAP_COUNTER_OFFSET:
342  		r = 1;
343  		break;
344  	case KVM_CAP_SET_GUEST_DEBUG2:
345  		return KVM_GUESTDBG_VALID_MASK;
346  	case KVM_CAP_ARM_SET_DEVICE_ADDR:
347  		r = 1;
348  		break;
349  	case KVM_CAP_NR_VCPUS:
350  		/*
351  		 * ARM64 treats KVM_CAP_NR_CPUS differently from all other
352  		 * architectures, as it does not always bound it to
353  		 * KVM_CAP_MAX_VCPUS. It should not matter much because
354  		 * this is just an advisory value.
355  		 */
356  		r = min_t(unsigned int, num_online_cpus(),
357  			  kvm_arm_default_max_vcpus());
358  		break;
359  	case KVM_CAP_MAX_VCPUS:
360  	case KVM_CAP_MAX_VCPU_ID:
361  		if (kvm)
362  			r = kvm->max_vcpus;
363  		else
364  			r = kvm_arm_default_max_vcpus();
365  		break;
366  	case KVM_CAP_MSI_DEVID:
367  		if (!kvm)
368  			r = -EINVAL;
369  		else
370  			r = kvm->arch.vgic.msis_require_devid;
371  		break;
372  	case KVM_CAP_ARM_USER_IRQ:
373  		/*
374  		 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
375  		 * (bump this number if adding more devices)
376  		 */
377  		r = 1;
378  		break;
379  	case KVM_CAP_ARM_MTE:
380  		r = system_supports_mte();
381  		break;
382  	case KVM_CAP_STEAL_TIME:
383  		r = kvm_arm_pvtime_supported();
384  		break;
385  	case KVM_CAP_ARM_EL1_32BIT:
386  		r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1);
387  		break;
388  	case KVM_CAP_GUEST_DEBUG_HW_BPS:
389  		r = get_num_brps();
390  		break;
391  	case KVM_CAP_GUEST_DEBUG_HW_WPS:
392  		r = get_num_wrps();
393  		break;
394  	case KVM_CAP_ARM_PMU_V3:
395  		r = kvm_arm_support_pmu_v3();
396  		break;
397  	case KVM_CAP_ARM_INJECT_SERROR_ESR:
398  		r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
399  		break;
400  	case KVM_CAP_ARM_VM_IPA_SIZE:
401  		r = get_kvm_ipa_limit();
402  		break;
403  	case KVM_CAP_ARM_SVE:
404  		r = system_supports_sve();
405  		break;
406  	case KVM_CAP_ARM_PTRAUTH_ADDRESS:
407  	case KVM_CAP_ARM_PTRAUTH_GENERIC:
408  		r = kvm_has_full_ptr_auth();
409  		break;
410  	case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
411  		if (kvm)
412  			r = kvm->arch.mmu.split_page_chunk_size;
413  		else
414  			r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
415  		break;
416  	case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES:
417  		r = kvm_supported_block_sizes();
418  		break;
419  	case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES:
420  		r = BIT(0);
421  		break;
422  	default:
423  		r = 0;
424  	}
425  
426  	return r;
427  }
428  
kvm_arch_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)429  long kvm_arch_dev_ioctl(struct file *filp,
430  			unsigned int ioctl, unsigned long arg)
431  {
432  	return -EINVAL;
433  }
434  
kvm_arch_alloc_vm(void)435  struct kvm *kvm_arch_alloc_vm(void)
436  {
437  	size_t sz = sizeof(struct kvm);
438  
439  	if (!has_vhe())
440  		return kzalloc(sz, GFP_KERNEL_ACCOUNT);
441  
442  	return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO);
443  }
444  
kvm_arch_vcpu_precreate(struct kvm * kvm,unsigned int id)445  int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
446  {
447  	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
448  		return -EBUSY;
449  
450  	if (id >= kvm->max_vcpus)
451  		return -EINVAL;
452  
453  	return 0;
454  }
455  
kvm_arch_vcpu_create(struct kvm_vcpu * vcpu)456  int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
457  {
458  	int err;
459  
460  	spin_lock_init(&vcpu->arch.mp_state_lock);
461  
462  #ifdef CONFIG_LOCKDEP
463  	/* Inform lockdep that the config_lock is acquired after vcpu->mutex */
464  	mutex_lock(&vcpu->mutex);
465  	mutex_lock(&vcpu->kvm->arch.config_lock);
466  	mutex_unlock(&vcpu->kvm->arch.config_lock);
467  	mutex_unlock(&vcpu->mutex);
468  #endif
469  
470  	/* Force users to call KVM_ARM_VCPU_INIT */
471  	vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
472  
473  	vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
474  
475  	/* Set up the timer */
476  	kvm_timer_vcpu_init(vcpu);
477  
478  	kvm_pmu_vcpu_init(vcpu);
479  
480  	kvm_arm_reset_debug_ptr(vcpu);
481  
482  	kvm_arm_pvtime_vcpu_init(&vcpu->arch);
483  
484  	vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
485  
486  	/*
487  	 * This vCPU may have been created after mpidr_data was initialized.
488  	 * Throw out the pre-computed mappings if that is the case which forces
489  	 * KVM to fall back to iteratively searching the vCPUs.
490  	 */
491  	kvm_destroy_mpidr_data(vcpu->kvm);
492  
493  	err = kvm_vgic_vcpu_init(vcpu);
494  	if (err)
495  		return err;
496  
497  	return kvm_share_hyp(vcpu, vcpu + 1);
498  }
499  
kvm_arch_vcpu_postcreate(struct kvm_vcpu * vcpu)500  void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
501  {
502  }
503  
kvm_arch_vcpu_destroy(struct kvm_vcpu * vcpu)504  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
505  {
506  	if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm)))
507  		static_branch_dec(&userspace_irqchip_in_use);
508  
509  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
510  	kvm_timer_vcpu_terminate(vcpu);
511  	kvm_pmu_vcpu_destroy(vcpu);
512  	kvm_vgic_vcpu_destroy(vcpu);
513  	kvm_arm_vcpu_destroy(vcpu);
514  }
515  
kvm_arch_vcpu_blocking(struct kvm_vcpu * vcpu)516  void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
517  {
518  
519  }
520  
kvm_arch_vcpu_unblocking(struct kvm_vcpu * vcpu)521  void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
522  {
523  
524  }
525  
vcpu_set_pauth_traps(struct kvm_vcpu * vcpu)526  static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu)
527  {
528  	if (vcpu_has_ptrauth(vcpu) && !is_protected_kvm_enabled()) {
529  		/*
530  		 * Either we're running an L2 guest, and the API/APK bits come
531  		 * from L1's HCR_EL2, or API/APK are both set.
532  		 */
533  		if (unlikely(vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) {
534  			u64 val;
535  
536  			val = __vcpu_sys_reg(vcpu, HCR_EL2);
537  			val &= (HCR_API | HCR_APK);
538  			vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK);
539  			vcpu->arch.hcr_el2 |= val;
540  		} else {
541  			vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK);
542  		}
543  
544  		/*
545  		 * Save the host keys if there is any chance for the guest
546  		 * to use pauth, as the entry code will reload the guest
547  		 * keys in that case.
548  		 */
549  		if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) {
550  			struct kvm_cpu_context *ctxt;
551  
552  			ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt);
553  			ptrauth_save_keys(ctxt);
554  		}
555  	}
556  }
557  
kvm_vcpu_should_clear_twi(struct kvm_vcpu * vcpu)558  static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu)
559  {
560  	if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
561  		return kvm_wfi_trap_policy == KVM_WFX_NOTRAP;
562  
563  	return single_task_running() &&
564  	       (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) ||
565  		vcpu->kvm->arch.vgic.nassgireq);
566  }
567  
kvm_vcpu_should_clear_twe(struct kvm_vcpu * vcpu)568  static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu)
569  {
570  	if (unlikely(kvm_wfe_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
571  		return kvm_wfe_trap_policy == KVM_WFX_NOTRAP;
572  
573  	return single_task_running();
574  }
575  
kvm_arch_vcpu_load(struct kvm_vcpu * vcpu,int cpu)576  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
577  {
578  	struct kvm_s2_mmu *mmu;
579  	int *last_ran;
580  
581  	if (vcpu_has_nv(vcpu))
582  		kvm_vcpu_load_hw_mmu(vcpu);
583  
584  	mmu = vcpu->arch.hw_mmu;
585  	last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
586  
587  	/*
588  	 * We guarantee that both TLBs and I-cache are private to each
589  	 * vcpu. If detecting that a vcpu from the same VM has
590  	 * previously run on the same physical CPU, call into the
591  	 * hypervisor code to nuke the relevant contexts.
592  	 *
593  	 * We might get preempted before the vCPU actually runs, but
594  	 * over-invalidation doesn't affect correctness.
595  	 */
596  	if (*last_ran != vcpu->vcpu_idx) {
597  		kvm_call_hyp(__kvm_flush_cpu_context, mmu);
598  		*last_ran = vcpu->vcpu_idx;
599  	}
600  
601  	vcpu->cpu = cpu;
602  
603  	kvm_vgic_load(vcpu);
604  	kvm_timer_vcpu_load(vcpu);
605  	if (has_vhe())
606  		kvm_vcpu_load_vhe(vcpu);
607  	kvm_arch_vcpu_load_fp(vcpu);
608  	kvm_vcpu_pmu_restore_guest(vcpu);
609  	if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
610  		kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
611  
612  	if (kvm_vcpu_should_clear_twe(vcpu))
613  		vcpu->arch.hcr_el2 &= ~HCR_TWE;
614  	else
615  		vcpu->arch.hcr_el2 |= HCR_TWE;
616  
617  	if (kvm_vcpu_should_clear_twi(vcpu))
618  		vcpu->arch.hcr_el2 &= ~HCR_TWI;
619  	else
620  		vcpu->arch.hcr_el2 |= HCR_TWI;
621  
622  	vcpu_set_pauth_traps(vcpu);
623  
624  	kvm_arch_vcpu_load_debug_state_flags(vcpu);
625  
626  	if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus))
627  		vcpu_set_on_unsupported_cpu(vcpu);
628  }
629  
kvm_arch_vcpu_put(struct kvm_vcpu * vcpu)630  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
631  {
632  	kvm_arch_vcpu_put_debug_state_flags(vcpu);
633  	kvm_arch_vcpu_put_fp(vcpu);
634  	if (has_vhe())
635  		kvm_vcpu_put_vhe(vcpu);
636  	kvm_timer_vcpu_put(vcpu);
637  	kvm_vgic_put(vcpu);
638  	kvm_vcpu_pmu_restore_host(vcpu);
639  	if (vcpu_has_nv(vcpu))
640  		kvm_vcpu_put_hw_mmu(vcpu);
641  	kvm_arm_vmid_clear_active();
642  
643  	vcpu_clear_on_unsupported_cpu(vcpu);
644  	vcpu->cpu = -1;
645  }
646  
__kvm_arm_vcpu_power_off(struct kvm_vcpu * vcpu)647  static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
648  {
649  	WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
650  	kvm_make_request(KVM_REQ_SLEEP, vcpu);
651  	kvm_vcpu_kick(vcpu);
652  }
653  
kvm_arm_vcpu_power_off(struct kvm_vcpu * vcpu)654  void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
655  {
656  	spin_lock(&vcpu->arch.mp_state_lock);
657  	__kvm_arm_vcpu_power_off(vcpu);
658  	spin_unlock(&vcpu->arch.mp_state_lock);
659  }
660  
kvm_arm_vcpu_stopped(struct kvm_vcpu * vcpu)661  bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
662  {
663  	return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED;
664  }
665  
kvm_arm_vcpu_suspend(struct kvm_vcpu * vcpu)666  static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
667  {
668  	WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_SUSPENDED);
669  	kvm_make_request(KVM_REQ_SUSPEND, vcpu);
670  	kvm_vcpu_kick(vcpu);
671  }
672  
kvm_arm_vcpu_suspended(struct kvm_vcpu * vcpu)673  static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
674  {
675  	return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED;
676  }
677  
kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu * vcpu,struct kvm_mp_state * mp_state)678  int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
679  				    struct kvm_mp_state *mp_state)
680  {
681  	*mp_state = READ_ONCE(vcpu->arch.mp_state);
682  
683  	return 0;
684  }
685  
kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu * vcpu,struct kvm_mp_state * mp_state)686  int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
687  				    struct kvm_mp_state *mp_state)
688  {
689  	int ret = 0;
690  
691  	spin_lock(&vcpu->arch.mp_state_lock);
692  
693  	switch (mp_state->mp_state) {
694  	case KVM_MP_STATE_RUNNABLE:
695  		WRITE_ONCE(vcpu->arch.mp_state, *mp_state);
696  		break;
697  	case KVM_MP_STATE_STOPPED:
698  		__kvm_arm_vcpu_power_off(vcpu);
699  		break;
700  	case KVM_MP_STATE_SUSPENDED:
701  		kvm_arm_vcpu_suspend(vcpu);
702  		break;
703  	default:
704  		ret = -EINVAL;
705  	}
706  
707  	spin_unlock(&vcpu->arch.mp_state_lock);
708  
709  	return ret;
710  }
711  
712  /**
713   * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
714   * @v:		The VCPU pointer
715   *
716   * If the guest CPU is not waiting for interrupts or an interrupt line is
717   * asserted, the CPU is by definition runnable.
718   */
kvm_arch_vcpu_runnable(struct kvm_vcpu * v)719  int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
720  {
721  	bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
722  	return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
723  		&& !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
724  }
725  
kvm_arch_vcpu_in_kernel(struct kvm_vcpu * vcpu)726  bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
727  {
728  	return vcpu_mode_priv(vcpu);
729  }
730  
731  #ifdef CONFIG_GUEST_PERF_EVENTS
kvm_arch_vcpu_get_ip(struct kvm_vcpu * vcpu)732  unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
733  {
734  	return *vcpu_pc(vcpu);
735  }
736  #endif
737  
kvm_init_mpidr_data(struct kvm * kvm)738  static void kvm_init_mpidr_data(struct kvm *kvm)
739  {
740  	struct kvm_mpidr_data *data = NULL;
741  	unsigned long c, mask, nr_entries;
742  	u64 aff_set = 0, aff_clr = ~0UL;
743  	struct kvm_vcpu *vcpu;
744  
745  	mutex_lock(&kvm->arch.config_lock);
746  
747  	if (rcu_access_pointer(kvm->arch.mpidr_data) ||
748  	    atomic_read(&kvm->online_vcpus) == 1)
749  		goto out;
750  
751  	kvm_for_each_vcpu(c, vcpu, kvm) {
752  		u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
753  		aff_set |= aff;
754  		aff_clr &= aff;
755  	}
756  
757  	/*
758  	 * A significant bit can be either 0 or 1, and will only appear in
759  	 * aff_set. Use aff_clr to weed out the useless stuff.
760  	 */
761  	mask = aff_set ^ aff_clr;
762  	nr_entries = BIT_ULL(hweight_long(mask));
763  
764  	/*
765  	 * Don't let userspace fool us. If we need more than a single page
766  	 * to describe the compressed MPIDR array, just fall back to the
767  	 * iterative method. Single vcpu VMs do not need this either.
768  	 */
769  	if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE)
770  		data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries),
771  			       GFP_KERNEL_ACCOUNT);
772  
773  	if (!data)
774  		goto out;
775  
776  	data->mpidr_mask = mask;
777  
778  	kvm_for_each_vcpu(c, vcpu, kvm) {
779  		u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
780  		u16 index = kvm_mpidr_index(data, aff);
781  
782  		data->cmpidr_to_idx[index] = c;
783  	}
784  
785  	rcu_assign_pointer(kvm->arch.mpidr_data, data);
786  out:
787  	mutex_unlock(&kvm->arch.config_lock);
788  }
789  
790  /*
791   * Handle both the initialisation that is being done when the vcpu is
792   * run for the first time, as well as the updates that must be
793   * performed each time we get a new thread dealing with this vcpu.
794   */
kvm_arch_vcpu_run_pid_change(struct kvm_vcpu * vcpu)795  int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
796  {
797  	struct kvm *kvm = vcpu->kvm;
798  	int ret;
799  
800  	if (!kvm_vcpu_initialized(vcpu))
801  		return -ENOEXEC;
802  
803  	if (!kvm_arm_vcpu_is_finalized(vcpu))
804  		return -EPERM;
805  
806  	ret = kvm_arch_vcpu_run_map_fp(vcpu);
807  	if (ret)
808  		return ret;
809  
810  	if (likely(vcpu_has_run_once(vcpu)))
811  		return 0;
812  
813  	kvm_init_mpidr_data(kvm);
814  
815  	kvm_arm_vcpu_init_debug(vcpu);
816  
817  	if (likely(irqchip_in_kernel(kvm))) {
818  		/*
819  		 * Map the VGIC hardware resources before running a vcpu the
820  		 * first time on this VM.
821  		 */
822  		ret = kvm_vgic_map_resources(kvm);
823  		if (ret)
824  			return ret;
825  	}
826  
827  	ret = kvm_finalize_sys_regs(vcpu);
828  	if (ret)
829  		return ret;
830  
831  	/*
832  	 * This needs to happen after any restriction has been applied
833  	 * to the feature set.
834  	 */
835  	kvm_calculate_traps(vcpu);
836  
837  	ret = kvm_timer_enable(vcpu);
838  	if (ret)
839  		return ret;
840  
841  	ret = kvm_arm_pmu_v3_enable(vcpu);
842  	if (ret)
843  		return ret;
844  
845  	if (is_protected_kvm_enabled()) {
846  		ret = pkvm_create_hyp_vm(kvm);
847  		if (ret)
848  			return ret;
849  	}
850  
851  	if (!irqchip_in_kernel(kvm)) {
852  		/*
853  		 * Tell the rest of the code that there are userspace irqchip
854  		 * VMs in the wild.
855  		 */
856  		static_branch_inc(&userspace_irqchip_in_use);
857  	}
858  
859  	/*
860  	 * Initialize traps for protected VMs.
861  	 * NOTE: Move to run in EL2 directly, rather than via a hypercall, once
862  	 * the code is in place for first run initialization at EL2.
863  	 */
864  	if (kvm_vm_is_protected(kvm))
865  		kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu);
866  
867  	mutex_lock(&kvm->arch.config_lock);
868  	set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags);
869  	mutex_unlock(&kvm->arch.config_lock);
870  
871  	return ret;
872  }
873  
kvm_arch_intc_initialized(struct kvm * kvm)874  bool kvm_arch_intc_initialized(struct kvm *kvm)
875  {
876  	return vgic_initialized(kvm);
877  }
878  
kvm_arm_halt_guest(struct kvm * kvm)879  void kvm_arm_halt_guest(struct kvm *kvm)
880  {
881  	unsigned long i;
882  	struct kvm_vcpu *vcpu;
883  
884  	kvm_for_each_vcpu(i, vcpu, kvm)
885  		vcpu->arch.pause = true;
886  	kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
887  }
888  
kvm_arm_resume_guest(struct kvm * kvm)889  void kvm_arm_resume_guest(struct kvm *kvm)
890  {
891  	unsigned long i;
892  	struct kvm_vcpu *vcpu;
893  
894  	kvm_for_each_vcpu(i, vcpu, kvm) {
895  		vcpu->arch.pause = false;
896  		__kvm_vcpu_wake_up(vcpu);
897  	}
898  }
899  
kvm_vcpu_sleep(struct kvm_vcpu * vcpu)900  static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu)
901  {
902  	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
903  
904  	rcuwait_wait_event(wait,
905  			   (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause),
906  			   TASK_INTERRUPTIBLE);
907  
908  	if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) {
909  		/* Awaken to handle a signal, request we sleep again later. */
910  		kvm_make_request(KVM_REQ_SLEEP, vcpu);
911  	}
912  
913  	/*
914  	 * Make sure we will observe a potential reset request if we've
915  	 * observed a change to the power state. Pairs with the smp_wmb() in
916  	 * kvm_psci_vcpu_on().
917  	 */
918  	smp_rmb();
919  }
920  
921  /**
922   * kvm_vcpu_wfi - emulate Wait-For-Interrupt behavior
923   * @vcpu:	The VCPU pointer
924   *
925   * Suspend execution of a vCPU until a valid wake event is detected, i.e. until
926   * the vCPU is runnable.  The vCPU may or may not be scheduled out, depending
927   * on when a wake event arrives, e.g. there may already be a pending wake event.
928   */
kvm_vcpu_wfi(struct kvm_vcpu * vcpu)929  void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
930  {
931  	/*
932  	 * Sync back the state of the GIC CPU interface so that we have
933  	 * the latest PMR and group enables. This ensures that
934  	 * kvm_arch_vcpu_runnable has up-to-date data to decide whether
935  	 * we have pending interrupts, e.g. when determining if the
936  	 * vCPU should block.
937  	 *
938  	 * For the same reason, we want to tell GICv4 that we need
939  	 * doorbells to be signalled, should an interrupt become pending.
940  	 */
941  	preempt_disable();
942  	vcpu_set_flag(vcpu, IN_WFI);
943  	kvm_vgic_put(vcpu);
944  	preempt_enable();
945  
946  	kvm_vcpu_halt(vcpu);
947  	vcpu_clear_flag(vcpu, IN_WFIT);
948  
949  	preempt_disable();
950  	vcpu_clear_flag(vcpu, IN_WFI);
951  	kvm_vgic_load(vcpu);
952  	preempt_enable();
953  }
954  
kvm_vcpu_suspend(struct kvm_vcpu * vcpu)955  static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu)
956  {
957  	if (!kvm_arm_vcpu_suspended(vcpu))
958  		return 1;
959  
960  	kvm_vcpu_wfi(vcpu);
961  
962  	/*
963  	 * The suspend state is sticky; we do not leave it until userspace
964  	 * explicitly marks the vCPU as runnable. Request that we suspend again
965  	 * later.
966  	 */
967  	kvm_make_request(KVM_REQ_SUSPEND, vcpu);
968  
969  	/*
970  	 * Check to make sure the vCPU is actually runnable. If so, exit to
971  	 * userspace informing it of the wakeup condition.
972  	 */
973  	if (kvm_arch_vcpu_runnable(vcpu)) {
974  		memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
975  		vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP;
976  		vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
977  		return 0;
978  	}
979  
980  	/*
981  	 * Otherwise, we were unblocked to process a different event, such as a
982  	 * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to
983  	 * process the event.
984  	 */
985  	return 1;
986  }
987  
988  /**
989   * check_vcpu_requests - check and handle pending vCPU requests
990   * @vcpu:	the VCPU pointer
991   *
992   * Return: 1 if we should enter the guest
993   *	   0 if we should exit to userspace
994   *	   < 0 if we should exit to userspace, where the return value indicates
995   *	   an error
996   */
check_vcpu_requests(struct kvm_vcpu * vcpu)997  static int check_vcpu_requests(struct kvm_vcpu *vcpu)
998  {
999  	if (kvm_request_pending(vcpu)) {
1000  		if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
1001  			return -EIO;
1002  
1003  		if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
1004  			kvm_vcpu_sleep(vcpu);
1005  
1006  		if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
1007  			kvm_reset_vcpu(vcpu);
1008  
1009  		/*
1010  		 * Clear IRQ_PENDING requests that were made to guarantee
1011  		 * that a VCPU sees new virtual interrupts.
1012  		 */
1013  		kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
1014  
1015  		if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
1016  			kvm_update_stolen_time(vcpu);
1017  
1018  		if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
1019  			/* The distributor enable bits were changed */
1020  			preempt_disable();
1021  			vgic_v4_put(vcpu);
1022  			vgic_v4_load(vcpu);
1023  			preempt_enable();
1024  		}
1025  
1026  		if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
1027  			kvm_vcpu_reload_pmu(vcpu);
1028  
1029  		if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu))
1030  			kvm_vcpu_pmu_restore_guest(vcpu);
1031  
1032  		if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
1033  			return kvm_vcpu_suspend(vcpu);
1034  
1035  		if (kvm_dirty_ring_check_request(vcpu))
1036  			return 0;
1037  
1038  		check_nested_vcpu_requests(vcpu);
1039  	}
1040  
1041  	return 1;
1042  }
1043  
vcpu_mode_is_bad_32bit(struct kvm_vcpu * vcpu)1044  static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
1045  {
1046  	if (likely(!vcpu_mode_is_32bit(vcpu)))
1047  		return false;
1048  
1049  	if (vcpu_has_nv(vcpu))
1050  		return true;
1051  
1052  	return !kvm_supports_32bit_el0();
1053  }
1054  
1055  /**
1056   * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest
1057   * @vcpu:	The VCPU pointer
1058   * @ret:	Pointer to write optional return code
1059   *
1060   * Returns: true if the VCPU needs to return to a preemptible + interruptible
1061   *	    and skip guest entry.
1062   *
1063   * This function disambiguates between two different types of exits: exits to a
1064   * preemptible + interruptible kernel context and exits to userspace. For an
1065   * exit to userspace, this function will write the return code to ret and return
1066   * true. For an exit to preemptible + interruptible kernel context (i.e. check
1067   * for pending work and re-enter), return true without writing to ret.
1068   */
kvm_vcpu_exit_request(struct kvm_vcpu * vcpu,int * ret)1069  static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
1070  {
1071  	struct kvm_run *run = vcpu->run;
1072  
1073  	/*
1074  	 * If we're using a userspace irqchip, then check if we need
1075  	 * to tell a userspace irqchip about timer or PMU level
1076  	 * changes and if so, exit to userspace (the actual level
1077  	 * state gets updated in kvm_timer_update_run and
1078  	 * kvm_pmu_update_run below).
1079  	 */
1080  	if (static_branch_unlikely(&userspace_irqchip_in_use)) {
1081  		if (kvm_timer_should_notify_user(vcpu) ||
1082  		    kvm_pmu_should_notify_user(vcpu)) {
1083  			*ret = -EINTR;
1084  			run->exit_reason = KVM_EXIT_INTR;
1085  			return true;
1086  		}
1087  	}
1088  
1089  	if (unlikely(vcpu_on_unsupported_cpu(vcpu))) {
1090  		run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1091  		run->fail_entry.hardware_entry_failure_reason = KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED;
1092  		run->fail_entry.cpu = smp_processor_id();
1093  		*ret = 0;
1094  		return true;
1095  	}
1096  
1097  	return kvm_request_pending(vcpu) ||
1098  			xfer_to_guest_mode_work_pending();
1099  }
1100  
1101  /*
1102   * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
1103   * the vCPU is running.
1104   *
1105   * This must be noinstr as instrumentation may make use of RCU, and this is not
1106   * safe during the EQS.
1107   */
kvm_arm_vcpu_enter_exit(struct kvm_vcpu * vcpu)1108  static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
1109  {
1110  	int ret;
1111  
1112  	guest_state_enter_irqoff();
1113  	ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
1114  	guest_state_exit_irqoff();
1115  
1116  	return ret;
1117  }
1118  
1119  /**
1120   * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
1121   * @vcpu:	The VCPU pointer
1122   *
1123   * This function is called through the VCPU_RUN ioctl called from user space. It
1124   * will execute VM code in a loop until the time slice for the process is used
1125   * or some emulation is needed from user space in which case the function will
1126   * return with return value 0 and with the kvm_run structure filled in with the
1127   * required data for the requested emulation.
1128   */
kvm_arch_vcpu_ioctl_run(struct kvm_vcpu * vcpu)1129  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
1130  {
1131  	struct kvm_run *run = vcpu->run;
1132  	int ret;
1133  
1134  	if (run->exit_reason == KVM_EXIT_MMIO) {
1135  		ret = kvm_handle_mmio_return(vcpu);
1136  		if (ret <= 0)
1137  			return ret;
1138  	}
1139  
1140  	vcpu_load(vcpu);
1141  
1142  	if (!vcpu->wants_to_run) {
1143  		ret = -EINTR;
1144  		goto out;
1145  	}
1146  
1147  	kvm_sigset_activate(vcpu);
1148  
1149  	ret = 1;
1150  	run->exit_reason = KVM_EXIT_UNKNOWN;
1151  	run->flags = 0;
1152  	while (ret > 0) {
1153  		/*
1154  		 * Check conditions before entering the guest
1155  		 */
1156  		ret = xfer_to_guest_mode_handle_work(vcpu);
1157  		if (!ret)
1158  			ret = 1;
1159  
1160  		if (ret > 0)
1161  			ret = check_vcpu_requests(vcpu);
1162  
1163  		/*
1164  		 * Preparing the interrupts to be injected also
1165  		 * involves poking the GIC, which must be done in a
1166  		 * non-preemptible context.
1167  		 */
1168  		preempt_disable();
1169  
1170  		/*
1171  		 * The VMID allocator only tracks active VMIDs per
1172  		 * physical CPU, and therefore the VMID allocated may not be
1173  		 * preserved on VMID roll-over if the task was preempted,
1174  		 * making a thread's VMID inactive. So we need to call
1175  		 * kvm_arm_vmid_update() in non-premptible context.
1176  		 */
1177  		if (kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid) &&
1178  		    has_vhe())
1179  			__load_stage2(vcpu->arch.hw_mmu,
1180  				      vcpu->arch.hw_mmu->arch);
1181  
1182  		kvm_pmu_flush_hwstate(vcpu);
1183  
1184  		local_irq_disable();
1185  
1186  		kvm_vgic_flush_hwstate(vcpu);
1187  
1188  		kvm_pmu_update_vcpu_events(vcpu);
1189  
1190  		/*
1191  		 * Ensure we set mode to IN_GUEST_MODE after we disable
1192  		 * interrupts and before the final VCPU requests check.
1193  		 * See the comment in kvm_vcpu_exiting_guest_mode() and
1194  		 * Documentation/virt/kvm/vcpu-requests.rst
1195  		 */
1196  		smp_store_mb(vcpu->mode, IN_GUEST_MODE);
1197  
1198  		if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) {
1199  			vcpu->mode = OUTSIDE_GUEST_MODE;
1200  			isb(); /* Ensure work in x_flush_hwstate is committed */
1201  			kvm_pmu_sync_hwstate(vcpu);
1202  			if (static_branch_unlikely(&userspace_irqchip_in_use))
1203  				kvm_timer_sync_user(vcpu);
1204  			kvm_vgic_sync_hwstate(vcpu);
1205  			local_irq_enable();
1206  			preempt_enable();
1207  			continue;
1208  		}
1209  
1210  		kvm_arm_setup_debug(vcpu);
1211  		kvm_arch_vcpu_ctxflush_fp(vcpu);
1212  
1213  		/**************************************************************
1214  		 * Enter the guest
1215  		 */
1216  		trace_kvm_entry(*vcpu_pc(vcpu));
1217  		guest_timing_enter_irqoff();
1218  
1219  		ret = kvm_arm_vcpu_enter_exit(vcpu);
1220  
1221  		vcpu->mode = OUTSIDE_GUEST_MODE;
1222  		vcpu->stat.exits++;
1223  		/*
1224  		 * Back from guest
1225  		 *************************************************************/
1226  
1227  		kvm_arm_clear_debug(vcpu);
1228  
1229  		/*
1230  		 * We must sync the PMU state before the vgic state so
1231  		 * that the vgic can properly sample the updated state of the
1232  		 * interrupt line.
1233  		 */
1234  		kvm_pmu_sync_hwstate(vcpu);
1235  
1236  		/*
1237  		 * Sync the vgic state before syncing the timer state because
1238  		 * the timer code needs to know if the virtual timer
1239  		 * interrupts are active.
1240  		 */
1241  		kvm_vgic_sync_hwstate(vcpu);
1242  
1243  		/*
1244  		 * Sync the timer hardware state before enabling interrupts as
1245  		 * we don't want vtimer interrupts to race with syncing the
1246  		 * timer virtual interrupt state.
1247  		 */
1248  		if (static_branch_unlikely(&userspace_irqchip_in_use))
1249  			kvm_timer_sync_user(vcpu);
1250  
1251  		kvm_arch_vcpu_ctxsync_fp(vcpu);
1252  
1253  		/*
1254  		 * We must ensure that any pending interrupts are taken before
1255  		 * we exit guest timing so that timer ticks are accounted as
1256  		 * guest time. Transiently unmask interrupts so that any
1257  		 * pending interrupts are taken.
1258  		 *
1259  		 * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other
1260  		 * context synchronization event) is necessary to ensure that
1261  		 * pending interrupts are taken.
1262  		 */
1263  		if (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_IRQ) {
1264  			local_irq_enable();
1265  			isb();
1266  			local_irq_disable();
1267  		}
1268  
1269  		guest_timing_exit_irqoff();
1270  
1271  		local_irq_enable();
1272  
1273  		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
1274  
1275  		/* Exit types that need handling before we can be preempted */
1276  		handle_exit_early(vcpu, ret);
1277  
1278  		preempt_enable();
1279  
1280  		/*
1281  		 * The ARMv8 architecture doesn't give the hypervisor
1282  		 * a mechanism to prevent a guest from dropping to AArch32 EL0
1283  		 * if implemented by the CPU. If we spot the guest in such
1284  		 * state and that we decided it wasn't supposed to do so (like
1285  		 * with the asymmetric AArch32 case), return to userspace with
1286  		 * a fatal error.
1287  		 */
1288  		if (vcpu_mode_is_bad_32bit(vcpu)) {
1289  			/*
1290  			 * As we have caught the guest red-handed, decide that
1291  			 * it isn't fit for purpose anymore by making the vcpu
1292  			 * invalid. The VMM can try and fix it by issuing  a
1293  			 * KVM_ARM_VCPU_INIT if it really wants to.
1294  			 */
1295  			vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
1296  			ret = ARM_EXCEPTION_IL;
1297  		}
1298  
1299  		ret = handle_exit(vcpu, ret);
1300  	}
1301  
1302  	/* Tell userspace about in-kernel device output levels */
1303  	if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
1304  		kvm_timer_update_run(vcpu);
1305  		kvm_pmu_update_run(vcpu);
1306  	}
1307  
1308  	kvm_sigset_deactivate(vcpu);
1309  
1310  out:
1311  	/*
1312  	 * In the unlikely event that we are returning to userspace
1313  	 * with pending exceptions or PC adjustment, commit these
1314  	 * adjustments in order to give userspace a consistent view of
1315  	 * the vcpu state. Note that this relies on __kvm_adjust_pc()
1316  	 * being preempt-safe on VHE.
1317  	 */
1318  	if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) ||
1319  		     vcpu_get_flag(vcpu, INCREMENT_PC)))
1320  		kvm_call_hyp(__kvm_adjust_pc, vcpu);
1321  
1322  	vcpu_put(vcpu);
1323  	return ret;
1324  }
1325  
vcpu_interrupt_line(struct kvm_vcpu * vcpu,int number,bool level)1326  static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
1327  {
1328  	int bit_index;
1329  	bool set;
1330  	unsigned long *hcr;
1331  
1332  	if (number == KVM_ARM_IRQ_CPU_IRQ)
1333  		bit_index = __ffs(HCR_VI);
1334  	else /* KVM_ARM_IRQ_CPU_FIQ */
1335  		bit_index = __ffs(HCR_VF);
1336  
1337  	hcr = vcpu_hcr(vcpu);
1338  	if (level)
1339  		set = test_and_set_bit(bit_index, hcr);
1340  	else
1341  		set = test_and_clear_bit(bit_index, hcr);
1342  
1343  	/*
1344  	 * If we didn't change anything, no need to wake up or kick other CPUs
1345  	 */
1346  	if (set == level)
1347  		return 0;
1348  
1349  	/*
1350  	 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
1351  	 * trigger a world-switch round on the running physical CPU to set the
1352  	 * virtual IRQ/FIQ fields in the HCR appropriately.
1353  	 */
1354  	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
1355  	kvm_vcpu_kick(vcpu);
1356  
1357  	return 0;
1358  }
1359  
kvm_vm_ioctl_irq_line(struct kvm * kvm,struct kvm_irq_level * irq_level,bool line_status)1360  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
1361  			  bool line_status)
1362  {
1363  	u32 irq = irq_level->irq;
1364  	unsigned int irq_type, vcpu_id, irq_num;
1365  	struct kvm_vcpu *vcpu = NULL;
1366  	bool level = irq_level->level;
1367  
1368  	irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
1369  	vcpu_id = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
1370  	vcpu_id += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
1371  	irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
1372  
1373  	trace_kvm_irq_line(irq_type, vcpu_id, irq_num, irq_level->level);
1374  
1375  	switch (irq_type) {
1376  	case KVM_ARM_IRQ_TYPE_CPU:
1377  		if (irqchip_in_kernel(kvm))
1378  			return -ENXIO;
1379  
1380  		vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
1381  		if (!vcpu)
1382  			return -EINVAL;
1383  
1384  		if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
1385  			return -EINVAL;
1386  
1387  		return vcpu_interrupt_line(vcpu, irq_num, level);
1388  	case KVM_ARM_IRQ_TYPE_PPI:
1389  		if (!irqchip_in_kernel(kvm))
1390  			return -ENXIO;
1391  
1392  		vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
1393  		if (!vcpu)
1394  			return -EINVAL;
1395  
1396  		if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
1397  			return -EINVAL;
1398  
1399  		return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL);
1400  	case KVM_ARM_IRQ_TYPE_SPI:
1401  		if (!irqchip_in_kernel(kvm))
1402  			return -ENXIO;
1403  
1404  		if (irq_num < VGIC_NR_PRIVATE_IRQS)
1405  			return -EINVAL;
1406  
1407  		return kvm_vgic_inject_irq(kvm, NULL, irq_num, level, NULL);
1408  	}
1409  
1410  	return -EINVAL;
1411  }
1412  
system_supported_vcpu_features(void)1413  static unsigned long system_supported_vcpu_features(void)
1414  {
1415  	unsigned long features = KVM_VCPU_VALID_FEATURES;
1416  
1417  	if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1))
1418  		clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features);
1419  
1420  	if (!kvm_arm_support_pmu_v3())
1421  		clear_bit(KVM_ARM_VCPU_PMU_V3, &features);
1422  
1423  	if (!system_supports_sve())
1424  		clear_bit(KVM_ARM_VCPU_SVE, &features);
1425  
1426  	if (!kvm_has_full_ptr_auth()) {
1427  		clear_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features);
1428  		clear_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features);
1429  	}
1430  
1431  	if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT))
1432  		clear_bit(KVM_ARM_VCPU_HAS_EL2, &features);
1433  
1434  	return features;
1435  }
1436  
kvm_vcpu_init_check_features(struct kvm_vcpu * vcpu,const struct kvm_vcpu_init * init)1437  static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
1438  					const struct kvm_vcpu_init *init)
1439  {
1440  	unsigned long features = init->features[0];
1441  	int i;
1442  
1443  	if (features & ~KVM_VCPU_VALID_FEATURES)
1444  		return -ENOENT;
1445  
1446  	for (i = 1; i < ARRAY_SIZE(init->features); i++) {
1447  		if (init->features[i])
1448  			return -ENOENT;
1449  	}
1450  
1451  	if (features & ~system_supported_vcpu_features())
1452  		return -EINVAL;
1453  
1454  	/*
1455  	 * For now make sure that both address/generic pointer authentication
1456  	 * features are requested by the userspace together.
1457  	 */
1458  	if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features) !=
1459  	    test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features))
1460  		return -EINVAL;
1461  
1462  	if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features))
1463  		return 0;
1464  
1465  	/* MTE is incompatible with AArch32 */
1466  	if (kvm_has_mte(vcpu->kvm))
1467  		return -EINVAL;
1468  
1469  	/* NV is incompatible with AArch32 */
1470  	if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features))
1471  		return -EINVAL;
1472  
1473  	return 0;
1474  }
1475  
kvm_vcpu_init_changed(struct kvm_vcpu * vcpu,const struct kvm_vcpu_init * init)1476  static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu,
1477  				  const struct kvm_vcpu_init *init)
1478  {
1479  	unsigned long features = init->features[0];
1480  
1481  	return !bitmap_equal(vcpu->kvm->arch.vcpu_features, &features,
1482  			     KVM_VCPU_MAX_FEATURES);
1483  }
1484  
kvm_setup_vcpu(struct kvm_vcpu * vcpu)1485  static int kvm_setup_vcpu(struct kvm_vcpu *vcpu)
1486  {
1487  	struct kvm *kvm = vcpu->kvm;
1488  	int ret = 0;
1489  
1490  	/*
1491  	 * When the vCPU has a PMU, but no PMU is set for the guest
1492  	 * yet, set the default one.
1493  	 */
1494  	if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu)
1495  		ret = kvm_arm_set_default_pmu(kvm);
1496  
1497  	/* Prepare for nested if required */
1498  	if (!ret && vcpu_has_nv(vcpu))
1499  		ret = kvm_vcpu_init_nested(vcpu);
1500  
1501  	return ret;
1502  }
1503  
__kvm_vcpu_set_target(struct kvm_vcpu * vcpu,const struct kvm_vcpu_init * init)1504  static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
1505  				 const struct kvm_vcpu_init *init)
1506  {
1507  	unsigned long features = init->features[0];
1508  	struct kvm *kvm = vcpu->kvm;
1509  	int ret = -EINVAL;
1510  
1511  	mutex_lock(&kvm->arch.config_lock);
1512  
1513  	if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) &&
1514  	    kvm_vcpu_init_changed(vcpu, init))
1515  		goto out_unlock;
1516  
1517  	bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES);
1518  
1519  	ret = kvm_setup_vcpu(vcpu);
1520  	if (ret)
1521  		goto out_unlock;
1522  
1523  	/* Now we know what it is, we can reset it. */
1524  	kvm_reset_vcpu(vcpu);
1525  
1526  	set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags);
1527  	vcpu_set_flag(vcpu, VCPU_INITIALIZED);
1528  	ret = 0;
1529  out_unlock:
1530  	mutex_unlock(&kvm->arch.config_lock);
1531  	return ret;
1532  }
1533  
kvm_vcpu_set_target(struct kvm_vcpu * vcpu,const struct kvm_vcpu_init * init)1534  static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
1535  			       const struct kvm_vcpu_init *init)
1536  {
1537  	int ret;
1538  
1539  	if (init->target != KVM_ARM_TARGET_GENERIC_V8 &&
1540  	    init->target != kvm_target_cpu())
1541  		return -EINVAL;
1542  
1543  	ret = kvm_vcpu_init_check_features(vcpu, init);
1544  	if (ret)
1545  		return ret;
1546  
1547  	if (!kvm_vcpu_initialized(vcpu))
1548  		return __kvm_vcpu_set_target(vcpu, init);
1549  
1550  	if (kvm_vcpu_init_changed(vcpu, init))
1551  		return -EINVAL;
1552  
1553  	kvm_reset_vcpu(vcpu);
1554  	return 0;
1555  }
1556  
kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu * vcpu,struct kvm_vcpu_init * init)1557  static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
1558  					 struct kvm_vcpu_init *init)
1559  {
1560  	bool power_off = false;
1561  	int ret;
1562  
1563  	/*
1564  	 * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid
1565  	 * reflecting it in the finalized feature set, thus limiting its scope
1566  	 * to a single KVM_ARM_VCPU_INIT call.
1567  	 */
1568  	if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) {
1569  		init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF);
1570  		power_off = true;
1571  	}
1572  
1573  	ret = kvm_vcpu_set_target(vcpu, init);
1574  	if (ret)
1575  		return ret;
1576  
1577  	/*
1578  	 * Ensure a rebooted VM will fault in RAM pages and detect if the
1579  	 * guest MMU is turned off and flush the caches as needed.
1580  	 *
1581  	 * S2FWB enforces all memory accesses to RAM being cacheable,
1582  	 * ensuring that the data side is always coherent. We still
1583  	 * need to invalidate the I-cache though, as FWB does *not*
1584  	 * imply CTR_EL0.DIC.
1585  	 */
1586  	if (vcpu_has_run_once(vcpu)) {
1587  		if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
1588  			stage2_unmap_vm(vcpu->kvm);
1589  		else
1590  			icache_inval_all_pou();
1591  	}
1592  
1593  	vcpu_reset_hcr(vcpu);
1594  	vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu);
1595  
1596  	/*
1597  	 * Handle the "start in power-off" case.
1598  	 */
1599  	spin_lock(&vcpu->arch.mp_state_lock);
1600  
1601  	if (power_off)
1602  		__kvm_arm_vcpu_power_off(vcpu);
1603  	else
1604  		WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
1605  
1606  	spin_unlock(&vcpu->arch.mp_state_lock);
1607  
1608  	return 0;
1609  }
1610  
kvm_arm_vcpu_set_attr(struct kvm_vcpu * vcpu,struct kvm_device_attr * attr)1611  static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
1612  				 struct kvm_device_attr *attr)
1613  {
1614  	int ret = -ENXIO;
1615  
1616  	switch (attr->group) {
1617  	default:
1618  		ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
1619  		break;
1620  	}
1621  
1622  	return ret;
1623  }
1624  
kvm_arm_vcpu_get_attr(struct kvm_vcpu * vcpu,struct kvm_device_attr * attr)1625  static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
1626  				 struct kvm_device_attr *attr)
1627  {
1628  	int ret = -ENXIO;
1629  
1630  	switch (attr->group) {
1631  	default:
1632  		ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
1633  		break;
1634  	}
1635  
1636  	return ret;
1637  }
1638  
kvm_arm_vcpu_has_attr(struct kvm_vcpu * vcpu,struct kvm_device_attr * attr)1639  static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
1640  				 struct kvm_device_attr *attr)
1641  {
1642  	int ret = -ENXIO;
1643  
1644  	switch (attr->group) {
1645  	default:
1646  		ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
1647  		break;
1648  	}
1649  
1650  	return ret;
1651  }
1652  
kvm_arm_vcpu_get_events(struct kvm_vcpu * vcpu,struct kvm_vcpu_events * events)1653  static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
1654  				   struct kvm_vcpu_events *events)
1655  {
1656  	memset(events, 0, sizeof(*events));
1657  
1658  	return __kvm_arm_vcpu_get_events(vcpu, events);
1659  }
1660  
kvm_arm_vcpu_set_events(struct kvm_vcpu * vcpu,struct kvm_vcpu_events * events)1661  static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
1662  				   struct kvm_vcpu_events *events)
1663  {
1664  	int i;
1665  
1666  	/* check whether the reserved field is zero */
1667  	for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
1668  		if (events->reserved[i])
1669  			return -EINVAL;
1670  
1671  	/* check whether the pad field is zero */
1672  	for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
1673  		if (events->exception.pad[i])
1674  			return -EINVAL;
1675  
1676  	return __kvm_arm_vcpu_set_events(vcpu, events);
1677  }
1678  
kvm_arch_vcpu_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1679  long kvm_arch_vcpu_ioctl(struct file *filp,
1680  			 unsigned int ioctl, unsigned long arg)
1681  {
1682  	struct kvm_vcpu *vcpu = filp->private_data;
1683  	void __user *argp = (void __user *)arg;
1684  	struct kvm_device_attr attr;
1685  	long r;
1686  
1687  	switch (ioctl) {
1688  	case KVM_ARM_VCPU_INIT: {
1689  		struct kvm_vcpu_init init;
1690  
1691  		r = -EFAULT;
1692  		if (copy_from_user(&init, argp, sizeof(init)))
1693  			break;
1694  
1695  		r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
1696  		break;
1697  	}
1698  	case KVM_SET_ONE_REG:
1699  	case KVM_GET_ONE_REG: {
1700  		struct kvm_one_reg reg;
1701  
1702  		r = -ENOEXEC;
1703  		if (unlikely(!kvm_vcpu_initialized(vcpu)))
1704  			break;
1705  
1706  		r = -EFAULT;
1707  		if (copy_from_user(&reg, argp, sizeof(reg)))
1708  			break;
1709  
1710  		/*
1711  		 * We could owe a reset due to PSCI. Handle the pending reset
1712  		 * here to ensure userspace register accesses are ordered after
1713  		 * the reset.
1714  		 */
1715  		if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
1716  			kvm_reset_vcpu(vcpu);
1717  
1718  		if (ioctl == KVM_SET_ONE_REG)
1719  			r = kvm_arm_set_reg(vcpu, &reg);
1720  		else
1721  			r = kvm_arm_get_reg(vcpu, &reg);
1722  		break;
1723  	}
1724  	case KVM_GET_REG_LIST: {
1725  		struct kvm_reg_list __user *user_list = argp;
1726  		struct kvm_reg_list reg_list;
1727  		unsigned n;
1728  
1729  		r = -ENOEXEC;
1730  		if (unlikely(!kvm_vcpu_initialized(vcpu)))
1731  			break;
1732  
1733  		r = -EPERM;
1734  		if (!kvm_arm_vcpu_is_finalized(vcpu))
1735  			break;
1736  
1737  		r = -EFAULT;
1738  		if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
1739  			break;
1740  		n = reg_list.n;
1741  		reg_list.n = kvm_arm_num_regs(vcpu);
1742  		if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
1743  			break;
1744  		r = -E2BIG;
1745  		if (n < reg_list.n)
1746  			break;
1747  		r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
1748  		break;
1749  	}
1750  	case KVM_SET_DEVICE_ATTR: {
1751  		r = -EFAULT;
1752  		if (copy_from_user(&attr, argp, sizeof(attr)))
1753  			break;
1754  		r = kvm_arm_vcpu_set_attr(vcpu, &attr);
1755  		break;
1756  	}
1757  	case KVM_GET_DEVICE_ATTR: {
1758  		r = -EFAULT;
1759  		if (copy_from_user(&attr, argp, sizeof(attr)))
1760  			break;
1761  		r = kvm_arm_vcpu_get_attr(vcpu, &attr);
1762  		break;
1763  	}
1764  	case KVM_HAS_DEVICE_ATTR: {
1765  		r = -EFAULT;
1766  		if (copy_from_user(&attr, argp, sizeof(attr)))
1767  			break;
1768  		r = kvm_arm_vcpu_has_attr(vcpu, &attr);
1769  		break;
1770  	}
1771  	case KVM_GET_VCPU_EVENTS: {
1772  		struct kvm_vcpu_events events;
1773  
1774  		if (kvm_arm_vcpu_get_events(vcpu, &events))
1775  			return -EINVAL;
1776  
1777  		if (copy_to_user(argp, &events, sizeof(events)))
1778  			return -EFAULT;
1779  
1780  		return 0;
1781  	}
1782  	case KVM_SET_VCPU_EVENTS: {
1783  		struct kvm_vcpu_events events;
1784  
1785  		if (copy_from_user(&events, argp, sizeof(events)))
1786  			return -EFAULT;
1787  
1788  		return kvm_arm_vcpu_set_events(vcpu, &events);
1789  	}
1790  	case KVM_ARM_VCPU_FINALIZE: {
1791  		int what;
1792  
1793  		if (!kvm_vcpu_initialized(vcpu))
1794  			return -ENOEXEC;
1795  
1796  		if (get_user(what, (const int __user *)argp))
1797  			return -EFAULT;
1798  
1799  		return kvm_arm_vcpu_finalize(vcpu, what);
1800  	}
1801  	default:
1802  		r = -EINVAL;
1803  	}
1804  
1805  	return r;
1806  }
1807  
kvm_arch_sync_dirty_log(struct kvm * kvm,struct kvm_memory_slot * memslot)1808  void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
1809  {
1810  
1811  }
1812  
kvm_vm_ioctl_set_device_addr(struct kvm * kvm,struct kvm_arm_device_addr * dev_addr)1813  static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
1814  					struct kvm_arm_device_addr *dev_addr)
1815  {
1816  	switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) {
1817  	case KVM_ARM_DEVICE_VGIC_V2:
1818  		if (!vgic_present)
1819  			return -ENXIO;
1820  		return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr);
1821  	default:
1822  		return -ENODEV;
1823  	}
1824  }
1825  
kvm_vm_has_attr(struct kvm * kvm,struct kvm_device_attr * attr)1826  static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1827  {
1828  	switch (attr->group) {
1829  	case KVM_ARM_VM_SMCCC_CTRL:
1830  		return kvm_vm_smccc_has_attr(kvm, attr);
1831  	default:
1832  		return -ENXIO;
1833  	}
1834  }
1835  
kvm_vm_set_attr(struct kvm * kvm,struct kvm_device_attr * attr)1836  static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1837  {
1838  	switch (attr->group) {
1839  	case KVM_ARM_VM_SMCCC_CTRL:
1840  		return kvm_vm_smccc_set_attr(kvm, attr);
1841  	default:
1842  		return -ENXIO;
1843  	}
1844  }
1845  
kvm_arch_vm_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1846  int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1847  {
1848  	struct kvm *kvm = filp->private_data;
1849  	void __user *argp = (void __user *)arg;
1850  	struct kvm_device_attr attr;
1851  
1852  	switch (ioctl) {
1853  	case KVM_CREATE_IRQCHIP: {
1854  		int ret;
1855  		if (!vgic_present)
1856  			return -ENXIO;
1857  		mutex_lock(&kvm->lock);
1858  		ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
1859  		mutex_unlock(&kvm->lock);
1860  		return ret;
1861  	}
1862  	case KVM_ARM_SET_DEVICE_ADDR: {
1863  		struct kvm_arm_device_addr dev_addr;
1864  
1865  		if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
1866  			return -EFAULT;
1867  		return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
1868  	}
1869  	case KVM_ARM_PREFERRED_TARGET: {
1870  		struct kvm_vcpu_init init = {
1871  			.target = KVM_ARM_TARGET_GENERIC_V8,
1872  		};
1873  
1874  		if (copy_to_user(argp, &init, sizeof(init)))
1875  			return -EFAULT;
1876  
1877  		return 0;
1878  	}
1879  	case KVM_ARM_MTE_COPY_TAGS: {
1880  		struct kvm_arm_copy_mte_tags copy_tags;
1881  
1882  		if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
1883  			return -EFAULT;
1884  		return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
1885  	}
1886  	case KVM_ARM_SET_COUNTER_OFFSET: {
1887  		struct kvm_arm_counter_offset offset;
1888  
1889  		if (copy_from_user(&offset, argp, sizeof(offset)))
1890  			return -EFAULT;
1891  		return kvm_vm_ioctl_set_counter_offset(kvm, &offset);
1892  	}
1893  	case KVM_HAS_DEVICE_ATTR: {
1894  		if (copy_from_user(&attr, argp, sizeof(attr)))
1895  			return -EFAULT;
1896  
1897  		return kvm_vm_has_attr(kvm, &attr);
1898  	}
1899  	case KVM_SET_DEVICE_ATTR: {
1900  		if (copy_from_user(&attr, argp, sizeof(attr)))
1901  			return -EFAULT;
1902  
1903  		return kvm_vm_set_attr(kvm, &attr);
1904  	}
1905  	case KVM_ARM_GET_REG_WRITABLE_MASKS: {
1906  		struct reg_mask_range range;
1907  
1908  		if (copy_from_user(&range, argp, sizeof(range)))
1909  			return -EFAULT;
1910  		return kvm_vm_ioctl_get_reg_writable_masks(kvm, &range);
1911  	}
1912  	default:
1913  		return -EINVAL;
1914  	}
1915  }
1916  
1917  /* unlocks vcpus from @vcpu_lock_idx and smaller */
unlock_vcpus(struct kvm * kvm,int vcpu_lock_idx)1918  static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
1919  {
1920  	struct kvm_vcpu *tmp_vcpu;
1921  
1922  	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
1923  		tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
1924  		mutex_unlock(&tmp_vcpu->mutex);
1925  	}
1926  }
1927  
unlock_all_vcpus(struct kvm * kvm)1928  void unlock_all_vcpus(struct kvm *kvm)
1929  {
1930  	lockdep_assert_held(&kvm->lock);
1931  
1932  	unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
1933  }
1934  
1935  /* Returns true if all vcpus were locked, false otherwise */
lock_all_vcpus(struct kvm * kvm)1936  bool lock_all_vcpus(struct kvm *kvm)
1937  {
1938  	struct kvm_vcpu *tmp_vcpu;
1939  	unsigned long c;
1940  
1941  	lockdep_assert_held(&kvm->lock);
1942  
1943  	/*
1944  	 * Any time a vcpu is in an ioctl (including running), the
1945  	 * core KVM code tries to grab the vcpu->mutex.
1946  	 *
1947  	 * By grabbing the vcpu->mutex of all VCPUs we ensure that no
1948  	 * other VCPUs can fiddle with the state while we access it.
1949  	 */
1950  	kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
1951  		if (!mutex_trylock(&tmp_vcpu->mutex)) {
1952  			unlock_vcpus(kvm, c - 1);
1953  			return false;
1954  		}
1955  	}
1956  
1957  	return true;
1958  }
1959  
nvhe_percpu_size(void)1960  static unsigned long nvhe_percpu_size(void)
1961  {
1962  	return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
1963  		(unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
1964  }
1965  
nvhe_percpu_order(void)1966  static unsigned long nvhe_percpu_order(void)
1967  {
1968  	unsigned long size = nvhe_percpu_size();
1969  
1970  	return size ? get_order(size) : 0;
1971  }
1972  
pkvm_host_sve_state_order(void)1973  static size_t pkvm_host_sve_state_order(void)
1974  {
1975  	return get_order(pkvm_host_sve_state_size());
1976  }
1977  
1978  /* A lookup table holding the hypervisor VA for each vector slot */
1979  static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
1980  
kvm_init_vector_slot(void * base,enum arm64_hyp_spectre_vector slot)1981  static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot)
1982  {
1983  	hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot);
1984  }
1985  
kvm_init_vector_slots(void)1986  static int kvm_init_vector_slots(void)
1987  {
1988  	int err;
1989  	void *base;
1990  
1991  	base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
1992  	kvm_init_vector_slot(base, HYP_VECTOR_DIRECT);
1993  
1994  	base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
1995  	kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT);
1996  
1997  	if (kvm_system_needs_idmapped_vectors() &&
1998  	    !is_protected_kvm_enabled()) {
1999  		err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs),
2000  					       __BP_HARDEN_HYP_VECS_SZ, &base);
2001  		if (err)
2002  			return err;
2003  	}
2004  
2005  	kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT);
2006  	kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT);
2007  	return 0;
2008  }
2009  
cpu_prepare_hyp_mode(int cpu,u32 hyp_va_bits)2010  static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
2011  {
2012  	struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
2013  	u64 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
2014  	unsigned long tcr;
2015  
2016  	/*
2017  	 * Calculate the raw per-cpu offset without a translation from the
2018  	 * kernel's mapping to the linear mapping, and store it in tpidr_el2
2019  	 * so that we can use adr_l to access per-cpu variables in EL2.
2020  	 * Also drop the KASAN tag which gets in the way...
2021  	 */
2022  	params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) -
2023  			    (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
2024  
2025  	params->mair_el2 = read_sysreg(mair_el1);
2026  
2027  	tcr = read_sysreg(tcr_el1);
2028  	if (cpus_have_final_cap(ARM64_KVM_HVHE)) {
2029  		tcr |= TCR_EPD1_MASK;
2030  	} else {
2031  		tcr &= TCR_EL2_MASK;
2032  		tcr |= TCR_EL2_RES1;
2033  	}
2034  	tcr &= ~TCR_T0SZ_MASK;
2035  	tcr |= TCR_T0SZ(hyp_va_bits);
2036  	tcr &= ~TCR_EL2_PS_MASK;
2037  	tcr |= FIELD_PREP(TCR_EL2_PS_MASK, kvm_get_parange(mmfr0));
2038  	if (kvm_lpa2_is_enabled())
2039  		tcr |= TCR_EL2_DS;
2040  	params->tcr_el2 = tcr;
2041  
2042  	params->pgd_pa = kvm_mmu_get_httbr();
2043  	if (is_protected_kvm_enabled())
2044  		params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
2045  	else
2046  		params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
2047  	if (cpus_have_final_cap(ARM64_KVM_HVHE))
2048  		params->hcr_el2 |= HCR_E2H;
2049  	params->vttbr = params->vtcr = 0;
2050  
2051  	/*
2052  	 * Flush the init params from the data cache because the struct will
2053  	 * be read while the MMU is off.
2054  	 */
2055  	kvm_flush_dcache_to_poc(params, sizeof(*params));
2056  }
2057  
hyp_install_host_vector(void)2058  static void hyp_install_host_vector(void)
2059  {
2060  	struct kvm_nvhe_init_params *params;
2061  	struct arm_smccc_res res;
2062  
2063  	/* Switch from the HYP stub to our own HYP init vector */
2064  	__hyp_set_vectors(kvm_get_idmap_vector());
2065  
2066  	/*
2067  	 * Call initialization code, and switch to the full blown HYP code.
2068  	 * If the cpucaps haven't been finalized yet, something has gone very
2069  	 * wrong, and hyp will crash and burn when it uses any
2070  	 * cpus_have_*_cap() wrapper.
2071  	 */
2072  	BUG_ON(!system_capabilities_finalized());
2073  	params = this_cpu_ptr_nvhe_sym(kvm_init_params);
2074  	arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res);
2075  	WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
2076  }
2077  
cpu_init_hyp_mode(void)2078  static void cpu_init_hyp_mode(void)
2079  {
2080  	hyp_install_host_vector();
2081  
2082  	/*
2083  	 * Disabling SSBD on a non-VHE system requires us to enable SSBS
2084  	 * at EL2.
2085  	 */
2086  	if (this_cpu_has_cap(ARM64_SSBS) &&
2087  	    arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) {
2088  		kvm_call_hyp_nvhe(__kvm_enable_ssbs);
2089  	}
2090  }
2091  
cpu_hyp_reset(void)2092  static void cpu_hyp_reset(void)
2093  {
2094  	if (!is_kernel_in_hyp_mode())
2095  		__hyp_reset_vectors();
2096  }
2097  
2098  /*
2099   * EL2 vectors can be mapped and rerouted in a number of ways,
2100   * depending on the kernel configuration and CPU present:
2101   *
2102   * - If the CPU is affected by Spectre-v2, the hardening sequence is
2103   *   placed in one of the vector slots, which is executed before jumping
2104   *   to the real vectors.
2105   *
2106   * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot
2107   *   containing the hardening sequence is mapped next to the idmap page,
2108   *   and executed before jumping to the real vectors.
2109   *
2110   * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an
2111   *   empty slot is selected, mapped next to the idmap page, and
2112   *   executed before jumping to the real vectors.
2113   *
2114   * Note that ARM64_SPECTRE_V3A is somewhat incompatible with
2115   * VHE, as we don't have hypervisor-specific mappings. If the system
2116   * is VHE and yet selects this capability, it will be ignored.
2117   */
cpu_set_hyp_vector(void)2118  static void cpu_set_hyp_vector(void)
2119  {
2120  	struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
2121  	void *vector = hyp_spectre_vector_selector[data->slot];
2122  
2123  	if (!is_protected_kvm_enabled())
2124  		*this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
2125  	else
2126  		kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
2127  }
2128  
cpu_hyp_init_context(void)2129  static void cpu_hyp_init_context(void)
2130  {
2131  	kvm_init_host_cpu_context(host_data_ptr(host_ctxt));
2132  
2133  	if (!is_kernel_in_hyp_mode())
2134  		cpu_init_hyp_mode();
2135  }
2136  
cpu_hyp_init_features(void)2137  static void cpu_hyp_init_features(void)
2138  {
2139  	cpu_set_hyp_vector();
2140  	kvm_arm_init_debug();
2141  
2142  	if (is_kernel_in_hyp_mode())
2143  		kvm_timer_init_vhe();
2144  
2145  	if (vgic_present)
2146  		kvm_vgic_init_cpu_hardware();
2147  }
2148  
cpu_hyp_reinit(void)2149  static void cpu_hyp_reinit(void)
2150  {
2151  	cpu_hyp_reset();
2152  	cpu_hyp_init_context();
2153  	cpu_hyp_init_features();
2154  }
2155  
cpu_hyp_init(void * discard)2156  static void cpu_hyp_init(void *discard)
2157  {
2158  	if (!__this_cpu_read(kvm_hyp_initialized)) {
2159  		cpu_hyp_reinit();
2160  		__this_cpu_write(kvm_hyp_initialized, 1);
2161  	}
2162  }
2163  
cpu_hyp_uninit(void * discard)2164  static void cpu_hyp_uninit(void *discard)
2165  {
2166  	if (__this_cpu_read(kvm_hyp_initialized)) {
2167  		cpu_hyp_reset();
2168  		__this_cpu_write(kvm_hyp_initialized, 0);
2169  	}
2170  }
2171  
kvm_arch_enable_virtualization_cpu(void)2172  int kvm_arch_enable_virtualization_cpu(void)
2173  {
2174  	/*
2175  	 * Most calls to this function are made with migration
2176  	 * disabled, but not with preemption disabled. The former is
2177  	 * enough to ensure correctness, but most of the helpers
2178  	 * expect the later and will throw a tantrum otherwise.
2179  	 */
2180  	preempt_disable();
2181  
2182  	cpu_hyp_init(NULL);
2183  
2184  	kvm_vgic_cpu_up();
2185  	kvm_timer_cpu_up();
2186  
2187  	preempt_enable();
2188  
2189  	return 0;
2190  }
2191  
kvm_arch_disable_virtualization_cpu(void)2192  void kvm_arch_disable_virtualization_cpu(void)
2193  {
2194  	kvm_timer_cpu_down();
2195  	kvm_vgic_cpu_down();
2196  
2197  	if (!is_protected_kvm_enabled())
2198  		cpu_hyp_uninit(NULL);
2199  }
2200  
2201  #ifdef CONFIG_CPU_PM
hyp_init_cpu_pm_notifier(struct notifier_block * self,unsigned long cmd,void * v)2202  static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
2203  				    unsigned long cmd,
2204  				    void *v)
2205  {
2206  	/*
2207  	 * kvm_hyp_initialized is left with its old value over
2208  	 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
2209  	 * re-enable hyp.
2210  	 */
2211  	switch (cmd) {
2212  	case CPU_PM_ENTER:
2213  		if (__this_cpu_read(kvm_hyp_initialized))
2214  			/*
2215  			 * don't update kvm_hyp_initialized here
2216  			 * so that the hyp will be re-enabled
2217  			 * when we resume. See below.
2218  			 */
2219  			cpu_hyp_reset();
2220  
2221  		return NOTIFY_OK;
2222  	case CPU_PM_ENTER_FAILED:
2223  	case CPU_PM_EXIT:
2224  		if (__this_cpu_read(kvm_hyp_initialized))
2225  			/* The hyp was enabled before suspend. */
2226  			cpu_hyp_reinit();
2227  
2228  		return NOTIFY_OK;
2229  
2230  	default:
2231  		return NOTIFY_DONE;
2232  	}
2233  }
2234  
2235  static struct notifier_block hyp_init_cpu_pm_nb = {
2236  	.notifier_call = hyp_init_cpu_pm_notifier,
2237  };
2238  
hyp_cpu_pm_init(void)2239  static void __init hyp_cpu_pm_init(void)
2240  {
2241  	if (!is_protected_kvm_enabled())
2242  		cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
2243  }
hyp_cpu_pm_exit(void)2244  static void __init hyp_cpu_pm_exit(void)
2245  {
2246  	if (!is_protected_kvm_enabled())
2247  		cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
2248  }
2249  #else
hyp_cpu_pm_init(void)2250  static inline void __init hyp_cpu_pm_init(void)
2251  {
2252  }
hyp_cpu_pm_exit(void)2253  static inline void __init hyp_cpu_pm_exit(void)
2254  {
2255  }
2256  #endif
2257  
init_cpu_logical_map(void)2258  static void __init init_cpu_logical_map(void)
2259  {
2260  	unsigned int cpu;
2261  
2262  	/*
2263  	 * Copy the MPIDR <-> logical CPU ID mapping to hyp.
2264  	 * Only copy the set of online CPUs whose features have been checked
2265  	 * against the finalized system capabilities. The hypervisor will not
2266  	 * allow any other CPUs from the `possible` set to boot.
2267  	 */
2268  	for_each_online_cpu(cpu)
2269  		hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu);
2270  }
2271  
2272  #define init_psci_0_1_impl_state(config, what)	\
2273  	config.psci_0_1_ ## what ## _implemented = psci_ops.what
2274  
init_psci_relay(void)2275  static bool __init init_psci_relay(void)
2276  {
2277  	/*
2278  	 * If PSCI has not been initialized, protected KVM cannot install
2279  	 * itself on newly booted CPUs.
2280  	 */
2281  	if (!psci_ops.get_version) {
2282  		kvm_err("Cannot initialize protected mode without PSCI\n");
2283  		return false;
2284  	}
2285  
2286  	kvm_host_psci_config.version = psci_ops.get_version();
2287  	kvm_host_psci_config.smccc_version = arm_smccc_get_version();
2288  
2289  	if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
2290  		kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
2291  		init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend);
2292  		init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on);
2293  		init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off);
2294  		init_psci_0_1_impl_state(kvm_host_psci_config, migrate);
2295  	}
2296  	return true;
2297  }
2298  
init_subsystems(void)2299  static int __init init_subsystems(void)
2300  {
2301  	int err = 0;
2302  
2303  	/*
2304  	 * Enable hardware so that subsystem initialisation can access EL2.
2305  	 */
2306  	on_each_cpu(cpu_hyp_init, NULL, 1);
2307  
2308  	/*
2309  	 * Register CPU lower-power notifier
2310  	 */
2311  	hyp_cpu_pm_init();
2312  
2313  	/*
2314  	 * Init HYP view of VGIC
2315  	 */
2316  	err = kvm_vgic_hyp_init();
2317  	switch (err) {
2318  	case 0:
2319  		vgic_present = true;
2320  		break;
2321  	case -ENODEV:
2322  	case -ENXIO:
2323  		vgic_present = false;
2324  		err = 0;
2325  		break;
2326  	default:
2327  		goto out;
2328  	}
2329  
2330  	/*
2331  	 * Init HYP architected timer support
2332  	 */
2333  	err = kvm_timer_hyp_init(vgic_present);
2334  	if (err)
2335  		goto out;
2336  
2337  	kvm_register_perf_callbacks(NULL);
2338  
2339  out:
2340  	if (err)
2341  		hyp_cpu_pm_exit();
2342  
2343  	if (err || !is_protected_kvm_enabled())
2344  		on_each_cpu(cpu_hyp_uninit, NULL, 1);
2345  
2346  	return err;
2347  }
2348  
teardown_subsystems(void)2349  static void __init teardown_subsystems(void)
2350  {
2351  	kvm_unregister_perf_callbacks();
2352  	hyp_cpu_pm_exit();
2353  }
2354  
teardown_hyp_mode(void)2355  static void __init teardown_hyp_mode(void)
2356  {
2357  	bool free_sve = system_supports_sve() && is_protected_kvm_enabled();
2358  	int cpu;
2359  
2360  	free_hyp_pgds();
2361  	for_each_possible_cpu(cpu) {
2362  		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
2363  		free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
2364  
2365  		if (free_sve) {
2366  			struct cpu_sve_state *sve_state;
2367  
2368  			sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
2369  			free_pages((unsigned long) sve_state, pkvm_host_sve_state_order());
2370  		}
2371  	}
2372  }
2373  
do_pkvm_init(u32 hyp_va_bits)2374  static int __init do_pkvm_init(u32 hyp_va_bits)
2375  {
2376  	void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
2377  	int ret;
2378  
2379  	preempt_disable();
2380  	cpu_hyp_init_context();
2381  	ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
2382  				num_possible_cpus(), kern_hyp_va(per_cpu_base),
2383  				hyp_va_bits);
2384  	cpu_hyp_init_features();
2385  
2386  	/*
2387  	 * The stub hypercalls are now disabled, so set our local flag to
2388  	 * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu().
2389  	 */
2390  	__this_cpu_write(kvm_hyp_initialized, 1);
2391  	preempt_enable();
2392  
2393  	return ret;
2394  }
2395  
get_hyp_id_aa64pfr0_el1(void)2396  static u64 get_hyp_id_aa64pfr0_el1(void)
2397  {
2398  	/*
2399  	 * Track whether the system isn't affected by spectre/meltdown in the
2400  	 * hypervisor's view of id_aa64pfr0_el1, used for protected VMs.
2401  	 * Although this is per-CPU, we make it global for simplicity, e.g., not
2402  	 * to have to worry about vcpu migration.
2403  	 *
2404  	 * Unlike for non-protected VMs, userspace cannot override this for
2405  	 * protected VMs.
2406  	 */
2407  	u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
2408  
2409  	val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) |
2410  		 ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3));
2411  
2412  	val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2),
2413  			  arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED);
2414  	val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3),
2415  			  arm64_get_meltdown_state() == SPECTRE_UNAFFECTED);
2416  
2417  	return val;
2418  }
2419  
kvm_hyp_init_symbols(void)2420  static void kvm_hyp_init_symbols(void)
2421  {
2422  	kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1();
2423  	kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
2424  	kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
2425  	kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
2426  	kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
2427  	kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
2428  	kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
2429  	kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
2430  	kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1);
2431  	kvm_nvhe_sym(__icache_flags) = __icache_flags;
2432  	kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
2433  }
2434  
kvm_hyp_init_protection(u32 hyp_va_bits)2435  static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
2436  {
2437  	void *addr = phys_to_virt(hyp_mem_base);
2438  	int ret;
2439  
2440  	ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
2441  	if (ret)
2442  		return ret;
2443  
2444  	ret = do_pkvm_init(hyp_va_bits);
2445  	if (ret)
2446  		return ret;
2447  
2448  	free_hyp_pgds();
2449  
2450  	return 0;
2451  }
2452  
init_pkvm_host_sve_state(void)2453  static int init_pkvm_host_sve_state(void)
2454  {
2455  	int cpu;
2456  
2457  	if (!system_supports_sve())
2458  		return 0;
2459  
2460  	/* Allocate pages for host sve state in protected mode. */
2461  	for_each_possible_cpu(cpu) {
2462  		struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order());
2463  
2464  		if (!page)
2465  			return -ENOMEM;
2466  
2467  		per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page);
2468  	}
2469  
2470  	/*
2471  	 * Don't map the pages in hyp since these are only used in protected
2472  	 * mode, which will (re)create its own mapping when initialized.
2473  	 */
2474  
2475  	return 0;
2476  }
2477  
2478  /*
2479   * Finalizes the initialization of hyp mode, once everything else is initialized
2480   * and the initialziation process cannot fail.
2481   */
finalize_init_hyp_mode(void)2482  static void finalize_init_hyp_mode(void)
2483  {
2484  	int cpu;
2485  
2486  	if (system_supports_sve() && is_protected_kvm_enabled()) {
2487  		for_each_possible_cpu(cpu) {
2488  			struct cpu_sve_state *sve_state;
2489  
2490  			sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
2491  			per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state =
2492  				kern_hyp_va(sve_state);
2493  		}
2494  	} else {
2495  		for_each_possible_cpu(cpu) {
2496  			struct user_fpsimd_state *fpsimd_state;
2497  
2498  			fpsimd_state = &per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->host_ctxt.fp_regs;
2499  			per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->fpsimd_state =
2500  				kern_hyp_va(fpsimd_state);
2501  		}
2502  	}
2503  }
2504  
pkvm_hyp_init_ptrauth(void)2505  static void pkvm_hyp_init_ptrauth(void)
2506  {
2507  	struct kvm_cpu_context *hyp_ctxt;
2508  	int cpu;
2509  
2510  	for_each_possible_cpu(cpu) {
2511  		hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu);
2512  		hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long();
2513  		hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long();
2514  		hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long();
2515  		hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long();
2516  		hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long();
2517  		hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long();
2518  		hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long();
2519  		hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long();
2520  		hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long();
2521  		hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long();
2522  	}
2523  }
2524  
2525  /* Inits Hyp-mode on all online CPUs */
init_hyp_mode(void)2526  static int __init init_hyp_mode(void)
2527  {
2528  	u32 hyp_va_bits;
2529  	int cpu;
2530  	int err = -ENOMEM;
2531  
2532  	/*
2533  	 * The protected Hyp-mode cannot be initialized if the memory pool
2534  	 * allocation has failed.
2535  	 */
2536  	if (is_protected_kvm_enabled() && !hyp_mem_base)
2537  		goto out_err;
2538  
2539  	/*
2540  	 * Allocate Hyp PGD and setup Hyp identity mapping
2541  	 */
2542  	err = kvm_mmu_init(&hyp_va_bits);
2543  	if (err)
2544  		goto out_err;
2545  
2546  	/*
2547  	 * Allocate stack pages for Hypervisor-mode
2548  	 */
2549  	for_each_possible_cpu(cpu) {
2550  		unsigned long stack_page;
2551  
2552  		stack_page = __get_free_page(GFP_KERNEL);
2553  		if (!stack_page) {
2554  			err = -ENOMEM;
2555  			goto out_err;
2556  		}
2557  
2558  		per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
2559  	}
2560  
2561  	/*
2562  	 * Allocate and initialize pages for Hypervisor-mode percpu regions.
2563  	 */
2564  	for_each_possible_cpu(cpu) {
2565  		struct page *page;
2566  		void *page_addr;
2567  
2568  		page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
2569  		if (!page) {
2570  			err = -ENOMEM;
2571  			goto out_err;
2572  		}
2573  
2574  		page_addr = page_address(page);
2575  		memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
2576  		kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr;
2577  	}
2578  
2579  	/*
2580  	 * Map the Hyp-code called directly from the host
2581  	 */
2582  	err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
2583  				  kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
2584  	if (err) {
2585  		kvm_err("Cannot map world-switch code\n");
2586  		goto out_err;
2587  	}
2588  
2589  	err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
2590  				  kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
2591  	if (err) {
2592  		kvm_err("Cannot map .hyp.rodata section\n");
2593  		goto out_err;
2594  	}
2595  
2596  	err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
2597  				  kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
2598  	if (err) {
2599  		kvm_err("Cannot map rodata section\n");
2600  		goto out_err;
2601  	}
2602  
2603  	/*
2604  	 * .hyp.bss is guaranteed to be placed at the beginning of the .bss
2605  	 * section thanks to an assertion in the linker script. Map it RW and
2606  	 * the rest of .bss RO.
2607  	 */
2608  	err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start),
2609  				  kvm_ksym_ref(__hyp_bss_end), PAGE_HYP);
2610  	if (err) {
2611  		kvm_err("Cannot map hyp bss section: %d\n", err);
2612  		goto out_err;
2613  	}
2614  
2615  	err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end),
2616  				  kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
2617  	if (err) {
2618  		kvm_err("Cannot map bss section\n");
2619  		goto out_err;
2620  	}
2621  
2622  	/*
2623  	 * Map the Hyp stack pages
2624  	 */
2625  	for_each_possible_cpu(cpu) {
2626  		struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
2627  		char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
2628  
2629  		err = create_hyp_stack(__pa(stack_page), &params->stack_hyp_va);
2630  		if (err) {
2631  			kvm_err("Cannot map hyp stack\n");
2632  			goto out_err;
2633  		}
2634  
2635  		/*
2636  		 * Save the stack PA in nvhe_init_params. This will be needed
2637  		 * to recreate the stack mapping in protected nVHE mode.
2638  		 * __hyp_pa() won't do the right thing there, since the stack
2639  		 * has been mapped in the flexible private VA space.
2640  		 */
2641  		params->stack_pa = __pa(stack_page);
2642  	}
2643  
2644  	for_each_possible_cpu(cpu) {
2645  		char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu];
2646  		char *percpu_end = percpu_begin + nvhe_percpu_size();
2647  
2648  		/* Map Hyp percpu pages */
2649  		err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
2650  		if (err) {
2651  			kvm_err("Cannot map hyp percpu region\n");
2652  			goto out_err;
2653  		}
2654  
2655  		/* Prepare the CPU initialization parameters */
2656  		cpu_prepare_hyp_mode(cpu, hyp_va_bits);
2657  	}
2658  
2659  	kvm_hyp_init_symbols();
2660  
2661  	if (is_protected_kvm_enabled()) {
2662  		if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) &&
2663  		    cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH))
2664  			pkvm_hyp_init_ptrauth();
2665  
2666  		init_cpu_logical_map();
2667  
2668  		if (!init_psci_relay()) {
2669  			err = -ENODEV;
2670  			goto out_err;
2671  		}
2672  
2673  		err = init_pkvm_host_sve_state();
2674  		if (err)
2675  			goto out_err;
2676  
2677  		err = kvm_hyp_init_protection(hyp_va_bits);
2678  		if (err) {
2679  			kvm_err("Failed to init hyp memory protection\n");
2680  			goto out_err;
2681  		}
2682  	}
2683  
2684  	return 0;
2685  
2686  out_err:
2687  	teardown_hyp_mode();
2688  	kvm_err("error initializing Hyp mode: %d\n", err);
2689  	return err;
2690  }
2691  
kvm_mpidr_to_vcpu(struct kvm * kvm,unsigned long mpidr)2692  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
2693  {
2694  	struct kvm_vcpu *vcpu = NULL;
2695  	struct kvm_mpidr_data *data;
2696  	unsigned long i;
2697  
2698  	mpidr &= MPIDR_HWID_BITMASK;
2699  
2700  	rcu_read_lock();
2701  	data = rcu_dereference(kvm->arch.mpidr_data);
2702  
2703  	if (data) {
2704  		u16 idx = kvm_mpidr_index(data, mpidr);
2705  
2706  		vcpu = kvm_get_vcpu(kvm, data->cmpidr_to_idx[idx]);
2707  		if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu))
2708  			vcpu = NULL;
2709  	}
2710  
2711  	rcu_read_unlock();
2712  
2713  	if (vcpu)
2714  		return vcpu;
2715  
2716  	kvm_for_each_vcpu(i, vcpu, kvm) {
2717  		if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
2718  			return vcpu;
2719  	}
2720  	return NULL;
2721  }
2722  
kvm_arch_irqchip_in_kernel(struct kvm * kvm)2723  bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
2724  {
2725  	return irqchip_in_kernel(kvm);
2726  }
2727  
kvm_arch_has_irq_bypass(void)2728  bool kvm_arch_has_irq_bypass(void)
2729  {
2730  	return true;
2731  }
2732  
kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer * cons,struct irq_bypass_producer * prod)2733  int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
2734  				      struct irq_bypass_producer *prod)
2735  {
2736  	struct kvm_kernel_irqfd *irqfd =
2737  		container_of(cons, struct kvm_kernel_irqfd, consumer);
2738  
2739  	return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
2740  					  &irqfd->irq_entry);
2741  }
kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer * cons,struct irq_bypass_producer * prod)2742  void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
2743  				      struct irq_bypass_producer *prod)
2744  {
2745  	struct kvm_kernel_irqfd *irqfd =
2746  		container_of(cons, struct kvm_kernel_irqfd, consumer);
2747  
2748  	kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
2749  				     &irqfd->irq_entry);
2750  }
2751  
kvm_arch_irq_bypass_stop(struct irq_bypass_consumer * cons)2752  void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
2753  {
2754  	struct kvm_kernel_irqfd *irqfd =
2755  		container_of(cons, struct kvm_kernel_irqfd, consumer);
2756  
2757  	kvm_arm_halt_guest(irqfd->kvm);
2758  }
2759  
kvm_arch_irq_bypass_start(struct irq_bypass_consumer * cons)2760  void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
2761  {
2762  	struct kvm_kernel_irqfd *irqfd =
2763  		container_of(cons, struct kvm_kernel_irqfd, consumer);
2764  
2765  	kvm_arm_resume_guest(irqfd->kvm);
2766  }
2767  
2768  /* Initialize Hyp-mode and memory mappings on all CPUs */
kvm_arm_init(void)2769  static __init int kvm_arm_init(void)
2770  {
2771  	int err;
2772  	bool in_hyp_mode;
2773  
2774  	if (!is_hyp_mode_available()) {
2775  		kvm_info("HYP mode not available\n");
2776  		return -ENODEV;
2777  	}
2778  
2779  	if (kvm_get_mode() == KVM_MODE_NONE) {
2780  		kvm_info("KVM disabled from command line\n");
2781  		return -ENODEV;
2782  	}
2783  
2784  	err = kvm_sys_reg_table_init();
2785  	if (err) {
2786  		kvm_info("Error initializing system register tables");
2787  		return err;
2788  	}
2789  
2790  	in_hyp_mode = is_kernel_in_hyp_mode();
2791  
2792  	if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
2793  	    cpus_have_final_cap(ARM64_WORKAROUND_1508412))
2794  		kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
2795  			 "Only trusted guests should be used on this system.\n");
2796  
2797  	err = kvm_set_ipa_limit();
2798  	if (err)
2799  		return err;
2800  
2801  	err = kvm_arm_init_sve();
2802  	if (err)
2803  		return err;
2804  
2805  	err = kvm_arm_vmid_alloc_init();
2806  	if (err) {
2807  		kvm_err("Failed to initialize VMID allocator.\n");
2808  		return err;
2809  	}
2810  
2811  	if (!in_hyp_mode) {
2812  		err = init_hyp_mode();
2813  		if (err)
2814  			goto out_err;
2815  	}
2816  
2817  	err = kvm_init_vector_slots();
2818  	if (err) {
2819  		kvm_err("Cannot initialise vector slots\n");
2820  		goto out_hyp;
2821  	}
2822  
2823  	err = init_subsystems();
2824  	if (err)
2825  		goto out_hyp;
2826  
2827  	kvm_info("%s%sVHE mode initialized successfully\n",
2828  		 in_hyp_mode ? "" : (is_protected_kvm_enabled() ?
2829  				     "Protected " : "Hyp "),
2830  		 in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ?
2831  				     "h" : "n"));
2832  
2833  	/*
2834  	 * FIXME: Do something reasonable if kvm_init() fails after pKVM
2835  	 * hypervisor protection is finalized.
2836  	 */
2837  	err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
2838  	if (err)
2839  		goto out_subs;
2840  
2841  	/*
2842  	 * This should be called after initialization is done and failure isn't
2843  	 * possible anymore.
2844  	 */
2845  	if (!in_hyp_mode)
2846  		finalize_init_hyp_mode();
2847  
2848  	kvm_arm_initialised = true;
2849  
2850  	return 0;
2851  
2852  out_subs:
2853  	teardown_subsystems();
2854  out_hyp:
2855  	if (!in_hyp_mode)
2856  		teardown_hyp_mode();
2857  out_err:
2858  	kvm_arm_vmid_alloc_free();
2859  	return err;
2860  }
2861  
early_kvm_mode_cfg(char * arg)2862  static int __init early_kvm_mode_cfg(char *arg)
2863  {
2864  	if (!arg)
2865  		return -EINVAL;
2866  
2867  	if (strcmp(arg, "none") == 0) {
2868  		kvm_mode = KVM_MODE_NONE;
2869  		return 0;
2870  	}
2871  
2872  	if (!is_hyp_mode_available()) {
2873  		pr_warn_once("KVM is not available. Ignoring kvm-arm.mode\n");
2874  		return 0;
2875  	}
2876  
2877  	if (strcmp(arg, "protected") == 0) {
2878  		if (!is_kernel_in_hyp_mode())
2879  			kvm_mode = KVM_MODE_PROTECTED;
2880  		else
2881  			pr_warn_once("Protected KVM not available with VHE\n");
2882  
2883  		return 0;
2884  	}
2885  
2886  	if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) {
2887  		kvm_mode = KVM_MODE_DEFAULT;
2888  		return 0;
2889  	}
2890  
2891  	if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) {
2892  		kvm_mode = KVM_MODE_NV;
2893  		return 0;
2894  	}
2895  
2896  	return -EINVAL;
2897  }
2898  early_param("kvm-arm.mode", early_kvm_mode_cfg);
2899  
early_kvm_wfx_trap_policy_cfg(char * arg,enum kvm_wfx_trap_policy * p)2900  static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p)
2901  {
2902  	if (!arg)
2903  		return -EINVAL;
2904  
2905  	if (strcmp(arg, "trap") == 0) {
2906  		*p = KVM_WFX_TRAP;
2907  		return 0;
2908  	}
2909  
2910  	if (strcmp(arg, "notrap") == 0) {
2911  		*p = KVM_WFX_NOTRAP;
2912  		return 0;
2913  	}
2914  
2915  	return -EINVAL;
2916  }
2917  
early_kvm_wfi_trap_policy_cfg(char * arg)2918  static int __init early_kvm_wfi_trap_policy_cfg(char *arg)
2919  {
2920  	return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfi_trap_policy);
2921  }
2922  early_param("kvm-arm.wfi_trap_policy", early_kvm_wfi_trap_policy_cfg);
2923  
early_kvm_wfe_trap_policy_cfg(char * arg)2924  static int __init early_kvm_wfe_trap_policy_cfg(char *arg)
2925  {
2926  	return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfe_trap_policy);
2927  }
2928  early_param("kvm-arm.wfe_trap_policy", early_kvm_wfe_trap_policy_cfg);
2929  
kvm_get_mode(void)2930  enum kvm_mode kvm_get_mode(void)
2931  {
2932  	return kvm_mode;
2933  }
2934  
2935  module_init(kvm_arm_init);
2936