1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *
4   * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
5   */
6  
7  #include <linux/types.h>
8  #include <linux/string.h>
9  #include <linux/kvm.h>
10  #include <linux/kvm_host.h>
11  #include <linux/anon_inodes.h>
12  #include <linux/file.h>
13  #include <linux/debugfs.h>
14  #include <linux/pgtable.h>
15  
16  #include <asm/kvm_ppc.h>
17  #include <asm/kvm_book3s.h>
18  #include "book3s_hv.h"
19  #include <asm/page.h>
20  #include <asm/mmu.h>
21  #include <asm/pgalloc.h>
22  #include <asm/pte-walk.h>
23  #include <asm/ultravisor.h>
24  #include <asm/kvm_book3s_uvmem.h>
25  #include <asm/plpar_wrappers.h>
26  #include <asm/firmware.h>
27  
28  /*
29   * Supported radix tree geometry.
30   * Like p9, we support either 5 or 9 bits at the first (lowest) level,
31   * for a page size of 64k or 4k.
32   */
33  static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
34  
__kvmhv_copy_tofrom_guest_radix(int lpid,int pid,gva_t eaddr,void * to,void * from,unsigned long n)35  unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
36  					      gva_t eaddr, void *to, void *from,
37  					      unsigned long n)
38  {
39  	int old_pid, old_lpid;
40  	unsigned long quadrant, ret = n;
41  	bool is_load = !!to;
42  
43  	if (kvmhv_is_nestedv2())
44  		return H_UNSUPPORTED;
45  
46  	/* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
47  	if (kvmhv_on_pseries())
48  		return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
49  					  (to != NULL) ? __pa(to): 0,
50  					  (from != NULL) ? __pa(from): 0, n);
51  
52  	if (eaddr & (0xFFFUL << 52))
53  		return ret;
54  
55  	quadrant = 1;
56  	if (!pid)
57  		quadrant = 2;
58  	if (is_load)
59  		from = (void *) (eaddr | (quadrant << 62));
60  	else
61  		to = (void *) (eaddr | (quadrant << 62));
62  
63  	preempt_disable();
64  
65  	asm volatile("hwsync" ::: "memory");
66  	isync();
67  	/* switch the lpid first to avoid running host with unallocated pid */
68  	old_lpid = mfspr(SPRN_LPID);
69  	if (old_lpid != lpid)
70  		mtspr(SPRN_LPID, lpid);
71  	if (quadrant == 1) {
72  		old_pid = mfspr(SPRN_PID);
73  		if (old_pid != pid)
74  			mtspr(SPRN_PID, pid);
75  	}
76  	isync();
77  
78  	pagefault_disable();
79  	if (is_load)
80  		ret = __copy_from_user_inatomic(to, (const void __user *)from, n);
81  	else
82  		ret = __copy_to_user_inatomic((void __user *)to, from, n);
83  	pagefault_enable();
84  
85  	asm volatile("hwsync" ::: "memory");
86  	isync();
87  	/* switch the pid first to avoid running host with unallocated pid */
88  	if (quadrant == 1 && pid != old_pid)
89  		mtspr(SPRN_PID, old_pid);
90  	if (lpid != old_lpid)
91  		mtspr(SPRN_LPID, old_lpid);
92  	isync();
93  
94  	preempt_enable();
95  
96  	return ret;
97  }
98  
kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu * vcpu,gva_t eaddr,void * to,void * from,unsigned long n)99  static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
100  					  void *to, void *from, unsigned long n)
101  {
102  	int lpid = vcpu->kvm->arch.lpid;
103  	int pid;
104  
105  	/* This would cause a data segment intr so don't allow the access */
106  	if (eaddr & (0x3FFUL << 52))
107  		return -EINVAL;
108  
109  	/* Should we be using the nested lpid */
110  	if (vcpu->arch.nested)
111  		lpid = vcpu->arch.nested->shadow_lpid;
112  
113  	/* If accessing quadrant 3 then pid is expected to be 0 */
114  	if (((eaddr >> 62) & 0x3) == 0x3)
115  		pid = 0;
116  	else
117  		pid = kvmppc_get_pid(vcpu);
118  
119  	eaddr &= ~(0xFFFUL << 52);
120  
121  	return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
122  }
123  
kvmhv_copy_from_guest_radix(struct kvm_vcpu * vcpu,gva_t eaddr,void * to,unsigned long n)124  long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
125  				 unsigned long n)
126  {
127  	long ret;
128  
129  	ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
130  	if (ret > 0)
131  		memset(to + (n - ret), 0, ret);
132  
133  	return ret;
134  }
135  
kvmhv_copy_to_guest_radix(struct kvm_vcpu * vcpu,gva_t eaddr,void * from,unsigned long n)136  long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
137  			       unsigned long n)
138  {
139  	return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
140  }
141  
kvmppc_mmu_walk_radix_tree(struct kvm_vcpu * vcpu,gva_t eaddr,struct kvmppc_pte * gpte,u64 root,u64 * pte_ret_p)142  int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
143  			       struct kvmppc_pte *gpte, u64 root,
144  			       u64 *pte_ret_p)
145  {
146  	struct kvm *kvm = vcpu->kvm;
147  	int ret, level, ps;
148  	unsigned long rts, bits, offset, index;
149  	u64 pte, base, gpa;
150  	__be64 rpte;
151  
152  	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
153  		((root & RTS2_MASK) >> RTS2_SHIFT);
154  	bits = root & RPDS_MASK;
155  	base = root & RPDB_MASK;
156  
157  	offset = rts + 31;
158  
159  	/* Current implementations only support 52-bit space */
160  	if (offset != 52)
161  		return -EINVAL;
162  
163  	/* Walk each level of the radix tree */
164  	for (level = 3; level >= 0; --level) {
165  		u64 addr;
166  		/* Check a valid size */
167  		if (level && bits != p9_supported_radix_bits[level])
168  			return -EINVAL;
169  		if (level == 0 && !(bits == 5 || bits == 9))
170  			return -EINVAL;
171  		offset -= bits;
172  		index = (eaddr >> offset) & ((1UL << bits) - 1);
173  		/* Check that low bits of page table base are zero */
174  		if (base & ((1UL << (bits + 3)) - 1))
175  			return -EINVAL;
176  		/* Read the entry from guest memory */
177  		addr = base + (index * sizeof(rpte));
178  
179  		kvm_vcpu_srcu_read_lock(vcpu);
180  		ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
181  		kvm_vcpu_srcu_read_unlock(vcpu);
182  		if (ret) {
183  			if (pte_ret_p)
184  				*pte_ret_p = addr;
185  			return ret;
186  		}
187  		pte = __be64_to_cpu(rpte);
188  		if (!(pte & _PAGE_PRESENT))
189  			return -ENOENT;
190  		/* Check if a leaf entry */
191  		if (pte & _PAGE_PTE)
192  			break;
193  		/* Get ready to walk the next level */
194  		base = pte & RPDB_MASK;
195  		bits = pte & RPDS_MASK;
196  	}
197  
198  	/* Need a leaf at lowest level; 512GB pages not supported */
199  	if (level < 0 || level == 3)
200  		return -EINVAL;
201  
202  	/* We found a valid leaf PTE */
203  	/* Offset is now log base 2 of the page size */
204  	gpa = pte & 0x01fffffffffff000ul;
205  	if (gpa & ((1ul << offset) - 1))
206  		return -EINVAL;
207  	gpa |= eaddr & ((1ul << offset) - 1);
208  	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
209  		if (offset == mmu_psize_defs[ps].shift)
210  			break;
211  	gpte->page_size = ps;
212  	gpte->page_shift = offset;
213  
214  	gpte->eaddr = eaddr;
215  	gpte->raddr = gpa;
216  
217  	/* Work out permissions */
218  	gpte->may_read = !!(pte & _PAGE_READ);
219  	gpte->may_write = !!(pte & _PAGE_WRITE);
220  	gpte->may_execute = !!(pte & _PAGE_EXEC);
221  
222  	gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
223  
224  	if (pte_ret_p)
225  		*pte_ret_p = pte;
226  
227  	return 0;
228  }
229  
230  /*
231   * Used to walk a partition or process table radix tree in guest memory
232   * Note: We exploit the fact that a partition table and a process
233   * table have the same layout, a partition-scoped page table and a
234   * process-scoped page table have the same layout, and the 2nd
235   * doubleword of a partition table entry has the same layout as
236   * the PTCR register.
237   */
kvmppc_mmu_radix_translate_table(struct kvm_vcpu * vcpu,gva_t eaddr,struct kvmppc_pte * gpte,u64 table,int table_index,u64 * pte_ret_p)238  int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
239  				     struct kvmppc_pte *gpte, u64 table,
240  				     int table_index, u64 *pte_ret_p)
241  {
242  	struct kvm *kvm = vcpu->kvm;
243  	int ret;
244  	unsigned long size, ptbl, root;
245  	struct prtb_entry entry;
246  
247  	if ((table & PRTS_MASK) > 24)
248  		return -EINVAL;
249  	size = 1ul << ((table & PRTS_MASK) + 12);
250  
251  	/* Is the table big enough to contain this entry? */
252  	if ((table_index * sizeof(entry)) >= size)
253  		return -EINVAL;
254  
255  	/* Read the table to find the root of the radix tree */
256  	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
257  	kvm_vcpu_srcu_read_lock(vcpu);
258  	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
259  	kvm_vcpu_srcu_read_unlock(vcpu);
260  	if (ret)
261  		return ret;
262  
263  	/* Root is stored in the first double word */
264  	root = be64_to_cpu(entry.prtb0);
265  
266  	return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
267  }
268  
kvmppc_mmu_radix_xlate(struct kvm_vcpu * vcpu,gva_t eaddr,struct kvmppc_pte * gpte,bool data,bool iswrite)269  int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
270  			   struct kvmppc_pte *gpte, bool data, bool iswrite)
271  {
272  	u32 pid;
273  	u64 pte;
274  	int ret;
275  
276  	/* Work out effective PID */
277  	switch (eaddr >> 62) {
278  	case 0:
279  		pid = kvmppc_get_pid(vcpu);
280  		break;
281  	case 3:
282  		pid = 0;
283  		break;
284  	default:
285  		return -EINVAL;
286  	}
287  
288  	ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
289  				vcpu->kvm->arch.process_table, pid, &pte);
290  	if (ret)
291  		return ret;
292  
293  	/* Check privilege (applies only to process scoped translations) */
294  	if (kvmppc_get_msr(vcpu) & MSR_PR) {
295  		if (pte & _PAGE_PRIVILEGED) {
296  			gpte->may_read = 0;
297  			gpte->may_write = 0;
298  			gpte->may_execute = 0;
299  		}
300  	} else {
301  		if (!(pte & _PAGE_PRIVILEGED)) {
302  			/* Check AMR/IAMR to see if strict mode is in force */
303  			if (kvmppc_get_amr_hv(vcpu) & (1ul << 62))
304  				gpte->may_read = 0;
305  			if (kvmppc_get_amr_hv(vcpu) & (1ul << 63))
306  				gpte->may_write = 0;
307  			if (vcpu->arch.iamr & (1ul << 62))
308  				gpte->may_execute = 0;
309  		}
310  	}
311  
312  	return 0;
313  }
314  
kvmppc_radix_tlbie_page(struct kvm * kvm,unsigned long addr,unsigned int pshift,u64 lpid)315  void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
316  			     unsigned int pshift, u64 lpid)
317  {
318  	unsigned long psize = PAGE_SIZE;
319  	int psi;
320  	long rc;
321  	unsigned long rb;
322  
323  	if (pshift)
324  		psize = 1UL << pshift;
325  	else
326  		pshift = PAGE_SHIFT;
327  
328  	addr &= ~(psize - 1);
329  
330  	if (!kvmhv_on_pseries()) {
331  		radix__flush_tlb_lpid_page(lpid, addr, psize);
332  		return;
333  	}
334  
335  	psi = shift_to_mmu_psize(pshift);
336  
337  	if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) {
338  		rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
339  		rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
340  					lpid, rb);
341  	} else {
342  		rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
343  					    H_RPTI_TYPE_NESTED |
344  					    H_RPTI_TYPE_TLB,
345  					    psize_to_rpti_pgsize(psi),
346  					    addr, addr + psize);
347  	}
348  
349  	if (rc)
350  		pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
351  }
352  
kvmppc_radix_flush_pwc(struct kvm * kvm,u64 lpid)353  static void kvmppc_radix_flush_pwc(struct kvm *kvm, u64 lpid)
354  {
355  	long rc;
356  
357  	if (!kvmhv_on_pseries()) {
358  		radix__flush_pwc_lpid(lpid);
359  		return;
360  	}
361  
362  	if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
363  		rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
364  					lpid, TLBIEL_INVAL_SET_LPID);
365  	else
366  		rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
367  					    H_RPTI_TYPE_NESTED |
368  					    H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL,
369  					    0, -1UL);
370  	if (rc)
371  		pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
372  }
373  
kvmppc_radix_update_pte(struct kvm * kvm,pte_t * ptep,unsigned long clr,unsigned long set,unsigned long addr,unsigned int shift)374  static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
375  				      unsigned long clr, unsigned long set,
376  				      unsigned long addr, unsigned int shift)
377  {
378  	return __radix_pte_update(ptep, clr, set);
379  }
380  
kvmppc_radix_set_pte_at(struct kvm * kvm,unsigned long addr,pte_t * ptep,pte_t pte)381  static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
382  			     pte_t *ptep, pte_t pte)
383  {
384  	radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
385  }
386  
387  static struct kmem_cache *kvm_pte_cache;
388  static struct kmem_cache *kvm_pmd_cache;
389  
kvmppc_pte_alloc(void)390  static pte_t *kvmppc_pte_alloc(void)
391  {
392  	pte_t *pte;
393  
394  	pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
395  	/* pmd_populate() will only reference _pa(pte). */
396  	kmemleak_ignore(pte);
397  
398  	return pte;
399  }
400  
kvmppc_pte_free(pte_t * ptep)401  static void kvmppc_pte_free(pte_t *ptep)
402  {
403  	kmem_cache_free(kvm_pte_cache, ptep);
404  }
405  
kvmppc_pmd_alloc(void)406  static pmd_t *kvmppc_pmd_alloc(void)
407  {
408  	pmd_t *pmd;
409  
410  	pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
411  	/* pud_populate() will only reference _pa(pmd). */
412  	kmemleak_ignore(pmd);
413  
414  	return pmd;
415  }
416  
kvmppc_pmd_free(pmd_t * pmdp)417  static void kvmppc_pmd_free(pmd_t *pmdp)
418  {
419  	kmem_cache_free(kvm_pmd_cache, pmdp);
420  }
421  
422  /* Called with kvm->mmu_lock held */
kvmppc_unmap_pte(struct kvm * kvm,pte_t * pte,unsigned long gpa,unsigned int shift,const struct kvm_memory_slot * memslot,u64 lpid)423  void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
424  		      unsigned int shift,
425  		      const struct kvm_memory_slot *memslot,
426  		      u64 lpid)
427  
428  {
429  	unsigned long old;
430  	unsigned long gfn = gpa >> PAGE_SHIFT;
431  	unsigned long page_size = PAGE_SIZE;
432  	unsigned long hpa;
433  
434  	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
435  	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
436  
437  	/* The following only applies to L1 entries */
438  	if (lpid != kvm->arch.lpid)
439  		return;
440  
441  	if (!memslot) {
442  		memslot = gfn_to_memslot(kvm, gfn);
443  		if (!memslot)
444  			return;
445  	}
446  	if (shift) { /* 1GB or 2MB page */
447  		page_size = 1ul << shift;
448  		if (shift == PMD_SHIFT)
449  			kvm->stat.num_2M_pages--;
450  		else if (shift == PUD_SHIFT)
451  			kvm->stat.num_1G_pages--;
452  	}
453  
454  	gpa &= ~(page_size - 1);
455  	hpa = old & PTE_RPN_MASK;
456  	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
457  
458  	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
459  		kvmppc_update_dirty_map(memslot, gfn, page_size);
460  }
461  
462  /*
463   * kvmppc_free_p?d are used to free existing page tables, and recursively
464   * descend and clear and free children.
465   * Callers are responsible for flushing the PWC.
466   *
467   * When page tables are being unmapped/freed as part of page fault path
468   * (full == false), valid ptes are generally not expected; however, there
469   * is one situation where they arise, which is when dirty page logging is
470   * turned off for a memslot while the VM is running.  The new memslot
471   * becomes visible to page faults before the memslot commit function
472   * gets to flush the memslot, which can lead to a 2MB page mapping being
473   * installed for a guest physical address where there are already 64kB
474   * (or 4kB) mappings (of sub-pages of the same 2MB page).
475   */
kvmppc_unmap_free_pte(struct kvm * kvm,pte_t * pte,bool full,u64 lpid)476  static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
477  				  u64 lpid)
478  {
479  	if (full) {
480  		memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
481  	} else {
482  		pte_t *p = pte;
483  		unsigned long it;
484  
485  		for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
486  			if (pte_val(*p) == 0)
487  				continue;
488  			kvmppc_unmap_pte(kvm, p,
489  					 pte_pfn(*p) << PAGE_SHIFT,
490  					 PAGE_SHIFT, NULL, lpid);
491  		}
492  	}
493  
494  	kvmppc_pte_free(pte);
495  }
496  
kvmppc_unmap_free_pmd(struct kvm * kvm,pmd_t * pmd,bool full,u64 lpid)497  static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
498  				  u64 lpid)
499  {
500  	unsigned long im;
501  	pmd_t *p = pmd;
502  
503  	for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
504  		if (!pmd_present(*p))
505  			continue;
506  		if (pmd_leaf(*p)) {
507  			if (full) {
508  				pmd_clear(p);
509  			} else {
510  				WARN_ON_ONCE(1);
511  				kvmppc_unmap_pte(kvm, (pte_t *)p,
512  					 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
513  					 PMD_SHIFT, NULL, lpid);
514  			}
515  		} else {
516  			pte_t *pte;
517  
518  			pte = pte_offset_kernel(p, 0);
519  			kvmppc_unmap_free_pte(kvm, pte, full, lpid);
520  			pmd_clear(p);
521  		}
522  	}
523  	kvmppc_pmd_free(pmd);
524  }
525  
kvmppc_unmap_free_pud(struct kvm * kvm,pud_t * pud,u64 lpid)526  static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
527  				  u64 lpid)
528  {
529  	unsigned long iu;
530  	pud_t *p = pud;
531  
532  	for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
533  		if (!pud_present(*p))
534  			continue;
535  		if (pud_leaf(*p)) {
536  			pud_clear(p);
537  		} else {
538  			pmd_t *pmd;
539  
540  			pmd = pmd_offset(p, 0);
541  			kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
542  			pud_clear(p);
543  		}
544  	}
545  	pud_free(kvm->mm, pud);
546  }
547  
kvmppc_free_pgtable_radix(struct kvm * kvm,pgd_t * pgd,u64 lpid)548  void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, u64 lpid)
549  {
550  	unsigned long ig;
551  
552  	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
553  		p4d_t *p4d = p4d_offset(pgd, 0);
554  		pud_t *pud;
555  
556  		if (!p4d_present(*p4d))
557  			continue;
558  		pud = pud_offset(p4d, 0);
559  		kvmppc_unmap_free_pud(kvm, pud, lpid);
560  		p4d_clear(p4d);
561  	}
562  }
563  
kvmppc_free_radix(struct kvm * kvm)564  void kvmppc_free_radix(struct kvm *kvm)
565  {
566  	if (kvm->arch.pgtable) {
567  		kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
568  					  kvm->arch.lpid);
569  		pgd_free(kvm->mm, kvm->arch.pgtable);
570  		kvm->arch.pgtable = NULL;
571  	}
572  }
573  
kvmppc_unmap_free_pmd_entry_table(struct kvm * kvm,pmd_t * pmd,unsigned long gpa,u64 lpid)574  static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
575  					unsigned long gpa, u64 lpid)
576  {
577  	pte_t *pte = pte_offset_kernel(pmd, 0);
578  
579  	/*
580  	 * Clearing the pmd entry then flushing the PWC ensures that the pte
581  	 * page no longer be cached by the MMU, so can be freed without
582  	 * flushing the PWC again.
583  	 */
584  	pmd_clear(pmd);
585  	kvmppc_radix_flush_pwc(kvm, lpid);
586  
587  	kvmppc_unmap_free_pte(kvm, pte, false, lpid);
588  }
589  
kvmppc_unmap_free_pud_entry_table(struct kvm * kvm,pud_t * pud,unsigned long gpa,u64 lpid)590  static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
591  					unsigned long gpa, u64 lpid)
592  {
593  	pmd_t *pmd = pmd_offset(pud, 0);
594  
595  	/*
596  	 * Clearing the pud entry then flushing the PWC ensures that the pmd
597  	 * page and any children pte pages will no longer be cached by the MMU,
598  	 * so can be freed without flushing the PWC again.
599  	 */
600  	pud_clear(pud);
601  	kvmppc_radix_flush_pwc(kvm, lpid);
602  
603  	kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
604  }
605  
606  /*
607   * There are a number of bits which may differ between different faults to
608   * the same partition scope entry. RC bits, in the course of cleaning and
609   * aging. And the write bit can change, either the access could have been
610   * upgraded, or a read fault could happen concurrently with a write fault
611   * that sets those bits first.
612   */
613  #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
614  
kvmppc_create_pte(struct kvm * kvm,pgd_t * pgtable,pte_t pte,unsigned long gpa,unsigned int level,unsigned long mmu_seq,u64 lpid,unsigned long * rmapp,struct rmap_nested ** n_rmap)615  int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
616  		      unsigned long gpa, unsigned int level,
617  		      unsigned long mmu_seq, u64 lpid,
618  		      unsigned long *rmapp, struct rmap_nested **n_rmap)
619  {
620  	pgd_t *pgd;
621  	p4d_t *p4d;
622  	pud_t *pud, *new_pud = NULL;
623  	pmd_t *pmd, *new_pmd = NULL;
624  	pte_t *ptep, *new_ptep = NULL;
625  	int ret;
626  
627  	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
628  	pgd = pgtable + pgd_index(gpa);
629  	p4d = p4d_offset(pgd, gpa);
630  
631  	pud = NULL;
632  	if (p4d_present(*p4d))
633  		pud = pud_offset(p4d, gpa);
634  	else
635  		new_pud = pud_alloc_one(kvm->mm, gpa);
636  
637  	pmd = NULL;
638  	if (pud && pud_present(*pud) && !pud_leaf(*pud))
639  		pmd = pmd_offset(pud, gpa);
640  	else if (level <= 1)
641  		new_pmd = kvmppc_pmd_alloc();
642  
643  	if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_leaf(*pmd)))
644  		new_ptep = kvmppc_pte_alloc();
645  
646  	/* Check if we might have been invalidated; let the guest retry if so */
647  	spin_lock(&kvm->mmu_lock);
648  	ret = -EAGAIN;
649  	if (mmu_invalidate_retry(kvm, mmu_seq))
650  		goto out_unlock;
651  
652  	/* Now traverse again under the lock and change the tree */
653  	ret = -ENOMEM;
654  	if (p4d_none(*p4d)) {
655  		if (!new_pud)
656  			goto out_unlock;
657  		p4d_populate(kvm->mm, p4d, new_pud);
658  		new_pud = NULL;
659  	}
660  	pud = pud_offset(p4d, gpa);
661  	if (pud_leaf(*pud)) {
662  		unsigned long hgpa = gpa & PUD_MASK;
663  
664  		/* Check if we raced and someone else has set the same thing */
665  		if (level == 2) {
666  			if (pud_raw(*pud) == pte_raw(pte)) {
667  				ret = 0;
668  				goto out_unlock;
669  			}
670  			/* Valid 1GB page here already, add our extra bits */
671  			WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
672  							PTE_BITS_MUST_MATCH);
673  			kvmppc_radix_update_pte(kvm, (pte_t *)pud,
674  					      0, pte_val(pte), hgpa, PUD_SHIFT);
675  			ret = 0;
676  			goto out_unlock;
677  		}
678  		/*
679  		 * If we raced with another CPU which has just put
680  		 * a 1GB pte in after we saw a pmd page, try again.
681  		 */
682  		if (!new_pmd) {
683  			ret = -EAGAIN;
684  			goto out_unlock;
685  		}
686  		/* Valid 1GB page here already, remove it */
687  		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
688  				 lpid);
689  	}
690  	if (level == 2) {
691  		if (!pud_none(*pud)) {
692  			/*
693  			 * There's a page table page here, but we wanted to
694  			 * install a large page, so remove and free the page
695  			 * table page.
696  			 */
697  			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
698  		}
699  		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
700  		if (rmapp && n_rmap)
701  			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
702  		ret = 0;
703  		goto out_unlock;
704  	}
705  	if (pud_none(*pud)) {
706  		if (!new_pmd)
707  			goto out_unlock;
708  		pud_populate(kvm->mm, pud, new_pmd);
709  		new_pmd = NULL;
710  	}
711  	pmd = pmd_offset(pud, gpa);
712  	if (pmd_leaf(*pmd)) {
713  		unsigned long lgpa = gpa & PMD_MASK;
714  
715  		/* Check if we raced and someone else has set the same thing */
716  		if (level == 1) {
717  			if (pmd_raw(*pmd) == pte_raw(pte)) {
718  				ret = 0;
719  				goto out_unlock;
720  			}
721  			/* Valid 2MB page here already, add our extra bits */
722  			WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
723  							PTE_BITS_MUST_MATCH);
724  			kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
725  					0, pte_val(pte), lgpa, PMD_SHIFT);
726  			ret = 0;
727  			goto out_unlock;
728  		}
729  
730  		/*
731  		 * If we raced with another CPU which has just put
732  		 * a 2MB pte in after we saw a pte page, try again.
733  		 */
734  		if (!new_ptep) {
735  			ret = -EAGAIN;
736  			goto out_unlock;
737  		}
738  		/* Valid 2MB page here already, remove it */
739  		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
740  				 lpid);
741  	}
742  	if (level == 1) {
743  		if (!pmd_none(*pmd)) {
744  			/*
745  			 * There's a page table page here, but we wanted to
746  			 * install a large page, so remove and free the page
747  			 * table page.
748  			 */
749  			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
750  		}
751  		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
752  		if (rmapp && n_rmap)
753  			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
754  		ret = 0;
755  		goto out_unlock;
756  	}
757  	if (pmd_none(*pmd)) {
758  		if (!new_ptep)
759  			goto out_unlock;
760  		pmd_populate(kvm->mm, pmd, new_ptep);
761  		new_ptep = NULL;
762  	}
763  	ptep = pte_offset_kernel(pmd, gpa);
764  	if (pte_present(*ptep)) {
765  		/* Check if someone else set the same thing */
766  		if (pte_raw(*ptep) == pte_raw(pte)) {
767  			ret = 0;
768  			goto out_unlock;
769  		}
770  		/* Valid page here already, add our extra bits */
771  		WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
772  							PTE_BITS_MUST_MATCH);
773  		kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
774  		ret = 0;
775  		goto out_unlock;
776  	}
777  	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
778  	if (rmapp && n_rmap)
779  		kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
780  	ret = 0;
781  
782   out_unlock:
783  	spin_unlock(&kvm->mmu_lock);
784  	if (new_pud)
785  		pud_free(kvm->mm, new_pud);
786  	if (new_pmd)
787  		kvmppc_pmd_free(new_pmd);
788  	if (new_ptep)
789  		kvmppc_pte_free(new_ptep);
790  	return ret;
791  }
792  
kvmppc_hv_handle_set_rc(struct kvm * kvm,bool nested,bool writing,unsigned long gpa,u64 lpid)793  bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
794  			     unsigned long gpa, u64 lpid)
795  {
796  	unsigned long pgflags;
797  	unsigned int shift;
798  	pte_t *ptep;
799  
800  	/*
801  	 * Need to set an R or C bit in the 2nd-level tables;
802  	 * since we are just helping out the hardware here,
803  	 * it is sufficient to do what the hardware does.
804  	 */
805  	pgflags = _PAGE_ACCESSED;
806  	if (writing)
807  		pgflags |= _PAGE_DIRTY;
808  
809  	if (nested)
810  		ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
811  	else
812  		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
813  
814  	if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
815  		kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
816  		return true;
817  	}
818  	return false;
819  }
820  
kvmppc_book3s_instantiate_page(struct kvm_vcpu * vcpu,unsigned long gpa,struct kvm_memory_slot * memslot,bool writing,bool kvm_ro,pte_t * inserted_pte,unsigned int * levelp)821  int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
822  				   unsigned long gpa,
823  				   struct kvm_memory_slot *memslot,
824  				   bool writing, bool kvm_ro,
825  				   pte_t *inserted_pte, unsigned int *levelp)
826  {
827  	struct kvm *kvm = vcpu->kvm;
828  	struct page *page = NULL;
829  	unsigned long mmu_seq;
830  	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
831  	bool upgrade_write = false;
832  	bool *upgrade_p = &upgrade_write;
833  	pte_t pte, *ptep;
834  	unsigned int shift, level;
835  	int ret;
836  	bool large_enable;
837  
838  	/* used to check for invalidations in progress */
839  	mmu_seq = kvm->mmu_invalidate_seq;
840  	smp_rmb();
841  
842  	/*
843  	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
844  	 * do it with !atomic && !async, which is how we call it.
845  	 * We always ask for write permission since the common case
846  	 * is that the page is writable.
847  	 */
848  	hva = gfn_to_hva_memslot(memslot, gfn);
849  	if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
850  		upgrade_write = true;
851  	} else {
852  		unsigned long pfn;
853  
854  		/* Call KVM generic code to do the slow-path check */
855  		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
856  					   writing, upgrade_p, NULL);
857  		if (is_error_noslot_pfn(pfn))
858  			return -EFAULT;
859  		page = NULL;
860  		if (pfn_valid(pfn)) {
861  			page = pfn_to_page(pfn);
862  			if (PageReserved(page))
863  				page = NULL;
864  		}
865  	}
866  
867  	/*
868  	 * Read the PTE from the process' radix tree and use that
869  	 * so we get the shift and attribute bits.
870  	 */
871  	spin_lock(&kvm->mmu_lock);
872  	ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
873  	pte = __pte(0);
874  	if (ptep)
875  		pte = READ_ONCE(*ptep);
876  	spin_unlock(&kvm->mmu_lock);
877  	/*
878  	 * If the PTE disappeared temporarily due to a THP
879  	 * collapse, just return and let the guest try again.
880  	 */
881  	if (!pte_present(pte)) {
882  		if (page)
883  			put_page(page);
884  		return RESUME_GUEST;
885  	}
886  
887  	/* If we're logging dirty pages, always map single pages */
888  	large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
889  
890  	/* Get pte level from shift/size */
891  	if (large_enable && shift == PUD_SHIFT &&
892  	    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
893  	    (hva & (PUD_SIZE - PAGE_SIZE))) {
894  		level = 2;
895  	} else if (large_enable && shift == PMD_SHIFT &&
896  		   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
897  		   (hva & (PMD_SIZE - PAGE_SIZE))) {
898  		level = 1;
899  	} else {
900  		level = 0;
901  		if (shift > PAGE_SHIFT) {
902  			/*
903  			 * If the pte maps more than one page, bring over
904  			 * bits from the virtual address to get the real
905  			 * address of the specific single page we want.
906  			 */
907  			unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
908  			pte = __pte(pte_val(pte) | (hva & rpnmask));
909  		}
910  	}
911  
912  	pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
913  	if (writing || upgrade_write) {
914  		if (pte_val(pte) & _PAGE_WRITE)
915  			pte = __pte(pte_val(pte) | _PAGE_DIRTY);
916  	} else {
917  		pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
918  	}
919  
920  	/* Allocate space in the tree and write the PTE */
921  	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
922  				mmu_seq, kvm->arch.lpid, NULL, NULL);
923  	if (inserted_pte)
924  		*inserted_pte = pte;
925  	if (levelp)
926  		*levelp = level;
927  
928  	if (page) {
929  		if (!ret && (pte_val(pte) & _PAGE_WRITE))
930  			set_page_dirty_lock(page);
931  		put_page(page);
932  	}
933  
934  	/* Increment number of large pages if we (successfully) inserted one */
935  	if (!ret) {
936  		if (level == 1)
937  			kvm->stat.num_2M_pages++;
938  		else if (level == 2)
939  			kvm->stat.num_1G_pages++;
940  	}
941  
942  	return ret;
943  }
944  
kvmppc_book3s_radix_page_fault(struct kvm_vcpu * vcpu,unsigned long ea,unsigned long dsisr)945  int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
946  				   unsigned long ea, unsigned long dsisr)
947  {
948  	struct kvm *kvm = vcpu->kvm;
949  	unsigned long gpa, gfn;
950  	struct kvm_memory_slot *memslot;
951  	long ret;
952  	bool writing = !!(dsisr & DSISR_ISSTORE);
953  	bool kvm_ro = false;
954  
955  	/* Check for unusual errors */
956  	if (dsisr & DSISR_UNSUPP_MMU) {
957  		pr_err("KVM: Got unsupported MMU fault\n");
958  		return -EFAULT;
959  	}
960  	if (dsisr & DSISR_BADACCESS) {
961  		/* Reflect to the guest as DSI */
962  		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
963  		kvmppc_core_queue_data_storage(vcpu,
964  				kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
965  				ea, dsisr);
966  		return RESUME_GUEST;
967  	}
968  
969  	/* Translate the logical address */
970  	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
971  	gpa &= ~0xF000000000000000ul;
972  	gfn = gpa >> PAGE_SHIFT;
973  	if (!(dsisr & DSISR_PRTABLE_FAULT))
974  		gpa |= ea & 0xfff;
975  
976  	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
977  		return kvmppc_send_page_to_uv(kvm, gfn);
978  
979  	/* Get the corresponding memslot */
980  	memslot = gfn_to_memslot(kvm, gfn);
981  
982  	/* No memslot means it's an emulated MMIO region */
983  	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
984  		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
985  			     DSISR_SET_RC)) {
986  			/*
987  			 * Bad address in guest page table tree, or other
988  			 * unusual error - reflect it to the guest as DSI.
989  			 */
990  			kvmppc_core_queue_data_storage(vcpu,
991  					kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
992  					ea, dsisr);
993  			return RESUME_GUEST;
994  		}
995  		return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
996  	}
997  
998  	if (memslot->flags & KVM_MEM_READONLY) {
999  		if (writing) {
1000  			/* give the guest a DSI */
1001  			kvmppc_core_queue_data_storage(vcpu,
1002  					kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
1003  					ea, DSISR_ISSTORE | DSISR_PROTFAULT);
1004  			return RESUME_GUEST;
1005  		}
1006  		kvm_ro = true;
1007  	}
1008  
1009  	/* Failed to set the reference/change bits */
1010  	if (dsisr & DSISR_SET_RC) {
1011  		spin_lock(&kvm->mmu_lock);
1012  		if (kvmppc_hv_handle_set_rc(kvm, false, writing,
1013  					    gpa, kvm->arch.lpid))
1014  			dsisr &= ~DSISR_SET_RC;
1015  		spin_unlock(&kvm->mmu_lock);
1016  
1017  		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
1018  			       DSISR_PROTFAULT | DSISR_SET_RC)))
1019  			return RESUME_GUEST;
1020  	}
1021  
1022  	/* Try to insert a pte */
1023  	ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
1024  					     kvm_ro, NULL, NULL);
1025  
1026  	if (ret == 0 || ret == -EAGAIN)
1027  		ret = RESUME_GUEST;
1028  	return ret;
1029  }
1030  
1031  /* Called with kvm->mmu_lock held */
kvm_unmap_radix(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long gfn)1032  void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1033  		     unsigned long gfn)
1034  {
1035  	pte_t *ptep;
1036  	unsigned long gpa = gfn << PAGE_SHIFT;
1037  	unsigned int shift;
1038  
1039  	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
1040  		uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
1041  		return;
1042  	}
1043  
1044  	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1045  	if (ptep && pte_present(*ptep))
1046  		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1047  				 kvm->arch.lpid);
1048  }
1049  
1050  /* Called with kvm->mmu_lock held */
kvm_age_radix(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long gfn)1051  bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1052  		   unsigned long gfn)
1053  {
1054  	pte_t *ptep;
1055  	unsigned long gpa = gfn << PAGE_SHIFT;
1056  	unsigned int shift;
1057  	bool ref = false;
1058  	unsigned long old, *rmapp;
1059  
1060  	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1061  		return ref;
1062  
1063  	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1064  	if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
1065  		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
1066  					      gpa, shift);
1067  		/* XXX need to flush tlb here? */
1068  		/* Also clear bit in ptes in shadow pgtable for nested guests */
1069  		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1070  		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
1071  					       old & PTE_RPN_MASK,
1072  					       1UL << shift);
1073  		ref = true;
1074  	}
1075  	return ref;
1076  }
1077  
1078  /* Called with kvm->mmu_lock held */
kvm_test_age_radix(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long gfn)1079  bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1080  			unsigned long gfn)
1081  
1082  {
1083  	pte_t *ptep;
1084  	unsigned long gpa = gfn << PAGE_SHIFT;
1085  	unsigned int shift;
1086  	bool ref = false;
1087  
1088  	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1089  		return ref;
1090  
1091  	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1092  	if (ptep && pte_present(*ptep) && pte_young(*ptep))
1093  		ref = true;
1094  	return ref;
1095  }
1096  
1097  /* Returns the number of PAGE_SIZE pages that are dirty */
kvm_radix_test_clear_dirty(struct kvm * kvm,struct kvm_memory_slot * memslot,int pagenum)1098  static int kvm_radix_test_clear_dirty(struct kvm *kvm,
1099  				struct kvm_memory_slot *memslot, int pagenum)
1100  {
1101  	unsigned long gfn = memslot->base_gfn + pagenum;
1102  	unsigned long gpa = gfn << PAGE_SHIFT;
1103  	pte_t *ptep, pte;
1104  	unsigned int shift;
1105  	int ret = 0;
1106  	unsigned long old, *rmapp;
1107  
1108  	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1109  		return ret;
1110  
1111  	/*
1112  	 * For performance reasons we don't hold kvm->mmu_lock while walking the
1113  	 * partition scoped table.
1114  	 */
1115  	ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);
1116  	if (!ptep)
1117  		return 0;
1118  
1119  	pte = READ_ONCE(*ptep);
1120  	if (pte_present(pte) && pte_dirty(pte)) {
1121  		spin_lock(&kvm->mmu_lock);
1122  		/*
1123  		 * Recheck the pte again
1124  		 */
1125  		if (pte_val(pte) != pte_val(*ptep)) {
1126  			/*
1127  			 * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can
1128  			 * only find PAGE_SIZE pte entries here. We can continue
1129  			 * to use the pte addr returned by above page table
1130  			 * walk.
1131  			 */
1132  			if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
1133  				spin_unlock(&kvm->mmu_lock);
1134  				return 0;
1135  			}
1136  		}
1137  
1138  		ret = 1;
1139  		VM_BUG_ON(shift);
1140  		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
1141  					      gpa, shift);
1142  		kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
1143  		/* Also clear bit in ptes in shadow pgtable for nested guests */
1144  		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1145  		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
1146  					       old & PTE_RPN_MASK,
1147  					       1UL << shift);
1148  		spin_unlock(&kvm->mmu_lock);
1149  	}
1150  	return ret;
1151  }
1152  
kvmppc_hv_get_dirty_log_radix(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long * map)1153  long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
1154  			struct kvm_memory_slot *memslot, unsigned long *map)
1155  {
1156  	unsigned long i, j;
1157  	int npages;
1158  
1159  	for (i = 0; i < memslot->npages; i = j) {
1160  		npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
1161  
1162  		/*
1163  		 * Note that if npages > 0 then i must be a multiple of npages,
1164  		 * since huge pages are only used to back the guest at guest
1165  		 * real addresses that are a multiple of their size.
1166  		 * Since we have at most one PTE covering any given guest
1167  		 * real address, if npages > 1 we can skip to i + npages.
1168  		 */
1169  		j = i + 1;
1170  		if (npages) {
1171  			set_dirty_bits(map, i, npages);
1172  			j = i + npages;
1173  		}
1174  	}
1175  	return 0;
1176  }
1177  
kvmppc_radix_flush_memslot(struct kvm * kvm,const struct kvm_memory_slot * memslot)1178  void kvmppc_radix_flush_memslot(struct kvm *kvm,
1179  				const struct kvm_memory_slot *memslot)
1180  {
1181  	unsigned long n;
1182  	pte_t *ptep;
1183  	unsigned long gpa;
1184  	unsigned int shift;
1185  
1186  	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
1187  		kvmppc_uvmem_drop_pages(memslot, kvm, true);
1188  
1189  	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1190  		return;
1191  
1192  	gpa = memslot->base_gfn << PAGE_SHIFT;
1193  	spin_lock(&kvm->mmu_lock);
1194  	for (n = memslot->npages; n; --n) {
1195  		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1196  		if (ptep && pte_present(*ptep))
1197  			kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1198  					 kvm->arch.lpid);
1199  		gpa += PAGE_SIZE;
1200  	}
1201  	/*
1202  	 * Increase the mmu notifier sequence number to prevent any page
1203  	 * fault that read the memslot earlier from writing a PTE.
1204  	 */
1205  	kvm->mmu_invalidate_seq++;
1206  	spin_unlock(&kvm->mmu_lock);
1207  }
1208  
add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info * info,int psize,int * indexp)1209  static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
1210  				 int psize, int *indexp)
1211  {
1212  	if (!mmu_psize_defs[psize].shift)
1213  		return;
1214  	info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
1215  		(mmu_psize_defs[psize].ap << 29);
1216  	++(*indexp);
1217  }
1218  
kvmhv_get_rmmu_info(struct kvm * kvm,struct kvm_ppc_rmmu_info * info)1219  int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
1220  {
1221  	int i;
1222  
1223  	if (!radix_enabled())
1224  		return -EINVAL;
1225  	memset(info, 0, sizeof(*info));
1226  
1227  	/* 4k page size */
1228  	info->geometries[0].page_shift = 12;
1229  	info->geometries[0].level_bits[0] = 9;
1230  	for (i = 1; i < 4; ++i)
1231  		info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
1232  	/* 64k page size */
1233  	info->geometries[1].page_shift = 16;
1234  	for (i = 0; i < 4; ++i)
1235  		info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
1236  
1237  	i = 0;
1238  	add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
1239  	add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
1240  	add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
1241  	add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
1242  
1243  	return 0;
1244  }
1245  
kvmppc_init_vm_radix(struct kvm * kvm)1246  int kvmppc_init_vm_radix(struct kvm *kvm)
1247  {
1248  	kvm->arch.pgtable = pgd_alloc(kvm->mm);
1249  	if (!kvm->arch.pgtable)
1250  		return -ENOMEM;
1251  	return 0;
1252  }
1253  
pte_ctor(void * addr)1254  static void pte_ctor(void *addr)
1255  {
1256  	memset(addr, 0, RADIX_PTE_TABLE_SIZE);
1257  }
1258  
pmd_ctor(void * addr)1259  static void pmd_ctor(void *addr)
1260  {
1261  	memset(addr, 0, RADIX_PMD_TABLE_SIZE);
1262  }
1263  
1264  struct debugfs_radix_state {
1265  	struct kvm	*kvm;
1266  	struct mutex	mutex;
1267  	unsigned long	gpa;
1268  	int		lpid;
1269  	int		chars_left;
1270  	int		buf_index;
1271  	char		buf[128];
1272  	u8		hdr;
1273  };
1274  
debugfs_radix_open(struct inode * inode,struct file * file)1275  static int debugfs_radix_open(struct inode *inode, struct file *file)
1276  {
1277  	struct kvm *kvm = inode->i_private;
1278  	struct debugfs_radix_state *p;
1279  
1280  	p = kzalloc(sizeof(*p), GFP_KERNEL);
1281  	if (!p)
1282  		return -ENOMEM;
1283  
1284  	kvm_get_kvm(kvm);
1285  	p->kvm = kvm;
1286  	mutex_init(&p->mutex);
1287  	file->private_data = p;
1288  
1289  	return nonseekable_open(inode, file);
1290  }
1291  
debugfs_radix_release(struct inode * inode,struct file * file)1292  static int debugfs_radix_release(struct inode *inode, struct file *file)
1293  {
1294  	struct debugfs_radix_state *p = file->private_data;
1295  
1296  	kvm_put_kvm(p->kvm);
1297  	kfree(p);
1298  	return 0;
1299  }
1300  
debugfs_radix_read(struct file * file,char __user * buf,size_t len,loff_t * ppos)1301  static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1302  				 size_t len, loff_t *ppos)
1303  {
1304  	struct debugfs_radix_state *p = file->private_data;
1305  	ssize_t ret, r;
1306  	unsigned long n;
1307  	struct kvm *kvm;
1308  	unsigned long gpa;
1309  	pgd_t *pgt;
1310  	struct kvm_nested_guest *nested;
1311  	pgd_t *pgdp;
1312  	p4d_t p4d, *p4dp;
1313  	pud_t pud, *pudp;
1314  	pmd_t pmd, *pmdp;
1315  	pte_t *ptep;
1316  	int shift;
1317  	unsigned long pte;
1318  
1319  	kvm = p->kvm;
1320  	if (!kvm_is_radix(kvm))
1321  		return 0;
1322  
1323  	ret = mutex_lock_interruptible(&p->mutex);
1324  	if (ret)
1325  		return ret;
1326  
1327  	if (p->chars_left) {
1328  		n = p->chars_left;
1329  		if (n > len)
1330  			n = len;
1331  		r = copy_to_user(buf, p->buf + p->buf_index, n);
1332  		n -= r;
1333  		p->chars_left -= n;
1334  		p->buf_index += n;
1335  		buf += n;
1336  		len -= n;
1337  		ret = n;
1338  		if (r) {
1339  			if (!n)
1340  				ret = -EFAULT;
1341  			goto out;
1342  		}
1343  	}
1344  
1345  	gpa = p->gpa;
1346  	nested = NULL;
1347  	pgt = NULL;
1348  	while (len != 0 && p->lpid >= 0) {
1349  		if (gpa >= RADIX_PGTABLE_RANGE) {
1350  			gpa = 0;
1351  			pgt = NULL;
1352  			if (nested) {
1353  				kvmhv_put_nested(nested);
1354  				nested = NULL;
1355  			}
1356  			p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1357  			p->hdr = 0;
1358  			if (p->lpid < 0)
1359  				break;
1360  		}
1361  		if (!pgt) {
1362  			if (p->lpid == 0) {
1363  				pgt = kvm->arch.pgtable;
1364  			} else {
1365  				nested = kvmhv_get_nested(kvm, p->lpid, false);
1366  				if (!nested) {
1367  					gpa = RADIX_PGTABLE_RANGE;
1368  					continue;
1369  				}
1370  				pgt = nested->shadow_pgtable;
1371  			}
1372  		}
1373  		n = 0;
1374  		if (!p->hdr) {
1375  			if (p->lpid > 0)
1376  				n = scnprintf(p->buf, sizeof(p->buf),
1377  					      "\nNested LPID %d: ", p->lpid);
1378  			n += scnprintf(p->buf + n, sizeof(p->buf) - n,
1379  				      "pgdir: %lx\n", (unsigned long)pgt);
1380  			p->hdr = 1;
1381  			goto copy;
1382  		}
1383  
1384  		pgdp = pgt + pgd_index(gpa);
1385  		p4dp = p4d_offset(pgdp, gpa);
1386  		p4d = READ_ONCE(*p4dp);
1387  		if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
1388  			gpa = (gpa & P4D_MASK) + P4D_SIZE;
1389  			continue;
1390  		}
1391  
1392  		pudp = pud_offset(&p4d, gpa);
1393  		pud = READ_ONCE(*pudp);
1394  		if (!(pud_val(pud) & _PAGE_PRESENT)) {
1395  			gpa = (gpa & PUD_MASK) + PUD_SIZE;
1396  			continue;
1397  		}
1398  		if (pud_val(pud) & _PAGE_PTE) {
1399  			pte = pud_val(pud);
1400  			shift = PUD_SHIFT;
1401  			goto leaf;
1402  		}
1403  
1404  		pmdp = pmd_offset(&pud, gpa);
1405  		pmd = READ_ONCE(*pmdp);
1406  		if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1407  			gpa = (gpa & PMD_MASK) + PMD_SIZE;
1408  			continue;
1409  		}
1410  		if (pmd_val(pmd) & _PAGE_PTE) {
1411  			pte = pmd_val(pmd);
1412  			shift = PMD_SHIFT;
1413  			goto leaf;
1414  		}
1415  
1416  		ptep = pte_offset_kernel(&pmd, gpa);
1417  		pte = pte_val(READ_ONCE(*ptep));
1418  		if (!(pte & _PAGE_PRESENT)) {
1419  			gpa += PAGE_SIZE;
1420  			continue;
1421  		}
1422  		shift = PAGE_SHIFT;
1423  	leaf:
1424  		n = scnprintf(p->buf, sizeof(p->buf),
1425  			      " %lx: %lx %d\n", gpa, pte, shift);
1426  		gpa += 1ul << shift;
1427  	copy:
1428  		p->chars_left = n;
1429  		if (n > len)
1430  			n = len;
1431  		r = copy_to_user(buf, p->buf, n);
1432  		n -= r;
1433  		p->chars_left -= n;
1434  		p->buf_index = n;
1435  		buf += n;
1436  		len -= n;
1437  		ret += n;
1438  		if (r) {
1439  			if (!ret)
1440  				ret = -EFAULT;
1441  			break;
1442  		}
1443  	}
1444  	p->gpa = gpa;
1445  	if (nested)
1446  		kvmhv_put_nested(nested);
1447  
1448   out:
1449  	mutex_unlock(&p->mutex);
1450  	return ret;
1451  }
1452  
debugfs_radix_write(struct file * file,const char __user * buf,size_t len,loff_t * ppos)1453  static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1454  			   size_t len, loff_t *ppos)
1455  {
1456  	return -EACCES;
1457  }
1458  
1459  static const struct file_operations debugfs_radix_fops = {
1460  	.owner	 = THIS_MODULE,
1461  	.open	 = debugfs_radix_open,
1462  	.release = debugfs_radix_release,
1463  	.read	 = debugfs_radix_read,
1464  	.write	 = debugfs_radix_write,
1465  	.llseek	 = generic_file_llseek,
1466  };
1467  
kvmhv_radix_debugfs_init(struct kvm * kvm)1468  void kvmhv_radix_debugfs_init(struct kvm *kvm)
1469  {
1470  	debugfs_create_file("radix", 0400, kvm->debugfs_dentry, kvm,
1471  			    &debugfs_radix_fops);
1472  }
1473  
kvmppc_radix_init(void)1474  int kvmppc_radix_init(void)
1475  {
1476  	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
1477  
1478  	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
1479  	if (!kvm_pte_cache)
1480  		return -ENOMEM;
1481  
1482  	size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
1483  
1484  	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
1485  	if (!kvm_pmd_cache) {
1486  		kmem_cache_destroy(kvm_pte_cache);
1487  		return -ENOMEM;
1488  	}
1489  
1490  	return 0;
1491  }
1492  
kvmppc_radix_exit(void)1493  void kvmppc_radix_exit(void)
1494  {
1495  	kmem_cache_destroy(kvm_pte_cache);
1496  	kmem_cache_destroy(kvm_pmd_cache);
1497  }
1498