1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Kernel-based Virtual Machine -- Performance Monitoring Unit support
4   *
5   * Copyright 2015 Red Hat, Inc. and/or its affiliates.
6   *
7   * Authors:
8   *   Avi Kivity   <avi@redhat.com>
9   *   Gleb Natapov <gleb@redhat.com>
10   *   Wei Huang    <wei@redhat.com>
11   */
12  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13  
14  #include <linux/types.h>
15  #include <linux/kvm_host.h>
16  #include <linux/perf_event.h>
17  #include <linux/bsearch.h>
18  #include <linux/sort.h>
19  #include <asm/perf_event.h>
20  #include <asm/cpu_device_id.h>
21  #include "x86.h"
22  #include "cpuid.h"
23  #include "lapic.h"
24  #include "pmu.h"
25  
26  /* This is enough to filter the vast majority of currently defined events. */
27  #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
28  
29  struct x86_pmu_capability __read_mostly kvm_pmu_cap;
30  EXPORT_SYMBOL_GPL(kvm_pmu_cap);
31  
32  struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
33  EXPORT_SYMBOL_GPL(kvm_pmu_eventsel);
34  
35  /* Precise Distribution of Instructions Retired (PDIR) */
36  static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
37  	X86_MATCH_VFM(INTEL_ICELAKE_D, NULL),
38  	X86_MATCH_VFM(INTEL_ICELAKE_X, NULL),
39  	/* Instruction-Accurate PDIR (PDIR++) */
40  	X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
41  	{}
42  };
43  
44  /* Precise Distribution (PDist) */
45  static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
46  	X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
47  	{}
48  };
49  
50  /* NOTE:
51   * - Each perf counter is defined as "struct kvm_pmc";
52   * - There are two types of perf counters: general purpose (gp) and fixed.
53   *   gp counters are stored in gp_counters[] and fixed counters are stored
54   *   in fixed_counters[] respectively. Both of them are part of "struct
55   *   kvm_pmu";
56   * - pmu.c understands the difference between gp counters and fixed counters.
57   *   However AMD doesn't support fixed-counters;
58   * - There are three types of index to access perf counters (PMC):
59   *     1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
60   *        has MSR_K7_PERFCTRn and, for families 15H and later,
61   *        MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
62   *        aliased to MSR_K7_PERFCTRn.
63   *     2. MSR Index (named idx): This normally is used by RDPMC instruction.
64   *        For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
65   *        C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
66   *        that it also supports fixed counters. idx can be used to as index to
67   *        gp and fixed counters.
68   *     3. Global PMC Index (named pmc): pmc is an index specific to PMU
69   *        code. Each pmc, stored in kvm_pmc.idx field, is unique across
70   *        all perf counters (both gp and fixed). The mapping relationship
71   *        between pmc and perf counters is as the following:
72   *        * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters
73   *                 [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
74   *        * AMD:   [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
75   *          and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
76   */
77  
78  static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
79  
80  #define KVM_X86_PMU_OP(func)					     \
81  	DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func,			     \
82  				*(((struct kvm_pmu_ops *)0)->func));
83  #define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
84  #include <asm/kvm-x86-pmu-ops.h>
85  
kvm_pmu_ops_update(const struct kvm_pmu_ops * pmu_ops)86  void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
87  {
88  	memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
89  
90  #define __KVM_X86_PMU_OP(func) \
91  	static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
92  #define KVM_X86_PMU_OP(func) \
93  	WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
94  #define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
95  #include <asm/kvm-x86-pmu-ops.h>
96  #undef __KVM_X86_PMU_OP
97  }
98  
__kvm_perf_overflow(struct kvm_pmc * pmc,bool in_pmi)99  static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
100  {
101  	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
102  	bool skip_pmi = false;
103  
104  	if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
105  		if (!in_pmi) {
106  			/*
107  			 * TODO: KVM is currently _choosing_ to not generate records
108  			 * for emulated instructions, avoiding BUFFER_OVF PMI when
109  			 * there are no records. Strictly speaking, it should be done
110  			 * as well in the right context to improve sampling accuracy.
111  			 */
112  			skip_pmi = true;
113  		} else {
114  			/* Indicate PEBS overflow PMI to guest. */
115  			skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
116  						      (unsigned long *)&pmu->global_status);
117  		}
118  	} else {
119  		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
120  	}
121  
122  	if (pmc->intr && !skip_pmi)
123  		kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
124  }
125  
kvm_perf_overflow(struct perf_event * perf_event,struct perf_sample_data * data,struct pt_regs * regs)126  static void kvm_perf_overflow(struct perf_event *perf_event,
127  			      struct perf_sample_data *data,
128  			      struct pt_regs *regs)
129  {
130  	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
131  
132  	/*
133  	 * Ignore asynchronous overflow events for counters that are scheduled
134  	 * to be reprogrammed, e.g. if a PMI for the previous event races with
135  	 * KVM's handling of a related guest WRMSR.
136  	 */
137  	if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
138  		return;
139  
140  	__kvm_perf_overflow(pmc, true);
141  
142  	kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
143  }
144  
pmc_get_pebs_precise_level(struct kvm_pmc * pmc)145  static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
146  {
147  	/*
148  	 * For some model specific pebs counters with special capabilities
149  	 * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
150  	 * level to the maximum value (currently 3, backwards compatible)
151  	 * so that the perf subsystem would assign specific hardware counter
152  	 * with that capability for vPMC.
153  	 */
154  	if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
155  	    (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu)))
156  		return 3;
157  
158  	/*
159  	 * The non-zero precision level of guest event makes the ordinary
160  	 * guest event becomes a guest PEBS event and triggers the host
161  	 * PEBS PMI handler to determine whether the PEBS overflow PMI
162  	 * comes from the host counters or the guest.
163  	 */
164  	return 1;
165  }
166  
get_sample_period(struct kvm_pmc * pmc,u64 counter_value)167  static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
168  {
169  	u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
170  
171  	if (!sample_period)
172  		sample_period = pmc_bitmask(pmc) + 1;
173  	return sample_period;
174  }
175  
pmc_reprogram_counter(struct kvm_pmc * pmc,u32 type,u64 config,bool exclude_user,bool exclude_kernel,bool intr)176  static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
177  				 bool exclude_user, bool exclude_kernel,
178  				 bool intr)
179  {
180  	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
181  	struct perf_event *event;
182  	struct perf_event_attr attr = {
183  		.type = type,
184  		.size = sizeof(attr),
185  		.pinned = true,
186  		.exclude_idle = true,
187  		.exclude_host = 1,
188  		.exclude_user = exclude_user,
189  		.exclude_kernel = exclude_kernel,
190  		.config = config,
191  	};
192  	bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
193  
194  	attr.sample_period = get_sample_period(pmc, pmc->counter);
195  
196  	if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
197  	    (boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) {
198  		/*
199  		 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
200  		 * period. Just clear the sample period so at least
201  		 * allocating the counter doesn't fail.
202  		 */
203  		attr.sample_period = 0;
204  	}
205  	if (pebs) {
206  		/*
207  		 * For most PEBS hardware events, the difference in the software
208  		 * precision levels of guest and host PEBS events will not affect
209  		 * the accuracy of the PEBS profiling result, because the "event IP"
210  		 * in the PEBS record is calibrated on the guest side.
211  		 */
212  		attr.precise_ip = pmc_get_pebs_precise_level(pmc);
213  	}
214  
215  	event = perf_event_create_kernel_counter(&attr, -1, current,
216  						 kvm_perf_overflow, pmc);
217  	if (IS_ERR(event)) {
218  		pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
219  			    PTR_ERR(event), pmc->idx);
220  		return PTR_ERR(event);
221  	}
222  
223  	pmc->perf_event = event;
224  	pmc_to_pmu(pmc)->event_count++;
225  	pmc->is_paused = false;
226  	pmc->intr = intr || pebs;
227  	return 0;
228  }
229  
pmc_pause_counter(struct kvm_pmc * pmc)230  static bool pmc_pause_counter(struct kvm_pmc *pmc)
231  {
232  	u64 counter = pmc->counter;
233  	u64 prev_counter;
234  
235  	/* update counter, reset event value to avoid redundant accumulation */
236  	if (pmc->perf_event && !pmc->is_paused)
237  		counter += perf_event_pause(pmc->perf_event, true);
238  
239  	/*
240  	 * Snapshot the previous counter *after* accumulating state from perf.
241  	 * If overflow already happened, hardware (via perf) is responsible for
242  	 * generating a PMI.  KVM just needs to detect overflow on emulated
243  	 * counter events that haven't yet been processed.
244  	 */
245  	prev_counter = counter & pmc_bitmask(pmc);
246  
247  	counter += pmc->emulated_counter;
248  	pmc->counter = counter & pmc_bitmask(pmc);
249  
250  	pmc->emulated_counter = 0;
251  	pmc->is_paused = true;
252  
253  	return pmc->counter < prev_counter;
254  }
255  
pmc_resume_counter(struct kvm_pmc * pmc)256  static bool pmc_resume_counter(struct kvm_pmc *pmc)
257  {
258  	if (!pmc->perf_event)
259  		return false;
260  
261  	/* recalibrate sample period and check if it's accepted by perf core */
262  	if (is_sampling_event(pmc->perf_event) &&
263  	    perf_event_period(pmc->perf_event,
264  			      get_sample_period(pmc, pmc->counter)))
265  		return false;
266  
267  	if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) !=
268  	    (!!pmc->perf_event->attr.precise_ip))
269  		return false;
270  
271  	/* reuse perf_event to serve as pmc_reprogram_counter() does*/
272  	perf_event_enable(pmc->perf_event);
273  	pmc->is_paused = false;
274  
275  	return true;
276  }
277  
pmc_release_perf_event(struct kvm_pmc * pmc)278  static void pmc_release_perf_event(struct kvm_pmc *pmc)
279  {
280  	if (pmc->perf_event) {
281  		perf_event_release_kernel(pmc->perf_event);
282  		pmc->perf_event = NULL;
283  		pmc->current_config = 0;
284  		pmc_to_pmu(pmc)->event_count--;
285  	}
286  }
287  
pmc_stop_counter(struct kvm_pmc * pmc)288  static void pmc_stop_counter(struct kvm_pmc *pmc)
289  {
290  	if (pmc->perf_event) {
291  		pmc->counter = pmc_read_counter(pmc);
292  		pmc_release_perf_event(pmc);
293  	}
294  }
295  
pmc_update_sample_period(struct kvm_pmc * pmc)296  static void pmc_update_sample_period(struct kvm_pmc *pmc)
297  {
298  	if (!pmc->perf_event || pmc->is_paused ||
299  	    !is_sampling_event(pmc->perf_event))
300  		return;
301  
302  	perf_event_period(pmc->perf_event,
303  			  get_sample_period(pmc, pmc->counter));
304  }
305  
pmc_write_counter(struct kvm_pmc * pmc,u64 val)306  void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
307  {
308  	/*
309  	 * Drop any unconsumed accumulated counts, the WRMSR is a write, not a
310  	 * read-modify-write.  Adjust the counter value so that its value is
311  	 * relative to the current count, as reading the current count from
312  	 * perf is faster than pausing and repgrogramming the event in order to
313  	 * reset it to '0'.  Note, this very sneakily offsets the accumulated
314  	 * emulated count too, by using pmc_read_counter()!
315  	 */
316  	pmc->emulated_counter = 0;
317  	pmc->counter += val - pmc_read_counter(pmc);
318  	pmc->counter &= pmc_bitmask(pmc);
319  	pmc_update_sample_period(pmc);
320  }
321  EXPORT_SYMBOL_GPL(pmc_write_counter);
322  
filter_cmp(const void * pa,const void * pb,u64 mask)323  static int filter_cmp(const void *pa, const void *pb, u64 mask)
324  {
325  	u64 a = *(u64 *)pa & mask;
326  	u64 b = *(u64 *)pb & mask;
327  
328  	return (a > b) - (a < b);
329  }
330  
331  
filter_sort_cmp(const void * pa,const void * pb)332  static int filter_sort_cmp(const void *pa, const void *pb)
333  {
334  	return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
335  				   KVM_PMU_MASKED_ENTRY_EXCLUDE));
336  }
337  
338  /*
339   * For the event filter, searching is done on the 'includes' list and
340   * 'excludes' list separately rather than on the 'events' list (which
341   * has both).  As a result the exclude bit can be ignored.
342   */
filter_event_cmp(const void * pa,const void * pb)343  static int filter_event_cmp(const void *pa, const void *pb)
344  {
345  	return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
346  }
347  
find_filter_index(u64 * events,u64 nevents,u64 key)348  static int find_filter_index(u64 *events, u64 nevents, u64 key)
349  {
350  	u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]),
351  			  filter_event_cmp);
352  
353  	if (!fe)
354  		return -1;
355  
356  	return fe - events;
357  }
358  
is_filter_entry_match(u64 filter_event,u64 umask)359  static bool is_filter_entry_match(u64 filter_event, u64 umask)
360  {
361  	u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
362  	u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
363  
364  	BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
365  		     (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
366  		     ARCH_PERFMON_EVENTSEL_UMASK);
367  
368  	return (umask & mask) == match;
369  }
370  
filter_contains_match(u64 * events,u64 nevents,u64 eventsel)371  static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
372  {
373  	u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
374  	u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
375  	int i, index;
376  
377  	index = find_filter_index(events, nevents, event_select);
378  	if (index < 0)
379  		return false;
380  
381  	/*
382  	 * Entries are sorted by the event select.  Walk the list in both
383  	 * directions to process all entries with the targeted event select.
384  	 */
385  	for (i = index; i < nevents; i++) {
386  		if (filter_event_cmp(&events[i], &event_select))
387  			break;
388  
389  		if (is_filter_entry_match(events[i], umask))
390  			return true;
391  	}
392  
393  	for (i = index - 1; i >= 0; i--) {
394  		if (filter_event_cmp(&events[i], &event_select))
395  			break;
396  
397  		if (is_filter_entry_match(events[i], umask))
398  			return true;
399  	}
400  
401  	return false;
402  }
403  
is_gp_event_allowed(struct kvm_x86_pmu_event_filter * f,u64 eventsel)404  static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
405  				u64 eventsel)
406  {
407  	if (filter_contains_match(f->includes, f->nr_includes, eventsel) &&
408  	    !filter_contains_match(f->excludes, f->nr_excludes, eventsel))
409  		return f->action == KVM_PMU_EVENT_ALLOW;
410  
411  	return f->action == KVM_PMU_EVENT_DENY;
412  }
413  
is_fixed_event_allowed(struct kvm_x86_pmu_event_filter * filter,int idx)414  static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
415  				   int idx)
416  {
417  	int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;
418  
419  	if (filter->action == KVM_PMU_EVENT_DENY &&
420  	    test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
421  		return false;
422  	if (filter->action == KVM_PMU_EVENT_ALLOW &&
423  	    !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
424  		return false;
425  
426  	return true;
427  }
428  
check_pmu_event_filter(struct kvm_pmc * pmc)429  static bool check_pmu_event_filter(struct kvm_pmc *pmc)
430  {
431  	struct kvm_x86_pmu_event_filter *filter;
432  	struct kvm *kvm = pmc->vcpu->kvm;
433  
434  	filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
435  	if (!filter)
436  		return true;
437  
438  	if (pmc_is_gp(pmc))
439  		return is_gp_event_allowed(filter, pmc->eventsel);
440  
441  	return is_fixed_event_allowed(filter, pmc->idx);
442  }
443  
pmc_event_is_allowed(struct kvm_pmc * pmc)444  static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
445  {
446  	return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
447  	       check_pmu_event_filter(pmc);
448  }
449  
reprogram_counter(struct kvm_pmc * pmc)450  static int reprogram_counter(struct kvm_pmc *pmc)
451  {
452  	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
453  	u64 eventsel = pmc->eventsel;
454  	u64 new_config = eventsel;
455  	bool emulate_overflow;
456  	u8 fixed_ctr_ctrl;
457  
458  	emulate_overflow = pmc_pause_counter(pmc);
459  
460  	if (!pmc_event_is_allowed(pmc))
461  		return 0;
462  
463  	if (emulate_overflow)
464  		__kvm_perf_overflow(pmc, false);
465  
466  	if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
467  		printk_once("kvm pmu: pin control bit is ignored\n");
468  
469  	if (pmc_is_fixed(pmc)) {
470  		fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
471  						  pmc->idx - KVM_FIXED_PMC_BASE_IDX);
472  		if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL)
473  			eventsel |= ARCH_PERFMON_EVENTSEL_OS;
474  		if (fixed_ctr_ctrl & INTEL_FIXED_0_USER)
475  			eventsel |= ARCH_PERFMON_EVENTSEL_USR;
476  		if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI)
477  			eventsel |= ARCH_PERFMON_EVENTSEL_INT;
478  		new_config = (u64)fixed_ctr_ctrl;
479  	}
480  
481  	if (pmc->current_config == new_config && pmc_resume_counter(pmc))
482  		return 0;
483  
484  	pmc_release_perf_event(pmc);
485  
486  	pmc->current_config = new_config;
487  
488  	return pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
489  				     (eventsel & pmu->raw_event_mask),
490  				     !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
491  				     !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
492  				     eventsel & ARCH_PERFMON_EVENTSEL_INT);
493  }
494  
kvm_pmu_handle_event(struct kvm_vcpu * vcpu)495  void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
496  {
497  	DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
498  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
499  	struct kvm_pmc *pmc;
500  	int bit;
501  
502  	bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX);
503  
504  	/*
505  	 * The reprogramming bitmap can be written asynchronously by something
506  	 * other than the task that holds vcpu->mutex, take care to clear only
507  	 * the bits that will actually processed.
508  	 */
509  	BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
510  	atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi);
511  
512  	kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
513  		/*
514  		 * If reprogramming fails, e.g. due to contention, re-set the
515  		 * regprogram bit set, i.e. opportunistically try again on the
516  		 * next PMU refresh.  Don't make a new request as doing so can
517  		 * stall the guest if reprogramming repeatedly fails.
518  		 */
519  		if (reprogram_counter(pmc))
520  			set_bit(pmc->idx, pmu->reprogram_pmi);
521  	}
522  
523  	/*
524  	 * Release unused perf_events if the corresponding guest MSRs weren't
525  	 * accessed during the last vCPU time slice (need_cleanup is set when
526  	 * the vCPU is scheduled back in).
527  	 */
528  	if (unlikely(pmu->need_cleanup))
529  		kvm_pmu_cleanup(vcpu);
530  }
531  
kvm_pmu_check_rdpmc_early(struct kvm_vcpu * vcpu,unsigned int idx)532  int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
533  {
534  	/*
535  	 * On Intel, VMX interception has priority over RDPMC exceptions that
536  	 * aren't already handled by the emulator, i.e. there are no additional
537  	 * check needed for Intel PMUs.
538  	 *
539  	 * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
540  	 * i.e. an invalid PMC results in a #GP, not #VMEXIT.
541  	 */
542  	if (!kvm_pmu_ops.check_rdpmc_early)
543  		return 0;
544  
545  	return kvm_pmu_call(check_rdpmc_early)(vcpu, idx);
546  }
547  
is_vmware_backdoor_pmc(u32 pmc_idx)548  bool is_vmware_backdoor_pmc(u32 pmc_idx)
549  {
550  	switch (pmc_idx) {
551  	case VMWARE_BACKDOOR_PMC_HOST_TSC:
552  	case VMWARE_BACKDOOR_PMC_REAL_TIME:
553  	case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
554  		return true;
555  	}
556  	return false;
557  }
558  
kvm_pmu_rdpmc_vmware(struct kvm_vcpu * vcpu,unsigned idx,u64 * data)559  static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
560  {
561  	u64 ctr_val;
562  
563  	switch (idx) {
564  	case VMWARE_BACKDOOR_PMC_HOST_TSC:
565  		ctr_val = rdtsc();
566  		break;
567  	case VMWARE_BACKDOOR_PMC_REAL_TIME:
568  		ctr_val = ktime_get_boottime_ns();
569  		break;
570  	case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
571  		ctr_val = ktime_get_boottime_ns() +
572  			vcpu->kvm->arch.kvmclock_offset;
573  		break;
574  	default:
575  		return 1;
576  	}
577  
578  	*data = ctr_val;
579  	return 0;
580  }
581  
kvm_pmu_rdpmc(struct kvm_vcpu * vcpu,unsigned idx,u64 * data)582  int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
583  {
584  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
585  	struct kvm_pmc *pmc;
586  	u64 mask = ~0ull;
587  
588  	if (!pmu->version)
589  		return 1;
590  
591  	if (is_vmware_backdoor_pmc(idx))
592  		return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
593  
594  	pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
595  	if (!pmc)
596  		return 1;
597  
598  	if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
599  	    (kvm_x86_call(get_cpl)(vcpu) != 0) &&
600  	    kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
601  		return 1;
602  
603  	*data = pmc_read_counter(pmc) & mask;
604  	return 0;
605  }
606  
kvm_pmu_deliver_pmi(struct kvm_vcpu * vcpu)607  void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
608  {
609  	if (lapic_in_kernel(vcpu)) {
610  		kvm_pmu_call(deliver_pmi)(vcpu);
611  		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
612  	}
613  }
614  
kvm_pmu_is_valid_msr(struct kvm_vcpu * vcpu,u32 msr)615  bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
616  {
617  	switch (msr) {
618  	case MSR_CORE_PERF_GLOBAL_STATUS:
619  	case MSR_CORE_PERF_GLOBAL_CTRL:
620  	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
621  		return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu));
622  	default:
623  		break;
624  	}
625  	return kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr) ||
626  	       kvm_pmu_call(is_valid_msr)(vcpu, msr);
627  }
628  
kvm_pmu_mark_pmc_in_use(struct kvm_vcpu * vcpu,u32 msr)629  static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
630  {
631  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
632  	struct kvm_pmc *pmc = kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr);
633  
634  	if (pmc)
635  		__set_bit(pmc->idx, pmu->pmc_in_use);
636  }
637  
kvm_pmu_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)638  int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
639  {
640  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
641  	u32 msr = msr_info->index;
642  
643  	switch (msr) {
644  	case MSR_CORE_PERF_GLOBAL_STATUS:
645  	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
646  		msr_info->data = pmu->global_status;
647  		break;
648  	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
649  	case MSR_CORE_PERF_GLOBAL_CTRL:
650  		msr_info->data = pmu->global_ctrl;
651  		break;
652  	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
653  	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
654  		msr_info->data = 0;
655  		break;
656  	default:
657  		return kvm_pmu_call(get_msr)(vcpu, msr_info);
658  	}
659  
660  	return 0;
661  }
662  
kvm_pmu_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)663  int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
664  {
665  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
666  	u32 msr = msr_info->index;
667  	u64 data = msr_info->data;
668  	u64 diff;
669  
670  	/*
671  	 * Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
672  	 * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
673  	 */
674  	switch (msr) {
675  	case MSR_CORE_PERF_GLOBAL_STATUS:
676  		if (!msr_info->host_initiated)
677  			return 1; /* RO MSR */
678  		fallthrough;
679  	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
680  		/* Per PPR, Read-only MSR. Writes are ignored. */
681  		if (!msr_info->host_initiated)
682  			break;
683  
684  		if (data & pmu->global_status_rsvd)
685  			return 1;
686  
687  		pmu->global_status = data;
688  		break;
689  	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
690  		data &= ~pmu->global_ctrl_rsvd;
691  		fallthrough;
692  	case MSR_CORE_PERF_GLOBAL_CTRL:
693  		if (!kvm_valid_perf_global_ctrl(pmu, data))
694  			return 1;
695  
696  		if (pmu->global_ctrl != data) {
697  			diff = pmu->global_ctrl ^ data;
698  			pmu->global_ctrl = data;
699  			reprogram_counters(pmu, diff);
700  		}
701  		break;
702  	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
703  		/*
704  		 * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
705  		 * GLOBAL_STATUS, and so the set of reserved bits is the same.
706  		 */
707  		if (data & pmu->global_status_rsvd)
708  			return 1;
709  		fallthrough;
710  	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
711  		if (!msr_info->host_initiated)
712  			pmu->global_status &= ~data;
713  		break;
714  	default:
715  		kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
716  		return kvm_pmu_call(set_msr)(vcpu, msr_info);
717  	}
718  
719  	return 0;
720  }
721  
kvm_pmu_reset(struct kvm_vcpu * vcpu)722  static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
723  {
724  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
725  	struct kvm_pmc *pmc;
726  	int i;
727  
728  	pmu->need_cleanup = false;
729  
730  	bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);
731  
732  	kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
733  		pmc_stop_counter(pmc);
734  		pmc->counter = 0;
735  		pmc->emulated_counter = 0;
736  
737  		if (pmc_is_gp(pmc))
738  			pmc->eventsel = 0;
739  	}
740  
741  	pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
742  
743  	kvm_pmu_call(reset)(vcpu);
744  }
745  
746  
747  /*
748   * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID
749   * and/or PERF_CAPABILITIES.
750   */
kvm_pmu_refresh(struct kvm_vcpu * vcpu)751  void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
752  {
753  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
754  
755  	if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
756  		return;
757  
758  	/*
759  	 * Stop/release all existing counters/events before realizing the new
760  	 * vPMU model.
761  	 */
762  	kvm_pmu_reset(vcpu);
763  
764  	pmu->version = 0;
765  	pmu->nr_arch_gp_counters = 0;
766  	pmu->nr_arch_fixed_counters = 0;
767  	pmu->counter_bitmask[KVM_PMC_GP] = 0;
768  	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
769  	pmu->reserved_bits = 0xffffffff00200000ull;
770  	pmu->raw_event_mask = X86_RAW_EVENT_MASK;
771  	pmu->global_ctrl_rsvd = ~0ull;
772  	pmu->global_status_rsvd = ~0ull;
773  	pmu->fixed_ctr_ctrl_rsvd = ~0ull;
774  	pmu->pebs_enable_rsvd = ~0ull;
775  	pmu->pebs_data_cfg_rsvd = ~0ull;
776  	bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
777  
778  	if (!vcpu->kvm->arch.enable_pmu)
779  		return;
780  
781  	kvm_pmu_call(refresh)(vcpu);
782  
783  	/*
784  	 * At RESET, both Intel and AMD CPUs set all enable bits for general
785  	 * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
786  	 * was written for v1 PMUs don't unknowingly leave GP counters disabled
787  	 * in the global controls).  Emulate that behavior when refreshing the
788  	 * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
789  	 */
790  	if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
791  		pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
792  }
793  
kvm_pmu_init(struct kvm_vcpu * vcpu)794  void kvm_pmu_init(struct kvm_vcpu *vcpu)
795  {
796  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
797  
798  	memset(pmu, 0, sizeof(*pmu));
799  	kvm_pmu_call(init)(vcpu);
800  	kvm_pmu_refresh(vcpu);
801  }
802  
803  /* Release perf_events for vPMCs that have been unused for a full time slice.  */
kvm_pmu_cleanup(struct kvm_vcpu * vcpu)804  void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
805  {
806  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
807  	struct kvm_pmc *pmc = NULL;
808  	DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
809  	int i;
810  
811  	pmu->need_cleanup = false;
812  
813  	bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
814  		      pmu->pmc_in_use, X86_PMC_IDX_MAX);
815  
816  	kvm_for_each_pmc(pmu, pmc, i, bitmask) {
817  		if (pmc->perf_event && !pmc_speculative_in_use(pmc))
818  			pmc_stop_counter(pmc);
819  	}
820  
821  	kvm_pmu_call(cleanup)(vcpu);
822  
823  	bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
824  }
825  
kvm_pmu_destroy(struct kvm_vcpu * vcpu)826  void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
827  {
828  	kvm_pmu_reset(vcpu);
829  }
830  
kvm_pmu_incr_counter(struct kvm_pmc * pmc)831  static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
832  {
833  	pmc->emulated_counter++;
834  	kvm_pmu_request_counter_reprogram(pmc);
835  }
836  
cpl_is_matched(struct kvm_pmc * pmc)837  static inline bool cpl_is_matched(struct kvm_pmc *pmc)
838  {
839  	bool select_os, select_user;
840  	u64 config;
841  
842  	if (pmc_is_gp(pmc)) {
843  		config = pmc->eventsel;
844  		select_os = config & ARCH_PERFMON_EVENTSEL_OS;
845  		select_user = config & ARCH_PERFMON_EVENTSEL_USR;
846  	} else {
847  		config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
848  					  pmc->idx - KVM_FIXED_PMC_BASE_IDX);
849  		select_os = config & INTEL_FIXED_0_KERNEL;
850  		select_user = config & INTEL_FIXED_0_USER;
851  	}
852  
853  	/*
854  	 * Skip the CPL lookup, which isn't free on Intel, if the result will
855  	 * be the same regardless of the CPL.
856  	 */
857  	if (select_os == select_user)
858  		return select_os;
859  
860  	return (kvm_x86_call(get_cpl)(pmc->vcpu) == 0) ? select_os :
861  							 select_user;
862  }
863  
kvm_pmu_trigger_event(struct kvm_vcpu * vcpu,u64 eventsel)864  void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel)
865  {
866  	DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
867  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
868  	struct kvm_pmc *pmc;
869  	int i;
870  
871  	BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);
872  
873  	if (!kvm_pmu_has_perf_global_ctrl(pmu))
874  		bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
875  	else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx,
876  			     (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
877  		return;
878  
879  	kvm_for_each_pmc(pmu, pmc, i, bitmap) {
880  		/*
881  		 * Ignore checks for edge detect (all events currently emulated
882  		 * but KVM are always rising edges), pin control (unsupported
883  		 * by modern CPUs), and counter mask and its invert flag (KVM
884  		 * doesn't emulate multiple events in a single clock cycle).
885  		 *
886  		 * Note, the uppermost nibble of AMD's mask overlaps Intel's
887  		 * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved
888  		 * bits (bits 35:34).  Checking the "in HLE/RTM transaction"
889  		 * flags is correct as the vCPU can't be in a transaction if
890  		 * KVM is emulating an instruction.  Checking the reserved bits
891  		 * might be wrong if they are defined in the future, but so
892  		 * could ignoring them, so do the simple thing for now.
893  		 */
894  		if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) ||
895  		    !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc))
896  			continue;
897  
898  		kvm_pmu_incr_counter(pmc);
899  	}
900  }
901  EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
902  
is_masked_filter_valid(const struct kvm_x86_pmu_event_filter * filter)903  static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
904  {
905  	u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
906  		   KVM_PMU_MASKED_ENTRY_UMASK_MASK |
907  		   KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
908  		   KVM_PMU_MASKED_ENTRY_EXCLUDE;
909  	int i;
910  
911  	for (i = 0; i < filter->nevents; i++) {
912  		if (filter->events[i] & ~mask)
913  			return false;
914  	}
915  
916  	return true;
917  }
918  
convert_to_masked_filter(struct kvm_x86_pmu_event_filter * filter)919  static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
920  {
921  	int i, j;
922  
923  	for (i = 0, j = 0; i < filter->nevents; i++) {
924  		/*
925  		 * Skip events that are impossible to match against a guest
926  		 * event.  When filtering, only the event select + unit mask
927  		 * of the guest event is used.  To maintain backwards
928  		 * compatibility, impossible filters can't be rejected :-(
929  		 */
930  		if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
931  					  ARCH_PERFMON_EVENTSEL_UMASK))
932  			continue;
933  		/*
934  		 * Convert userspace events to a common in-kernel event so
935  		 * only one code path is needed to support both events.  For
936  		 * the in-kernel events use masked events because they are
937  		 * flexible enough to handle both cases.  To convert to masked
938  		 * events all that's needed is to add an "all ones" umask_mask,
939  		 * (unmasked filter events don't support EXCLUDE).
940  		 */
941  		filter->events[j++] = filter->events[i] |
942  				      (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
943  	}
944  
945  	filter->nevents = j;
946  }
947  
prepare_filter_lists(struct kvm_x86_pmu_event_filter * filter)948  static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
949  {
950  	int i;
951  
952  	if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
953  		convert_to_masked_filter(filter);
954  	else if (!is_masked_filter_valid(filter))
955  		return -EINVAL;
956  
957  	/*
958  	 * Sort entries by event select and includes vs. excludes so that all
959  	 * entries for a given event select can be processed efficiently during
960  	 * filtering.  The EXCLUDE flag uses a more significant bit than the
961  	 * event select, and so the sorted list is also effectively split into
962  	 * includes and excludes sub-lists.
963  	 */
964  	sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
965  	     filter_sort_cmp, NULL);
966  
967  	i = filter->nevents;
968  	/* Find the first EXCLUDE event (only supported for masked events). */
969  	if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
970  		for (i = 0; i < filter->nevents; i++) {
971  			if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
972  				break;
973  		}
974  	}
975  
976  	filter->nr_includes = i;
977  	filter->nr_excludes = filter->nevents - filter->nr_includes;
978  	filter->includes = filter->events;
979  	filter->excludes = filter->events + filter->nr_includes;
980  
981  	return 0;
982  }
983  
kvm_vm_ioctl_set_pmu_event_filter(struct kvm * kvm,void __user * argp)984  int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
985  {
986  	struct kvm_pmu_event_filter __user *user_filter = argp;
987  	struct kvm_x86_pmu_event_filter *filter;
988  	struct kvm_pmu_event_filter tmp;
989  	struct kvm_vcpu *vcpu;
990  	unsigned long i;
991  	size_t size;
992  	int r;
993  
994  	if (copy_from_user(&tmp, user_filter, sizeof(tmp)))
995  		return -EFAULT;
996  
997  	if (tmp.action != KVM_PMU_EVENT_ALLOW &&
998  	    tmp.action != KVM_PMU_EVENT_DENY)
999  		return -EINVAL;
1000  
1001  	if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
1002  		return -EINVAL;
1003  
1004  	if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
1005  		return -E2BIG;
1006  
1007  	size = struct_size(filter, events, tmp.nevents);
1008  	filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
1009  	if (!filter)
1010  		return -ENOMEM;
1011  
1012  	filter->action = tmp.action;
1013  	filter->nevents = tmp.nevents;
1014  	filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
1015  	filter->flags = tmp.flags;
1016  
1017  	r = -EFAULT;
1018  	if (copy_from_user(filter->events, user_filter->events,
1019  			   sizeof(filter->events[0]) * filter->nevents))
1020  		goto cleanup;
1021  
1022  	r = prepare_filter_lists(filter);
1023  	if (r)
1024  		goto cleanup;
1025  
1026  	mutex_lock(&kvm->lock);
1027  	filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
1028  				     mutex_is_locked(&kvm->lock));
1029  	mutex_unlock(&kvm->lock);
1030  	synchronize_srcu_expedited(&kvm->srcu);
1031  
1032  	BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
1033  		     sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
1034  
1035  	kvm_for_each_vcpu(i, vcpu, kvm)
1036  		atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
1037  
1038  	kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
1039  
1040  	r = 0;
1041  cleanup:
1042  	kfree(filter);
1043  	return r;
1044  }
1045