Merge tag 'kvm-x86-pmu-6.3' of https://github.com/kvm-x86/linux into HEAD

KVM x86 PMU changes for 6.3: - Add support for created masked events for the PMU filter to allow userspace to heavily restrict what events the guest can use without needing to create an absurd number of events - Clean up KVM's handling of "PMU MSRs to save", especially when vPMU support is disabled - Add PEBS support for Intel SPR
author: Paolo Bonzini <pbonzini@redhat.com> 2023-02-15 08:23:24 -0500
committer: Paolo Bonzini <pbonzini@redhat.com> 2023-02-15 08:23:24 -0500
commit: 157ed9cb04306eae813b7598cf672834cd295b8e (patch)
tree: 6d46e80adc55e96cc405b81831c54207cf4e7064 /arch/x86/kvm/pmu.c
parent: 1c5ec0d433b4b2af15e7f54366411fd27d7c40b2 (diff)
parent: 13738a3647368f7f600b30d241779bcd2a3ebbfd (diff)
1 files changed, 236 insertions, 50 deletions
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 58e5a456273a..612e6c70ce2e 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -29,9 +29,18 @@
 struct x86_pmu_capability __read_mostly kvm_pmu_cap;
 EXPORT_SYMBOL_GPL(kvm_pmu_cap);
 
-static const struct x86_cpu_id vmx_icl_pebs_cpu[] = {
+/* Precise Distribution of Instructions Retired (PDIR) */
+static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
+	/* Instruction-Accurate PDIR (PDIR++) */
+	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
+	{}
+};
+
+/* Precise Distribution (PDist) */
+static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
+	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
 	{}
 };
 
@@ -156,6 +165,28 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
 	kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
 }
 
+static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
+{
+	/*
+	 * For some model specific pebs counters with special capabilities
+	 * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
+	 * level to the maximum value (currently 3, backwards compatible)
+	 * so that the perf subsystem would assign specific hardware counter
+	 * with that capability for vPMC.
+	 */
+	if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
+	    (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu)))
+		return 3;
+
+	/*
+	 * The non-zero precision level of guest event makes the ordinary
+	 * guest event becomes a guest PEBS event and triggers the host
+	 * PEBS PMI handler to determine whether the PEBS overflow PMI
+	 * comes from the host counters or the guest.
+	 */
+	return 1;
+}
+
 static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
 				 bool exclude_user, bool exclude_kernel,
 				 bool intr)
@@ -187,22 +218,12 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
 	}
 	if (pebs) {
 		/*
-		 * The non-zero precision level of guest event makes the ordinary
-		 * guest event becomes a guest PEBS event and triggers the host
-		 * PEBS PMI handler to determine whether the PEBS overflow PMI
-		 * comes from the host counters or the guest.
-		 *
 		 * For most PEBS hardware events, the difference in the software
 		 * precision levels of guest and host PEBS events will not affect
 		 * the accuracy of the PEBS profiling result, because the "event IP"
 		 * in the PEBS record is calibrated on the guest side.
-		 *
-		 * On Icelake everything is fine. Other hardware (GLC+, TNT+) that
-		 * could possibly care here is unsupported and needs changes.
 		 */
-		attr.precise_ip = 1;
-		if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32)
-			attr.precise_ip = 3;
+		attr.precise_ip = pmc_get_pebs_precise_level(pmc);
 	}
 
 	event = perf_event_create_kernel_counter(&attr, -1, current,
@@ -255,48 +276,128 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
 	return true;
 }
 
-static int cmp_u64(const void *pa, const void *pb)
+static int filter_cmp(const void *pa, const void *pb, u64 mask)
 {
-	u64 a = *(u64 *)pa;
-	u64 b = *(u64 *)pb;
+	u64 a = *(u64 *)pa & mask;
+	u64 b = *(u64 *)pb & mask;
 
 	return (a > b) - (a < b);
 }
 
+
+static int filter_sort_cmp(const void *pa, const void *pb)
+{
+	return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
+				   KVM_PMU_MASKED_ENTRY_EXCLUDE));
+}
+
+/*
+ * For the event filter, searching is done on the 'includes' list and
+ * 'excludes' list separately rather than on the 'events' list (which
+ * has both).  As a result the exclude bit can be ignored.
+ */
+static int filter_event_cmp(const void *pa, const void *pb)
+{
+	return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
+}
+
+static int find_filter_index(u64 *events, u64 nevents, u64 key)
+{
+	u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]),
+			  filter_event_cmp);
+
+	if (!fe)
+		return -1;
+
+	return fe - events;
+}
+
+static bool is_filter_entry_match(u64 filter_event, u64 umask)
+{
+	u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
+	u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
+
+	BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
+		     (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
+		     ARCH_PERFMON_EVENTSEL_UMASK);
+
+	return (umask & mask) == match;
+}
+
+static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
+{
+	u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
+	u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
+	int i, index;
+
+	index = find_filter_index(events, nevents, event_select);
+	if (index < 0)
+		return false;
+
+	/*
+	 * Entries are sorted by the event select.  Walk the list in both
+	 * directions to process all entries with the targeted event select.
+	 */
+	for (i = index; i < nevents; i++) {
+		if (filter_event_cmp(&events[i], &event_select))
+			break;
+
+		if (is_filter_entry_match(events[i], umask))
+			return true;
+	}
+
+	for (i = index - 1; i >= 0; i--) {
+		if (filter_event_cmp(&events[i], &event_select))
+			break;
+
+		if (is_filter_entry_match(events[i], umask))
+			return true;
+	}
+
+	return false;
+}
+
+static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
+				u64 eventsel)
+{
+	if (filter_contains_match(f->includes, f->nr_includes, eventsel) &&
+	    !filter_contains_match(f->excludes, f->nr_excludes, eventsel))
+		return f->action == KVM_PMU_EVENT_ALLOW;
+
+	return f->action == KVM_PMU_EVENT_DENY;
+}
+
+static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
+				   int idx)
+{
+	int fixed_idx = idx - INTEL_PMC_IDX_FIXED;
+
+	if (filter->action == KVM_PMU_EVENT_DENY &&
+	    test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+		return false;
+	if (filter->action == KVM_PMU_EVENT_ALLOW &&
+	    !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+		return false;
+
+	return true;
+}
+
 static bool check_pmu_event_filter(struct kvm_pmc *pmc)
 {
-	struct kvm_pmu_event_filter *filter;
+	struct kvm_x86_pmu_event_filter *filter;
 	struct kvm *kvm = pmc->vcpu->kvm;
-	bool allow_event = true;
-	__u64 key;
-	int idx;
 
 	if (!static_call(kvm_x86_pmu_hw_event_available)(pmc))
 		return false;
 
 	filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
 	if (!filter)
-		goto out;
+		return true;
 
-	if (pmc_is_gp(pmc)) {
-		key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB;
-		if (bsearch(&key, filter->events, filter->nevents,
-			    sizeof(__u64), cmp_u64))
-			allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
-		else
-			allow_event = filter->action == KVM_PMU_EVENT_DENY;
-	} else {
-		idx = pmc->idx - INTEL_PMC_IDX_FIXED;
-		if (filter->action == KVM_PMU_EVENT_DENY &&
-		    test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
-			allow_event = false;
-		if (filter->action == KVM_PMU_EVENT_ALLOW &&
-		    !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
-			allow_event = false;
-	}
+	if (pmc_is_gp(pmc))
+		return is_gp_event_allowed(filter, pmc->eventsel);
 
-out:
-	return allow_event;
+	return is_fixed_event_allowed(filter, pmc->idx);
 }
 
 static void reprogram_counter(struct kvm_pmc *pmc)
@@ -593,43 +694,128 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
 }
 EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
 
+static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
+{
+	u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
+		   KVM_PMU_MASKED_ENTRY_UMASK_MASK |
+		   KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
+		   KVM_PMU_MASKED_ENTRY_EXCLUDE;
+	int i;
+
+	for (i = 0; i < filter->nevents; i++) {
+		if (filter->events[i] & ~mask)
+			return false;
+	}
+
+	return true;
+}
+
+static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
+{
+	int i, j;
+
+	for (i = 0, j = 0; i < filter->nevents; i++) {
+		/*
+		 * Skip events that are impossible to match against a guest
+		 * event.  When filtering, only the event select + unit mask
+		 * of the guest event is used.  To maintain backwards
+		 * compatibility, impossible filters can't be rejected :-(
+		 */
+		if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
+					  ARCH_PERFMON_EVENTSEL_UMASK))
+			continue;
+		/*
+		 * Convert userspace events to a common in-kernel event so
+		 * only one code path is needed to support both events.  For
+		 * the in-kernel events use masked events because they are
+		 * flexible enough to handle both cases.  To convert to masked
+		 * events all that's needed is to add an "all ones" umask_mask,
+		 * (unmasked filter events don't support EXCLUDE).
+		 */
+		filter->events[j++] = filter->events[i] |
+				      (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
+	}
+
+	filter->nevents = j;
+}
+
+static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
+{
+	int i;
+
+	if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
+		convert_to_masked_filter(filter);
+	else if (!is_masked_filter_valid(filter))
+		return -EINVAL;
+
+	/*
+	 * Sort entries by event select and includes vs. excludes so that all
+	 * entries for a given event select can be processed efficiently during
+	 * filtering.  The EXCLUDE flag uses a more significant bit than the
+	 * event select, and so the sorted list is also effectively split into
+	 * includes and excludes sub-lists.
+	 */
+	sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
+	     filter_sort_cmp, NULL);
+
+	i = filter->nevents;
+	/* Find the first EXCLUDE event (only supported for masked events). */
+	if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
+		for (i = 0; i < filter->nevents; i++) {
+			if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
+				break;
+		}
+	}
+
+	filter->nr_includes = i;
+	filter->nr_excludes = filter->nevents - filter->nr_includes;
+	filter->includes = filter->events;
+	filter->excludes = filter->events + filter->nr_includes;
+
+	return 0;
+}
+
 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
 {
-	struct kvm_pmu_event_filter tmp, *filter;
+	struct kvm_pmu_event_filter __user *user_filter = argp;
+	struct kvm_x86_pmu_event_filter *filter;
+	struct kvm_pmu_event_filter tmp;
 	struct kvm_vcpu *vcpu;
 	unsigned long i;
 	size_t size;
 	int r;
 
-	if (copy_from_user(&tmp, argp, sizeof(tmp)))
+	if (copy_from_user(&tmp, user_filter, sizeof(tmp)))
 		return -EFAULT;
 
 	if (tmp.action != KVM_PMU_EVENT_ALLOW &&
 	    tmp.action != KVM_PMU_EVENT_DENY)
 		return -EINVAL;
 
-	if (tmp.flags != 0)
+	if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
 		return -EINVAL;
 
 	if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
 		return -E2BIG;
 
 	size = struct_size(filter, events, tmp.nevents);
-	filter = kmalloc(size, GFP_KERNEL_ACCOUNT);
+	filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
 	if (!filter)
 		return -ENOMEM;
 
+	filter->action = tmp.action;
+	filter->nevents = tmp.nevents;
+	filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
+	filter->flags = tmp.flags;
+
 	r = -EFAULT;
-	if (copy_from_user(filter, argp, size))
+	if (copy_from_user(filter->events, user_filter->events,
+			   sizeof(filter->events[0]) * filter->nevents))
 		goto cleanup;
 
-	/* Ensure nevents can't be changed between the user copies. */
-	*filter = tmp;
-
-	/*
-	 * Sort the in-kernel list so that we can search it with bsearch.
-	 */
-	sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
+	r = prepare_filter_lists(filter);
+	if (r)
+		goto cleanup;
 
 	mutex_lock(&kvm->lock);
 	filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
author	Paolo Bonzini <pbonzini@redhat.com>	2023-02-15 08:23:24 -0500
committer	Paolo Bonzini <pbonzini@redhat.com>	2023-02-15 08:23:24 -0500
commit	157ed9cb04306eae813b7598cf672834cd295b8e (patch)
tree	6d46e80adc55e96cc405b81831c54207cf4e7064 /arch/x86/kvm/pmu.c
parent	1c5ec0d433b4b2af15e7f54366411fd27d7c40b2 (diff)
parent	13738a3647368f7f600b30d241779bcd2a3ebbfd (diff)