LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 1/2] KVM: Optimize kvm_make_vcpus_request_mask() a bit
@ 2021-08-20 12:43 Vitaly Kuznetsov
  2021-08-20 12:43 ` [PATCH 2/2] KVM: x86: Fix stack-out-of-bounds memory access from ioapic_write_indirect() Vitaly Kuznetsov
  2021-08-20 18:19 ` [PATCH 1/2] KVM: Optimize kvm_make_vcpus_request_mask() a bit Sean Christopherson
  0 siblings, 2 replies; 4+ messages in thread
From: Vitaly Kuznetsov @ 2021-08-20 12:43 UTC (permalink / raw)
  To: kvm, Paolo Bonzini
  Cc: Sean Christopherson, Wanpeng Li, Jim Mattson,
	Dr. David Alan Gilbert, Nitesh Narayan Lal, linux-kernel

Iterating over set bits in 'vcpu_bitmap' should be faster than going
through all vCPUs, especially when just a few bits are set.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 virt/kvm/kvm_main.c | 49 +++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3e67c93ca403..0f873c5ed538 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -257,34 +257,49 @@ static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
 	return true;
 }
 
+static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu,
+				  unsigned int req, cpumask_var_t tmp)
+{
+	int cpu = vcpu->cpu;
+
+	kvm_make_request(req, vcpu);
+
+	if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
+		return;
+
+	if (tmp != NULL && cpu != -1 && cpu != raw_smp_processor_id() &&
+	    kvm_request_needs_ipi(vcpu, req))
+		__cpumask_set_cpu(cpu, tmp);
+}
+
 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 				 struct kvm_vcpu *except,
 				 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
 {
-	int i, cpu, me;
+	int i;
 	struct kvm_vcpu *vcpu;
 	bool called;
 
-	me = get_cpu();
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
-		    vcpu == except)
-			continue;
-
-		kvm_make_request(req, vcpu);
-		cpu = vcpu->cpu;
-
-		if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
-			continue;
+	preempt_disable();
 
-		if (tmp != NULL && cpu != -1 && cpu != me &&
-		    kvm_request_needs_ipi(vcpu, req))
-			__cpumask_set_cpu(cpu, tmp);
+	if (likely(vcpu_bitmap)) {
+		for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
+			vcpu = kvm_get_vcpu(kvm, i);
+			if (!vcpu || vcpu == except)
+				continue;
+			kvm_make_vcpu_request(kvm, vcpu, req, tmp);
+		}
+	} else {
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			if (vcpu == except)
+				continue;
+			kvm_make_vcpu_request(kvm, vcpu, req, tmp);
+		}
 	}
 
 	called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
-	put_cpu();
+
+	preempt_enable();
 
 	return called;
 }
-- 
2.31.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 2/2] KVM: x86: Fix stack-out-of-bounds memory access from ioapic_write_indirect()
  2021-08-20 12:43 [PATCH 1/2] KVM: Optimize kvm_make_vcpus_request_mask() a bit Vitaly Kuznetsov
@ 2021-08-20 12:43 ` Vitaly Kuznetsov
  2021-08-20 18:19 ` [PATCH 1/2] KVM: Optimize kvm_make_vcpus_request_mask() a bit Sean Christopherson
  1 sibling, 0 replies; 4+ messages in thread
From: Vitaly Kuznetsov @ 2021-08-20 12:43 UTC (permalink / raw)
  To: kvm, Paolo Bonzini
  Cc: Sean Christopherson, Wanpeng Li, Jim Mattson,
	Dr. David Alan Gilbert, Nitesh Narayan Lal, linux-kernel

KASAN reports the following issue:

 BUG: KASAN: stack-out-of-bounds in kvm_make_vcpus_request_mask+0x174/0x440 [kvm]
 Read of size 8 at addr ffffc9001364f638 by task qemu-kvm/4798

 CPU: 0 PID: 4798 Comm: qemu-kvm Tainted: G               X --------- ---
 Hardware name: AMD Corporation DAYTONA_X/DAYTONA_X, BIOS RYM0081C 07/13/2020
 Call Trace:
  dump_stack+0xa5/0xe6
  print_address_description.constprop.0+0x18/0x130
  ? kvm_make_vcpus_request_mask+0x174/0x440 [kvm]
  __kasan_report.cold+0x7f/0x114
  ? kvm_make_vcpus_request_mask+0x174/0x440 [kvm]
  kasan_report+0x38/0x50
  kasan_check_range+0xf5/0x1d0
  kvm_make_vcpus_request_mask+0x174/0x440 [kvm]
  kvm_make_scan_ioapic_request_mask+0x84/0xc0 [kvm]
  ? kvm_arch_exit+0x110/0x110 [kvm]
  ? sched_clock+0x5/0x10
  ioapic_write_indirect+0x59f/0x9e0 [kvm]
  ? static_obj+0xc0/0xc0
  ? __lock_acquired+0x1d2/0x8c0
  ? kvm_ioapic_eoi_inject_work+0x120/0x120 [kvm]

The problem appears to be that 'vcpu_bitmap' is allocated as a single long
on stack and it should really be KVM_MAX_VCPUS long. We also seem to clear
the lower 16 bits of it with bitmap_zero() for no particular reason (my
guess would be that 'bitmap' and 'vcpu_bitmap' variables in
kvm_bitmap_or_dest_vcpus() caused the confusion: while the later is indeed
16-bit long, the later should accommodate all possible vCPUs).

Fixes: 7ee30bc132c6 ("KVM: x86: deliver KVM IOAPIC scan request to target vCPUs")
Fixes: 9a2ae9f6b6bb ("KVM: x86: Zero the IOAPIC scan request dest vCPUs bitmap")
Reported-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 arch/x86/kvm/ioapic.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index ff005fe738a4..92cd4b02e9ba 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -319,7 +319,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 	unsigned index;
 	bool mask_before, mask_after;
 	union kvm_ioapic_redirect_entry *e;
-	unsigned long vcpu_bitmap;
+	unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
 	int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode;
 
 	switch (ioapic->ioregsel) {
@@ -384,9 +384,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 			irq.shorthand = APIC_DEST_NOSHORT;
 			irq.dest_id = e->fields.dest_id;
 			irq.msi_redir_hint = false;
-			bitmap_zero(&vcpu_bitmap, 16);
+			bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
 			kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
-						 &vcpu_bitmap);
+						 vcpu_bitmap);
 			if (old_dest_mode != e->fields.dest_mode ||
 			    old_dest_id != e->fields.dest_id) {
 				/*
@@ -399,10 +399,10 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 				    kvm_lapic_irq_dest_mode(
 					!!e->fields.dest_mode);
 				kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
-							 &vcpu_bitmap);
+							 vcpu_bitmap);
 			}
 			kvm_make_scan_ioapic_request_mask(ioapic->kvm,
-							  &vcpu_bitmap);
+							  vcpu_bitmap);
 		} else {
 			kvm_make_scan_ioapic_request(ioapic->kvm);
 		}
-- 
2.31.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/2] KVM: Optimize kvm_make_vcpus_request_mask() a bit
  2021-08-20 12:43 [PATCH 1/2] KVM: Optimize kvm_make_vcpus_request_mask() a bit Vitaly Kuznetsov
  2021-08-20 12:43 ` [PATCH 2/2] KVM: x86: Fix stack-out-of-bounds memory access from ioapic_write_indirect() Vitaly Kuznetsov
@ 2021-08-20 18:19 ` Sean Christopherson
  2021-08-23  8:03   ` Vitaly Kuznetsov
  1 sibling, 1 reply; 4+ messages in thread
From: Sean Christopherson @ 2021-08-20 18:19 UTC (permalink / raw)
  To: Vitaly Kuznetsov
  Cc: kvm, Paolo Bonzini, Wanpeng Li, Jim Mattson,
	Dr. David Alan Gilbert, Nitesh Narayan Lal, linux-kernel

On Fri, Aug 20, 2021, Vitaly Kuznetsov wrote:
> Iterating over set bits in 'vcpu_bitmap' should be faster than going
> through all vCPUs, especially when just a few bits are set.
> 
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> ---
>  virt/kvm/kvm_main.c | 49 +++++++++++++++++++++++++++++----------------
>  1 file changed, 32 insertions(+), 17 deletions(-)
> 
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 3e67c93ca403..0f873c5ed538 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -257,34 +257,49 @@ static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
>  	return true;
>  }
>  
> +static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu,
> +				  unsigned int req, cpumask_var_t tmp)
> +{
> +	int cpu = vcpu->cpu;

This reminds me, syzbot found a data race a while back[*] in kvm_vcpu_kick()
related to reading vcpu->cpu.  That race is benign, but legitimate.  I believe
this code has a similar race, and I'm not as confident that it's benign.

If the target vCPU changes vcpu->cpu after it's read by this code, then the IPI
can sent to the wrong pCPU, e.g. this pCPU gets waylaid by an IRQ and the target
vCPU is migrated to a new pCPU.

The TL;DR is that the race is benign because the target vCPU is still guaranteed
to see the request before entering the guest, even if the IPI goes to the wrong
pCPU.  I believe the same holds true for KVM_REQUEST_WAIT, e.g. if the lockless
shadow PTE walk gets migrated to a new pCPU just before setting vcpu->mode to
READING_SHADOW_PAGE_TABLES, this code can use a stale "cpu" for __cpumask_set_cpu().
The race is benign because the vCPU would have to enter READING_SHADOW_PAGE_TABLES
_after_ the SPTE modifications were made, as vcpu->cpu can't change while the vCPU
is reading SPTEs.  The same logic holds true for the case where the vCPU is migrated
after the call to __cpumask_set_cpu(); the goal is to wait for the vCPU to return to
OUTSIDE_GUEST_MODE, which is guaranteed if the vCPU is migrated even if this path
doesn't wait for an ack from the _new_ pCPU.

I'll send patches to fix the races later today, maybe they can be folded into
v2?  Even though the races are benign, I think they're worth fixing, if only to
provide an opportunity to document why it's ok to send IPIs to the wrong pCPU.

[*] On an upstream kernel, but I don't think the bug report was posted to LKML.

> +
> +	kvm_make_request(req, vcpu);
> +
> +	if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
> +		return;
> +
> +	if (tmp != NULL && cpu != -1 && cpu != raw_smp_processor_id() &&

For large VMs, might be worth keeping get_cpu() in the caller in passing in @me?

> +	    kvm_request_needs_ipi(vcpu, req))
> +		__cpumask_set_cpu(cpu, tmp);
> +}
> +
>  bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
>  				 struct kvm_vcpu *except,
>  				 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
>  {
> -	int i, cpu, me;
> +	int i;
>  	struct kvm_vcpu *vcpu;
>  	bool called;
>  
> -	me = get_cpu();
> -
> -	kvm_for_each_vcpu(i, vcpu, kvm) {
> -		if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
> -		    vcpu == except)
> -			continue;
> -
> -		kvm_make_request(req, vcpu);
> -		cpu = vcpu->cpu;
> -
> -		if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
> -			continue;
> +	preempt_disable();
>  
> -		if (tmp != NULL && cpu != -1 && cpu != me &&
> -		    kvm_request_needs_ipi(vcpu, req))
> -			__cpumask_set_cpu(cpu, tmp);
> +	if (likely(vcpu_bitmap)) {

I don't think this is actually "likely".  kvm_make_all_cpus_request() is by far
the most common caller and does not pass in a vcpu_bitmap.  Practically speaking
I highly don't the code organization will matter, but from a documentation
perspective it's wrong.

> +		for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
> +			vcpu = kvm_get_vcpu(kvm, i);
> +			if (!vcpu || vcpu == except)
> +				continue;
> +			kvm_make_vcpu_request(kvm, vcpu, req, tmp);
> +		}
> +	} else {
> +		kvm_for_each_vcpu(i, vcpu, kvm) {
> +			if (vcpu == except)
> +				continue;
> +			kvm_make_vcpu_request(kvm, vcpu, req, tmp);
> +		}
>  	}
>  
>  	called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
> -	put_cpu();
> +
> +	preempt_enable();
>  
>  	return called;
>  }
> -- 
> 2.31.1
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/2] KVM: Optimize kvm_make_vcpus_request_mask() a bit
  2021-08-20 18:19 ` [PATCH 1/2] KVM: Optimize kvm_make_vcpus_request_mask() a bit Sean Christopherson
@ 2021-08-23  8:03   ` Vitaly Kuznetsov
  0 siblings, 0 replies; 4+ messages in thread
From: Vitaly Kuznetsov @ 2021-08-23  8:03 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: kvm, Paolo Bonzini, Wanpeng Li, Jim Mattson,
	Dr. David Alan Gilbert, Nitesh Narayan Lal, linux-kernel

Sean Christopherson <seanjc@google.com> writes:

> On Fri, Aug 20, 2021, Vitaly Kuznetsov wrote:
>> Iterating over set bits in 'vcpu_bitmap' should be faster than going
>> through all vCPUs, especially when just a few bits are set.
>> 
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> ---
>>  virt/kvm/kvm_main.c | 49 +++++++++++++++++++++++++++++----------------
>>  1 file changed, 32 insertions(+), 17 deletions(-)
>> 
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 3e67c93ca403..0f873c5ed538 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -257,34 +257,49 @@ static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
>>  	return true;
>>  }
>>  
>> +static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu,
>> +				  unsigned int req, cpumask_var_t tmp)
>> +{
>> +	int cpu = vcpu->cpu;
>
> This reminds me, syzbot found a data race a while back[*] in kvm_vcpu_kick()
> related to reading vcpu->cpu.  That race is benign, but legitimate.  I believe
> this code has a similar race, and I'm not as confident that it's benign.
>
> If the target vCPU changes vcpu->cpu after it's read by this code, then the IPI
> can sent to the wrong pCPU, e.g. this pCPU gets waylaid by an IRQ and the target
> vCPU is migrated to a new pCPU.
>
> The TL;DR is that the race is benign because the target vCPU is still guaranteed
> to see the request before entering the guest, even if the IPI goes to the wrong
> pCPU.  I believe the same holds true for KVM_REQUEST_WAIT, e.g. if the lockless
> shadow PTE walk gets migrated to a new pCPU just before setting vcpu->mode to
> READING_SHADOW_PAGE_TABLES, this code can use a stale "cpu" for __cpumask_set_cpu().
> The race is benign because the vCPU would have to enter READING_SHADOW_PAGE_TABLES
> _after_ the SPTE modifications were made, as vcpu->cpu can't change while the vCPU
> is reading SPTEs.  The same logic holds true for the case where the vCPU is migrated
> after the call to __cpumask_set_cpu(); the goal is to wait for the vCPU to return to
> OUTSIDE_GUEST_MODE, which is guaranteed if the vCPU is migrated even if this path
> doesn't wait for an ack from the _new_ pCPU.
>
> I'll send patches to fix the races later today, maybe they can be folded into
> v2?  Even though the races are benign, I think they're worth fixing, if only to
> provide an opportunity to document why it's ok to send IPIs to the
> wrong pCPU.

You're blazingly fast as usual :-) I'll do v2 on top of your patches.

>
> [*] On an upstream kernel, but I don't think the bug report was posted to LKML.
>
>> +
>> +	kvm_make_request(req, vcpu);
>> +
>> +	if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
>> +		return;
>> +
>> +	if (tmp != NULL && cpu != -1 && cpu != raw_smp_processor_id() &&
>
> For large VMs, might be worth keeping get_cpu() in the caller in passing in @me?

The only reason against was that I've tried keeping the newly introduced
kvm_make_vcpu_request()'s interface nicer, like it can be reused some
day. Will get back to get_cpu() in v2.

>
>> +	    kvm_request_needs_ipi(vcpu, req))
>> +		__cpumask_set_cpu(cpu, tmp);
>> +}
>> +
>>  bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
>>  				 struct kvm_vcpu *except,
>>  				 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
>>  {
>> -	int i, cpu, me;
>> +	int i;
>>  	struct kvm_vcpu *vcpu;
>>  	bool called;
>>  
>> -	me = get_cpu();
>> -
>> -	kvm_for_each_vcpu(i, vcpu, kvm) {
>> -		if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
>> -		    vcpu == except)
>> -			continue;
>> -
>> -		kvm_make_request(req, vcpu);
>> -		cpu = vcpu->cpu;
>> -
>> -		if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
>> -			continue;
>> +	preempt_disable();
>>  
>> -		if (tmp != NULL && cpu != -1 && cpu != me &&
>> -		    kvm_request_needs_ipi(vcpu, req))
>> -			__cpumask_set_cpu(cpu, tmp);
>> +	if (likely(vcpu_bitmap)) {
>
> I don't think this is actually "likely".  kvm_make_all_cpus_request() is by far
> the most common caller and does not pass in a vcpu_bitmap.  Practically speaking
> I highly don't the code organization will matter, but from a documentation
> perspective it's wrong.

Right, I was thinking more about two other users: IOAPIC and Hyper-V who
call kvm_make_vcpus_request_mask() directly but I agree that
kvm_make_all_cpus_request() is probably much more common.

>
>> +		for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
>> +			vcpu = kvm_get_vcpu(kvm, i);
>> +			if (!vcpu || vcpu == except)
>> +				continue;
>> +			kvm_make_vcpu_request(kvm, vcpu, req, tmp);
>> +		}
>> +	} else {
>> +		kvm_for_each_vcpu(i, vcpu, kvm) {
>> +			if (vcpu == except)
>> +				continue;
>> +			kvm_make_vcpu_request(kvm, vcpu, req, tmp);
>> +		}
>>  	}
>>  
>>  	called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
>> -	put_cpu();
>> +
>> +	preempt_enable();
>>  
>>  	return called;
>>  }
>> -- 
>> 2.31.1
>> 
>

-- 
Vitaly


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-08-23  8:03 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-20 12:43 [PATCH 1/2] KVM: Optimize kvm_make_vcpus_request_mask() a bit Vitaly Kuznetsov
2021-08-20 12:43 ` [PATCH 2/2] KVM: x86: Fix stack-out-of-bounds memory access from ioapic_write_indirect() Vitaly Kuznetsov
2021-08-20 18:19 ` [PATCH 1/2] KVM: Optimize kvm_make_vcpus_request_mask() a bit Sean Christopherson
2021-08-23  8:03   ` Vitaly Kuznetsov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).