From 92eca8faad2d1b136c939bc122842dcdabd6ff46 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 20 May 2012 13:13:28 +0900 Subject: KVM: Separate out dirty_bitmap allocation code as kvm_kvzalloc() Will be used for lpage_info allocation later. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7e140683ff14..1148c96a4817 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -516,16 +516,32 @@ out_err_nodisable: return ERR_PTR(r); } +/* + * Avoid using vmalloc for a small buffer. + * Should not be used when the size is statically known. + */ +static void *kvm_kvzalloc(unsigned long size) +{ + if (size > PAGE_SIZE) + return vzalloc(size); + else + return kzalloc(size, GFP_KERNEL); +} + +static void kvm_kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} + static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) { if (!memslot->dirty_bitmap) return; - if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) - vfree(memslot->dirty_bitmap); - else - kfree(memslot->dirty_bitmap); - + kvm_kvfree(memslot->dirty_bitmap); memslot->dirty_bitmap = NULL; } @@ -617,11 +633,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) #ifndef CONFIG_S390 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); - if (dirty_bytes > PAGE_SIZE) - memslot->dirty_bitmap = vzalloc(dirty_bytes); - else - memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); - + memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); if (!memslot->dirty_bitmap) return -ENOMEM; -- cgit v1.2.3 From c1a7b32a14138f908df52d7c53b5ce3415ec6b50 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 20 May 2012 13:15:07 +0900 Subject: KVM: Avoid wasting pages for small lpage_info arrays lpage_info is created for each large level even when the memory slot is not for RAM. This means that when we add one slot for a PCI device, we end up allocating at least KVM_NR_PAGE_SIZES - 1 pages by vmalloc(). To make things worse, there is an increasing number of devices which would result in more pages being wasted this way. This patch mitigates this problem by using kvm_kvzalloc(). Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 3 +++ virt/kvm/kvm_main.c | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'virt') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be6d54929fa7..f12a52408cda 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6304,7 +6304,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { - vfree(free->arch.lpage_info[i]); + kvm_kvfree(free->arch.lpage_info[i]); free->arch.lpage_info[i] = NULL; } } @@ -6323,7 +6323,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) slot->base_gfn, level) + 1; slot->arch.lpage_info[i] = - vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); + kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); if (!slot->arch.lpage_info[i]) goto out_free; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c4464356b35b..19b83f6efa49 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -535,6 +535,9 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); void kvm_free_physmem(struct kvm *kvm); +void *kvm_kvzalloc(unsigned long size); +void kvm_kvfree(const void *addr); + #ifndef __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1148c96a4817..02cb440f802d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -520,7 +520,7 @@ out_err_nodisable: * Avoid using vmalloc for a small buffer. * Should not be used when the size is statically known. */ -static void *kvm_kvzalloc(unsigned long size) +void *kvm_kvzalloc(unsigned long size) { if (size > PAGE_SIZE) return vzalloc(size); @@ -528,7 +528,7 @@ static void *kvm_kvzalloc(unsigned long size) return kzalloc(size, GFP_KERNEL); } -static void kvm_kvfree(const void *addr) +void kvm_kvfree(const void *addr) { if (is_vmalloc_addr(addr)) vfree(addr); -- cgit v1.2.3 From 9900b4b48b095895cf962054e45aafa49ef70f74 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 15 Jun 2012 15:07:02 -0400 Subject: KVM: use KVM_CAP_IRQ_ROUTING to protect the routing related code The KVM code sometimes uses CONFIG_HAVE_KVM_IRQCHIP to protect code that is related to IRQ routing, which not all in-kernel irqchips may support. Use KVM_CAP_IRQ_ROUTING instead. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'virt') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 27ac8a4767fa..c7f77876c9b3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -802,7 +802,7 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se } #endif -#ifdef CONFIG_HAVE_KVM_IRQCHIP +#ifdef KVM_CAP_IRQ_ROUTING #define KVM_MAX_IRQ_ROUTES 1024 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 02cb440f802d..636bd08bb399 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2225,7 +2225,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) case KVM_CAP_SIGNAL_MSI: #endif return 1; -#ifdef CONFIG_HAVE_KVM_IRQCHIP +#ifdef KVM_CAP_IRQ_ROUTING case KVM_CAP_IRQ_ROUTING: return KVM_MAX_IRQ_ROUTES; #endif -- cgit v1.2.3 From 5cfc2aabcb282f4554e7086c9893b386ad6ba9d4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 19 Jun 2012 16:51:04 -0400 Subject: KVM: handle last_boosted_vcpu = 0 case If last_boosted_vcpu == 0, then we fall through all test cases and may end up with all VCPUs pouncing on vcpu 0. With a large enough guest, this can result in enormous runqueue lock contention, which can prevent vcpu0 from running, leading to a livelock. Changing < to <= makes sure we properly handle that case. Signed-off-by: Rik van Riel Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 636bd08bb399..b3ce91c623e2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1598,7 +1598,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) */ for (pass = 0; pass < 2 && !yielded; pass++) { kvm_for_each_vcpu(i, vcpu, kvm) { - if (!pass && i < last_boosted_vcpu) { + if (!pass && i <= last_boosted_vcpu) { i = last_boosted_vcpu; continue; } else if (pass && i > last_boosted_vcpu) -- cgit v1.2.3 From 1a577b72475d161b6677c05abe57301362023bb2 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 19 Jul 2012 13:45:20 +0300 Subject: KVM: fix race with level interrupts When more than 1 source id is in use for the same GSI, we have the following race related to handling irq_states race: CPU 0 clears bit 0. CPU 0 read irq_state as 0. CPU 1 sets level to 1. CPU 1 calls kvm_ioapic_set_irq(1). CPU 0 calls kvm_ioapic_set_irq(0). Now ioapic thinks the level is 0 but irq_state is not 0. Fix by performing all irq_states bitmap handling under pic/ioapic lock. This also removes the need for atomics with irq_states handling. Reported-by: Gleb Natapov Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 15 ++++++++++++++- arch/x86/kvm/i8259.c | 17 ++++++++++++++--- virt/kvm/ioapic.c | 19 ++++++++++++++++--- virt/kvm/ioapic.h | 4 +++- virt/kvm/irq_comm.c | 31 ++++--------------------------- 5 files changed, 51 insertions(+), 35 deletions(-) (limited to 'virt') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a3e9409e90b6..2c75b400e40c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -816,7 +816,20 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); -int kvm_pic_set_irq(void *opaque, int irq, int level); +static inline int __kvm_irq_line_state(unsigned long *irq_state, + int irq_source_id, int level) +{ + /* Logical OR for level trig interrupt */ + if (level) + __set_bit(irq_source_id, irq_state); + else + __clear_bit(irq_source_id, irq_state); + + return !!(*irq_state); +} + +int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); +void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 81cf4fa4a2be..1df8fb9e1d5d 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -188,14 +188,15 @@ void kvm_pic_update_irq(struct kvm_pic *s) pic_unlock(s); } -int kvm_pic_set_irq(void *opaque, int irq, int level) +int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) { - struct kvm_pic *s = opaque; int ret = -1; pic_lock(s); if (irq >= 0 && irq < PIC_NUM_PINS) { - ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); + int irq_level = __kvm_irq_line_state(&s->irq_states[irq], + irq_source_id, level); + ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); @@ -205,6 +206,16 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) return ret; } +void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) +{ + int i; + + pic_lock(s); + for (i = 0; i < PIC_NUM_PINS; i++) + __clear_bit(irq_source_id, &s->irq_states[i]); + pic_unlock(s); +} + /* * acknowledge interrupt 'irq' */ diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 26fd54dc459e..ef61d529a6c4 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -191,7 +191,8 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); } -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, + int level) { u32 old_irr; u32 mask = 1 << irq; @@ -201,9 +202,11 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) spin_lock(&ioapic->lock); old_irr = ioapic->irr; if (irq >= 0 && irq < IOAPIC_NUM_PINS) { + int irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], + irq_source_id, level); entry = ioapic->redirtbl[irq]; - level ^= entry.fields.polarity; - if (!level) + irq_level ^= entry.fields.polarity; + if (!irq_level) ioapic->irr &= ~mask; else { int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); @@ -221,6 +224,16 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) return ret; } +void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) +{ + int i; + + spin_lock(&ioapic->lock); + for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) + __clear_bit(irq_source_id, &ioapic->irq_states[i]); + spin_unlock(&ioapic->lock); +} + static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector, int trigger_mode) { diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 32872a09b63f..a30abfe6ed16 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -74,7 +74,9 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_destroy(struct kvm *kvm); -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, + int level); +void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index a6a0365475ed..cc59c68da032 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -33,26 +33,12 @@ #include "ioapic.h" -static inline int kvm_irq_line_state(unsigned long *irq_state, - int irq_source_id, int level) -{ - /* Logical OR for level trig interrupt */ - if (level) - set_bit(irq_source_id, irq_state); - else - clear_bit(irq_source_id, irq_state); - - return !!(*irq_state); -} - static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level) { #ifdef CONFIG_X86 struct kvm_pic *pic = pic_irqchip(kvm); - level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin], - irq_source_id, level); - return kvm_pic_set_irq(pic, e->irqchip.pin, level); + return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); #else return -1; #endif @@ -62,10 +48,7 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin], - irq_source_id, level); - - return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level); + return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level); } inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) @@ -249,8 +232,6 @@ unlock: void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) { - int i; - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); mutex_lock(&kvm->irq_lock); @@ -263,14 +244,10 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) if (!irqchip_in_kernel(kvm)) goto unlock; - for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) { - clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]); - if (i >= 16) - continue; + kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); #ifdef CONFIG_X86 - clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]); + kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id); #endif - } unlock: mutex_unlock(&kvm->irq_lock); } -- cgit v1.2.3